OpenZFS 9580 - Add a hash-table on top of nvlist to speed-up operations

= Motivation

While dealing with another performance issue (see 126118f) we noticed
that we spend a lot of time in various places in the kernel when
constructing long nvlists. The problem is that when an nvlist is created
with the NV_UNIQUE_NAME set (which is the case most of the time), we do
a linear search through the whole list to ensure uniqueness for every
entry we add.

An example of the above scenario can be seen in the following
flamegraph, where more than have the time of the zfsdev_ioctl() is spent
on constructing nvlists.  Flamegraph:
https://sdimitro.github.io/img/flame/sdimitro_snap_unmount3.svg

Adding a table to speed up lookups will help situations where we just
construct an nvlist (like the scenario above), in addition to regular
lookups and removals.

= What this patch does

In this diff we've implemented a hash-table on top of the nvlist code
that converts most nvlist operations from O(# number of entries) to
O(1)* (the start is for amortized time as the hash-table grows and
shrinks depending on the # of entries - plain lookup is strictly O(1)).

= Performance Analysis

To analyze the performance improvement I just used the setup from the
snapshot deletion issue mentioned above in the Motivation section.
Basically I created 10K filesystems with one snapshot each and then I
just used the API of libZFS_Core to pass down an nvlist of all the
snapshots to have them deleted. The reason I used my own driver program
was to have clean performance results of what actually happens in the
kernel. The flamegraphs and wall clock times mentioned below were
gathered from the start to the end of the driver program's run. Between
trials the testpool used was completely destroyed, the system was
rebooted and the testpool was completely recreated. The reason for this
dance was to get consistent results.

== Results (before patch):

=== Sampling Flamegraphs

[Trial 1] https://sdimitro.github.io/img/flame/DLPX-53417/trial-A.svg
[Trial 2] https://sdimitro.github.io/img/flame/DLPX-53417/trial-A2.svg
[Trial 3] https://sdimitro.github.io/img/flame/DLPX-53417/trial-A3.svg

=== Wall clock times (in seconds)

```
[Trial 4]
real        5.3
user        0.4
sys         2.3

[Trial 5]
real        8.2
user        0.4
sys         2.4

[Trial 6]
real        6.0
user        0.5
sys         2.3
```

== Results (after patch):

=== Sampling Flamegraphs

[Trial 1] https://sdimitro.github.io/img/flame/DLPX-53417/trial-Ae.svg
[Trial 2] https://sdimitro.github.io/img/flame/DLPX-53417/trial-A2e.svg
[Trial 3] https://sdimitro.github.io/img/flame/DLPX-53417/trial-A3e.svg

=== Wall clock times (in seconds)

```
[Trial 4]
real        4.9
user        0.0
sys         0.9

[Trial 5]
real        3.8
user        0.0
sys         0.9

[Trial 6]
real        3.6
user        0.0
sys         0.9
```

== Analysis

The results between the trials are consistent so in this sections I will
only talk about the flamegraph results from trial-1 and the wall-clock
results from trial-4.

From trial-1 we can see that zfs_dev_ioctl() goes from 2,331 to 996
samples counts.  Specifically, the samples from fnvlist_add_nvlist() and
spa_history_log_nvl() are almost gone (~500 & ~800 to 5 & 5 samples),
leaving zfs_ioc_destroy_snaps() to dominate most samples from
zfs_dev_ioctl().

From trial-4 we see that the user time dropped to 0 secods. I believe
the consistent 0.4 seconds before my patch was applied was due to my
driver program constructing the long nvlist of snapshots so it can pass
it to the kernel. As for the system time, the effect there is more clear
(2.3 down to 0.9 seconds).

Porting Notes:
* DATA_TYPE_DONTCARE case added to switch in fm_nvprintr() and
  zpool_do_events_nvprint().

Authored by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Sebastien Roy <sebastien.roy@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>

OpenZFS-issue: https://www.illumos.org/issues/9580
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/b5eca7b1
Closes #7748
This commit is contained in:
Serapheim Dimitropoulos 2017-12-05 09:57:42 -08:00 committed by Brian Behlendorf
parent 1897bc0d48
commit 6b64382b17
6 changed files with 348 additions and 55 deletions

View File

@ -8087,6 +8087,7 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth)
case DATA_TYPE_BOOLEAN_ARRAY:
case DATA_TYPE_BYTE_ARRAY:
case DATA_TYPE_DOUBLE:
case DATA_TYPE_DONTCARE:
case DATA_TYPE_UNKNOWN:
printf(gettext("<unknown>"));
break;

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_NVPAIR_H
@ -35,6 +35,7 @@ extern "C" {
#endif
typedef enum {
DATA_TYPE_DONTCARE = -1,
DATA_TYPE_UNKNOWN = 0,
DATA_TYPE_BOOLEAN,
DATA_TYPE_BYTE,

View File

@ -24,11 +24,13 @@
* Use is subject to license terms.
*/
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
#ifndef _NVPAIR_IMPL_H
#define _NVPAIR_IMPL_H
#ifdef __cplusplus
extern "C" {
#endif
@ -47,16 +49,27 @@ typedef struct i_nvp i_nvp_t;
struct i_nvp {
union {
uint64_t _nvi_align; /* ensure alignment */
/* ensure alignment */
uint64_t _nvi_align;
struct {
i_nvp_t *_nvi_next; /* pointer to next nvpair */
i_nvp_t *_nvi_prev; /* pointer to prev nvpair */
/* pointer to next nvpair */
i_nvp_t *_nvi_next;
/* pointer to prev nvpair */
i_nvp_t *_nvi_prev;
/* next pair in table bucket */
i_nvp_t *_nvi_hashtable_next;
} _nvi;
} _nvi_un;
nvpair_t nvi_nvp; /* nvpair */
/* nvpair */
nvpair_t nvi_nvp;
};
#define nvi_next _nvi_un._nvi._nvi_next
#define nvi_prev _nvi_un._nvi._nvi_prev
#define nvi_hashtable_next _nvi_un._nvi._nvi_hashtable_next
typedef struct {
i_nvp_t *nvp_list; /* linked list of nvpairs */
@ -64,6 +77,10 @@ typedef struct {
i_nvp_t *nvp_curr; /* current walker nvpair */
nv_alloc_t *nvp_nva; /* pluggable allocator */
uint32_t nvp_stat; /* internal state */
i_nvp_t **nvp_hashtable; /* table of entries used for lookup */
uint32_t nvp_nbuckets; /* # of buckets in hash table */
uint32_t nvp_nentries; /* # of entries in hash table */
} nvpriv_t;
#ifdef __cplusplus

View File

@ -10,6 +10,7 @@
*/
/*
* Copyright (c) 2014, Joyent, Inc.
* Copyright (c) 2017 by Delphix. All rights reserved.
*/
#include <stdio.h>
@ -394,8 +395,10 @@ nvlist_print_json(FILE *fp, nvlist_t *nvl)
}
case DATA_TYPE_UNKNOWN:
case DATA_TYPE_DONTCARE:
return (-1);
}
}
FPRINTF(fp, "}");

View File

@ -139,6 +139,8 @@ int nvpair_max_recursion = 20;
int nvpair_max_recursion = 100;
#endif
uint64_t nvlist_hashtable_init_size = (1 << 4);
int
nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
{
@ -246,6 +248,291 @@ nv_priv_alloc_embedded(nvpriv_t *priv)
return (emb_priv);
}
static int
nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets)
{
ASSERT3P(priv->nvp_hashtable, ==, NULL);
ASSERT0(priv->nvp_nbuckets);
ASSERT0(priv->nvp_nentries);
i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *));
if (tab == NULL)
return (ENOMEM);
priv->nvp_hashtable = tab;
priv->nvp_nbuckets = buckets;
return (0);
}
static void
nvt_tab_free(nvpriv_t *priv)
{
i_nvp_t **tab = priv->nvp_hashtable;
if (tab == NULL) {
ASSERT0(priv->nvp_nbuckets);
ASSERT0(priv->nvp_nentries);
return;
}
nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *));
priv->nvp_hashtable = NULL;
priv->nvp_nbuckets = 0;
priv->nvp_nentries = 0;
}
static uint32_t
nvt_hash(const char *p)
{
uint32_t g, hval = 0;
while (*p) {
hval = (hval << 4) + *p++;
if ((g = (hval & 0xf0000000)) != 0)
hval ^= g >> 24;
hval &= ~g;
}
return (hval);
}
static boolean_t
nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag)
{
boolean_t match = B_FALSE;
if (nvflag & NV_UNIQUE_NAME_TYPE) {
if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 &&
NVP_TYPE(nvp1) == NVP_TYPE(nvp2))
match = B_TRUE;
} else {
ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME);
if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0)
match = B_TRUE;
}
return (match);
}
static nvpair_t *
nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type)
{
nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
ASSERT(priv != NULL);
i_nvp_t **tab = priv->nvp_hashtable;
if (tab == NULL) {
ASSERT3P(priv->nvp_list, ==, NULL);
ASSERT0(priv->nvp_nbuckets);
ASSERT0(priv->nvp_nentries);
return (NULL);
} else {
ASSERT(priv->nvp_nbuckets != 0);
}
uint64_t hash = nvt_hash(name);
uint64_t index = hash & (priv->nvp_nbuckets - 1);
ASSERT3U(index, <, priv->nvp_nbuckets);
i_nvp_t *entry = tab[index];
for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) {
if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 &&
(type == DATA_TYPE_DONTCARE ||
NVP_TYPE(&e->nvi_nvp) == type))
return (&e->nvi_nvp);
}
return (NULL);
}
static nvpair_t *
nvt_lookup_name(nvlist_t *nvl, const char *name)
{
return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE));
}
static int
nvt_resize(nvpriv_t *priv, uint32_t new_size)
{
i_nvp_t **tab = priv->nvp_hashtable;
/*
* Migrate all the entries from the current table
* to a newly-allocated table with the new size by
* re-adjusting the pointers of their entries.
*/
uint32_t size = priv->nvp_nbuckets;
uint32_t new_mask = new_size - 1;
ASSERT(ISP2(new_size));
i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *));
if (new_tab == NULL)
return (ENOMEM);
uint32_t nentries = 0;
for (uint32_t i = 0; i < size; i++) {
i_nvp_t *next, *e = tab[i];
while (e != NULL) {
next = e->nvi_hashtable_next;
uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp));
uint32_t index = hash & new_mask;
e->nvi_hashtable_next = new_tab[index];
new_tab[index] = e;
nentries++;
e = next;
}
tab[i] = NULL;
}
ASSERT3U(nentries, ==, priv->nvp_nentries);
nvt_tab_free(priv);
priv->nvp_hashtable = new_tab;
priv->nvp_nbuckets = new_size;
priv->nvp_nentries = nentries;
return (0);
}
static boolean_t
nvt_needs_togrow(nvpriv_t *priv)
{
/*
* Grow only when we have more elements than buckets
* and the # of buckets doesn't overflow.
*/
return (priv->nvp_nentries > priv->nvp_nbuckets &&
(UINT32_MAX >> 1) >= priv->nvp_nbuckets);
}
/*
* Allocate a new table that's twice the size of the old one,
* and migrate all the entries from the old one to the new
* one by re-adjusting their pointers.
*/
static int
nvt_grow(nvpriv_t *priv)
{
uint32_t current_size = priv->nvp_nbuckets;
/* ensure we won't overflow */
ASSERT3U(UINT32_MAX >> 1, >=, current_size);
return (nvt_resize(priv, current_size << 1));
}
static boolean_t
nvt_needs_toshrink(nvpriv_t *priv)
{
/*
* Shrink only when the # of elements is less than or
* equal to 1/4 the # of buckets. Never shrink less than
* nvlist_hashtable_init_size.
*/
ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size);
if (priv->nvp_nbuckets == nvlist_hashtable_init_size)
return (B_FALSE);
return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2));
}
/*
* Allocate a new table that's half the size of the old one,
* and migrate all the entries from the old one to the new
* one by re-adjusting their pointers.
*/
static int
nvt_shrink(nvpriv_t *priv)
{
uint32_t current_size = priv->nvp_nbuckets;
/* ensure we won't overflow */
ASSERT3U(current_size, >=, nvlist_hashtable_init_size);
return (nvt_resize(priv, current_size >> 1));
}
static int
nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
{
nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
if (nvt_needs_toshrink(priv)) {
int err = nvt_shrink(priv);
if (err != 0)
return (err);
}
i_nvp_t **tab = priv->nvp_hashtable;
char *name = NVP_NAME(nvp);
uint64_t hash = nvt_hash(name);
uint64_t index = hash & (priv->nvp_nbuckets - 1);
ASSERT3U(index, <, priv->nvp_nbuckets);
i_nvp_t *bucket = tab[index];
for (i_nvp_t *prev = NULL, *e = bucket;
e != NULL; prev = e, e = e->nvi_hashtable_next) {
if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_flag)) {
if (prev != NULL) {
prev->nvi_hashtable_next =
e->nvi_hashtable_next;
} else {
ASSERT3P(e, ==, bucket);
tab[index] = e->nvi_hashtable_next;
}
e->nvi_hashtable_next = NULL;
priv->nvp_nentries--;
break;
}
}
return (0);
}
static int
nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
{
nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
/* initialize nvpair table now if it doesn't exist. */
if (priv->nvp_hashtable == NULL) {
int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size);
if (err != 0)
return (err);
}
/*
* if we don't allow duplicate entries, make sure to
* unlink any existing entries from the table.
*/
if (nvl->nvl_nvflag != 0) {
int err = nvt_remove_nvpair(nvl, nvp);
if (err != 0)
return (err);
}
if (nvt_needs_togrow(priv)) {
int err = nvt_grow(priv);
if (err != 0)
return (err);
}
i_nvp_t **tab = priv->nvp_hashtable;
char *name = NVP_NAME(nvp);
uint64_t hash = nvt_hash(name);
uint64_t index = hash & (priv->nvp_nbuckets - 1);
ASSERT3U(index, <, priv->nvp_nbuckets);
i_nvp_t *bucket = tab[index];
/* insert link at the beginning of the bucket */
i_nvp_t *new_entry = NVPAIR2I_NVP(nvp);
ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL);
new_entry->nvi_hashtable_next = bucket;
tab[index] = new_entry;
priv->nvp_nentries++;
return (0);
}
static void
nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
{
@ -590,6 +877,7 @@ nvlist_free(nvlist_t *nvl)
else
nvl->nvl_priv = 0;
nvt_tab_free(priv);
nv_mem_free(priv, priv, sizeof (nvpriv_t));
}
@ -644,26 +932,14 @@ nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
int
nvlist_remove_all(nvlist_t *nvl, const char *name)
{
nvpriv_t *priv;
i_nvp_t *curr;
int error = ENOENT;
if (nvl == NULL || name == NULL ||
(priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
return (EINVAL);
curr = priv->nvp_list;
while (curr != NULL) {
nvpair_t *nvp = &curr->nvi_nvp;
curr = curr->nvi_next;
if (strcmp(name, NVP_NAME(nvp)) != 0)
continue;
nvp_buf_unlink(nvl, nvp);
nvpair_free(nvp);
nvp_buf_free(nvl, nvp);
nvpair_t *nvp;
while ((nvp = nvt_lookup_name(nvl, name)) != NULL) {
VERIFY0(nvlist_remove_nvpair(nvl, nvp));
error = 0;
}
@ -676,28 +952,14 @@ nvlist_remove_all(nvlist_t *nvl, const char *name)
int
nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
{
nvpriv_t *priv;
i_nvp_t *curr;
if (nvl == NULL || name == NULL ||
(priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
return (EINVAL);
curr = priv->nvp_list;
while (curr != NULL) {
nvpair_t *nvp = &curr->nvi_nvp;
nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
if (nvp == NULL)
return (ENOENT);
if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) {
nvp_buf_unlink(nvl, nvp);
nvpair_free(nvp);
nvp_buf_free(nvl, nvp);
return (0);
}
curr = curr->nvi_next;
}
return (ENOENT);
return (nvlist_remove_nvpair(nvl, nvp));
}
int
@ -706,6 +968,10 @@ nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
if (nvl == NULL || nvp == NULL)
return (EINVAL);
int err = nvt_remove_nvpair(nvl, nvp);
if (err != 0)
return (err);
nvp_buf_unlink(nvl, nvp);
nvpair_free(nvp);
nvp_buf_free(nvl, nvp);
@ -983,6 +1249,12 @@ nvlist_add_common(nvlist_t *nvl, const char *name,
else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
(void) nvlist_remove(nvl, name, type);
err = nvt_add_nvpair(nvl, nvp);
if (err != 0) {
nvpair_free(nvp);
nvp_buf_free(nvl, nvp);
return (err);
}
nvp_buf_link(nvl, nvp);
return (0);
@ -1335,25 +1607,17 @@ static int
nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
uint_t *nelem, void *data)
{
nvpriv_t *priv;
nvpair_t *nvp;
i_nvp_t *curr;
if (name == NULL || nvl == NULL ||
(priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
return (EINVAL);
if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
return (ENOTSUP);
for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
nvp = &curr->nvi_nvp;
nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
if (nvp == NULL)
return (ENOENT);
if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type)
return (nvpair_value_common(nvp, type, nelem, data));
}
return (ENOENT);
return (nvpair_value_common(nvp, type, nelem, data));
}
int
@ -2111,6 +2375,12 @@ nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
return (EFAULT);
}
err = nvt_add_nvpair(nvl, nvp);
if (err != 0) {
nvpair_free(nvp);
nvp_buf_free(nvl, nvp);
return (err);
}
nvp_buf_link(nvl, nvp);
}
return (err);

View File

@ -393,6 +393,7 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
break;
case DATA_TYPE_UNKNOWN:
case DATA_TYPE_DONTCARE:
c = fm_printf(d + 1, c, cols, "<unknown>");
break;
}