Add visibility in to txg sync behavior

This change is an attempt to add visibility in to how txgs are being
formed on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to txg. These entries are then exported through the kstat
interface, which can then be interpreted in userspace.

For each txg, the following information is exported:

 * Unique txg number (uint64_t)
 * The time the txd was born (hrtime_t)
   (*not* wall clock time; relative to the other entries on the list)
 * The current txg state ((O)pen/(Q)uiescing/(S)yncing/(C)ommitted)
 * The number of reserved bytes for the txg (uint64_t)
 * The number of bytes read during the txg (uint64_t)
 * The number of bytes written during the txg (uint64_t)
 * The number of read operations during the txg (uint64_t)
 * The number of write operations during the txg (uint64_t)
 * The time the txg was closed (hrtime_t)
 * The time the txg was quiesced (hrtime_t)
 * The time the txg was synced (hrtime_t)

Note that while the raw kstat now stores relative hrtimes for the
open, quiesce, and sync times.  Those relative times are used to
calculate how long each state took and these deltas and printed by
output handlers.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Brian Behlendorf 2013-10-01 09:50:50 -07:00
parent 1421c89142
commit 0b1401ee91
3 changed files with 322 additions and 1 deletions

View File

@ -548,12 +548,26 @@ typedef struct spa_stats_history {
typedef struct spa_stats {
spa_stats_history_t read_history;
spa_stats_history_t txg_history;
} spa_stats_t;
typedef enum txg_state {
TXG_STATE_BIRTH = 0,
TXG_STATE_OPEN = 1,
TXG_STATE_QUIESCED = 2,
TXG_STATE_SYNCED = 3,
TXG_STATE_COMMITTED = 4,
} txg_state_t;
extern void spa_stats_init(spa_t *spa);
extern void spa_stats_destroy(spa_t *spa);
extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb,
uint32_t aflags);
extern void spa_txg_history_add(spa_t *spa, uint64_t txg);
extern int spa_txg_history_set(spa_t *spa, uint64_t txg,
txg_state_t completed_state, hrtime_t completed_time);
extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved);
/* Pool configuration locks */
extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);

View File

@ -32,6 +32,11 @@ int zfs_read_history = 0;
*/
int zfs_read_history_hits = 0;
/*
* Keeps stats on the last N txgs, disabled by default.
*/
int zfs_txg_history = 0;
/*
* ==========================================================================
* SPA Read History Routines
@ -227,15 +232,289 @@ spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags)
mutex_exit(&ssh->lock);
}
/*
* ==========================================================================
* SPA TXG History Routines
* ==========================================================================
*/
/*
* Txg statistics - Information exported regarding each txg sync
*/
typedef struct spa_txg_history {
uint64_t txg; /* txg id */
txg_state_t state; /* active txg state */
uint64_t nread; /* number of bytes read */
uint64_t nwritten; /* number of bytes written */
uint64_t reads; /* number of read operations */
uint64_t writes; /* number of write operations */
uint64_t nreserved; /* number of bytes reserved */
hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */
list_node_t sth_link;
} spa_txg_history_t;
static int
spa_txg_history_headers(char *buf, size_t size)
{
size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s "
"%-8s %-8s %-12s %-12s %-12s\n", "txg", "birth", "state",
"nreserved", "nread", "nwritten", "reads", "writes",
"otime", "qtime", "stime");
buf[size] = '\0';
return (0);
}
static int
spa_txg_history_data(char *buf, size_t size, void *data)
{
spa_txg_history_t *sth = (spa_txg_history_t *)data;
uint64_t open = 0, quiesce = 0, sync = 0;
char state;
switch (sth->state) {
case TXG_STATE_BIRTH: state = 'B'; break;
case TXG_STATE_OPEN: state = 'O'; break;
case TXG_STATE_QUIESCED: state = 'Q'; break;
case TXG_STATE_SYNCED: state = 'S'; break;
case TXG_STATE_COMMITTED: state = 'C'; break;
default: state = '?'; break;
}
if (sth->times[TXG_STATE_OPEN])
open = sth->times[TXG_STATE_OPEN] -
sth->times[TXG_STATE_BIRTH];
if (sth->times[TXG_STATE_QUIESCED])
quiesce = sth->times[TXG_STATE_QUIESCED] -
sth->times[TXG_STATE_OPEN];
if (sth->times[TXG_STATE_SYNCED])
sync = sth->times[TXG_STATE_SYNCED] -
sth->times[TXG_STATE_QUIESCED];
size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu "
"%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu\n",
(longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
(u_longlong_t)sth->nreserved,
(u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
(u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
(u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)sync);
buf[size] = '\0';
return (0);
}
/*
* Calculate the address for the next spa_stats_history_t entry. The
* ssh->lock will be held until ksp->ks_ndata entries are processed.
*/
static void *
spa_txg_history_addr(kstat_t *ksp, loff_t n)
{
spa_t *spa = ksp->ks_private;
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
ASSERT(MUTEX_HELD(&ssh->lock));
if (n == 0)
ssh->private = list_tail(&ssh->list);
else if (ssh->private)
ssh->private = list_prev(&ssh->list, ssh->private);
return (ssh->private);
}
/*
* When the kstat is written discard all spa_txg_history_t entires. The
* ssh->lock will be held until ksp->ks_ndata entries are processed.
*/
static int
spa_txg_history_update(kstat_t *ksp, int rw)
{
spa_t *spa = ksp->ks_private;
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
ASSERT(MUTEX_HELD(&ssh->lock));
if (rw == KSTAT_WRITE) {
spa_txg_history_t *sth;
while ((sth = list_remove_head(&ssh->list))) {
ssh->size--;
kmem_free(sth, sizeof(spa_txg_history_t));
}
ASSERT3U(ssh->size, ==, 0);
}
ksp->ks_ndata = ssh->size;
ksp->ks_data_size = ssh->size * sizeof(spa_txg_history_t);
return (0);
}
static void
spa_txg_history_init(spa_t *spa)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
char name[KSTAT_STRLEN];
kstat_t *ksp;
mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&ssh->list, sizeof (spa_txg_history_t),
offsetof(spa_txg_history_t, sth_link));
ssh->count = 0;
ssh->size = 0;
ssh->private = NULL;
(void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
name[KSTAT_STRLEN-1] = '\0';
ksp = kstat_create(name, 0, "txgs", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
ssh->kstat = ksp;
if (ksp) {
ksp->ks_lock = &ssh->lock;
ksp->ks_data = NULL;
ksp->ks_private = spa;
ksp->ks_update = spa_txg_history_update;
kstat_set_raw_ops(ksp, spa_txg_history_headers,
spa_txg_history_data, spa_txg_history_addr);
kstat_install(ksp);
}
}
static void
spa_txg_history_destroy(spa_t *spa)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
spa_txg_history_t *sth;
kstat_t *ksp;
ksp = ssh->kstat;
if (ksp)
kstat_delete(ksp);
mutex_enter(&ssh->lock);
while ((sth = list_remove_head(&ssh->list))) {
ssh->size--;
kmem_free(sth, sizeof(spa_txg_history_t));
}
ASSERT3U(ssh->size, ==, 0);
list_destroy(&ssh->list);
mutex_exit(&ssh->lock);
mutex_destroy(&ssh->lock);
}
/*
* Add a new txg to historical record.
*/
void
spa_txg_history_add(spa_t *spa, uint64_t txg)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
spa_txg_history_t *sth, *rm;
if (zfs_txg_history == 0 && ssh->size == 0)
return;
sth = kmem_zalloc(sizeof(spa_txg_history_t), KM_PUSHPAGE);
sth->txg = txg;
sth->state = TXG_STATE_OPEN;
sth->times[TXG_STATE_BIRTH] = gethrtime();
mutex_enter(&ssh->lock);
list_insert_head(&ssh->list, sth);
ssh->size++;
while (ssh->size > zfs_txg_history) {
ssh->size--;
rm = list_remove_tail(&ssh->list);
kmem_free(rm, sizeof(spa_txg_history_t));
}
mutex_exit(&ssh->lock);
}
/*
* Set txg state completion time and increment current state.
*/
int
spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
hrtime_t completed_time)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
spa_txg_history_t *sth;
int error = ENOENT;
if (zfs_txg_history == 0)
return (0);
mutex_enter(&ssh->lock);
for (sth = list_head(&ssh->list); sth != NULL;
sth = list_next(&ssh->list, sth)) {
if (sth->txg == txg) {
sth->times[completed_state] = completed_time;
sth->state++;
error = 0;
break;
}
}
mutex_exit(&ssh->lock);
return (error);
}
/*
* Set txg IO stats.
*/
int
spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
spa_txg_history_t *sth;
int error = ENOENT;
if (zfs_txg_history == 0)
return (0);
mutex_enter(&ssh->lock);
for (sth = list_head(&ssh->list); sth != NULL;
sth = list_next(&ssh->list, sth)) {
if (sth->txg == txg) {
sth->nread = nread;
sth->nwritten = nwritten;
sth->reads = reads;
sth->writes = writes;
sth->nreserved = nreserved;
error = 0;
break;
}
}
mutex_exit(&ssh->lock);
return (error);
}
void
spa_stats_init(spa_t *spa)
{
spa_read_history_init(spa);
spa_txg_history_init(spa);
}
void
spa_stats_destroy(spa_t *spa)
{
spa_txg_history_destroy(spa);
spa_read_history_destroy(spa);
}
@ -245,4 +524,7 @@ MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads");
module_param(zfs_read_history_hits, int, 0644);
MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history");
module_param(zfs_txg_history, int, 0644);
MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs");
#endif

View File

@ -27,6 +27,7 @@
#include <sys/zfs_context.h>
#include <sys/txg_impl.h>
#include <sys/dmu_impl.h>
#include <sys/spa_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
@ -363,6 +364,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
ASSERT(txg == tx->tx_open_txg);
tx->tx_open_txg++;
spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime());
spa_txg_history_add(dp->dp_spa, tx->tx_open_txg);
/*
* Now that we've incremented tx_open_txg, we can let threads
* enter the next transaction group.
@ -380,6 +384,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
cv_wait(&tc->tc_cv[g], &tc->tc_lock);
mutex_exit(&tc->tc_lock);
}
spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime());
}
static void
@ -451,6 +457,7 @@ txg_sync_thread(dsl_pool_t *dp)
spa_t *spa = dp->dp_spa;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
vdev_stat_t *vs1, *vs2;
uint64_t start, delta;
#ifdef _KERNEL
@ -464,6 +471,9 @@ txg_sync_thread(dsl_pool_t *dp)
txg_thread_enter(tx, &cpr);
vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);
vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);
start = delta = 0;
for (;;) {
uint64_t timer, timeout;
@ -499,8 +509,13 @@ txg_sync_thread(dsl_pool_t *dp)
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
}
if (tx->tx_exiting)
if (tx->tx_exiting) {
kmem_free(vs2, sizeof(vdev_stat_t));
kmem_free(vs1, sizeof(vdev_stat_t));
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
}
vdev_get_stats(spa->spa_root_vdev, vs1);
/*
* Consume the quiesced txg which has been handed off to
@ -529,6 +544,16 @@ txg_sync_thread(dsl_pool_t *dp)
* Dispatch commit callbacks to worker threads.
*/
txg_dispatch_callbacks(dp, txg);
vdev_get_stats(spa->spa_root_vdev, vs2);
spa_txg_history_set_io(spa, txg,
vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ],
vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],
vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ],
vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE],
dp->dp_space_towrite[txg & TXG_MASK] +
dp->dp_tempreserved[txg & TXG_MASK] / 2);
spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime());
}
}