3112 ztest does not honor ZFS_DEBUG
3113 ztest should use watchpoints to protect frozen arc bufs
3114 some leaked nvlists in zfsdev_ioctl

Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Matt Amdur <Matt.Amdur@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Approved by: Eric Schrock <eric.schrock@delphix.com>

References:
  https://www.illumos.org/issues/3112
  https://www.illumos.org/issues/3113
  https://www.illumos.org/issues/3114
  illumos/illumos-gate@cd1c8b85eb

The /proc/self/cmd watchpoint interface is specific to Solaris.
Therefore, the #3113 implementation was reworked to use the more
portable mprotect(2) system call.  When the pages are watched they
are marked read-only for protection.  Any write to the protected
address range immediately trigger a SIGSEGV.  The pages are marked
writable again when they are unwatched.

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1489
This commit is contained in:
Matthew Ahrens 2013-05-16 14:18:06 -07:00 committed by Brian Behlendorf
parent 03c6040bee
commit 498877baf5
6 changed files with 89 additions and 12 deletions

View File

@ -6210,11 +6210,12 @@ main(int argc, char **argv)
(void) setvbuf(stdout, NULL, _IOLBF, 0); (void) setvbuf(stdout, NULL, _IOLBF, 0);
dprintf_setup(&argc, argv);
ztest_fd_rand = open("/dev/urandom", O_RDONLY); ztest_fd_rand = open("/dev/urandom", O_RDONLY);
ASSERT3S(ztest_fd_rand, >=, 0); ASSERT3S(ztest_fd_rand, >=, 0);
if (!fd_data_str) { if (!fd_data_str) {
dprintf_setup(&argc, argv);
process_options(argc, argv); process_options(argc, argv);
setup_data_fd(); setup_data_fd();

View File

@ -136,6 +136,7 @@ int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag); void arc_release(arc_buf_t *buf, void *tag);
int arc_released(arc_buf_t *buf); int arc_released(arc_buf_t *buf);
int arc_has_callback(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf);
void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused);
void arc_buf_freeze(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf);
void arc_buf_thaw(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf);
boolean_t arc_buf_eviction_needed(arc_buf_t *buf); boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
@ -183,6 +184,10 @@ extern int zfs_write_limit_shift;
extern unsigned long zfs_write_limit_max; extern unsigned long zfs_write_limit_max;
extern kmutex_t zfs_write_limit_lock; extern kmutex_t zfs_write_limit_lock;
#ifndef _KERNEL
extern boolean_t arc_watch;
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -97,6 +97,8 @@
#include <dirent.h> #include <dirent.h>
#include <time.h> #include <time.h>
#include <ctype.h> #include <ctype.h>
#include <signal.h>
#include <sys/mman.h>
#include <sys/note.h> #include <sys/note.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/cred.h> #include <sys/cred.h>

View File

@ -145,6 +145,11 @@
#include <sys/dmu_tx.h> #include <sys/dmu_tx.h>
#include <zfs_fletcher.h> #include <zfs_fletcher.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
boolean_t arc_watch = B_FALSE;
#endif
static kmutex_t arc_reclaim_thr_lock; static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit; static uint8_t arc_thread_exit;
@ -569,6 +574,7 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
static int arc_evict_needed(arc_buf_contents_t type); static int arc_evict_needed(arc_buf_contents_t type);
static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
arc_buf_contents_t type); arc_buf_contents_t type);
static void arc_buf_watch(arc_buf_t *buf);
static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
@ -1060,6 +1066,37 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
fletcher_2_native(buf->b_data, buf->b_hdr->b_size, fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
buf->b_hdr->b_freeze_cksum); buf->b_hdr->b_freeze_cksum);
mutex_exit(&buf->b_hdr->b_freeze_lock); mutex_exit(&buf->b_hdr->b_freeze_lock);
arc_buf_watch(buf);
}
#ifndef _KERNEL
void
arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
{
panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
}
#endif
/* ARGSUSED */
static void
arc_buf_unwatch(arc_buf_t *buf)
{
#ifndef _KERNEL
if (arc_watch) {
ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
PROT_READ | PROT_WRITE));
}
#endif
}
/* ARGSUSED */
static void
arc_buf_watch(arc_buf_t *buf)
{
#ifndef _KERNEL
if (arc_watch)
ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
#endif
} }
void void
@ -1080,6 +1117,8 @@ arc_buf_thaw(arc_buf_t *buf)
} }
mutex_exit(&buf->b_hdr->b_freeze_lock); mutex_exit(&buf->b_hdr->b_freeze_lock);
arc_buf_unwatch(buf);
} }
void void
@ -1097,6 +1136,7 @@ arc_buf_freeze(arc_buf_t *buf)
buf->b_hdr->b_state == arc_anon); buf->b_hdr->b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE); arc_cksum_compute(buf, B_FALSE);
mutex_exit(hash_lock); mutex_exit(hash_lock);
} }
static void static void
@ -1504,21 +1544,22 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
* the buffer is placed on l2arc_free_on_write to be freed later. * the buffer is placed on l2arc_free_on_write to be freed later.
*/ */
static void static void
arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
void *data, size_t size)
{ {
arc_buf_hdr_t *hdr = buf->b_hdr;
if (HDR_L2_WRITING(hdr)) { if (HDR_L2_WRITING(hdr)) {
l2arc_data_free_t *df; l2arc_data_free_t *df;
df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE); df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE);
df->l2df_data = data; df->l2df_data = buf->b_data;
df->l2df_size = size; df->l2df_size = hdr->b_size;
df->l2df_func = free_func; df->l2df_func = free_func;
mutex_enter(&l2arc_free_on_write_mtx); mutex_enter(&l2arc_free_on_write_mtx);
list_insert_head(l2arc_free_on_write, df); list_insert_head(l2arc_free_on_write, df);
mutex_exit(&l2arc_free_on_write_mtx); mutex_exit(&l2arc_free_on_write_mtx);
ARCSTAT_BUMP(arcstat_l2_free_on_write); ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else { } else {
free_func(data, size); free_func(buf->b_data, hdr->b_size);
} }
} }
@ -1534,16 +1575,15 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
arc_buf_contents_t type = buf->b_hdr->b_type; arc_buf_contents_t type = buf->b_hdr->b_type;
arc_cksum_verify(buf); arc_cksum_verify(buf);
arc_buf_unwatch(buf);
if (!recycle) { if (!recycle) {
if (type == ARC_BUFC_METADATA) { if (type == ARC_BUFC_METADATA) {
arc_buf_data_free(buf->b_hdr, zio_buf_free, arc_buf_data_free(buf, zio_buf_free);
buf->b_data, size);
arc_space_return(size, ARC_SPACE_DATA); arc_space_return(size, ARC_SPACE_DATA);
} else { } else {
ASSERT(type == ARC_BUFC_DATA); ASSERT(type == ARC_BUFC_DATA);
arc_buf_data_free(buf->b_hdr, arc_buf_data_free(buf, zio_data_buf_free);
zio_data_buf_free, buf->b_data, size);
ARCSTAT_INCR(arcstat_data_size, -size); ARCSTAT_INCR(arcstat_data_size, -size);
atomic_add_64(&arc_size, -size); atomic_add_64(&arc_size, -size);
} }
@ -2908,6 +2948,7 @@ arc_read_done(zio_t *zio)
} }
arc_cksum_compute(buf, B_FALSE); arc_cksum_compute(buf, B_FALSE);
arc_buf_watch(buf);
if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
/* /*
@ -3542,6 +3583,7 @@ arc_release(arc_buf_t *buf, void *tag)
} }
hdr->b_datacnt -= 1; hdr->b_datacnt -= 1;
arc_cksum_verify(buf); arc_cksum_verify(buf);
arc_buf_unwatch(buf);
mutex_exit(hash_lock); mutex_exit(hash_lock);

View File

@ -1630,6 +1630,23 @@ spa_init(int mode)
spa_mode_global = mode; spa_mode_global = mode;
#ifndef _KERNEL
if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
struct sigaction sa;
sa.sa_flags = SA_SIGINFO;
sigemptyset(&sa.sa_mask);
sa.sa_sigaction = arc_buf_sigsegv;
if (sigaction(SIGSEGV, &sa, NULL) == -1) {
perror("could not enable watchpoints: "
"sigaction(SIGSEGV, ...) = ");
} else {
arc_watch = B_TRUE;
}
}
#endif
fm_init(); fm_init();
refcount_init(); refcount_init();
unique_init(); unique_init();

View File

@ -169,11 +169,21 @@ zio_init(void)
while (p2 & (p2 - 1)) while (p2 & (p2 - 1))
p2 &= p2 - 1; p2 &= p2 - 1;
#ifndef _KERNEL
/*
* If we are using watchpoints, put each buffer on its own page,
* to eliminate the performance overhead of trapping to the
* kernel when modifying a non-watched buffer that shares the
* page with a watched buffer.
*/
if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
continue;
#endif
if (size <= 4 * SPA_MINBLOCKSIZE) { if (size <= 4 * SPA_MINBLOCKSIZE) {
align = SPA_MINBLOCKSIZE; align = SPA_MINBLOCKSIZE;
} else if (P2PHASE(size, PAGESIZE) == 0) { } else if (IS_P2ALIGNED(size, PAGESIZE)) {
align = PAGESIZE; align = PAGESIZE;
} else if (P2PHASE(size, p2 >> 2) == 0) { } else if (IS_P2ALIGNED(size, p2 >> 2)) {
align = p2 >> 2; align = p2 >> 2;
} }