From 8a02d01e85556bbe3a1c6947bc11b8ef028d4023 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 17 Dec 2021 09:52:13 -0800 Subject: [PATCH] Fix zvol_open() lock inversion When restructuring the zvol_open() logic for the Linux 5.13 kernel a lock inversion was accidentally introduced. In the updated code the spa_namespace_lock is now taken before the zv_suspend_lock allowing the following scenario to occur: down_read <=== waiting for zv_suspend_lock zvol_open <=== holds spa_namespace_lock __blkdev_get blkdev_get_by_dev blkdev_open ... mutex_lock <== waiting for spa_namespace_lock spa_open_common spa_open dsl_pool_hold dmu_objset_hold_flags dmu_objset_hold dsl_prop_get dsl_prop_get_integer zvol_create_minor dmu_recv_end zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend() zfs_ioc_recv ... This commit resolves the issue by moving the acquisition of the spa_namespace_lock back to after the zv_suspend_lock which restores the original ordering. Additionally, as part of this change the error exit paths were simplified where possible. Reviewed-by: Tony Hutter Reviewed-by: Rich Ercolani Signed-off-by: Brian Behlendorf Closes #12863 --- module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------ 1 file changed, 58 insertions(+), 63 deletions(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 44caadd587..69479b3f79 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; int error = 0; - boolean_t drop_suspend = B_TRUE; - boolean_t drop_namespace = B_FALSE; + boolean_t drop_suspend = B_FALSE; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); hrtime_t start = gethrtime(); @@ -517,7 +516,36 @@ retry: return (SET_ERROR(-ENXIO)); } - if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + mutex_enter(&zv->zv_state_lock); + /* + * Make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + } else { + drop_suspend = B_TRUE; + } + } else { + drop_suspend = B_TRUE; + } + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + boolean_t drop_namespace = B_FALSE; + + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + /* * In all other call paths the spa_namespace_lock is taken * before the bdev->bd_mutex lock. However, on open(2) @@ -542,84 +570,51 @@ retry: * the kernel so the only option is to return the error for * the caller to handle it. */ - if (!mutex_tryenter(&spa_namespace_lock)) { - rw_exit(&zvol_state_lock); + if (!mutex_owned(&spa_namespace_lock)) { + if (!mutex_tryenter(&spa_namespace_lock)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); #ifdef HAVE_BLKDEV_GET_ERESTARTSYS - schedule(); - return (SET_ERROR(-ERESTARTSYS)); -#else - if ((gethrtime() - start) > timeout) + schedule(); return (SET_ERROR(-ERESTARTSYS)); +#else + if ((gethrtime() - start) > timeout) + return (SET_ERROR(-ERESTARTSYS)); - schedule_timeout(MSEC_TO_TICK(10)); - goto retry; + schedule_timeout(MSEC_TO_TICK(10)); + goto retry; #endif - } else { - drop_namespace = B_TRUE; - } - } - - mutex_enter(&zv->zv_state_lock); - /* - * make sure zvol is not suspended during first open - * (hold zv_suspend_lock) and respect proper lock acquisition - * ordering - zv_suspend_lock before zv_state_lock - */ - if (zv->zv_open_count == 0) { - if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { - mutex_exit(&zv->zv_state_lock); - rw_enter(&zv->zv_suspend_lock, RW_READER); - mutex_enter(&zv->zv_state_lock); - /* check to see if zv_suspend_lock is needed */ - if (zv->zv_open_count != 0) { - rw_exit(&zv->zv_suspend_lock); - drop_suspend = B_FALSE; + } else { + drop_namespace = B_TRUE; } } - } else { - drop_suspend = B_FALSE; - } - rw_exit(&zvol_state_lock); - ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - - if (zv->zv_open_count == 0) { - ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); - if (error) - goto out_mutex; + + if (drop_namespace) + mutex_exit(&spa_namespace_lock); } - if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - error = -EROFS; - goto out_open_count; - } + if (error == 0) { + if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + if (zv->zv_open_count == 0) + zvol_last_close(zv); - zv->zv_open_count++; + error = SET_ERROR(-EROFS); + } else { + zv->zv_open_count++; + } + } mutex_exit(&zv->zv_state_lock); - if (drop_namespace) - mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - zfs_check_media_change(bdev); + if (error == 0) + zfs_check_media_change(bdev); - return (0); - -out_open_count: - if (zv->zv_open_count == 0) - zvol_last_close(zv); - -out_mutex: - mutex_exit(&zv->zv_state_lock); - if (drop_namespace) - mutex_exit(&spa_namespace_lock); - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - - return (SET_ERROR(error)); + return (error); } static void