From 5016b82a49eb06cbe2002db7bd8a5501ba4ef6d1 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 7 May 2012 12:00:56 +0200 Subject: [PATCH] drbd: fix race between drbdadm invalidate/verify and finishing resync When a resync or online verify is finished or aborted, drbd does a bulk write-out of changed bitmap pages. If *in that very moment* a new verify or resync is triggered, this can race: ASSERT( !test_bit(BITMAP_IO, &mdev->flags) ) in drbd_main.c FIXME going to queue 'set_n_write from StartingSync' but 'write from resync_finished' still pending? and similar. This can be observed with e.g. tight invalidate loops in test scripts, and probably has no real-life implication. Still, that race can be solved by first quiescen the device, before starting a new resync or verify. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 3a8fa89f673a..cbd45de533cb 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2372,6 +2372,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -2390,6 +2391,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); } + drbd_resume_io(mdev); out: drbd_adm_finish(info, retcode); @@ -2435,6 +2437,11 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) mdev = adm_ctx.mdev; + /* If there is still bitmap IO pending, probably because of a previous + * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); + wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); if (retcode < SS_SUCCESS) { if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { @@ -2450,6 +2457,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); } + drbd_resume_io(mdev); out: drbd_adm_finish(info, retcode); @@ -2903,8 +2911,10 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) } /* If there is still bitmap IO pending, e.g. previous resync or verify * just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + drbd_resume_io(mdev); out: drbd_adm_finish(info, retcode); return 0; -- 2.39.5