]> git.karo-electronics.de Git - mv-sheeva.git/commitdiff
drbd: fix potential data divergence after multiple failures
authorLars Ellenberg <lars.ellenberg@linbit.com>
Mon, 18 Oct 2010 21:04:07 +0000 (23:04 +0200)
committerPhilipp Reisner <philipp.reisner@linbit.com>
Fri, 22 Oct 2010 13:50:27 +0000 (15:50 +0200)
If we get an IO-error during an activity log transaction,
if we failed to write the bitmap of the evicted extent,
we must not write the transaction itself.
If we failed to write the transaction,
we must not even submit the corresponding bio,
as its extent is not yet marked in the activity log.

Otherwise, if this was a disconneted Primary (degraded cluster), which
now lost its disk as well, and we later re-attach the same backend
storage, we possibly "forget" to resync some parts of the disk that
potentially have been changed.

On the receiving side, when receiving from a peer with unhealthy disk,
checking for pdsk == D_DISKLESS is not enough, we need to set out of
sync and do AL transactions for everything pdsk < D_INCONSISTENT on the
receiving side.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
drivers/block/drbd/drbd_actlog.c
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c

index ac04ef97eac29a0203e38d08a942c1733e5d9285..bd925180a2b07167f8702264125e25b8716f3659 100644 (file)
@@ -284,18 +284,32 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
        u32 xor_sum = 0;
 
        if (!get_ldev(mdev)) {
-               dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+               dev_err(DEV,
+                       "disk is %s, cannot start al transaction (-%d +%d)\n",
+                       drbd_disk_str(mdev->state.disk), evicted, new_enr);
                complete(&((struct update_al_work *)w)->event);
                return 1;
        }
        /* do we have to do a bitmap write, first?
         * TODO reduce maximum latency:
         * submit both bios, then wait for both,
-        * instead of doing two synchronous sector writes. */
+        * instead of doing two synchronous sector writes.
+        * For now, we must not write the transaction,
+        * if we cannot write out the bitmap of the evicted extent. */
        if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
                drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
 
-       mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+       /* The bitmap write may have failed, causing a state change. */
+       if (mdev->state.disk < D_INCONSISTENT) {
+               dev_err(DEV,
+                       "disk is %s, cannot write al transaction (-%d +%d)\n",
+                       drbd_disk_str(mdev->state.disk), evicted, new_enr);
+               complete(&((struct update_al_work *)w)->event);
+               put_ldev(mdev);
+               return 1;
+       }
+
+       mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
        buffer = (struct al_transaction *)page_address(mdev->md_io_page);
 
        buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
@@ -739,7 +753,7 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
        unsigned int enr;
        unsigned long add = 0;
        char ppb[10];
-       int i;
+       int i, tmp;
 
        wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
 
@@ -747,7 +761,9 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
                enr = lc_element_by_index(mdev->act_log, i)->lc_number;
                if (enr == LC_FREE)
                        continue;
-               add += drbd_bm_ALe_set_all(mdev, enr);
+               tmp = drbd_bm_ALe_set_all(mdev, enr);
+               dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
+               add += tmp;
        }
 
        lc_unlock(mdev->act_log);
index 04a823b01da506a4e0c615c9bc7dc5252f103c32..1146faa7ae38df1a4a9e16cb7db179c77f1c1bed 100644 (file)
@@ -1995,10 +1995,11 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
                break;
        }
 
-       if (mdev->state.pdsk == D_DISKLESS) {
+       if (mdev->state.pdsk < D_INCONSISTENT) {
                /* In case we have the only disk of the cluster, */
                drbd_set_out_of_sync(mdev, e->sector, e->size);
                e->flags |= EE_CALL_AL_COMPLETE_IO;
+               e->flags &= ~EE_MAY_SET_IN_SYNC;
                drbd_al_begin_io(mdev, e->sector);
        }
 
index 9e91a2545fc869273d39ca9737904e56d5f203b9..d26b213dbf15acbfa9a6b9c037429ceca7f77de5 100644 (file)
@@ -942,12 +942,21 @@ allocate_barrier:
        if (local) {
                req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
 
-               if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
-                                    : rw == READ  ? DRBD_FAULT_DT_RD
-                                    :               DRBD_FAULT_DT_RA))
+               /* State may have changed since we grabbed our reference on the
+                * mdev->ldev member. Double check, and short-circuit to endio.
+                * In case the last activity log transaction failed to get on
+                * stable storage, and this is a WRITE, we may not even submit
+                * this bio. */
+               if (get_ldev(mdev)) {
+                       if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
+                                            : rw == READ  ? DRBD_FAULT_DT_RD
+                                            :               DRBD_FAULT_DT_RA))
+                               bio_endio(req->private_bio, -EIO);
+                       else
+                               generic_make_request(req->private_bio);
+                       put_ldev(mdev);
+               } else
                        bio_endio(req->private_bio, -EIO);
-               else
-                       generic_make_request(req->private_bio);
        }
 
        /* we need to plug ALWAYS since we possibly need to kick lo_dev.