4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26 #include <linux/module.h>
27 #include <linux/drbd.h>
28 #include <linux/sched.h>
29 #include <linux/wait.h>
31 #include <linux/memcontrol.h>
32 #include <linux/mm_inline.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/string.h>
36 #include <linux/scatterlist.h>
41 static int w_make_ov_request(struct drbd_work *w, int cancel);
45 * drbd_md_io_complete (defined here)
46 * drbd_request_endio (defined here)
47 * drbd_peer_request_endio (defined here)
48 * bm_async_io_complete (defined in drbd_bitmap.c)
50 * For all these callbacks, note the following:
51 * The callbacks will be called in irq context by the IDE drivers,
52 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
53 * Try to get the locking right :)
58 /* About the global_state_lock
59 Each state transition on an device holds a read lock. In case we have
60 to evaluate the sync after dependencies, we grab a write lock, because
61 we need stable states on all devices for that. */
62 rwlock_t global_state_lock;
64 /* used for synchronous meta data and bitmap IO
65 * submitted by drbd_md_sync_page_io()
67 void drbd_md_io_complete(struct bio *bio, int error)
69 struct drbd_md_io *md_io;
71 md_io = (struct drbd_md_io *)bio->bi_private;
74 complete(&md_io->event);
77 /* reads on behalf of the partner,
78 * "submitted" by the receiver
80 void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
82 unsigned long flags = 0;
83 struct drbd_conf *mdev = peer_req->w.mdev;
85 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
86 mdev->read_cnt += peer_req->i.size >> 9;
87 list_del(&peer_req->w.list);
88 if (list_empty(&mdev->read_ee))
89 wake_up(&mdev->ee_wait);
90 if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
91 __drbd_chk_io_error(mdev, false);
92 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
94 drbd_queue_work(&mdev->tconn->data.work, &peer_req->w);
98 /* writes on behalf of the partner, or resync writes,
99 * "submitted" by the receiver, final stage. */
100 static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
102 unsigned long flags = 0;
103 struct drbd_conf *mdev = peer_req->w.mdev;
104 struct drbd_interval i;
107 int do_al_complete_io;
109 /* after we moved peer_req to done_ee,
110 * we may no longer access it,
111 * it may be freed/reused already!
112 * (as soon as we release the req_lock) */
114 do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
115 block_id = peer_req->block_id;
117 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
118 mdev->writ_cnt += peer_req->i.size >> 9;
119 list_del(&peer_req->w.list); /* has been on active_ee or sync_ee */
120 list_add_tail(&peer_req->w.list, &mdev->done_ee);
123 * Do not remove from the write_requests tree here: we did not send the
124 * Ack yet and did not wake possibly waiting conflicting requests.
125 * Removed from the tree from "drbd_process_done_ee" within the
126 * appropriate w.cb (e_end_block/e_end_resync_block) or from
127 * _drbd_clear_done_ee.
130 do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee);
132 if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
133 __drbd_chk_io_error(mdev, false);
134 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
136 if (block_id == ID_SYNCER)
137 drbd_rs_complete_io(mdev, i.sector);
140 wake_up(&mdev->ee_wait);
142 if (do_al_complete_io)
143 drbd_al_complete_io(mdev, &i);
145 wake_asender(mdev->tconn);
149 /* writes on behalf of the partner, or resync writes,
150 * "submitted" by the receiver.
152 void drbd_peer_request_endio(struct bio *bio, int error)
154 struct drbd_peer_request *peer_req = bio->bi_private;
155 struct drbd_conf *mdev = peer_req->w.mdev;
156 int uptodate = bio_flagged(bio, BIO_UPTODATE);
157 int is_write = bio_data_dir(bio) == WRITE;
159 if (error && __ratelimit(&drbd_ratelimit_state))
160 dev_warn(DEV, "%s: error=%d s=%llus\n",
161 is_write ? "write" : "read", error,
162 (unsigned long long)peer_req->i.sector);
163 if (!error && !uptodate) {
164 if (__ratelimit(&drbd_ratelimit_state))
165 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
166 is_write ? "write" : "read",
167 (unsigned long long)peer_req->i.sector);
168 /* strange behavior of some lower level drivers...
169 * fail the request by clearing the uptodate flag,
170 * but do not return any error?! */
175 set_bit(__EE_WAS_ERROR, &peer_req->flags);
177 bio_put(bio); /* no need for the bio anymore */
178 if (atomic_dec_and_test(&peer_req->pending_bios)) {
180 drbd_endio_write_sec_final(peer_req);
182 drbd_endio_read_sec_final(peer_req);
186 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
188 void drbd_request_endio(struct bio *bio, int error)
191 struct drbd_request *req = bio->bi_private;
192 struct drbd_conf *mdev = req->w.mdev;
193 struct bio_and_error m;
194 enum drbd_req_event what;
195 int uptodate = bio_flagged(bio, BIO_UPTODATE);
197 if (!error && !uptodate) {
198 dev_warn(DEV, "p %s: setting error to -EIO\n",
199 bio_data_dir(bio) == WRITE ? "write" : "read");
200 /* strange behavior of some lower level drivers...
201 * fail the request by clearing the uptodate flag,
202 * but do not return any error?! */
206 /* to avoid recursion in __req_mod */
207 if (unlikely(error)) {
208 what = (bio_data_dir(bio) == WRITE)
209 ? WRITE_COMPLETED_WITH_ERROR
210 : (bio_rw(bio) == READ)
211 ? READ_COMPLETED_WITH_ERROR
212 : READ_AHEAD_COMPLETED_WITH_ERROR;
216 bio_put(req->private_bio);
217 req->private_bio = ERR_PTR(error);
219 /* not req_mod(), we need irqsave here! */
220 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
221 __req_mod(req, what, &m);
222 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
225 complete_master_bio(mdev, &m);
228 int w_read_retry_remote(struct drbd_work *w, int cancel)
230 struct drbd_request *req = container_of(w, struct drbd_request, w);
231 struct drbd_conf *mdev = w->mdev;
233 /* We should not detach for read io-error,
234 * but try to WRITE the P_DATA_REPLY to the failed location,
235 * to give the disk the chance to relocate that block */
237 spin_lock_irq(&mdev->tconn->req_lock);
238 if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
239 _req_mod(req, READ_RETRY_REMOTE_CANCELED);
240 spin_unlock_irq(&mdev->tconn->req_lock);
243 spin_unlock_irq(&mdev->tconn->req_lock);
245 return w_send_read_req(w, 0);
248 void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm,
249 struct drbd_peer_request *peer_req, void *digest)
251 struct hash_desc desc;
252 struct scatterlist sg;
253 struct page *page = peer_req->pages;
260 sg_init_table(&sg, 1);
261 crypto_hash_init(&desc);
263 while ((tmp = page_chain_next(page))) {
264 /* all but the last page will be fully used */
265 sg_set_page(&sg, page, PAGE_SIZE, 0);
266 crypto_hash_update(&desc, &sg, sg.length);
269 /* and now the last, possibly only partially used page */
270 len = peer_req->i.size & (PAGE_SIZE - 1);
271 sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
272 crypto_hash_update(&desc, &sg, sg.length);
273 crypto_hash_final(&desc, digest);
276 void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
278 struct hash_desc desc;
279 struct scatterlist sg;
280 struct bio_vec *bvec;
286 sg_init_table(&sg, 1);
287 crypto_hash_init(&desc);
289 __bio_for_each_segment(bvec, bio, i, 0) {
290 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
291 crypto_hash_update(&desc, &sg, sg.length);
293 crypto_hash_final(&desc, digest);
296 /* MAYBE merge common code with w_e_end_ov_req */
297 static int w_e_send_csum(struct drbd_work *w, int cancel)
299 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
300 struct drbd_conf *mdev = w->mdev;
305 if (unlikely(cancel))
308 if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
311 digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
312 digest = kmalloc(digest_size, GFP_NOIO);
314 sector_t sector = peer_req->i.sector;
315 unsigned int size = peer_req->i.size;
316 drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
317 /* Free peer_req and pages before send.
318 * In case we block on congestion, we could otherwise run into
319 * some distributed deadlock, if the other side blocks on
320 * congestion as well, because our receiver blocks in
321 * drbd_alloc_pages due to pp_in_use > max_buffers. */
322 drbd_free_peer_req(mdev, peer_req);
324 inc_rs_pending(mdev);
325 err = drbd_send_drequest_csum(mdev, sector, size,
330 dev_err(DEV, "kmalloc() of digest failed.\n");
336 drbd_free_peer_req(mdev, peer_req);
339 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
343 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
345 static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
347 struct drbd_peer_request *peer_req;
352 if (drbd_rs_should_slow_down(mdev, sector))
355 /* GFP_TRY, because if there is no memory available right now, this may
356 * be rescheduled for later. It is "only" background resync, after all. */
357 peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector,
362 peer_req->w.cb = w_e_send_csum;
363 spin_lock_irq(&mdev->tconn->req_lock);
364 list_add(&peer_req->w.list, &mdev->read_ee);
365 spin_unlock_irq(&mdev->tconn->req_lock);
367 atomic_add(size >> 9, &mdev->rs_sect_ev);
368 if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
371 /* If it failed because of ENOMEM, retry should help. If it failed
372 * because bio_add_page failed (probably broken lower level driver),
373 * retry may or may not help.
374 * If it does not, you may need to force disconnect. */
375 spin_lock_irq(&mdev->tconn->req_lock);
376 list_del(&peer_req->w.list);
377 spin_unlock_irq(&mdev->tconn->req_lock);
379 drbd_free_peer_req(mdev, peer_req);
385 int w_resync_timer(struct drbd_work *w, int cancel)
387 struct drbd_conf *mdev = w->mdev;
388 switch (mdev->state.conn) {
390 w_make_ov_request(w, cancel);
393 w_make_resync_request(w, cancel);
400 void resync_timer_fn(unsigned long data)
402 struct drbd_conf *mdev = (struct drbd_conf *) data;
404 if (list_empty(&mdev->resync_work.list))
405 drbd_queue_work(&mdev->tconn->data.work, &mdev->resync_work);
408 static void fifo_set(struct fifo_buffer *fb, int value)
412 for (i = 0; i < fb->size; i++)
413 fb->values[i] = value;
416 static int fifo_push(struct fifo_buffer *fb, int value)
420 ov = fb->values[fb->head_index];
421 fb->values[fb->head_index++] = value;
423 if (fb->head_index >= fb->size)
429 static void fifo_add_val(struct fifo_buffer *fb, int value)
433 for (i = 0; i < fb->size; i++)
434 fb->values[i] += value;
437 static int drbd_rs_controller(struct drbd_conf *mdev)
439 unsigned int sect_in; /* Number of sectors that came in since the last turn */
440 unsigned int want; /* The number of sectors we want in the proxy */
441 int req_sect; /* Number of sectors to request in this turn */
442 int correction; /* Number of sectors more we need in the proxy*/
443 int cps; /* correction per invocation of drbd_rs_controller() */
444 int steps; /* Number of time steps to plan ahead */
448 sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
449 mdev->rs_in_flight -= sect_in;
451 spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
453 steps = mdev->rs_plan_s.size; /* (mdev->ldev->dc.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
455 if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
456 want = ((mdev->ldev->dc.resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
457 } else { /* normal path */
458 want = mdev->ldev->dc.c_fill_target ? mdev->ldev->dc.c_fill_target :
459 sect_in * mdev->ldev->dc.c_delay_target * HZ / (SLEEP_TIME * 10);
462 correction = want - mdev->rs_in_flight - mdev->rs_planed;
465 cps = correction / steps;
466 fifo_add_val(&mdev->rs_plan_s, cps);
467 mdev->rs_planed += cps * steps;
469 /* What we do in this step */
470 curr_corr = fifo_push(&mdev->rs_plan_s, 0);
471 spin_unlock(&mdev->peer_seq_lock);
472 mdev->rs_planed -= curr_corr;
474 req_sect = sect_in + curr_corr;
478 max_sect = (mdev->ldev->dc.c_max_rate * 2 * SLEEP_TIME) / HZ;
479 if (req_sect > max_sect)
483 dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
484 sect_in, mdev->rs_in_flight, want, correction,
485 steps, cps, mdev->rs_planed, curr_corr, req_sect);
491 static int drbd_rs_number_requests(struct drbd_conf *mdev)
494 if (mdev->rs_plan_s.size) { /* mdev->ldev->dc.c_plan_ahead */
495 number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
496 mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
498 mdev->c_sync_rate = mdev->ldev->dc.resync_rate;
499 number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
502 /* ignore the amount of pending requests, the resync controller should
503 * throttle down to incoming reply rate soon enough anyways. */
507 int w_make_resync_request(struct drbd_work *w, int cancel)
509 struct drbd_conf *mdev = w->mdev;
512 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
514 int number, rollback_i, size;
515 int align, queued, sndbuf;
518 if (unlikely(cancel))
521 if (mdev->rs_total == 0) {
523 drbd_resync_finished(mdev);
527 if (!get_ldev(mdev)) {
528 /* Since we only need to access mdev->rsync a
529 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
530 to continue resync with a broken disk makes no sense at
532 dev_err(DEV, "Disk broke down during resync!\n");
536 max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
537 number = drbd_rs_number_requests(mdev);
541 for (i = 0; i < number; i++) {
542 /* Stop generating RS requests, when half of the send buffer is filled */
543 mutex_lock(&mdev->tconn->data.mutex);
544 if (mdev->tconn->data.socket) {
545 queued = mdev->tconn->data.socket->sk->sk_wmem_queued;
546 sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf;
551 mutex_unlock(&mdev->tconn->data.mutex);
552 if (queued > sndbuf / 2)
556 size = BM_BLOCK_SIZE;
557 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
559 if (bit == DRBD_END_OF_BITMAP) {
560 mdev->bm_resync_fo = drbd_bm_bits(mdev);
565 sector = BM_BIT_TO_SECT(bit);
567 if (drbd_rs_should_slow_down(mdev, sector) ||
568 drbd_try_rs_begin_io(mdev, sector)) {
569 mdev->bm_resync_fo = bit;
572 mdev->bm_resync_fo = bit + 1;
574 if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
575 drbd_rs_complete_io(mdev, sector);
579 #if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
580 /* try to find some adjacent bits.
581 * we stop if we have already the maximum req size.
583 * Additionally always align bigger requests, in order to
584 * be prepared for all stripe sizes of software RAIDs.
589 if (size + BM_BLOCK_SIZE > max_bio_size)
592 /* Be always aligned */
593 if (sector & ((1<<(align+3))-1))
596 /* do not cross extent boundaries */
597 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
599 /* now, is it actually dirty, after all?
600 * caution, drbd_bm_test_bit is tri-state for some
601 * obscure reason; ( b == 0 ) would get the out-of-band
602 * only accidentally right because of the "oddly sized"
603 * adjustment below */
604 if (drbd_bm_test_bit(mdev, bit+1) != 1)
607 size += BM_BLOCK_SIZE;
608 if ((BM_BLOCK_SIZE << align) <= size)
612 /* if we merged some,
613 * reset the offset to start the next drbd_bm_find_next from */
614 if (size > BM_BLOCK_SIZE)
615 mdev->bm_resync_fo = bit + 1;
618 /* adjust very last sectors, in case we are oddly sized */
619 if (sector + (size>>9) > capacity)
620 size = (capacity-sector)<<9;
621 if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) {
622 switch (read_for_csum(mdev, sector, size)) {
623 case -EIO: /* Disk failure */
626 case -EAGAIN: /* allocation failed, or ldev busy */
627 drbd_rs_complete_io(mdev, sector);
628 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
640 inc_rs_pending(mdev);
641 err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
642 sector, size, ID_SYNCER);
644 dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
645 dec_rs_pending(mdev);
652 if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
653 /* last syncer _request_ was sent,
654 * but the P_RS_DATA_REPLY not yet received. sync will end (and
655 * next sync group will resume), as soon as we receive the last
656 * resync data block, and the last bit is cleared.
657 * until then resync "work" is "inactive" ...
664 mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
665 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
670 static int w_make_ov_request(struct drbd_work *w, int cancel)
672 struct drbd_conf *mdev = w->mdev;
675 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
677 if (unlikely(cancel))
680 number = drbd_rs_number_requests(mdev);
682 sector = mdev->ov_position;
683 for (i = 0; i < number; i++) {
684 if (sector >= capacity) {
688 size = BM_BLOCK_SIZE;
690 if (drbd_rs_should_slow_down(mdev, sector) ||
691 drbd_try_rs_begin_io(mdev, sector)) {
692 mdev->ov_position = sector;
696 if (sector + (size>>9) > capacity)
697 size = (capacity-sector)<<9;
699 inc_rs_pending(mdev);
700 if (drbd_send_ov_request(mdev, sector, size)) {
701 dec_rs_pending(mdev);
704 sector += BM_SECT_PER_BIT;
706 mdev->ov_position = sector;
709 mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
710 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
714 int w_ov_finished(struct drbd_work *w, int cancel)
716 struct drbd_conf *mdev = w->mdev;
718 ov_out_of_sync_print(mdev);
719 drbd_resync_finished(mdev);
724 static int w_resync_finished(struct drbd_work *w, int cancel)
726 struct drbd_conf *mdev = w->mdev;
729 drbd_resync_finished(mdev);
734 static void ping_peer(struct drbd_conf *mdev)
736 struct drbd_tconn *tconn = mdev->tconn;
738 clear_bit(GOT_PING_ACK, &tconn->flags);
740 wait_event(tconn->ping_wait,
741 test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED);
744 int drbd_resync_finished(struct drbd_conf *mdev)
746 unsigned long db, dt, dbdt;
748 union drbd_state os, ns;
750 char *khelper_cmd = NULL;
753 /* Remove all elements from the resync LRU. Since future actions
754 * might set bits in the (main) bitmap, then the entries in the
755 * resync LRU would be wrong. */
756 if (drbd_rs_del_all(mdev)) {
757 /* In case this is not possible now, most probably because
758 * there are P_RS_DATA_REPLY Packets lingering on the worker's
759 * queue (or even the read operations for those packets
760 * is not finished by now). Retry in 100ms. */
762 schedule_timeout_interruptible(HZ / 10);
763 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
765 w->cb = w_resync_finished;
766 drbd_queue_work(&mdev->tconn->data.work, w);
769 dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
772 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
776 dbdt = Bit2KB(db/dt);
777 mdev->rs_paused /= HZ;
784 spin_lock_irq(&mdev->tconn->req_lock);
785 os = drbd_read_state(mdev);
787 verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
789 /* This protects us against multiple calls (that can happen in the presence
790 of application IO), and against connectivity loss just before we arrive here. */
791 if (os.conn <= C_CONNECTED)
795 ns.conn = C_CONNECTED;
797 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
798 verify_done ? "Online verify " : "Resync",
799 dt + mdev->rs_paused, mdev->rs_paused, dbdt);
801 n_oos = drbd_bm_total_weight(mdev);
803 if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
805 dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
807 khelper_cmd = "out-of-sync";
810 D_ASSERT((n_oos - mdev->rs_failed) == 0);
812 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
813 khelper_cmd = "after-resync-target";
815 if (mdev->tconn->csums_tfm && mdev->rs_total) {
816 const unsigned long s = mdev->rs_same_csum;
817 const unsigned long t = mdev->rs_total;
820 (t < 100000) ? ((s*100)/t) : (s/(t/100));
821 dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; "
822 "transferred %luK total %luK\n",
824 Bit2KB(mdev->rs_same_csum),
825 Bit2KB(mdev->rs_total - mdev->rs_same_csum),
826 Bit2KB(mdev->rs_total));
830 if (mdev->rs_failed) {
831 dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
833 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
834 ns.disk = D_INCONSISTENT;
835 ns.pdsk = D_UP_TO_DATE;
837 ns.disk = D_UP_TO_DATE;
838 ns.pdsk = D_INCONSISTENT;
841 ns.disk = D_UP_TO_DATE;
842 ns.pdsk = D_UP_TO_DATE;
844 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
847 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
848 _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
849 drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
850 _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
852 dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
856 if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
857 /* for verify runs, we don't update uuids here,
858 * so there would be nothing to report. */
859 drbd_uuid_set_bm(mdev, 0UL);
860 drbd_print_uuids(mdev, "updated UUIDs");
862 /* Now the two UUID sets are equal, update what we
863 * know of the peer. */
865 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
866 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
871 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
873 spin_unlock_irq(&mdev->tconn->req_lock);
880 mdev->ov_start_sector = 0;
885 drbd_khelper(mdev, khelper_cmd);
891 static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
893 if (drbd_peer_req_has_active_page(peer_req)) {
894 /* This might happen if sendpage() has not finished */
895 int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
896 atomic_add(i, &mdev->pp_in_use_by_net);
897 atomic_sub(i, &mdev->pp_in_use);
898 spin_lock_irq(&mdev->tconn->req_lock);
899 list_add_tail(&peer_req->w.list, &mdev->net_ee);
900 spin_unlock_irq(&mdev->tconn->req_lock);
901 wake_up(&drbd_pp_wait);
903 drbd_free_peer_req(mdev, peer_req);
907 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
908 * @mdev: DRBD device.
910 * @cancel: The connection will be closed anyways
912 int w_e_end_data_req(struct drbd_work *w, int cancel)
914 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
915 struct drbd_conf *mdev = w->mdev;
918 if (unlikely(cancel)) {
919 drbd_free_peer_req(mdev, peer_req);
924 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
925 err = drbd_send_block(mdev, P_DATA_REPLY, peer_req);
927 if (__ratelimit(&drbd_ratelimit_state))
928 dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
929 (unsigned long long)peer_req->i.sector);
931 err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req);
936 move_to_net_ee_or_free(mdev, peer_req);
939 dev_err(DEV, "drbd_send_block() failed\n");
944 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
945 * @mdev: DRBD device.
947 * @cancel: The connection will be closed anyways
949 int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
951 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
952 struct drbd_conf *mdev = w->mdev;
955 if (unlikely(cancel)) {
956 drbd_free_peer_req(mdev, peer_req);
961 if (get_ldev_if_state(mdev, D_FAILED)) {
962 drbd_rs_complete_io(mdev, peer_req->i.sector);
966 if (mdev->state.conn == C_AHEAD) {
967 err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req);
968 } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
969 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
970 inc_rs_pending(mdev);
971 err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
973 if (__ratelimit(&drbd_ratelimit_state))
974 dev_err(DEV, "Not sending RSDataReply, "
975 "partner DISKLESS!\n");
979 if (__ratelimit(&drbd_ratelimit_state))
980 dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
981 (unsigned long long)peer_req->i.sector);
983 err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
985 /* update resync data with failure */
986 drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size);
991 move_to_net_ee_or_free(mdev, peer_req);
994 dev_err(DEV, "drbd_send_block() failed\n");
998 int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
1000 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1001 struct drbd_conf *mdev = w->mdev;
1002 struct digest_info *di;
1004 void *digest = NULL;
1007 if (unlikely(cancel)) {
1008 drbd_free_peer_req(mdev, peer_req);
1013 if (get_ldev(mdev)) {
1014 drbd_rs_complete_io(mdev, peer_req->i.sector);
1018 di = peer_req->digest;
1020 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1021 /* quick hack to try to avoid a race against reconfiguration.
1022 * a real fix would be much more involved,
1023 * introducing more locking mechanisms */
1024 if (mdev->tconn->csums_tfm) {
1025 digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
1026 D_ASSERT(digest_size == di->digest_size);
1027 digest = kmalloc(digest_size, GFP_NOIO);
1030 drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
1031 eq = !memcmp(digest, di->digest, digest_size);
1036 drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size);
1037 /* rs_same_csums unit is BM_BLOCK_SIZE */
1038 mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
1039 err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req);
1041 inc_rs_pending(mdev);
1042 peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1043 peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
1045 err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
1048 err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
1049 if (__ratelimit(&drbd_ratelimit_state))
1050 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1054 move_to_net_ee_or_free(mdev, peer_req);
1057 dev_err(DEV, "drbd_send_block/ack() failed\n");
1061 int w_e_end_ov_req(struct drbd_work *w, int cancel)
1063 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1064 struct drbd_conf *mdev = w->mdev;
1065 sector_t sector = peer_req->i.sector;
1066 unsigned int size = peer_req->i.size;
1071 if (unlikely(cancel))
1074 digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
1075 digest = kmalloc(digest_size, GFP_NOIO);
1077 err = 1; /* terminate the connection in case the allocation failed */
1081 if (likely(!(peer_req->flags & EE_WAS_ERROR)))
1082 drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
1084 memset(digest, 0, digest_size);
1086 /* Free e and pages before send.
1087 * In case we block on congestion, we could otherwise run into
1088 * some distributed deadlock, if the other side blocks on
1089 * congestion as well, because our receiver blocks in
1090 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1091 drbd_free_peer_req(mdev, peer_req);
1093 inc_rs_pending(mdev);
1094 err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY);
1096 dec_rs_pending(mdev);
1101 drbd_free_peer_req(mdev, peer_req);
1106 void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size)
1108 if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
1109 mdev->ov_last_oos_size += size>>9;
1111 mdev->ov_last_oos_start = sector;
1112 mdev->ov_last_oos_size = size>>9;
1114 drbd_set_out_of_sync(mdev, sector, size);
1117 int w_e_end_ov_reply(struct drbd_work *w, int cancel)
1119 struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1120 struct drbd_conf *mdev = w->mdev;
1121 struct digest_info *di;
1123 sector_t sector = peer_req->i.sector;
1124 unsigned int size = peer_req->i.size;
1128 if (unlikely(cancel)) {
1129 drbd_free_peer_req(mdev, peer_req);
1134 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1135 * the resync lru has been cleaned up already */
1136 if (get_ldev(mdev)) {
1137 drbd_rs_complete_io(mdev, peer_req->i.sector);
1141 di = peer_req->digest;
1143 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1144 digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
1145 digest = kmalloc(digest_size, GFP_NOIO);
1147 drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
1149 D_ASSERT(digest_size == di->digest_size);
1150 eq = !memcmp(digest, di->digest, digest_size);
1155 /* Free peer_req and pages before send.
1156 * In case we block on congestion, we could otherwise run into
1157 * some distributed deadlock, if the other side blocks on
1158 * congestion as well, because our receiver blocks in
1159 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1160 drbd_free_peer_req(mdev, peer_req);
1162 drbd_ov_out_of_sync_found(mdev, sector, size);
1164 ov_out_of_sync_print(mdev);
1166 err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
1167 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1173 /* let's advance progress step marks only for every other megabyte */
1174 if ((mdev->ov_left & 0x200) == 0x200)
1175 drbd_advance_rs_marks(mdev, mdev->ov_left);
1177 if (mdev->ov_left == 0) {
1178 ov_out_of_sync_print(mdev);
1179 drbd_resync_finished(mdev);
1185 int w_prev_work_done(struct drbd_work *w, int cancel)
1187 struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1193 int w_send_barrier(struct drbd_work *w, int cancel)
1195 struct drbd_socket *sock;
1196 struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
1197 struct drbd_conf *mdev = w->mdev;
1198 struct p_barrier *p;
1200 /* really avoid racing with tl_clear. w.cb may have been referenced
1201 * just before it was reassigned and re-queued, so double check that.
1202 * actually, this race was harmless, since we only try to send the
1203 * barrier packet here, and otherwise do nothing with the object.
1204 * but compare with the head of w_clear_epoch */
1205 spin_lock_irq(&mdev->tconn->req_lock);
1206 if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
1208 spin_unlock_irq(&mdev->tconn->req_lock);
1212 sock = &mdev->tconn->data;
1213 p = drbd_prepare_command(mdev, sock);
1216 p->barrier = b->br_number;
1217 /* inc_ap_pending was done where this was queued.
1218 * dec_ap_pending will be done in got_BarrierAck
1219 * or (on connection loss) in w_clear_epoch. */
1220 return drbd_send_command(mdev, sock, P_BARRIER, sizeof(*p), NULL, 0);
1223 int w_send_write_hint(struct drbd_work *w, int cancel)
1225 struct drbd_conf *mdev = w->mdev;
1226 struct drbd_socket *sock;
1230 sock = &mdev->tconn->data;
1231 if (!drbd_prepare_command(mdev, sock))
1233 return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
1236 int w_send_out_of_sync(struct drbd_work *w, int cancel)
1238 struct drbd_request *req = container_of(w, struct drbd_request, w);
1239 struct drbd_conf *mdev = w->mdev;
1242 if (unlikely(cancel)) {
1243 req_mod(req, SEND_CANCELED);
1247 err = drbd_send_out_of_sync(mdev, req);
1248 req_mod(req, OOS_HANDED_TO_NETWORK);
1254 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1255 * @mdev: DRBD device.
1257 * @cancel: The connection will be closed anyways
1259 int w_send_dblock(struct drbd_work *w, int cancel)
1261 struct drbd_request *req = container_of(w, struct drbd_request, w);
1262 struct drbd_conf *mdev = w->mdev;
1265 if (unlikely(cancel)) {
1266 req_mod(req, SEND_CANCELED);
1270 err = drbd_send_dblock(mdev, req);
1271 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1277 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1278 * @mdev: DRBD device.
1280 * @cancel: The connection will be closed anyways
1282 int w_send_read_req(struct drbd_work *w, int cancel)
1284 struct drbd_request *req = container_of(w, struct drbd_request, w);
1285 struct drbd_conf *mdev = w->mdev;
1288 if (unlikely(cancel)) {
1289 req_mod(req, SEND_CANCELED);
1293 err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size,
1294 (unsigned long)req);
1296 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1301 int w_restart_disk_io(struct drbd_work *w, int cancel)
1303 struct drbd_request *req = container_of(w, struct drbd_request, w);
1304 struct drbd_conf *mdev = w->mdev;
1306 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1307 drbd_al_begin_io(mdev, &req->i);
1308 /* Calling drbd_al_begin_io() out of the worker might deadlocks
1309 theoretically. Practically it can not deadlock, since this is
1310 only used when unfreezing IOs. All the extents of the requests
1311 that made it into the TL are already active */
1313 drbd_req_make_private_bio(req, req->master_bio);
1314 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
1315 generic_make_request(req->private_bio);
1320 static int _drbd_may_sync_now(struct drbd_conf *mdev)
1322 struct drbd_conf *odev = mdev;
1327 if (odev->ldev->dc.resync_after == -1)
1329 odev = minor_to_mdev(odev->ldev->dc.resync_after);
1332 if ((odev->state.conn >= C_SYNC_SOURCE &&
1333 odev->state.conn <= C_PAUSED_SYNC_T) ||
1334 odev->state.aftr_isp || odev->state.peer_isp ||
1335 odev->state.user_isp)
1341 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1342 * @mdev: DRBD device.
1344 * Called from process context only (admin command and after_state_ch).
1346 static int _drbd_pause_after(struct drbd_conf *mdev)
1348 struct drbd_conf *odev;
1352 idr_for_each_entry(&minors, odev, i) {
1353 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1355 if (!_drbd_may_sync_now(odev))
1356 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1357 != SS_NOTHING_TO_DO);
1365 * _drbd_resume_next() - Resume resync on all devices that may resync now
1366 * @mdev: DRBD device.
1368 * Called from process context only (admin command and worker).
1370 static int _drbd_resume_next(struct drbd_conf *mdev)
1372 struct drbd_conf *odev;
1376 idr_for_each_entry(&minors, odev, i) {
1377 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1379 if (odev->state.aftr_isp) {
1380 if (_drbd_may_sync_now(odev))
1381 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1383 != SS_NOTHING_TO_DO) ;
1390 void resume_next_sg(struct drbd_conf *mdev)
1392 write_lock_irq(&global_state_lock);
1393 _drbd_resume_next(mdev);
1394 write_unlock_irq(&global_state_lock);
1397 void suspend_other_sg(struct drbd_conf *mdev)
1399 write_lock_irq(&global_state_lock);
1400 _drbd_pause_after(mdev);
1401 write_unlock_irq(&global_state_lock);
1404 static int sync_after_error(struct drbd_conf *mdev, int o_minor)
1406 struct drbd_conf *odev;
1410 if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
1411 return ERR_SYNC_AFTER;
1413 /* check for loops */
1414 odev = minor_to_mdev(o_minor);
1417 return ERR_SYNC_AFTER_CYCLE;
1419 /* dependency chain ends here, no cycles. */
1420 if (odev->ldev->dc.resync_after == -1)
1423 /* follow the dependency chain */
1424 odev = minor_to_mdev(odev->ldev->dc.resync_after);
1428 int drbd_alter_sa(struct drbd_conf *mdev, int na)
1433 write_lock_irq(&global_state_lock);
1434 retcode = sync_after_error(mdev, na);
1435 if (retcode == NO_ERROR) {
1436 mdev->ldev->dc.resync_after = na;
1438 changes = _drbd_pause_after(mdev);
1439 changes |= _drbd_resume_next(mdev);
1442 write_unlock_irq(&global_state_lock);
1446 void drbd_rs_controller_reset(struct drbd_conf *mdev)
1448 atomic_set(&mdev->rs_sect_in, 0);
1449 atomic_set(&mdev->rs_sect_ev, 0);
1450 mdev->rs_in_flight = 0;
1451 mdev->rs_planed = 0;
1452 spin_lock(&mdev->peer_seq_lock);
1453 fifo_set(&mdev->rs_plan_s, 0);
1454 spin_unlock(&mdev->peer_seq_lock);
1457 void start_resync_timer_fn(unsigned long data)
1459 struct drbd_conf *mdev = (struct drbd_conf *) data;
1461 drbd_queue_work(&mdev->tconn->data.work, &mdev->start_resync_work);
1464 int w_start_resync(struct drbd_work *w, int cancel)
1466 struct drbd_conf *mdev = w->mdev;
1468 if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
1469 dev_warn(DEV, "w_start_resync later...\n");
1470 mdev->start_resync_timer.expires = jiffies + HZ/10;
1471 add_timer(&mdev->start_resync_timer);
1475 drbd_start_resync(mdev, C_SYNC_SOURCE);
1476 clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags);
1481 * drbd_start_resync() - Start the resync process
1482 * @mdev: DRBD device.
1483 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1485 * This function might bring you directly into one of the
1486 * C_PAUSED_SYNC_* states.
1488 void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1490 union drbd_state ns;
1493 if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn < C_AHEAD) {
1494 dev_err(DEV, "Resync already running!\n");
1498 if (mdev->state.conn < C_AHEAD) {
1499 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1500 drbd_rs_cancel_all(mdev);
1501 /* This should be done when we abort the resync. We definitely do not
1502 want to have this for connections going back and forth between
1503 Ahead/Behind and SyncSource/SyncTarget */
1506 if (!test_bit(B_RS_H_DONE, &mdev->flags)) {
1507 if (side == C_SYNC_TARGET) {
1508 /* Since application IO was locked out during C_WF_BITMAP_T and
1509 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1510 we check that we might make the data inconsistent. */
1511 r = drbd_khelper(mdev, "before-resync-target");
1512 r = (r >> 8) & 0xff;
1514 dev_info(DEV, "before-resync-target handler returned %d, "
1515 "dropping connection.\n", r);
1516 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
1519 } else /* C_SYNC_SOURCE */ {
1520 r = drbd_khelper(mdev, "before-resync-source");
1521 r = (r >> 8) & 0xff;
1524 dev_info(DEV, "before-resync-source handler returned %d, "
1525 "ignoring. Old userland tools?", r);
1527 dev_info(DEV, "before-resync-source handler returned %d, "
1528 "dropping connection.\n", r);
1529 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
1536 if (current == mdev->tconn->worker.task) {
1537 /* The worker should not sleep waiting for state_mutex,
1538 that can take long */
1539 if (!mutex_trylock(mdev->state_mutex)) {
1540 set_bit(B_RS_H_DONE, &mdev->flags);
1541 mdev->start_resync_timer.expires = jiffies + HZ/5;
1542 add_timer(&mdev->start_resync_timer);
1546 mutex_lock(mdev->state_mutex);
1548 clear_bit(B_RS_H_DONE, &mdev->flags);
1550 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
1551 mutex_unlock(mdev->state_mutex);
1555 write_lock_irq(&global_state_lock);
1556 ns = drbd_read_state(mdev);
1558 ns.aftr_isp = !_drbd_may_sync_now(mdev);
1562 if (side == C_SYNC_TARGET)
1563 ns.disk = D_INCONSISTENT;
1564 else /* side == C_SYNC_SOURCE */
1565 ns.pdsk = D_INCONSISTENT;
1567 r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1568 ns = drbd_read_state(mdev);
1570 if (ns.conn < C_CONNECTED)
1571 r = SS_UNKNOWN_ERROR;
1573 if (r == SS_SUCCESS) {
1574 unsigned long tw = drbd_bm_total_weight(mdev);
1575 unsigned long now = jiffies;
1578 mdev->rs_failed = 0;
1579 mdev->rs_paused = 0;
1580 mdev->rs_same_csum = 0;
1581 mdev->rs_last_events = 0;
1582 mdev->rs_last_sect_ev = 0;
1583 mdev->rs_total = tw;
1584 mdev->rs_start = now;
1585 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1586 mdev->rs_mark_left[i] = tw;
1587 mdev->rs_mark_time[i] = now;
1589 _drbd_pause_after(mdev);
1591 write_unlock_irq(&global_state_lock);
1593 if (r == SS_SUCCESS) {
1594 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1595 drbd_conn_str(ns.conn),
1596 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1597 (unsigned long) mdev->rs_total);
1598 if (side == C_SYNC_TARGET)
1599 mdev->bm_resync_fo = 0;
1601 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1602 * with w_send_oos, or the sync target will get confused as to
1603 * how much bits to resync. We cannot do that always, because for an
1604 * empty resync and protocol < 95, we need to do it here, as we call
1605 * drbd_resync_finished from here in that case.
1606 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1607 * and from after_state_ch otherwise. */
1608 if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96)
1609 drbd_gen_and_send_sync_uuid(mdev);
1611 if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) {
1612 /* This still has a race (about when exactly the peers
1613 * detect connection loss) that can lead to a full sync
1614 * on next handshake. In 8.3.9 we fixed this with explicit
1615 * resync-finished notifications, but the fix
1616 * introduces a protocol change. Sleeping for some
1617 * time longer than the ping interval + timeout on the
1618 * SyncSource, to give the SyncTarget the chance to
1619 * detect connection loss, then waiting for a ping
1620 * response (implicit in drbd_resync_finished) reduces
1621 * the race considerably, but does not solve it. */
1622 if (side == C_SYNC_SOURCE) {
1623 struct net_conf *nc;
1627 nc = rcu_dereference(mdev->tconn->net_conf);
1628 timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
1630 schedule_timeout_interruptible(timeo);
1632 drbd_resync_finished(mdev);
1635 drbd_rs_controller_reset(mdev);
1636 /* ns.conn may already be != mdev->state.conn,
1637 * we may have been paused in between, or become paused until
1638 * the timer triggers.
1639 * No matter, that is handled in resync_timer_fn() */
1640 if (ns.conn == C_SYNC_TARGET)
1641 mod_timer(&mdev->resync_timer, jiffies);
1646 mutex_unlock(mdev->state_mutex);
1649 int drbd_worker(struct drbd_thread *thi)
1651 struct drbd_tconn *tconn = thi->tconn;
1652 struct drbd_work *w = NULL;
1653 struct drbd_conf *mdev;
1654 struct net_conf *nc;
1655 LIST_HEAD(work_list);
1659 while (get_t_state(thi) == RUNNING) {
1660 drbd_thread_current_set_cpu(thi);
1662 if (down_trylock(&tconn->data.work.s)) {
1663 mutex_lock(&tconn->data.mutex);
1666 nc = rcu_dereference(tconn->net_conf);
1667 cork = nc ? !nc->no_cork : 0;
1670 if (tconn->data.socket && cork)
1671 drbd_tcp_uncork(tconn->data.socket);
1672 mutex_unlock(&tconn->data.mutex);
1674 intr = down_interruptible(&tconn->data.work.s);
1676 mutex_lock(&tconn->data.mutex);
1677 if (tconn->data.socket && cork)
1678 drbd_tcp_cork(tconn->data.socket);
1679 mutex_unlock(&tconn->data.mutex);
1683 flush_signals(current);
1684 if (get_t_state(thi) == RUNNING) {
1685 conn_warn(tconn, "Worker got an unexpected signal\n");
1691 if (get_t_state(thi) != RUNNING)
1693 /* With this break, we have done a down() but not consumed
1694 the entry from the list. The cleanup code takes care of
1698 spin_lock_irq(&tconn->data.work.q_lock);
1699 if (list_empty(&tconn->data.work.q)) {
1700 /* something terribly wrong in our logic.
1701 * we were able to down() the semaphore,
1702 * but the list is empty... doh.
1704 * what is the best thing to do now?
1705 * try again from scratch, restarting the receiver,
1706 * asender, whatnot? could break even more ugly,
1707 * e.g. when we are primary, but no good local data.
1709 * I'll try to get away just starting over this loop.
1711 conn_warn(tconn, "Work list unexpectedly empty\n");
1712 spin_unlock_irq(&tconn->data.work.q_lock);
1715 w = list_entry(tconn->data.work.q.next, struct drbd_work, list);
1716 list_del_init(&w->list);
1717 spin_unlock_irq(&tconn->data.work.q_lock);
1719 if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS)) {
1720 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1721 if (tconn->cstate >= C_WF_REPORT_PARAMS)
1722 conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
1726 spin_lock_irq(&tconn->data.work.q_lock);
1727 while (!list_empty(&tconn->data.work.q)) {
1728 list_splice_init(&tconn->data.work.q, &work_list);
1729 spin_unlock_irq(&tconn->data.work.q_lock);
1731 while (!list_empty(&work_list)) {
1732 w = list_entry(work_list.next, struct drbd_work, list);
1733 list_del_init(&w->list);
1737 spin_lock_irq(&tconn->data.work.q_lock);
1739 sema_init(&tconn->data.work.s, 0);
1740 /* DANGEROUS race: if someone did queue his work within the spinlock,
1741 * but up() ed outside the spinlock, we could get an up() on the
1742 * semaphore without corresponding list entry.
1745 spin_unlock_irq(&tconn->data.work.q_lock);
1747 /* _drbd_set_state only uses stop_nowait.
1748 * wait here for the exiting receiver. */
1749 drbd_thread_stop(&tconn->receiver);
1751 down_read(&drbd_cfg_rwsem);
1752 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1753 D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
1754 drbd_mdev_cleanup(mdev);
1756 up_read(&drbd_cfg_rwsem);
1757 clear_bit(OBJECT_DYING, &tconn->flags);
1758 clear_bit(CONFIG_PENDING, &tconn->flags);
1759 wake_up(&tconn->ping_wait);