drivers/block/drbd/drbd_receiver.c

   1 /*
   2    drbd_receiver.c
   3
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25
  26 #include <linux/module.h>
  27
  28 #include <linux/uaccess.h>
  29 #include <net/sock.h>
  30
  31 #include <linux/drbd.h>
  32 #include <linux/fs.h>
  33 #include <linux/file.h>
  34 #include <linux/in.h>
  35 #include <linux/mm.h>
  36 #include <linux/memcontrol.h>
  37 #include <linux/mm_inline.h>
  38 #include <linux/slab.h>
  39 #include <uapi/linux/sched/types.h>
  40 #include <linux/sched/signal.h>
  41 #include <linux/pkt_sched.h>
  42 #define __KERNEL_SYSCALLS__
  43 #include <linux/unistd.h>
  44 #include <linux/vmalloc.h>
  45 #include <linux/random.h>
  46 #include <linux/string.h>
  47 #include <linux/scatterlist.h>
  48 #include "drbd_int.h"
  49 #include "drbd_protocol.h"
  50 #include "drbd_req.h"
  51 #include "drbd_vli.h"
  52
  53 #define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME)
  54
  55 struct packet_info {
  56         enum drbd_packet cmd;
  57         unsigned int size;
  58         unsigned int vnr;
  59         void *data;
  60 };
  61
  62 enum finish_epoch {
  63         FE_STILL_LIVE,
  64         FE_DESTROYED,
  65         FE_RECYCLED,
  66 };
  67
  68 static int drbd_do_features(struct drbd_connection *connection);
  69 static int drbd_do_auth(struct drbd_connection *connection);
  70 static int drbd_disconnected(struct drbd_peer_device *);
  71 static void conn_wait_active_ee_empty(struct drbd_connection *connection);
  72 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
  73 static int e_end_block(struct drbd_work *, int);
  74
  75
  76 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  77
  78 /*
  79  * some helper functions to deal with single linked page lists,
  80  * page->private being our "next" pointer.
  81  */
  82
  83 /* If at least n pages are linked at head, get n pages off.
  84  * Otherwise, don't modify head, and return NULL.
  85  * Locking is the responsibility of the caller.
  86  */
  87 static struct page *page_chain_del(struct page **head, int n)
  88 {
  89         struct page *page;
  90         struct page *tmp;
  91
  92         BUG_ON(!n);
  93         BUG_ON(!head);
  94
  95         page = *head;
  96
  97         if (!page)
  98                 return NULL;
  99
 100         while (page) {
 101                 tmp = page_chain_next(page);
 102                 if (--n == 0)
 103                         break; /* found sufficient pages */
 104                 if (tmp == NULL)
 105                         /* insufficient pages, don't use any of them. */
 106                         return NULL;
 107                 page = tmp;
 108         }
 109
 110         /* add end of list marker for the returned list */
 111         set_page_private(page, 0);
 112         /* actual return value, and adjustment of head */
 113         page = *head;
 114         *head = tmp;
 115         return page;
 116 }
 117
 118 /* may be used outside of locks to find the tail of a (usually short)
 119  * "private" page chain, before adding it back to a global chain head
 120  * with page_chain_add() under a spinlock. */
 121 static struct page *page_chain_tail(struct page *page, int *len)
 122 {
 123         struct page *tmp;
 124         int i = 1;
 125         while ((tmp = page_chain_next(page)))
 126                 ++i, page = tmp;
 127         if (len)
 128                 *len = i;
 129         return page;
 130 }
 131
 132 static int page_chain_free(struct page *page)
 133 {
 134         struct page *tmp;
 135         int i = 0;
 136         page_chain_for_each_safe(page, tmp) {
 137                 put_page(page);
 138                 ++i;
 139         }
 140         return i;
 141 }
 142
 143 static void page_chain_add(struct page **head,
 144                 struct page *chain_first, struct page *chain_last)
 145 {
 146 #if 1
 147         struct page *tmp;
 148         tmp = page_chain_tail(chain_first, NULL);
 149         BUG_ON(tmp != chain_last);
 150 #endif
 151
 152         /* add chain to head */
 153         set_page_private(chain_last, (unsigned long)*head);
 154         *head = chain_first;
 155 }
 156
 157 static struct page *__drbd_alloc_pages(struct drbd_device *device,
 158                                        unsigned int number)
 159 {
 160         struct page *page = NULL;
 161         struct page *tmp = NULL;
 162         unsigned int i = 0;
 163
 164         /* Yes, testing drbd_pp_vacant outside the lock is racy.
 165          * So what. It saves a spin_lock. */
 166         if (drbd_pp_vacant >= number) {
 167                 spin_lock(&drbd_pp_lock);
 168                 page = page_chain_del(&drbd_pp_pool, number);
 169                 if (page)
 170                         drbd_pp_vacant -= number;
 171                 spin_unlock(&drbd_pp_lock);
 172                 if (page)
 173                         return page;
 174         }
 175
 176         /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 177          * "criss-cross" setup, that might cause write-out on some other DRBD,
 178          * which in turn might block on the other node at this very place.  */
 179         for (i = 0; i < number; i++) {
 180                 tmp = alloc_page(GFP_TRY);
 181                 if (!tmp)
 182                         break;
 183                 set_page_private(tmp, (unsigned long)page);
 184                 page = tmp;
 185         }
 186
 187         if (i == number)
 188                 return page;
 189
 190         /* Not enough pages immediately available this time.
 191          * No need to jump around here, drbd_alloc_pages will retry this
 192          * function "soon". */
 193         if (page) {
 194                 tmp = page_chain_tail(page, NULL);
 195                 spin_lock(&drbd_pp_lock);
 196                 page_chain_add(&drbd_pp_pool, page, tmp);
 197                 drbd_pp_vacant += i;
 198                 spin_unlock(&drbd_pp_lock);
 199         }
 200         return NULL;
 201 }
 202
 203 static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
 204                                            struct list_head *to_be_freed)
 205 {
 206         struct drbd_peer_request *peer_req, *tmp;
 207
 208         /* The EEs are always appended to the end of the list. Since
 209            they are sent in order over the wire, they have to finish
 210            in order. As soon as we see the first not finished we can
 211            stop to examine the list... */
 212
 213         list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
 214                 if (drbd_peer_req_has_active_page(peer_req))
 215                         break;
 216                 list_move(&peer_req->w.list, to_be_freed);
 217         }
 218 }
 219
 220 static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
 221 {
 222         LIST_HEAD(reclaimed);
 223         struct drbd_peer_request *peer_req, *t;
 224
 225         spin_lock_irq(&device->resource->req_lock);
 226         reclaim_finished_net_peer_reqs(device, &reclaimed);
 227         spin_unlock_irq(&device->resource->req_lock);
 228         list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 229                 drbd_free_net_peer_req(device, peer_req);
 230 }
 231
 232 static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
 233 {
 234         struct drbd_peer_device *peer_device;
 235         int vnr;
 236
 237         rcu_read_lock();
 238         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 239                 struct drbd_device *device = peer_device->device;
 240                 if (!atomic_read(&device->pp_in_use_by_net))
 241                         continue;
 242
 243                 kref_get(&device->kref);
 244                 rcu_read_unlock();
 245                 drbd_reclaim_net_peer_reqs(device);
 246                 kref_put(&device->kref, drbd_destroy_device);
 247                 rcu_read_lock();
 248         }
 249         rcu_read_unlock();
 250 }
 251
 252 /**
 253  * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
 254  * @device:     DRBD device.
 255  * @number:     number of pages requested
 256  * @retry:      whether to retry, if not enough pages are available right now
 257  *
 258  * Tries to allocate number pages, first from our own page pool, then from
 259  * the kernel.
 260  * Possibly retry until DRBD frees sufficient pages somewhere else.
 261  *
 262  * If this allocation would exceed the max_buffers setting, we throttle
 263  * allocation (schedule_timeout) to give the system some room to breathe.
 264  *
 265  * We do not use max-buffers as hard limit, because it could lead to
 266  * congestion and further to a distributed deadlock during online-verify or
 267  * (checksum based) resync, if the max-buffers, socket buffer sizes and
 268  * resync-rate settings are mis-configured.
 269  *
 270  * Returns a page chain linked via page->private.
 271  */
 272 struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
 273                               bool retry)
 274 {
 275         struct drbd_device *device = peer_device->device;
 276         struct page *page = NULL;
 277         struct net_conf *nc;
 278         DEFINE_WAIT(wait);
 279         unsigned int mxb;
 280
 281         rcu_read_lock();
 282         nc = rcu_dereference(peer_device->connection->net_conf);
 283         mxb = nc ? nc->max_buffers : 1000000;
 284         rcu_read_unlock();
 285
 286         if (atomic_read(&device->pp_in_use) < mxb)
 287                 page = __drbd_alloc_pages(device, number);
 288
 289         /* Try to keep the fast path fast, but occasionally we need
 290          * to reclaim the pages we lended to the network stack. */
 291         if (page && atomic_read(&device->pp_in_use_by_net) > 512)
 292                 drbd_reclaim_net_peer_reqs(device);
 293
 294         while (page == NULL) {
 295                 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 296
 297                 drbd_reclaim_net_peer_reqs(device);
 298
 299                 if (atomic_read(&device->pp_in_use) < mxb) {
 300                         page = __drbd_alloc_pages(device, number);
 301                         if (page)
 302                                 break;
 303                 }
 304
 305                 if (!retry)
 306                         break;
 307
 308                 if (signal_pending(current)) {
 309                         drbd_warn(device, "drbd_alloc_pages interrupted!\n");
 310                         break;
 311                 }
 312
 313                 if (schedule_timeout(HZ/10) == 0)
 314                         mxb = UINT_MAX;
 315         }
 316         finish_wait(&drbd_pp_wait, &wait);
 317
 318         if (page)
 319                 atomic_add(number, &device->pp_in_use);
 320         return page;
 321 }
 322
 323 /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
 324  * Is also used from inside an other spin_lock_irq(&resource->req_lock);
 325  * Either links the page chain back to the global pool,
 326  * or returns all pages to the system. */
 327 static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
 328 {
 329         atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
 330         int i;
 331
 332         if (page == NULL)
 333                 return;
 334
 335         if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
 336                 i = page_chain_free(page);
 337         else {
 338                 struct page *tmp;
 339                 tmp = page_chain_tail(page, &i);
 340                 spin_lock(&drbd_pp_lock);
 341                 page_chain_add(&drbd_pp_pool, page, tmp);
 342                 drbd_pp_vacant += i;
 343                 spin_unlock(&drbd_pp_lock);
 344         }
 345         i = atomic_sub_return(i, a);
 346         if (i < 0)
 347                 drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
 348                         is_net ? "pp_in_use_by_net" : "pp_in_use", i);
 349         wake_up(&drbd_pp_wait);
 350 }
 351
 352 /*
 353 You need to hold the req_lock:
 354  _drbd_wait_ee_list_empty()
 355
 356 You must not have the req_lock:
 357  drbd_free_peer_req()
 358  drbd_alloc_peer_req()
 359  drbd_free_peer_reqs()
 360  drbd_ee_fix_bhs()
 361  drbd_finish_peer_reqs()
 362  drbd_clear_done_ee()
 363  drbd_wait_ee_list_empty()
 364 */
 365
 366 /* normal: payload_size == request size (bi_size)
 367  * w_same: payload_size == logical_block_size
 368  * trim: payload_size == 0 */
 369 struct drbd_peer_request *
 370 drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 371                     unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
 372 {
 373         struct drbd_device *device = peer_device->device;
 374         struct drbd_peer_request *peer_req;
 375         struct page *page = NULL;
 376         unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 377
 378         if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
 379                 return NULL;
 380
 381         peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 382         if (!peer_req) {
 383                 if (!(gfp_mask & __GFP_NOWARN))
 384                         drbd_err(device, "%s: allocation failed\n", __func__);
 385                 return NULL;
 386         }
 387
 388         if (nr_pages) {
 389                 page = drbd_alloc_pages(peer_device, nr_pages,
 390                                         gfpflags_allow_blocking(gfp_mask));
 391                 if (!page)
 392                         goto fail;
 393         }
 394
 395         memset(peer_req, 0, sizeof(*peer_req));
 396         INIT_LIST_HEAD(&peer_req->w.list);
 397         drbd_clear_interval(&peer_req->i);
 398         peer_req->i.size = request_size;
 399         peer_req->i.sector = sector;
 400         peer_req->submit_jif = jiffies;
 401         peer_req->peer_device = peer_device;
 402         peer_req->pages = page;
 403         /*
 404          * The block_id is opaque to the receiver.  It is not endianness
 405          * converted, and sent back to the sender unchanged.
 406          */
 407         peer_req->block_id = id;
 408
 409         return peer_req;
 410
 411  fail:
 412         mempool_free(peer_req, drbd_ee_mempool);
 413         return NULL;
 414 }
 415
 416 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
 417                        int is_net)
 418 {
 419         might_sleep();
 420         if (peer_req->flags & EE_HAS_DIGEST)
 421                 kfree(peer_req->digest);
 422         drbd_free_pages(device, peer_req->pages, is_net);
 423         D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
 424         D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 425         if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
 426                 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 427                 drbd_al_complete_io(device, &peer_req->i);
 428         }
 429         mempool_free(peer_req, drbd_ee_mempool);
 430 }
 431
 432 int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
 433 {
 434         LIST_HEAD(work_list);
 435         struct drbd_peer_request *peer_req, *t;
 436         int count = 0;
 437         int is_net = list == &device->net_ee;
 438
 439         spin_lock_irq(&device->resource->req_lock);
 440         list_splice_init(list, &work_list);
 441         spin_unlock_irq(&device->resource->req_lock);
 442
 443         list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 444                 __drbd_free_peer_req(device, peer_req, is_net);
 445                 count++;
 446         }
 447         return count;
 448 }
 449
 450 /*
 451  * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
 452  */
 453 static int drbd_finish_peer_reqs(struct drbd_device *device)
 454 {
 455         LIST_HEAD(work_list);
 456         LIST_HEAD(reclaimed);
 457         struct drbd_peer_request *peer_req, *t;
 458         int err = 0;
 459
 460         spin_lock_irq(&device->resource->req_lock);
 461         reclaim_finished_net_peer_reqs(device, &reclaimed);
 462         list_splice_init(&device->done_ee, &work_list);
 463         spin_unlock_irq(&device->resource->req_lock);
 464
 465         list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 466                 drbd_free_net_peer_req(device, peer_req);
 467
 468         /* possible callbacks here:
 469          * e_end_block, and e_end_resync_block, e_send_superseded.
 470          * all ignore the last argument.
 471          */
 472         list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 473                 int err2;
 474
 475                 /* list_del not necessary, next/prev members not touched */
 476                 err2 = peer_req->w.cb(&peer_req->w, !!err);
 477                 if (!err)
 478                         err = err2;
 479                 drbd_free_peer_req(device, peer_req);
 480         }
 481         wake_up(&device->ee_wait);
 482
 483         return err;
 484 }
 485
 486 static void _drbd_wait_ee_list_empty(struct drbd_device *device,
 487                                      struct list_head *head)
 488 {
 489         DEFINE_WAIT(wait);
 490
 491         /* avoids spin_lock/unlock
 492          * and calling prepare_to_wait in the fast path */
 493         while (!list_empty(head)) {
 494                 prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
 495                 spin_unlock_irq(&device->resource->req_lock);
 496                 io_schedule();
 497                 finish_wait(&device->ee_wait, &wait);
 498                 spin_lock_irq(&device->resource->req_lock);
 499         }
 500 }
 501
 502 static void drbd_wait_ee_list_empty(struct drbd_device *device,
 503                                     struct list_head *head)
 504 {
 505         spin_lock_irq(&device->resource->req_lock);
 506         _drbd_wait_ee_list_empty(device, head);
 507         spin_unlock_irq(&device->resource->req_lock);
 508 }
 509
 510 static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
 511 {
 512         struct kvec iov = {
 513                 .iov_base = buf,
 514                 .iov_len = size,
 515         };
 516         struct msghdr msg = {
 517                 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 518         };
 519         return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
 520 }
 521
 522 static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
 523 {
 524         int rv;
 525
 526         rv = drbd_recv_short(connection->data.socket, buf, size, 0);
 527
 528         if (rv < 0) {
 529                 if (rv == -ECONNRESET)
 530                         drbd_info(connection, "sock was reset by peer\n");
 531                 else if (rv != -ERESTARTSYS)
 532                         drbd_err(connection, "sock_recvmsg returned %d\n", rv);
 533         } else if (rv == 0) {
 534                 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
 535                         long t;
 536                         rcu_read_lock();
 537                         t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
 538                         rcu_read_unlock();
 539
 540                         t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
 541
 542                         if (t)
 543                                 goto out;
 544                 }
 545                 drbd_info(connection, "sock was shut down by peer\n");
 546         }
 547
 548         if (rv != size)
 549                 conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
 550
 551 out:
 552         return rv;
 553 }
 554
 555 static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
 556 {
 557         int err;
 558
 559         err = drbd_recv(connection, buf, size);
 560         if (err != size) {
 561                 if (err >= 0)
 562                         err = -EIO;
 563         } else
 564                 err = 0;
 565         return err;
 566 }
 567
 568 static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
 569 {
 570         int err;
 571
 572         err = drbd_recv_all(connection, buf, size);
 573         if (err && !signal_pending(current))
 574                 drbd_warn(connection, "short read (expected size %d)\n", (int)size);
 575         return err;
 576 }
 577
 578 /* quoting tcp(7):
 579  *   On individual connections, the socket buffer size must be set prior to the
 580  *   listen(2) or connect(2) calls in order to have it take effect.
 581  * This is our wrapper to do so.
 582  */
 583 static void drbd_setbufsize(struct socket *sock, unsigned int snd,
 584                 unsigned int rcv)
 585 {
 586         /* open coded SO_SNDBUF, SO_RCVBUF */
 587         if (snd) {
 588                 sock->sk->sk_sndbuf = snd;
 589                 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 590         }
 591         if (rcv) {
 592                 sock->sk->sk_rcvbuf = rcv;
 593                 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 594         }
 595 }
 596
 597 static struct socket *drbd_try_connect(struct drbd_connection *connection)
 598 {
 599         const char *what;
 600         struct socket *sock;
 601         struct sockaddr_in6 src_in6;
 602         struct sockaddr_in6 peer_in6;
 603         struct net_conf *nc;
 604         int err, peer_addr_len, my_addr_len;
 605         int sndbuf_size, rcvbuf_size, connect_int;
 606         int disconnect_on_error = 1;
 607
 608         rcu_read_lock();
 609         nc = rcu_dereference(connection->net_conf);
 610         if (!nc) {
 611                 rcu_read_unlock();
 612                 return NULL;
 613         }
 614         sndbuf_size = nc->sndbuf_size;
 615         rcvbuf_size = nc->rcvbuf_size;
 616         connect_int = nc->connect_int;
 617         rcu_read_unlock();
 618
 619         my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
 620         memcpy(&src_in6, &connection->my_addr, my_addr_len);
 621
 622         if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
 623                 src_in6.sin6_port = 0;
 624         else
 625                 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
 626
 627         peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
 628         memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
 629
 630         what = "sock_create_kern";
 631         err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
 632                                SOCK_STREAM, IPPROTO_TCP, &sock);
 633         if (err < 0) {
 634                 sock = NULL;
 635                 goto out;
 636         }
 637
 638         sock->sk->sk_rcvtimeo =
 639         sock->sk->sk_sndtimeo = connect_int * HZ;
 640         drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
 641
 642        /* explicitly bind to the configured IP as source IP
 643         *  for the outgoing connections.
 644         *  This is needed for multihomed hosts and to be
 645         *  able to use lo: interfaces for drbd.
 646         * Make sure to use 0 as port number, so linux selects
 647         *  a free one dynamically.
 648         */
 649         what = "bind before connect";
 650         err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
 651         if (err < 0)
 652                 goto out;
 653
 654         /* connect may fail, peer not yet available.
 655          * stay C_WF_CONNECTION, don't go Disconnecting! */
 656         disconnect_on_error = 0;
 657         what = "connect";
 658         err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
 659
 660 out:
 661         if (err < 0) {
 662                 if (sock) {
 663                         sock_release(sock);
 664                         sock = NULL;
 665                 }
 666                 switch (-err) {
 667                         /* timeout, busy, signal pending */
 668                 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
 669                 case EINTR: case ERESTARTSYS:
 670                         /* peer not (yet) available, network problem */
 671                 case ECONNREFUSED: case ENETUNREACH:
 672                 case EHOSTDOWN:    case EHOSTUNREACH:
 673                         disconnect_on_error = 0;
 674                         break;
 675                 default:
 676                         drbd_err(connection, "%s failed, err = %d\n", what, err);
 677                 }
 678                 if (disconnect_on_error)
 679                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 680         }
 681
 682         return sock;
 683 }
 684
 685 struct accept_wait_data {
 686         struct drbd_connection *connection;
 687         struct socket *s_listen;
 688         struct completion door_bell;
 689         void (*original_sk_state_change)(struct sock *sk);
 690
 691 };
 692
 693 static void drbd_incoming_connection(struct sock *sk)
 694 {
 695         struct accept_wait_data *ad = sk->sk_user_data;
 696         void (*state_change)(struct sock *sk);
 697
 698         state_change = ad->original_sk_state_change;
 699         if (sk->sk_state == TCP_ESTABLISHED)
 700                 complete(&ad->door_bell);
 701         state_change(sk);
 702 }
 703
 704 static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
 705 {
 706         int err, sndbuf_size, rcvbuf_size, my_addr_len;
 707         struct sockaddr_in6 my_addr;
 708         struct socket *s_listen;
 709         struct net_conf *nc;
 710         const char *what;
 711
 712         rcu_read_lock();
 713         nc = rcu_dereference(connection->net_conf);
 714         if (!nc) {
 715                 rcu_read_unlock();
 716                 return -EIO;
 717         }
 718         sndbuf_size = nc->sndbuf_size;
 719         rcvbuf_size = nc->rcvbuf_size;
 720         rcu_read_unlock();
 721
 722         my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
 723         memcpy(&my_addr, &connection->my_addr, my_addr_len);
 724
 725         what = "sock_create_kern";
 726         err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
 727                                SOCK_STREAM, IPPROTO_TCP, &s_listen);
 728         if (err) {
 729                 s_listen = NULL;
 730                 goto out;
 731         }
 732
 733         s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 734         drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
 735
 736         what = "bind before listen";
 737         err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
 738         if (err < 0)
 739                 goto out;
 740
 741         ad->s_listen = s_listen;
 742         write_lock_bh(&s_listen->sk->sk_callback_lock);
 743         ad->original_sk_state_change = s_listen->sk->sk_state_change;
 744         s_listen->sk->sk_state_change = drbd_incoming_connection;
 745         s_listen->sk->sk_user_data = ad;
 746         write_unlock_bh(&s_listen->sk->sk_callback_lock);
 747
 748         what = "listen";
 749         err = s_listen->ops->listen(s_listen, 5);
 750         if (err < 0)
 751                 goto out;
 752
 753         return 0;
 754 out:
 755         if (s_listen)
 756                 sock_release(s_listen);
 757         if (err < 0) {
 758                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 759                         drbd_err(connection, "%s failed, err = %d\n", what, err);
 760                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 761                 }
 762         }
 763
 764         return -EIO;
 765 }
 766
 767 static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
 768 {
 769         write_lock_bh(&sk->sk_callback_lock);
 770         sk->sk_state_change = ad->original_sk_state_change;
 771         sk->sk_user_data = NULL;
 772         write_unlock_bh(&sk->sk_callback_lock);
 773 }
 774
 775 static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
 776 {
 777         int timeo, connect_int, err = 0;
 778         struct socket *s_estab = NULL;
 779         struct net_conf *nc;
 780
 781         rcu_read_lock();
 782         nc = rcu_dereference(connection->net_conf);
 783         if (!nc) {
 784                 rcu_read_unlock();
 785                 return NULL;
 786         }
 787         connect_int = nc->connect_int;
 788         rcu_read_unlock();
 789
 790         timeo = connect_int * HZ;
 791         /* 28.5% random jitter */
 792         timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
 793
 794         err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
 795         if (err <= 0)
 796                 return NULL;
 797
 798         err = kernel_accept(ad->s_listen, &s_estab, 0);
 799         if (err < 0) {
 800                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 801                         drbd_err(connection, "accept failed, err = %d\n", err);
 802                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 803                 }
 804         }
 805
 806         if (s_estab)
 807                 unregister_state_change(s_estab->sk, ad);
 808
 809         return s_estab;
 810 }
 811
 812 static int decode_header(struct drbd_connection *, void *, struct packet_info *);
 813
 814 static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
 815                              enum drbd_packet cmd)
 816 {
 817         if (!conn_prepare_command(connection, sock))
 818                 return -EIO;
 819         return conn_send_command(connection, sock, cmd, 0, NULL, 0);
 820 }
 821
 822 static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
 823 {
 824         unsigned int header_size = drbd_header_size(connection);
 825         struct packet_info pi;
 826         struct net_conf *nc;
 827         int err;
 828
 829         rcu_read_lock();
 830         nc = rcu_dereference(connection->net_conf);
 831         if (!nc) {
 832                 rcu_read_unlock();
 833                 return -EIO;
 834         }
 835         sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
 836         rcu_read_unlock();
 837
 838         err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
 839         if (err != header_size) {
 840                 if (err >= 0)
 841                         err = -EIO;
 842                 return err;
 843         }
 844         err = decode_header(connection, connection->data.rbuf, &pi);
 845         if (err)
 846                 return err;
 847         return pi.cmd;
 848 }
 849
 850 /**
 851  * drbd_socket_okay() - Free the socket if its connection is not okay
 852  * @sock:       pointer to the pointer to the socket.
 853  */
 854 static bool drbd_socket_okay(struct socket **sock)
 855 {
 856         int rr;
 857         char tb[4];
 858
 859         if (!*sock)
 860                 return false;
 861
 862         rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 863
 864         if (rr > 0 || rr == -EAGAIN) {
 865                 return true;
 866         } else {
 867                 sock_release(*sock);
 868                 *sock = NULL;
 869                 return false;
 870         }
 871 }
 872
 873 static bool connection_established(struct drbd_connection *connection,
 874                                    struct socket **sock1,
 875                                    struct socket **sock2)
 876 {
 877         struct net_conf *nc;
 878         int timeout;
 879         bool ok;
 880
 881         if (!*sock1 || !*sock2)
 882                 return false;
 883
 884         rcu_read_lock();
 885         nc = rcu_dereference(connection->net_conf);
 886         timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
 887         rcu_read_unlock();
 888         schedule_timeout_interruptible(timeout);
 889
 890         ok = drbd_socket_okay(sock1);
 891         ok = drbd_socket_okay(sock2) && ok;
 892
 893         return ok;
 894 }
 895
 896 /* Gets called if a connection is established, or if a new minor gets created
 897    in a connection */
 898 int drbd_connected(struct drbd_peer_device *peer_device)
 899 {
 900         struct drbd_device *device = peer_device->device;
 901         int err;
 902
 903         atomic_set(&device->packet_seq, 0);
 904         device->peer_seq = 0;
 905
 906         device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
 907                 &peer_device->connection->cstate_mutex :
 908                 &device->own_state_mutex;
 909
 910         err = drbd_send_sync_param(peer_device);
 911         if (!err)
 912                 err = drbd_send_sizes(peer_device, 0, 0);
 913         if (!err)
 914                 err = drbd_send_uuids(peer_device);
 915         if (!err)
 916                 err = drbd_send_current_state(peer_device);
 917         clear_bit(USE_DEGR_WFC_T, &device->flags);
 918         clear_bit(RESIZE_PENDING, &device->flags);
 919         atomic_set(&device->ap_in_flight, 0);
 920         mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
 921         return err;
 922 }
 923
 924 /*
 925  * return values:
 926  *   1 yes, we have a valid connection
 927  *   0 oops, did not work out, please try again
 928  *  -1 peer talks different language,
 929  *     no point in trying again, please go standalone.
 930  *  -2 We do not have a network config...
 931  */
 932 static int conn_connect(struct drbd_connection *connection)
 933 {
 934         struct drbd_socket sock, msock;
 935         struct drbd_peer_device *peer_device;
 936         struct net_conf *nc;
 937         int vnr, timeout, h;
 938         bool discard_my_data, ok;
 939         enum drbd_state_rv rv;
 940         struct accept_wait_data ad = {
 941                 .connection = connection,
 942                 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
 943         };
 944
 945         clear_bit(DISCONNECT_SENT, &connection->flags);
 946         if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
 947                 return -2;
 948
 949         mutex_init(&sock.mutex);
 950         sock.sbuf = connection->data.sbuf;
 951         sock.rbuf = connection->data.rbuf;
 952         sock.socket = NULL;
 953         mutex_init(&msock.mutex);
 954         msock.sbuf = connection->meta.sbuf;
 955         msock.rbuf = connection->meta.rbuf;
 956         msock.socket = NULL;
 957
 958         /* Assume that the peer only understands protocol 80 until we know better.  */
 959         connection->agreed_pro_version = 80;
 960
 961         if (prepare_listen_socket(connection, &ad))
 962                 return 0;
 963
 964         do {
 965                 struct socket *s;
 966
 967                 s = drbd_try_connect(connection);
 968                 if (s) {
 969                         if (!sock.socket) {
 970                                 sock.socket = s;
 971                                 send_first_packet(connection, &sock, P_INITIAL_DATA);
 972                         } else if (!msock.socket) {
 973                                 clear_bit(RESOLVE_CONFLICTS, &connection->flags);
 974                                 msock.socket = s;
 975                                 send_first_packet(connection, &msock, P_INITIAL_META);
 976                         } else {
 977                                 drbd_err(connection, "Logic error in conn_connect()\n");
 978                                 goto out_release_sockets;
 979                         }
 980                 }
 981
 982                 if (connection_established(connection, &sock.socket, &msock.socket))
 983                         break;
 984
 985 retry:
 986                 s = drbd_wait_for_connect(connection, &ad);
 987                 if (s) {
 988                         int fp = receive_first_packet(connection, s);
 989                         drbd_socket_okay(&sock.socket);
 990                         drbd_socket_okay(&msock.socket);
 991                         switch (fp) {
 992                         case P_INITIAL_DATA:
 993                                 if (sock.socket) {
 994                                         drbd_warn(connection, "initial packet S crossed\n");
 995                                         sock_release(sock.socket);
 996                                         sock.socket = s;
 997                                         goto randomize;
 998                                 }
 999                                 sock.socket = s;
1000                                 break;
1001                         case P_INITIAL_META:
1002                                 set_bit(RESOLVE_CONFLICTS, &connection->flags);
1003                                 if (msock.socket) {
1004                                         drbd_warn(connection, "initial packet M crossed\n");
1005                                         sock_release(msock.socket);
1006                                         msock.socket = s;
1007                                         goto randomize;
1008                                 }
1009                                 msock.socket = s;
1010                                 break;
1011                         default:
1012                                 drbd_warn(connection, "Error receiving initial packet\n");
1013                                 sock_release(s);
1014 randomize:
1015                                 if (prandom_u32() & 1)
1016                                         goto retry;
1017                         }
1018                 }
1019
1020                 if (connection->cstate <= C_DISCONNECTING)
1021                         goto out_release_sockets;
1022                 if (signal_pending(current)) {
1023                         flush_signals(current);
1024                         smp_rmb();
1025                         if (get_t_state(&connection->receiver) == EXITING)
1026                                 goto out_release_sockets;
1027                 }
1028
1029                 ok = connection_established(connection, &sock.socket, &msock.socket);
1030         } while (!ok);
1031
1032         if (ad.s_listen)
1033                 sock_release(ad.s_listen);
1034
1035         sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1036         msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1037
1038         sock.socket->sk->sk_allocation = GFP_NOIO;
1039         msock.socket->sk->sk_allocation = GFP_NOIO;
1040
1041         sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1042         msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
1043
1044         /* NOT YET ...
1045          * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
1046          * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1047          * first set it to the P_CONNECTION_FEATURES timeout,
1048          * which we set to 4x the configured ping_timeout. */
1049         rcu_read_lock();
1050         nc = rcu_dereference(connection->net_conf);
1051
1052         sock.socket->sk->sk_sndtimeo =
1053         sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
1054
1055         msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
1056         timeout = nc->timeout * HZ / 10;
1057         discard_my_data = nc->discard_my_data;
1058         rcu_read_unlock();
1059
1060         msock.socket->sk->sk_sndtimeo = timeout;
1061
1062         /* we don't want delays.
1063          * we use TCP_CORK where appropriate, though */
1064         drbd_tcp_nodelay(sock.socket);
1065         drbd_tcp_nodelay(msock.socket);
1066
1067         connection->data.socket = sock.socket;
1068         connection->meta.socket = msock.socket;
1069         connection->last_received = jiffies;
1070
1071         h = drbd_do_features(connection);
1072         if (h <= 0)
1073                 return h;
1074
1075         if (connection->cram_hmac_tfm) {
1076                 /* drbd_request_state(device, NS(conn, WFAuth)); */
1077                 switch (drbd_do_auth(connection)) {
1078                 case -1:
1079                         drbd_err(connection, "Authentication of peer failed\n");
1080                         return -1;
1081                 case 0:
1082                         drbd_err(connection, "Authentication of peer failed, trying again.\n");
1083                         return 0;
1084                 }
1085         }
1086
1087         connection->data.socket->sk->sk_sndtimeo = timeout;
1088         connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1089
1090         if (drbd_send_protocol(connection) == -EOPNOTSUPP)
1091                 return -1;
1092
1093         /* Prevent a race between resync-handshake and
1094          * being promoted to Primary.
1095          *
1096          * Grab and release the state mutex, so we know that any current
1097          * drbd_set_role() is finished, and any incoming drbd_set_role
1098          * will see the STATE_SENT flag, and wait for it to be cleared.
1099          */
1100         idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1101                 mutex_lock(peer_device->device->state_mutex);
1102
1103         set_bit(STATE_SENT, &connection->flags);
1104
1105         idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1106                 mutex_unlock(peer_device->device->state_mutex);
1107
1108         rcu_read_lock();
1109         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1110                 struct drbd_device *device = peer_device->device;
1111                 kref_get(&device->kref);
1112                 rcu_read_unlock();
1113
1114                 if (discard_my_data)
1115                         set_bit(DISCARD_MY_DATA, &device->flags);
1116                 else
1117                         clear_bit(DISCARD_MY_DATA, &device->flags);
1118
1119                 drbd_connected(peer_device);
1120                 kref_put(&device->kref, drbd_destroy_device);
1121                 rcu_read_lock();
1122         }
1123         rcu_read_unlock();
1124
1125         rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1126         if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
1127                 clear_bit(STATE_SENT, &connection->flags);
1128                 return 0;
1129         }
1130
1131         drbd_thread_start(&connection->ack_receiver);
1132         /* opencoded create_singlethread_workqueue(),
1133          * to be able to use format string arguments */
1134         connection->ack_sender =
1135                 alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
1136         if (!connection->ack_sender) {
1137                 drbd_err(connection, "Failed to create workqueue ack_sender\n");
1138                 return 0;
1139         }
1140
1141         mutex_lock(&connection->resource->conf_update);
1142         /* The discard_my_data flag is a single-shot modifier to the next
1143          * connection attempt, the handshake of which is now well underway.
1144          * No need for rcu style copying of the whole struct
1145          * just to clear a single value. */
1146         connection->net_conf->discard_my_data = 0;
1147         mutex_unlock(&connection->resource->conf_update);
1148
1149         return h;
1150
1151 out_release_sockets:
1152         if (ad.s_listen)
1153                 sock_release(ad.s_listen);
1154         if (sock.socket)
1155                 sock_release(sock.socket);
1156         if (msock.socket)
1157                 sock_release(msock.socket);
1158         return -1;
1159 }
1160
1161 static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
1162 {
1163         unsigned int header_size = drbd_header_size(connection);
1164
1165         if (header_size == sizeof(struct p_header100) &&
1166             *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1167                 struct p_header100 *h = header;
1168                 if (h->pad != 0) {
1169                         drbd_err(connection, "Header padding is not zero\n");
1170                         return -EINVAL;
1171                 }
1172                 pi->vnr = be16_to_cpu(h->volume);
1173                 pi->cmd = be16_to_cpu(h->command);
1174                 pi->size = be32_to_cpu(h->length);
1175         } else if (header_size == sizeof(struct p_header95) &&
1176                    *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
1177                 struct p_header95 *h = header;
1178                 pi->cmd = be16_to_cpu(h->command);
1179                 pi->size = be32_to_cpu(h->length);
1180                 pi->vnr = 0;
1181         } else if (header_size == sizeof(struct p_header80) &&
1182                    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1183                 struct p_header80 *h = header;
1184                 pi->cmd = be16_to_cpu(h->command);
1185                 pi->size = be16_to_cpu(h->length);
1186                 pi->vnr = 0;
1187         } else {
1188                 drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
1189                          be32_to_cpu(*(__be32 *)header),
1190                          connection->agreed_pro_version);
1191                 return -EINVAL;
1192         }
1193         pi->data = header + header_size;
1194         return 0;
1195 }
1196
1197 static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
1198 {
1199         void *buffer = connection->data.rbuf;
1200         int err;
1201
1202         err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
1203         if (err)
1204                 return err;
1205
1206         err = decode_header(connection, buffer, pi);
1207         connection->last_received = jiffies;
1208
1209         return err;
1210 }
1211
1212 /* This is blkdev_issue_flush, but asynchronous.
1213  * We want to submit to all component volumes in parallel,
1214  * then wait for all completions.
1215  */
1216 struct issue_flush_context {
1217         atomic_t pending;
1218         int error;
1219         struct completion done;
1220 };
1221 struct one_flush_context {
1222         struct drbd_device *device;
1223         struct issue_flush_context *ctx;
1224 };
1225
1226 void one_flush_endio(struct bio *bio)
1227 {
1228         struct one_flush_context *octx = bio->bi_private;
1229         struct drbd_device *device = octx->device;
1230         struct issue_flush_context *ctx = octx->ctx;
1231
1232         if (bio->bi_status) {
1233                 ctx->error = blk_status_to_errno(bio->bi_status);
1234                 drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status);
1235         }
1236         kfree(octx);
1237         bio_put(bio);
1238
1239         clear_bit(FLUSH_PENDING, &device->flags);
1240         put_ldev(device);
1241         kref_put(&device->kref, drbd_destroy_device);
1242
1243         if (atomic_dec_and_test(&ctx->pending))
1244                 complete(&ctx->done);
1245 }
1246
1247 static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
1248 {
1249         struct bio *bio = bio_alloc(GFP_NOIO, 0);
1250         struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
1251         if (!bio || !octx) {
1252                 drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
1253                 /* FIXME: what else can I do now?  disconnecting or detaching
1254                  * really does not help to improve the state of the world, either.
1255                  */
1256                 kfree(octx);
1257                 if (bio)
1258                         bio_put(bio);
1259
1260                 ctx->error = -ENOMEM;
1261                 put_ldev(device);
1262                 kref_put(&device->kref, drbd_destroy_device);
1263                 return;
1264         }
1265
1266         octx->device = device;
1267         octx->ctx = ctx;
1268         bio->bi_bdev = device->ldev->backing_bdev;
1269         bio->bi_private = octx;
1270         bio->bi_end_io = one_flush_endio;
1271         bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH;
1272
1273         device->flush_jif = jiffies;
1274         set_bit(FLUSH_PENDING, &device->flags);
1275         atomic_inc(&ctx->pending);
1276         submit_bio(bio);
1277 }
1278
1279 static void drbd_flush(struct drbd_connection *connection)
1280 {
1281         if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
1282                 struct drbd_peer_device *peer_device;
1283                 struct issue_flush_context ctx;
1284                 int vnr;
1285
1286                 atomic_set(&ctx.pending, 1);
1287                 ctx.error = 0;
1288                 init_completion(&ctx.done);
1289
1290                 rcu_read_lock();
1291                 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1292                         struct drbd_device *device = peer_device->device;
1293
1294                         if (!get_ldev(device))
1295                                 continue;
1296                         kref_get(&device->kref);
1297                         rcu_read_unlock();
1298
1299                         submit_one_flush(device, &ctx);
1300
1301                         rcu_read_lock();
1302                 }
1303                 rcu_read_unlock();
1304
1305                 /* Do we want to add a timeout,
1306                  * if disk-timeout is set? */
1307                 if (!atomic_dec_and_test(&ctx.pending))
1308                         wait_for_completion(&ctx.done);
1309
1310                 if (ctx.error) {
1311                         /* would rather check on EOPNOTSUPP, but that is not reliable.
1312                          * don't try again for ANY return value != 0
1313                          * if (rv == -EOPNOTSUPP) */
1314                         /* Any error is already reported by bio_endio callback. */
1315                         drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1316                 }
1317         }
1318 }
1319
1320 /**
1321  * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1322  * @device:     DRBD device.
1323  * @epoch:      Epoch object.
1324  * @ev:         Epoch event.
1325  */
1326 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
1327                                                struct drbd_epoch *epoch,
1328                                                enum epoch_event ev)
1329 {
1330         int epoch_size;
1331         struct drbd_epoch *next_epoch;
1332         enum finish_epoch rv = FE_STILL_LIVE;
1333
1334         spin_lock(&connection->epoch_lock);
1335         do {
1336                 next_epoch = NULL;
1337
1338                 epoch_size = atomic_read(&epoch->epoch_size);
1339
1340                 switch (ev & ~EV_CLEANUP) {
1341                 case EV_PUT:
1342                         atomic_dec(&epoch->active);
1343                         break;
1344                 case EV_GOT_BARRIER_NR:
1345                         set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1346                         break;
1347                 case EV_BECAME_LAST:
1348                         /* nothing to do*/
1349                         break;
1350                 }
1351
1352                 if (epoch_size != 0 &&
1353                     atomic_read(&epoch->active) == 0 &&
1354                     (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
1355                         if (!(ev & EV_CLEANUP)) {
1356                                 spin_unlock(&connection->epoch_lock);
1357                                 drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
1358                                 spin_lock(&connection->epoch_lock);
1359                         }
1360 #if 0
1361                         /* FIXME: dec unacked on connection, once we have
1362                          * something to count pending connection packets in. */
1363                         if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
1364                                 dec_unacked(epoch->connection);
1365 #endif
1366
1367                         if (connection->current_epoch != epoch) {
1368                                 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1369                                 list_del(&epoch->list);
1370                                 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1371                                 connection->epochs--;
1372                                 kfree(epoch);
1373
1374                                 if (rv == FE_STILL_LIVE)
1375                                         rv = FE_DESTROYED;
1376                         } else {
1377                                 epoch->flags = 0;
1378                                 atomic_set(&epoch->epoch_size, 0);
1379                                 /* atomic_set(&epoch->active, 0); is already zero */
1380                                 if (rv == FE_STILL_LIVE)
1381                                         rv = FE_RECYCLED;
1382                         }
1383                 }
1384
1385                 if (!next_epoch)
1386                         break;
1387
1388                 epoch = next_epoch;
1389         } while (1);
1390
1391         spin_unlock(&connection->epoch_lock);
1392
1393         return rv;
1394 }
1395
1396 static enum write_ordering_e
1397 max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1398 {
1399         struct disk_conf *dc;
1400
1401         dc = rcu_dereference(bdev->disk_conf);
1402
1403         if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
1404                 wo = WO_DRAIN_IO;
1405         if (wo == WO_DRAIN_IO && !dc->disk_drain)
1406                 wo = WO_NONE;
1407
1408         return wo;
1409 }
1410
1411 /**
1412  * drbd_bump_write_ordering() - Fall back to an other write ordering method
1413  * @connection: DRBD connection.
1414  * @wo:         Write ordering method to try.
1415  */
1416 void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1417                               enum write_ordering_e wo)
1418 {
1419         struct drbd_device *device;
1420         enum write_ordering_e pwo;
1421         int vnr;
1422         static char *write_ordering_str[] = {
1423                 [WO_NONE] = "none",
1424                 [WO_DRAIN_IO] = "drain",
1425                 [WO_BDEV_FLUSH] = "flush",
1426         };
1427
1428         pwo = resource->write_ordering;
1429         if (wo != WO_BDEV_FLUSH)
1430                 wo = min(pwo, wo);
1431         rcu_read_lock();
1432         idr_for_each_entry(&resource->devices, device, vnr) {
1433                 if (get_ldev(device)) {
1434                         wo = max_allowed_wo(device->ldev, wo);
1435                         if (device->ldev == bdev)
1436                                 bdev = NULL;
1437                         put_ldev(device);
1438                 }
1439         }
1440
1441         if (bdev)
1442                 wo = max_allowed_wo(bdev, wo);
1443
1444         rcu_read_unlock();
1445
1446         resource->write_ordering = wo;
1447         if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
1448                 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
1449 }
1450
1451 static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
1452 {
1453         struct block_device *bdev = device->ldev->backing_bdev;
1454
1455         if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9,
1456                         GFP_NOIO, 0))
1457                 peer_req->flags |= EE_WAS_ERROR;
1458
1459         drbd_endio_write_sec_final(peer_req);
1460 }
1461
1462 static void drbd_issue_peer_wsame(struct drbd_device *device,
1463                                   struct drbd_peer_request *peer_req)
1464 {
1465         struct block_device *bdev = device->ldev->backing_bdev;
1466         sector_t s = peer_req->i.sector;
1467         sector_t nr = peer_req->i.size >> 9;
1468         if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
1469                 peer_req->flags |= EE_WAS_ERROR;
1470         drbd_endio_write_sec_final(peer_req);
1471 }
1472
1473
1474 /**
1475  * drbd_submit_peer_request()
1476  * @device:     DRBD device.
1477  * @peer_req:   peer request
1478  * @rw:         flag field, see bio->bi_opf
1479  *
1480  * May spread the pages to multiple bios,
1481  * depending on bio_add_page restrictions.
1482  *
1483  * Returns 0 if all bios have been submitted,
1484  * -ENOMEM if we could not allocate enough bios,
1485  * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1486  *  single page to an empty bio (which should never happen and likely indicates
1487  *  that the lower level IO stack is in some way broken). This has been observed
1488  *  on certain Xen deployments.
1489  */
1490 /* TODO allocate from our own bio_set. */
1491 int drbd_submit_peer_request(struct drbd_device *device,
1492                              struct drbd_peer_request *peer_req,
1493                              const unsigned op, const unsigned op_flags,
1494                              const int fault_type)
1495 {
1496         struct bio *bios = NULL;
1497         struct bio *bio;
1498         struct page *page = peer_req->pages;
1499         sector_t sector = peer_req->i.sector;
1500         unsigned data_size = peer_req->i.size;
1501         unsigned n_bios = 0;
1502         unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
1503         int err = -ENOMEM;
1504
1505         /* TRIM/DISCARD: for now, always use the helper function
1506          * blkdev_issue_zeroout(..., discard=true).
1507          * It's synchronous, but it does the right thing wrt. bio splitting.
1508          * Correctness first, performance later.  Next step is to code an
1509          * asynchronous variant of the same.
1510          */
1511         if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) {
1512                 /* wait for all pending IO completions, before we start
1513                  * zeroing things out. */
1514                 conn_wait_active_ee_empty(peer_req->peer_device->connection);
1515                 /* add it to the active list now,
1516                  * so we can find it to present it in debugfs */
1517                 peer_req->submit_jif = jiffies;
1518                 peer_req->flags |= EE_SUBMITTED;
1519
1520                 /* If this was a resync request from receive_rs_deallocated(),
1521                  * it is already on the sync_ee list */
1522                 if (list_empty(&peer_req->w.list)) {
1523                         spin_lock_irq(&device->resource->req_lock);
1524                         list_add_tail(&peer_req->w.list, &device->active_ee);
1525                         spin_unlock_irq(&device->resource->req_lock);
1526                 }
1527
1528                 if (peer_req->flags & EE_IS_TRIM)
1529                         drbd_issue_peer_discard(device, peer_req);
1530                 else /* EE_WRITE_SAME */
1531                         drbd_issue_peer_wsame(device, peer_req);
1532                 return 0;
1533         }
1534
1535         /* In most cases, we will only need one bio.  But in case the lower
1536          * level restrictions happen to be different at this offset on this
1537          * side than those of the sending peer, we may need to submit the
1538          * request in more than one bio.
1539          *
1540          * Plain bio_alloc is good enough here, this is no DRBD internally
1541          * generated bio, but a bio allocated on behalf of the peer.
1542          */
1543 next_bio:
1544         bio = bio_alloc(GFP_NOIO, nr_pages);
1545         if (!bio) {
1546                 drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
1547                 goto fail;
1548         }
1549         /* > peer_req->i.sector, unless this is the first bio */
1550         bio->bi_iter.bi_sector = sector;
1551         bio->bi_bdev = device->ldev->backing_bdev;
1552         bio_set_op_attrs(bio, op, op_flags);
1553         bio->bi_private = peer_req;
1554         bio->bi_end_io = drbd_peer_request_endio;
1555
1556         bio->bi_next = bios;
1557         bios = bio;
1558         ++n_bios;
1559
1560         page_chain_for_each(page) {
1561                 unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
1562                 if (!bio_add_page(bio, page, len, 0))
1563                         goto next_bio;
1564                 data_size -= len;
1565                 sector += len >> 9;
1566                 --nr_pages;
1567         }
1568         D_ASSERT(device, data_size == 0);
1569         D_ASSERT(device, page == NULL);
1570
1571         atomic_set(&peer_req->pending_bios, n_bios);
1572         /* for debugfs: update timestamp, mark as submitted */
1573         peer_req->submit_jif = jiffies;
1574         peer_req->flags |= EE_SUBMITTED;
1575         do {
1576                 bio = bios;
1577                 bios = bios->bi_next;
1578                 bio->bi_next = NULL;
1579
1580                 drbd_generic_make_request(device, fault_type, bio);
1581         } while (bios);
1582         return 0;
1583
1584 fail:
1585         while (bios) {
1586                 bio = bios;
1587                 bios = bios->bi_next;
1588                 bio_put(bio);
1589         }
1590         return err;
1591 }
1592
1593 static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
1594                                              struct drbd_peer_request *peer_req)
1595 {
1596         struct drbd_interval *i = &peer_req->i;
1597
1598         drbd_remove_interval(&device->write_requests, i);
1599         drbd_clear_interval(i);
1600
1601         /* Wake up any processes waiting for this peer request to complete.  */
1602         if (i->waiting)
1603                 wake_up(&device->misc_wait);
1604 }
1605
1606 static void conn_wait_active_ee_empty(struct drbd_connection *connection)
1607 {
1608         struct drbd_peer_device *peer_device;
1609         int vnr;
1610
1611         rcu_read_lock();
1612         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1613                 struct drbd_device *device = peer_device->device;
1614
1615                 kref_get(&device->kref);
1616                 rcu_read_unlock();
1617                 drbd_wait_ee_list_empty(device, &device->active_ee);
1618                 kref_put(&device->kref, drbd_destroy_device);
1619                 rcu_read_lock();
1620         }
1621         rcu_read_unlock();
1622 }
1623
1624 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
1625 {
1626         int rv;
1627         struct p_barrier *p = pi->data;
1628         struct drbd_epoch *epoch;
1629
1630         /* FIXME these are unacked on connection,
1631          * not a specific (peer)device.
1632          */
1633         connection->current_epoch->barrier_nr = p->barrier;
1634         connection->current_epoch->connection = connection;
1635         rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
1636
1637         /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1638          * the activity log, which means it would not be resynced in case the
1639          * R_PRIMARY crashes now.
1640          * Therefore we must send the barrier_ack after the barrier request was
1641          * completed. */
1642         switch (connection->resource->write_ordering) {
1643         case WO_NONE:
1644                 if (rv == FE_RECYCLED)
1645                         return 0;
1646
1647                 /* receiver context, in the writeout path of the other node.
1648                  * avoid potential distributed deadlock */
1649                 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1650                 if (epoch)
1651                         break;
1652                 else
1653                         drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
1654                         /* Fall through */
1655
1656         case WO_BDEV_FLUSH:
1657         case WO_DRAIN_IO:
1658                 conn_wait_active_ee_empty(connection);
1659                 drbd_flush(connection);
1660
1661                 if (atomic_read(&connection->current_epoch->epoch_size)) {
1662                         epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1663                         if (epoch)
1664                                 break;
1665                 }
1666
1667                 return 0;
1668         default:
1669                 drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
1670                          connection->resource->write_ordering);
1671                 return -EIO;
1672         }
1673
1674         epoch->flags = 0;
1675         atomic_set(&epoch->epoch_size, 0);
1676         atomic_set(&epoch->active, 0);
1677
1678         spin_lock(&connection->epoch_lock);
1679         if (atomic_read(&connection->current_epoch->epoch_size)) {
1680                 list_add(&epoch->list, &connection->current_epoch->list);
1681                 connection->current_epoch = epoch;
1682                 connection->epochs++;
1683         } else {
1684                 /* The current_epoch got recycled while we allocated this one... */
1685                 kfree(epoch);
1686         }
1687         spin_unlock(&connection->epoch_lock);
1688
1689         return 0;
1690 }
1691
1692 /* quick wrapper in case payload size != request_size (write same) */
1693 static void drbd_csum_ee_size(struct crypto_ahash *h,
1694                               struct drbd_peer_request *r, void *d,
1695                               unsigned int payload_size)
1696 {
1697         unsigned int tmp = r->i.size;
1698         r->i.size = payload_size;
1699         drbd_csum_ee(h, r, d);
1700         r->i.size = tmp;
1701 }
1702
1703 /* used from receive_RSDataReply (recv_resync_read)
1704  * and from receive_Data.
1705  * data_size: actual payload ("data in")
1706  *      for normal writes that is bi_size.
1707  *      for discards, that is zero.
1708  *      for write same, it is logical_block_size.
1709  * both trim and write same have the bi_size ("data len to be affected")
1710  * as extra argument in the packet header.
1711  */
1712 static struct drbd_peer_request *
1713 read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1714               struct packet_info *pi) __must_hold(local)
1715 {
1716         struct drbd_device *device = peer_device->device;
1717         const sector_t capacity = drbd_get_capacity(device->this_bdev);
1718         struct drbd_peer_request *peer_req;
1719         struct page *page;
1720         int digest_size, err;
1721         unsigned int data_size = pi->size, ds;
1722         void *dig_in = peer_device->connection->int_dig_in;
1723         void *dig_vv = peer_device->connection->int_dig_vv;
1724         unsigned long *data;
1725         struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
1726         struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
1727
1728         digest_size = 0;
1729         if (!trim && peer_device->connection->peer_integrity_tfm) {
1730                 digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm);
1731                 /*
1732                  * FIXME: Receive the incoming digest into the receive buffer
1733                  *        here, together with its struct p_data?
1734                  */
1735                 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
1736                 if (err)
1737                         return NULL;
1738                 data_size -= digest_size;
1739         }
1740
1741         /* assume request_size == data_size, but special case trim and wsame. */
1742         ds = data_size;
1743         if (trim) {
1744                 if (!expect(data_size == 0))
1745                         return NULL;
1746                 ds = be32_to_cpu(trim->size);
1747         } else if (wsame) {
1748                 if (data_size != queue_logical_block_size(device->rq_queue)) {
1749                         drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
1750                                 data_size, queue_logical_block_size(device->rq_queue));
1751                         return NULL;
1752                 }
1753                 if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
1754                         drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
1755                                 data_size, bdev_logical_block_size(device->ldev->backing_bdev));
1756                         return NULL;
1757                 }
1758                 ds = be32_to_cpu(wsame->size);
1759         }
1760
1761         if (!expect(IS_ALIGNED(ds, 512)))
1762                 return NULL;
1763         if (trim || wsame) {
1764                 if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
1765                         return NULL;
1766         } else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
1767                 return NULL;
1768
1769         /* even though we trust out peer,
1770          * we sometimes have to double check. */
1771         if (sector + (ds>>9) > capacity) {
1772                 drbd_err(device, "request from peer beyond end of local disk: "
1773                         "capacity: %llus < sector: %llus + size: %u\n",
1774                         (unsigned long long)capacity,
1775                         (unsigned long long)sector, ds);
1776                 return NULL;
1777         }
1778
1779         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1780          * "criss-cross" setup, that might cause write-out on some other DRBD,
1781          * which in turn might block on the other node at this very place.  */
1782         peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
1783         if (!peer_req)
1784                 return NULL;
1785
1786         peer_req->flags |= EE_WRITE;
1787         if (trim) {
1788                 peer_req->flags |= EE_IS_TRIM;
1789                 return peer_req;
1790         }
1791         if (wsame)
1792                 peer_req->flags |= EE_WRITE_SAME;
1793
1794         /* receive payload size bytes into page chain */
1795         ds = data_size;
1796         page = peer_req->pages;
1797         page_chain_for_each(page) {
1798                 unsigned len = min_t(int, ds, PAGE_SIZE);
1799                 data = kmap(page);
1800                 err = drbd_recv_all_warn(peer_device->connection, data, len);
1801                 if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
1802                         drbd_err(device, "Fault injection: Corrupting data on receive\n");
1803                         data[0] = data[0] ^ (unsigned long)-1;
1804                 }
1805                 kunmap(page);
1806                 if (err) {
1807                         drbd_free_peer_req(device, peer_req);
1808                         return NULL;
1809                 }
1810                 ds -= len;
1811         }
1812
1813         if (digest_size) {
1814                 drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
1815                 if (memcmp(dig_in, dig_vv, digest_size)) {
1816                         drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
1817                                 (unsigned long long)sector, data_size);
1818                         drbd_free_peer_req(device, peer_req);
1819                         return NULL;
1820                 }
1821         }
1822         device->recv_cnt += data_size >> 9;
1823         return peer_req;
1824 }
1825
1826 /* drbd_drain_block() just takes a data block
1827  * out of the socket input buffer, and discards it.
1828  */
1829 static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
1830 {
1831         struct page *page;
1832         int err = 0;
1833         void *data;
1834
1835         if (!data_size)
1836                 return 0;
1837
1838         page = drbd_alloc_pages(peer_device, 1, 1);
1839
1840         data = kmap(page);
1841         while (data_size) {
1842                 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1843
1844                 err = drbd_recv_all_warn(peer_device->connection, data, len);
1845                 if (err)
1846                         break;
1847                 data_size -= len;
1848         }
1849         kunmap(page);
1850         drbd_free_pages(peer_device->device, page, 0);
1851         return err;
1852 }
1853
1854 static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
1855                            sector_t sector, int data_size)
1856 {
1857         struct bio_vec bvec;
1858         struct bvec_iter iter;
1859         struct bio *bio;
1860         int digest_size, err, expect;
1861         void *dig_in = peer_device->connection->int_dig_in;
1862         void *dig_vv = peer_device->connection->int_dig_vv;
1863
1864         digest_size = 0;
1865         if (peer_device->connection->peer_integrity_tfm) {
1866                 digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm);
1867                 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
1868                 if (err)
1869                         return err;
1870                 data_size -= digest_size;
1871         }
1872
1873         /* optimistically update recv_cnt.  if receiving fails below,
1874          * we disconnect anyways, and counters will be reset. */
1875         peer_device->device->recv_cnt += data_size>>9;
1876
1877         bio = req->master_bio;
1878         D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
1879
1880         bio_for_each_segment(bvec, bio, iter) {
1881                 void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
1882                 expect = min_t(int, data_size, bvec.bv_len);
1883                 err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
1884                 kunmap(bvec.bv_page);
1885                 if (err)
1886                         return err;
1887                 data_size -= expect;
1888         }
1889
1890         if (digest_size) {
1891                 drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
1892                 if (memcmp(dig_in, dig_vv, digest_size)) {
1893                         drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
1894                         return -EINVAL;
1895                 }
1896         }
1897
1898         D_ASSERT(peer_device->device, data_size == 0);
1899         return 0;
1900 }
1901
1902 /*
1903  * e_end_resync_block() is called in ack_sender context via
1904  * drbd_finish_peer_reqs().
1905  */
1906 static int e_end_resync_block(struct drbd_work *w, int unused)
1907 {
1908         struct drbd_peer_request *peer_req =
1909                 container_of(w, struct drbd_peer_request, w);
1910         struct drbd_peer_device *peer_device = peer_req->peer_device;
1911         struct drbd_device *device = peer_device->device;
1912         sector_t sector = peer_req->i.sector;
1913         int err;
1914
1915         D_ASSERT(device, drbd_interval_empty(&peer_req->i));
1916
1917         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1918                 drbd_set_in_sync(device, sector, peer_req->i.size);
1919                 err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
1920         } else {
1921                 /* Record failure to sync */
1922                 drbd_rs_failed_io(device, sector, peer_req->i.size);
1923
1924                 err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
1925         }
1926         dec_unacked(device);
1927
1928         return err;
1929 }
1930
1931 static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
1932                             struct packet_info *pi) __releases(local)
1933 {
1934         struct drbd_device *device = peer_device->device;
1935         struct drbd_peer_request *peer_req;
1936
1937         peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
1938         if (!peer_req)
1939                 goto fail;
1940
1941         dec_rs_pending(device);
1942
1943         inc_unacked(device);
1944         /* corresponding dec_unacked() in e_end_resync_block()
1945          * respective _drbd_clear_done_ee */
1946
1947         peer_req->w.cb = e_end_resync_block;
1948         peer_req->submit_jif = jiffies;
1949
1950         spin_lock_irq(&device->resource->req_lock);
1951         list_add_tail(&peer_req->w.list, &device->sync_ee);
1952         spin_unlock_irq(&device->resource->req_lock);
1953
1954         atomic_add(pi->size >> 9, &device->rs_sect_ev);
1955         if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0,
1956                                      DRBD_FAULT_RS_WR) == 0)
1957                 return 0;
1958
1959         /* don't care for the reason here */
1960         drbd_err(device, "submit failed, triggering re-connect\n");
1961         spin_lock_irq(&device->resource->req_lock);
1962         list_del(&peer_req->w.list);
1963         spin_unlock_irq(&device->resource->req_lock);
1964
1965         drbd_free_peer_req(device, peer_req);
1966 fail:
1967         put_ldev(device);
1968         return -EIO;
1969 }
1970
1971 static struct drbd_request *
1972 find_request(struct drbd_device *device, struct rb_root *root, u64 id,
1973              sector_t sector, bool missing_ok, const char *func)
1974 {
1975         struct drbd_request *req;
1976
1977         /* Request object according to our peer */
1978         req = (struct drbd_request *)(unsigned long)id;
1979         if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
1980                 return req;
1981         if (!missing_ok) {
1982                 drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
1983                         (unsigned long)id, (unsigned long long)sector);
1984         }
1985         return NULL;
1986 }
1987
1988 static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
1989 {
1990         struct drbd_peer_device *peer_device;
1991         struct drbd_device *device;
1992         struct drbd_request *req;
1993         sector_t sector;
1994         int err;
1995         struct p_data *p = pi->data;
1996
1997         peer_device = conn_peer_device(connection, pi->vnr);
1998         if (!peer_device)
1999                 return -EIO;
2000         device = peer_device->device;
2001
2002         sector = be64_to_cpu(p->sector);
2003
2004         spin_lock_irq(&device->resource->req_lock);
2005         req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
2006         spin_unlock_irq(&device->resource->req_lock);
2007         if (unlikely(!req))
2008                 return -EIO;
2009
2010         /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
2011          * special casing it there for the various failure cases.
2012          * still no race with drbd_fail_pending_reads */
2013         err = recv_dless_read(peer_device, req, sector, pi->size);
2014         if (!err)
2015                 req_mod(req, DATA_RECEIVED);
2016         /* else: nothing. handled from drbd_disconnect...
2017          * I don't think we may complete this just yet
2018          * in case we are "on-disconnect: freeze" */
2019
2020         return err;
2021 }
2022
2023 static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
2024 {
2025         struct drbd_peer_device *peer_device;
2026         struct drbd_device *device;
2027         sector_t sector;
2028         int err;
2029         struct p_data *p = pi->data;
2030
2031         peer_device = conn_peer_device(connection, pi->vnr);
2032         if (!peer_device)
2033                 return -EIO;
2034         device = peer_device->device;
2035
2036         sector = be64_to_cpu(p->sector);
2037         D_ASSERT(device, p->block_id == ID_SYNCER);
2038
2039         if (get_ldev(device)) {
2040                 /* data is submitted to disk within recv_resync_read.
2041                  * corresponding put_ldev done below on error,
2042                  * or in drbd_peer_request_endio. */
2043                 err = recv_resync_read(peer_device, sector, pi);
2044         } else {
2045                 if (__ratelimit(&drbd_ratelimit_state))
2046                         drbd_err(device, "Can not write resync data to local disk.\n");
2047
2048                 err = drbd_drain_block(peer_device, pi->size);
2049
2050                 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2051         }
2052
2053         atomic_add(pi->size >> 9, &device->rs_sect_in);
2054
2055         return err;
2056 }
2057
2058 static void restart_conflicting_writes(struct drbd_device *device,
2059                                        sector_t sector, int size)
2060 {
2061         struct drbd_interval *i;
2062         struct drbd_request *req;
2063
2064         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2065                 if (!i->local)
2066                         continue;
2067                 req = container_of(i, struct drbd_request, i);
2068                 if (req->rq_state & RQ_LOCAL_PENDING ||
2069                     !(req->rq_state & RQ_POSTPONED))
2070                         continue;
2071                 /* as it is RQ_POSTPONED, this will cause it to
2072                  * be queued on the retry workqueue. */
2073                 __req_mod(req, CONFLICT_RESOLVED, NULL);
2074         }
2075 }
2076
2077 /*
2078  * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
2079  */
2080 static int e_end_block(struct drbd_work *w, int cancel)
2081 {
2082         struct drbd_peer_request *peer_req =
2083                 container_of(w, struct drbd_peer_request, w);
2084         struct drbd_peer_device *peer_device = peer_req->peer_device;
2085         struct drbd_device *device = peer_device->device;
2086         sector_t sector = peer_req->i.sector;
2087         int err = 0, pcmd;
2088
2089         if (peer_req->flags & EE_SEND_WRITE_ACK) {
2090                 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
2091                         pcmd = (device->state.conn >= C_SYNC_SOURCE &&
2092                                 device->state.conn <= C_PAUSED_SYNC_T &&
2093                                 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
2094                                 P_RS_WRITE_ACK : P_WRITE_ACK;
2095                         err = drbd_send_ack(peer_device, pcmd, peer_req);
2096                         if (pcmd == P_RS_WRITE_ACK)
2097                                 drbd_set_in_sync(device, sector, peer_req->i.size);
2098                 } else {
2099                         err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
2100                         /* we expect it to be marked out of sync anyways...
2101                          * maybe assert this?  */
2102                 }
2103                 dec_unacked(device);
2104         }
2105
2106         /* we delete from the conflict detection hash _after_ we sent out the
2107          * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
2108         if (peer_req->flags & EE_IN_INTERVAL_TREE) {
2109                 spin_lock_irq(&device->resource->req_lock);
2110                 D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
2111                 drbd_remove_epoch_entry_interval(device, peer_req);
2112                 if (peer_req->flags & EE_RESTART_REQUESTS)
2113                         restart_conflicting_writes(device, sector, peer_req->i.size);
2114                 spin_unlock_irq(&device->resource->req_lock);
2115         } else
2116                 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
2117
2118         drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
2119
2120         return err;
2121 }
2122
2123 static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
2124 {
2125         struct drbd_peer_request *peer_req =
2126                 container_of(w, struct drbd_peer_request, w);
2127         struct drbd_peer_device *peer_device = peer_req->peer_device;
2128         int err;
2129
2130         err = drbd_send_ack(peer_device, ack, peer_req);
2131         dec_unacked(peer_device->device);
2132
2133         return err;
2134 }
2135
2136 static int e_send_superseded(struct drbd_work *w, int unused)
2137 {
2138         return e_send_ack(w, P_SUPERSEDED);
2139 }
2140
2141 static int e_send_retry_write(struct drbd_work *w, int unused)
2142 {
2143         struct drbd_peer_request *peer_req =
2144                 container_of(w, struct drbd_peer_request, w);
2145         struct drbd_connection *connection = peer_req->peer_device->connection;
2146
2147         return e_send_ack(w, connection->agreed_pro_version >= 100 ?
2148                              P_RETRY_WRITE : P_SUPERSEDED);
2149 }
2150
2151 static bool seq_greater(u32 a, u32 b)
2152 {
2153         /*
2154          * We assume 32-bit wrap-around here.
2155          * For 24-bit wrap-around, we would have to shift:
2156          *  a <<= 8; b <<= 8;
2157          */
2158         return (s32)a - (s32)b > 0;
2159 }
2160
2161 static u32 seq_max(u32 a, u32 b)
2162 {
2163         return seq_greater(a, b) ? a : b;
2164 }
2165
2166 static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
2167 {
2168         struct drbd_device *device = peer_device->device;
2169         unsigned int newest_peer_seq;
2170
2171         if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
2172                 spin_lock(&device->peer_seq_lock);
2173                 newest_peer_seq = seq_max(device->peer_seq, peer_seq);
2174                 device->peer_seq = newest_peer_seq;
2175                 spin_unlock(&device->peer_seq_lock);
2176                 /* wake up only if we actually changed device->peer_seq */
2177                 if (peer_seq == newest_peer_seq)
2178                         wake_up(&device->seq_wait);
2179         }
2180 }
2181
2182 static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
2183 {
2184         return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
2185 }
2186
2187 /* maybe change sync_ee into interval trees as well? */
2188 static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
2189 {
2190         struct drbd_peer_request *rs_req;
2191         bool rv = false;
2192
2193         spin_lock_irq(&device->resource->req_lock);
2194         list_for_each_entry(rs_req, &device->sync_ee, w.list) {
2195                 if (overlaps(peer_req->i.sector, peer_req->i.size,
2196                              rs_req->i.sector, rs_req->i.size)) {
2197                         rv = true;
2198                         break;
2199                 }
2200         }
2201         spin_unlock_irq(&device->resource->req_lock);
2202
2203         return rv;
2204 }
2205
2206 /* Called from receive_Data.
2207  * Synchronize packets on sock with packets on msock.
2208  *
2209  * This is here so even when a P_DATA packet traveling via sock overtook an Ack
2210  * packet traveling on msock, they are still processed in the order they have
2211  * been sent.
2212  *
2213  * Note: we don't care for Ack packets overtaking P_DATA packets.
2214  *
2215  * In case packet_seq is larger than device->peer_seq number, there are
2216  * outstanding packets on the msock. We wait for them to arrive.
2217  * In case we are the logically next packet, we update device->peer_seq
2218  * ourselves. Correctly handles 32bit wrap around.
2219  *
2220  * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
2221  * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
2222  * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
2223  * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
2224  *
2225  * returns 0 if we may process the packet,
2226  * -ERESTARTSYS if we were interrupted (by disconnect signal). */
2227 static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
2228 {
2229         struct drbd_device *device = peer_device->device;
2230         DEFINE_WAIT(wait);
2231         long timeout;
2232         int ret = 0, tp;
2233
2234         if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
2235                 return 0;
2236
2237         spin_lock(&device->peer_seq_lock);
2238         for (;;) {
2239                 if (!seq_greater(peer_seq - 1, device->peer_seq)) {
2240                         device->peer_seq = seq_max(device->peer_seq, peer_seq);
2241                         break;
2242                 }
2243
2244                 if (signal_pending(current)) {
2245                         ret = -ERESTARTSYS;
2246                         break;
2247                 }
2248
2249                 rcu_read_lock();
2250                 tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
2251                 rcu_read_unlock();
2252
2253                 if (!tp)
2254                         break;
2255
2256                 /* Only need to wait if two_primaries is enabled */
2257                 prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
2258                 spin_unlock(&device->peer_seq_lock);
2259                 rcu_read_lock();
2260                 timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
2261                 rcu_read_unlock();
2262                 timeout = schedule_timeout(timeout);
2263                 spin_lock(&device->peer_seq_lock);
2264                 if (!timeout) {
2265                         ret = -ETIMEDOUT;
2266                         drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
2267                         break;
2268                 }
2269         }
2270         spin_unlock(&device->peer_seq_lock);
2271         finish_wait(&device->seq_wait, &wait);
2272         return ret;
2273 }
2274
2275 /* see also bio_flags_to_wire()
2276  * DRBD_REQ_*, because we need to semantically map the flags to data packet
2277  * flags and back. We may replicate to other kernel versions. */
2278 static unsigned long wire_flags_to_bio_flags(u32 dpf)
2279 {
2280         return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2281                 (dpf & DP_FUA ? REQ_FUA : 0) |
2282                 (dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
2283 }
2284
2285 static unsigned long wire_flags_to_bio_op(u32 dpf)
2286 {
2287         if (dpf & DP_DISCARD)
2288                 return REQ_OP_WRITE_ZEROES;
2289         else
2290                 return REQ_OP_WRITE;
2291 }
2292
2293 static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
2294                                     unsigned int size)
2295 {
2296         struct drbd_interval *i;
2297
2298     repeat:
2299         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2300                 struct drbd_request *req;
2301                 struct bio_and_error m;
2302
2303                 if (!i->local)
2304                         continue;
2305                 req = container_of(i, struct drbd_request, i);
2306                 if (!(req->rq_state & RQ_POSTPONED))
2307                         continue;
2308                 req->rq_state &= ~RQ_POSTPONED;
2309                 __req_mod(req, NEG_ACKED, &m);
2310                 spin_unlock_irq(&device->resource->req_lock);
2311                 if (m.bio)
2312                         complete_master_bio(device, &m);
2313                 spin_lock_irq(&device->resource->req_lock);
2314                 goto repeat;
2315         }
2316 }
2317
2318 static int handle_write_conflicts(struct drbd_device *device,
2319                                   struct drbd_peer_request *peer_req)
2320 {
2321         struct drbd_connection *connection = peer_req->peer_device->connection;
2322         bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
2323         sector_t sector = peer_req->i.sector;
2324         const unsigned int size = peer_req->i.size;
2325         struct drbd_interval *i;
2326         bool equal;
2327         int err;
2328
2329         /*
2330          * Inserting the peer request into the write_requests tree will prevent
2331          * new conflicting local requests from being added.
2332          */
2333         drbd_insert_interval(&device->write_requests, &peer_req->i);
2334
2335     repeat:
2336         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2337                 if (i == &peer_req->i)
2338                         continue;
2339                 if (i->completed)
2340                         continue;
2341
2342                 if (!i->local) {
2343                         /*
2344                          * Our peer has sent a conflicting remote request; this
2345                          * should not happen in a two-node setup.  Wait for the
2346                          * earlier peer request to complete.
2347                          */
2348                         err = drbd_wait_misc(device, i);
2349                         if (err)
2350                                 goto out;
2351                         goto repeat;
2352                 }
2353
2354                 equal = i->sector == sector && i->size == size;
2355                 if (resolve_conflicts) {
2356                         /*
2357                          * If the peer request is fully contained within the
2358                          * overlapping request, it can be considered overwritten
2359                          * and thus superseded; otherwise, it will be retried
2360                          * once all overlapping requests have completed.
2361                          */
2362                         bool superseded = i->sector <= sector && i->sector +
2363                                        (i->size >> 9) >= sector + (size >> 9);
2364
2365                         if (!equal)
2366                                 drbd_alert(device, "Concurrent writes detected: "
2367                                                "local=%llus +%u, remote=%llus +%u, "
2368                                                "assuming %s came first\n",
2369                                           (unsigned long long)i->sector, i->size,
2370                                           (unsigned long long)sector, size,
2371                                           superseded ? "local" : "remote");
2372
2373                         peer_req->w.cb = superseded ? e_send_superseded :
2374                                                    e_send_retry_write;
2375                         list_add_tail(&peer_req->w.list, &device->done_ee);
2376                         queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
2377
2378                         err = -ENOENT;
2379                         goto out;
2380                 } else {
2381                         struct drbd_request *req =
2382                                 container_of(i, struct drbd_request, i);
2383
2384                         if (!equal)
2385                                 drbd_alert(device, "Concurrent writes detected: "
2386                                                "local=%llus +%u, remote=%llus +%u\n",
2387                                           (unsigned long long)i->sector, i->size,
2388                                           (unsigned long long)sector, size);
2389
2390                         if (req->rq_state & RQ_LOCAL_PENDING ||
2391                             !(req->rq_state & RQ_POSTPONED)) {
2392                                 /*
2393                                  * Wait for the node with the discard flag to
2394                                  * decide if this request has been superseded
2395                                  * or needs to be retried.
2396                                  * Requests that have been superseded will
2397                                  * disappear from the write_requests tree.
2398                                  *
2399                                  * In addition, wait for the conflicting
2400                                  * request to finish locally before submitting
2401                                  * the conflicting peer request.
2402                                  */
2403                                 err = drbd_wait_misc(device, &req->i);
2404                                 if (err) {
2405                                         _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
2406                                         fail_postponed_requests(device, sector, size);
2407                                         goto out;
2408                                 }
2409                                 goto repeat;
2410                         }
2411                         /*
2412                          * Remember to restart the conflicting requests after
2413                          * the new peer request has completed.
2414                          */
2415                         peer_req->flags |= EE_RESTART_REQUESTS;
2416                 }
2417         }
2418         err = 0;
2419
2420     out:
2421         if (err)
2422                 drbd_remove_epoch_entry_interval(device, peer_req);
2423         return err;
2424 }
2425
2426 /* mirrored write */
2427 static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
2428 {
2429         struct drbd_peer_device *peer_device;
2430         struct drbd_device *device;
2431         struct net_conf *nc;
2432         sector_t sector;
2433         struct drbd_peer_request *peer_req;
2434         struct p_data *p = pi->data;
2435         u32 peer_seq = be32_to_cpu(p->seq_num);
2436         int op, op_flags;
2437         u32 dp_flags;
2438         int err, tp;
2439
2440         peer_device = conn_peer_device(connection, pi->vnr);
2441         if (!peer_device)
2442                 return -EIO;
2443         device = peer_device->device;
2444
2445         if (!get_ldev(device)) {
2446                 int err2;
2447
2448                 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2449                 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2450                 atomic_inc(&connection->current_epoch->epoch_size);
2451                 err2 = drbd_drain_block(peer_device, pi->size);
2452                 if (!err)
2453                         err = err2;
2454                 return err;
2455         }
2456
2457         /*
2458          * Corresponding put_ldev done either below (on various errors), or in
2459          * drbd_peer_request_endio, if we successfully submit the data at the
2460          * end of this function.
2461          */
2462
2463         sector = be64_to_cpu(p->sector);
2464         peer_req = read_in_block(peer_device, p->block_id, sector, pi);
2465         if (!peer_req) {
2466                 put_ldev(device);
2467                 return -EIO;
2468         }
2469
2470         peer_req->w.cb = e_end_block;
2471         peer_req->submit_jif = jiffies;
2472         peer_req->flags |= EE_APPLICATION;
2473
2474         dp_flags = be32_to_cpu(p->dp_flags);
2475         op = wire_flags_to_bio_op(dp_flags);
2476         op_flags = wire_flags_to_bio_flags(dp_flags);
2477         if (pi->cmd == P_TRIM) {
2478                 D_ASSERT(peer_device, peer_req->i.size > 0);
2479                 D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
2480                 D_ASSERT(peer_device, peer_req->pages == NULL);
2481         } else if (peer_req->pages == NULL) {
2482                 D_ASSERT(device, peer_req->i.size == 0);
2483                 D_ASSERT(device, dp_flags & DP_FLUSH);
2484         }
2485
2486         if (dp_flags & DP_MAY_SET_IN_SYNC)
2487                 peer_req->flags |= EE_MAY_SET_IN_SYNC;
2488
2489         spin_lock(&connection->epoch_lock);
2490         peer_req->epoch = connection->current_epoch;
2491         atomic_inc(&peer_req->epoch->epoch_size);
2492         atomic_inc(&peer_req->epoch->active);
2493         spin_unlock(&connection->epoch_lock);
2494
2495         rcu_read_lock();
2496         nc = rcu_dereference(peer_device->connection->net_conf);
2497         tp = nc->two_primaries;
2498         if (peer_device->connection->agreed_pro_version < 100) {
2499                 switch (nc->wire_protocol) {
2500                 case DRBD_PROT_C:
2501                         dp_flags |= DP_SEND_WRITE_ACK;
2502                         break;
2503                 case DRBD_PROT_B:
2504                         dp_flags |= DP_SEND_RECEIVE_ACK;
2505                         break;
2506                 }
2507         }
2508         rcu_read_unlock();
2509
2510         if (dp_flags & DP_SEND_WRITE_ACK) {
2511                 peer_req->flags |= EE_SEND_WRITE_ACK;
2512                 inc_unacked(device);
2513                 /* corresponding dec_unacked() in e_end_block()
2514                  * respective _drbd_clear_done_ee */
2515         }
2516
2517         if (dp_flags & DP_SEND_RECEIVE_ACK) {
2518                 /* I really don't like it that the receiver thread
2519                  * sends on the msock, but anyways */
2520                 drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
2521         }
2522
2523         if (tp) {
2524                 /* two primaries implies protocol C */
2525                 D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
2526                 peer_req->flags |= EE_IN_INTERVAL_TREE;
2527                 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2528                 if (err)
2529                         goto out_interrupted;
2530                 spin_lock_irq(&device->resource->req_lock);
2531                 err = handle_write_conflicts(device, peer_req);
2532                 if (err) {
2533                         spin_unlock_irq(&device->resource->req_lock);
2534                         if (err == -ENOENT) {
2535                                 put_ldev(device);
2536                                 return 0;
2537                         }
2538                         goto out_interrupted;
2539                 }
2540         } else {
2541                 update_peer_seq(peer_device, peer_seq);
2542                 spin_lock_irq(&device->resource->req_lock);
2543         }
2544         /* TRIM and WRITE_SAME are processed synchronously,
2545          * we wait for all pending requests, respectively wait for
2546          * active_ee to become empty in drbd_submit_peer_request();
2547          * better not add ourselves here. */
2548         if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0)
2549                 list_add_tail(&peer_req->w.list, &device->active_ee);
2550         spin_unlock_irq(&device->resource->req_lock);
2551
2552         if (device->state.conn == C_SYNC_TARGET)
2553                 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
2554
2555         if (device->state.pdsk < D_INCONSISTENT) {
2556                 /* In case we have the only disk of the cluster, */
2557                 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
2558                 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
2559                 drbd_al_begin_io(device, &peer_req->i);
2560                 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2561         }
2562
2563         err = drbd_submit_peer_request(device, peer_req, op, op_flags,
2564                                        DRBD_FAULT_DT_WR);
2565         if (!err)
2566                 return 0;
2567
2568         /* don't care for the reason here */
2569         drbd_err(device, "submit failed, triggering re-connect\n");
2570         spin_lock_irq(&device->resource->req_lock);
2571         list_del(&peer_req->w.list);
2572         drbd_remove_epoch_entry_interval(device, peer_req);
2573         spin_unlock_irq(&device->resource->req_lock);
2574         if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
2575                 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
2576                 drbd_al_complete_io(device, &peer_req->i);
2577         }
2578
2579 out_interrupted:
2580         drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
2581         put_ldev(device);
2582         drbd_free_peer_req(device, peer_req);
2583         return err;
2584 }
2585
2586 /* We may throttle resync, if the lower device seems to be busy,
2587  * and current sync rate is above c_min_rate.
2588  *
2589  * To decide whether or not the lower device is busy, we use a scheme similar
2590  * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2591  * (more than 64 sectors) of activity we cannot account for with our own resync
2592  * activity, it obviously is "busy".
2593  *
2594  * The current sync rate used here uses only the most recent two step marks,
2595  * to have a short time average so we can react faster.
2596  */
2597 bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
2598                 bool throttle_if_app_is_waiting)
2599 {
2600         struct lc_element *tmp;
2601         bool throttle = drbd_rs_c_min_rate_throttle(device);
2602
2603         if (!throttle || throttle_if_app_is_waiting)
2604                 return throttle;
2605
2606         spin_lock_irq(&device->al_lock);
2607         tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
2608         if (tmp) {
2609                 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2610                 if (test_bit(BME_PRIORITY, &bm_ext->flags))
2611                         throttle = false;
2612                 /* Do not slow down if app IO is already waiting for this extent,
2613                  * and our progress is necessary for application IO to complete. */
2614         }
2615         spin_unlock_irq(&device->al_lock);
2616
2617         return throttle;
2618 }
2619
2620 bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2621 {
2622         struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
2623         unsigned long db, dt, dbdt;
2624         unsigned int c_min_rate;
2625         int curr_events;
2626
2627         rcu_read_lock();
2628         c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2629         rcu_read_unlock();
2630
2631         /* feature disabled? */
2632         if (c_min_rate == 0)
2633                 return false;
2634
2635         curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2636                       (int)part_stat_read(&disk->part0, sectors[1]) -
2637                         atomic_read(&device->rs_sect_ev);
2638
2639         if (atomic_read(&device->ap_actlog_cnt)
2640             || curr_events - device->rs_last_events > 64) {
2641                 unsigned long rs_left;
2642                 int i;
2643
2644                 device->rs_last_events = curr_events;
2645
2646                 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2647                  * approx. */
2648                 i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2649
2650                 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
2651                         rs_left = device->ov_left;
2652                 else
2653                         rs_left = drbd_bm_total_weight(device) - device->rs_failed;
2654
2655                 dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
2656                 if (!dt)
2657                         dt++;
2658                 db = device->rs_mark_left[i] - rs_left;
2659                 dbdt = Bit2KB(db/dt);
2660
2661                 if (dbdt > c_min_rate)
2662                         return true;
2663         }
2664         return false;
2665 }
2666
2667 static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
2668 {
2669         struct drbd_peer_device *peer_device;
2670         struct drbd_device *device;
2671         sector_t sector;
2672         sector_t capacity;
2673         struct drbd_peer_request *peer_req;
2674         struct digest_info *di = NULL;
2675         int size, verb;
2676         unsigned int fault_type;
2677         struct p_block_req *p = pi->data;
2678
2679         peer_device = conn_peer_device(connection, pi->vnr);
2680         if (!peer_device)
2681                 return -EIO;
2682         device = peer_device->device;
2683         capacity = drbd_get_capacity(device->this_bdev);
2684
2685         sector = be64_to_cpu(p->sector);
2686         size   = be32_to_cpu(p->blksize);
2687
2688         if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
2689                 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2690                                 (unsigned long long)sector, size);
2691                 return -EINVAL;
2692         }
2693         if (sector + (size>>9) > capacity) {
2694                 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2695                                 (unsigned long long)sector, size);
2696                 return -EINVAL;
2697         }
2698
2699         if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
2700                 verb = 1;
2701                 switch (pi->cmd) {
2702                 case P_DATA_REQUEST:
2703                         drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
2704                         break;
2705                 case P_RS_THIN_REQ:
2706                 case P_RS_DATA_REQUEST:
2707                 case P_CSUM_RS_REQUEST:
2708                 case P_OV_REQUEST:
2709                         drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
2710                         break;
2711                 case P_OV_REPLY:
2712                         verb = 0;
2713                         dec_rs_pending(device);
2714                         drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
2715                         break;
2716                 default:
2717                         BUG();
2718                 }
2719                 if (verb && __ratelimit(&drbd_ratelimit_state))
2720                         drbd_err(device, "Can not satisfy peer's read request, "
2721                             "no local data.\n");
2722
2723                 /* drain possibly payload */
2724                 return drbd_drain_block(peer_device, pi->size);
2725         }
2726
2727         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2728          * "criss-cross" setup, that might cause write-out on some other DRBD,
2729          * which in turn might block on the other node at this very place.  */
2730         peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
2731                         size, GFP_NOIO);
2732         if (!peer_req) {
2733                 put_ldev(device);
2734                 return -ENOMEM;
2735         }
2736
2737         switch (pi->cmd) {
2738         case P_DATA_REQUEST:
2739                 peer_req->w.cb = w_e_end_data_req;
2740                 fault_type = DRBD_FAULT_DT_RD;
2741                 /* application IO, don't drbd_rs_begin_io */
2742                 peer_req->flags |= EE_APPLICATION;
2743                 goto submit;
2744
2745         case P_RS_THIN_REQ:
2746                 /* If at some point in the future we have a smart way to
2747                    find out if this data block is completely deallocated,
2748                    then we would do something smarter here than reading
2749                    the block... */
2750                 peer_req->flags |= EE_RS_THIN_REQ;
2751         case P_RS_DATA_REQUEST:
2752                 peer_req->w.cb = w_e_end_rsdata_req;
2753                 fault_type = DRBD_FAULT_RS_RD;
2754                 /* used in the sector offset progress display */
2755                 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2756                 break;
2757
2758         case P_OV_REPLY:
2759         case P_CSUM_RS_REQUEST:
2760                 fault_type = DRBD_FAULT_RS_RD;
2761                 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
2762                 if (!di)
2763                         goto out_free_e;
2764
2765                 di->digest_size = pi->size;
2766                 di->digest = (((char *)di)+sizeof(struct digest_info));
2767
2768                 peer_req->digest = di;
2769                 peer_req->flags |= EE_HAS_DIGEST;
2770
2771                 if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
2772                         goto out_free_e;
2773
2774                 if (pi->cmd == P_CSUM_RS_REQUEST) {
2775                         D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
2776                         peer_req->w.cb = w_e_end_csum_rs_req;
2777                         /* used in the sector offset progress display */
2778                         device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2779                         /* remember to report stats in drbd_resync_finished */
2780                         device->use_csums = true;
2781                 } else if (pi->cmd == P_OV_REPLY) {
2782                         /* track progress, we may need to throttle */
2783                         atomic_add(size >> 9, &device->rs_sect_in);
2784                         peer_req->w.cb = w_e_end_ov_reply;
2785                         dec_rs_pending(device);
2786                         /* drbd_rs_begin_io done when we sent this request,
2787                          * but accounting still needs to be done. */
2788                         goto submit_for_resync;
2789                 }
2790                 break;
2791
2792         case P_OV_REQUEST:
2793                 if (device->ov_start_sector == ~(sector_t)0 &&
2794                     peer_device->connection->agreed_pro_version >= 90) {
2795                         unsigned long now = jiffies;
2796                         int i;
2797                         device->ov_start_sector = sector;
2798                         device->ov_position = sector;
2799                         device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
2800                         device->rs_total = device->ov_left;
2801                         for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2802                                 device->rs_mark_left[i] = device->ov_left;
2803                                 device->rs_mark_time[i] = now;
2804                         }
2805                         drbd_info(device, "Online Verify start sector: %llu\n",
2806                                         (unsigned long long)sector);
2807                 }
2808                 peer_req->w.cb = w_e_end_ov_req;
2809                 fault_type = DRBD_FAULT_RS_RD;
2810                 break;
2811
2812         default:
2813                 BUG();
2814         }
2815
2816         /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2817          * wrt the receiver, but it is not as straightforward as it may seem.
2818          * Various places in the resync start and stop logic assume resync
2819          * requests are processed in order, requeuing this on the worker thread
2820          * introduces a bunch of new code for synchronization between threads.
2821          *
2822          * Unlimited throttling before drbd_rs_begin_io may stall the resync
2823          * "forever", throttling after drbd_rs_begin_io will lock that extent
2824          * for application writes for the same time.  For now, just throttle
2825          * here, where the rest of the code expects the receiver to sleep for
2826          * a while, anyways.
2827          */
2828
2829         /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2830          * this defers syncer requests for some time, before letting at least
2831          * on request through.  The resync controller on the receiving side
2832          * will adapt to the incoming rate accordingly.
2833          *
2834          * We cannot throttle here if remote is Primary/SyncTarget:
2835          * we would also throttle its application reads.
2836          * In that case, throttling is done on the SyncTarget only.
2837          */
2838
2839         /* Even though this may be a resync request, we do add to "read_ee";
2840          * "sync_ee" is only used for resync WRITEs.
2841          * Add to list early, so debugfs can find this request
2842          * even if we have to sleep below. */
2843         spin_lock_irq(&device->resource->req_lock);
2844         list_add_tail(&peer_req->w.list, &device->read_ee);
2845         spin_unlock_irq(&device->resource->req_lock);
2846
2847         update_receiver_timing_details(connection, drbd_rs_should_slow_down);
2848         if (device->state.peer != R_PRIMARY
2849         && drbd_rs_should_slow_down(device, sector, false))
2850                 schedule_timeout_uninterruptible(HZ/10);
2851         update_receiver_timing_details(connection, drbd_rs_begin_io);
2852         if (drbd_rs_begin_io(device, sector))
2853                 goto out_free_e;
2854
2855 submit_for_resync:
2856         atomic_add(size >> 9, &device->rs_sect_ev);
2857
2858 submit:
2859         update_receiver_timing_details(connection, drbd_submit_peer_request);
2860         inc_unacked(device);
2861         if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0,
2862                                      fault_type) == 0)
2863                 return 0;
2864
2865         /* don't care for the reason here */
2866         drbd_err(device, "submit failed, triggering re-connect\n");
2867
2868 out_free_e:
2869         spin_lock_irq(&device->resource->req_lock);
2870         list_del(&peer_req->w.list);
2871         spin_unlock_irq(&device->resource->req_lock);
2872         /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2873
2874         put_ldev(device);
2875         drbd_free_peer_req(device, peer_req);
2876         return -EIO;
2877 }
2878
2879 /**
2880  * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
2881  */
2882 static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
2883 {
2884         struct drbd_device *device = peer_device->device;
2885         int self, peer, rv = -100;
2886         unsigned long ch_self, ch_peer;
2887         enum drbd_after_sb_p after_sb_0p;
2888
2889         self = device->ldev->md.uuid[UI_BITMAP] & 1;
2890         peer = device->p_uuid[UI_BITMAP] & 1;
2891
2892         ch_peer = device->p_uuid[UI_SIZE];
2893         ch_self = device->comm_bm_set;
2894
2895         rcu_read_lock();
2896         after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
2897         rcu_read_unlock();
2898         switch (after_sb_0p) {
2899         case ASB_CONSENSUS:
2900         case ASB_DISCARD_SECONDARY:
2901         case ASB_CALL_HELPER:
2902         case ASB_VIOLENTLY:
2903                 drbd_err(device, "Configuration error.\n");
2904                 break;
2905         case ASB_DISCONNECT:
2906                 break;
2907         case ASB_DISCARD_YOUNGER_PRI:
2908                 if (self == 0 && peer == 1) {
2909                         rv = -1;
2910                         break;
2911                 }
2912                 if (self == 1 && peer == 0) {
2913                         rv =  1;
2914                         break;
2915                 }
2916                 /* Else fall through to one of the other strategies... */
2917         case ASB_DISCARD_OLDER_PRI:
2918                 if (self == 0 && peer == 1) {
2919                         rv = 1;
2920                         break;
2921                 }
2922                 if (self == 1 && peer == 0) {
2923                         rv = -1;
2924                         break;
2925                 }
2926                 /* Else fall through to one of the other strategies... */
2927                 drbd_warn(device, "Discard younger/older primary did not find a decision\n"
2928                      "Using discard-least-changes instead\n");
2929         case ASB_DISCARD_ZERO_CHG:
2930                 if (ch_peer == 0 && ch_self == 0) {
2931                         rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
2932                                 ? -1 : 1;
2933                         break;
2934                 } else {
2935                         if (ch_peer == 0) { rv =  1; break; }
2936                         if (ch_self == 0) { rv = -1; break; }
2937                 }
2938                 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
2939                         break;
2940         case ASB_DISCARD_LEAST_CHG:
2941                 if      (ch_self < ch_peer)
2942                         rv = -1;
2943                 else if (ch_self > ch_peer)
2944                         rv =  1;
2945                 else /* ( ch_self == ch_peer ) */
2946                      /* Well, then use something else. */
2947                         rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
2948                                 ? -1 : 1;
2949                 break;
2950         case ASB_DISCARD_LOCAL:
2951                 rv = -1;
2952                 break;
2953         case ASB_DISCARD_REMOTE:
2954                 rv =  1;
2955         }
2956
2957         return rv;
2958 }
2959
2960 /**
2961  * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
2962  */
2963 static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
2964 {
2965         struct drbd_device *device = peer_device->device;
2966         int hg, rv = -100;
2967         enum drbd_after_sb_p after_sb_1p;
2968
2969         rcu_read_lock();
2970         after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
2971         rcu_read_unlock();
2972         switch (after_sb_1p) {
2973         case ASB_DISCARD_YOUNGER_PRI:
2974         case ASB_DISCARD_OLDER_PRI:
2975         case ASB_DISCARD_LEAST_CHG:
2976         case ASB_DISCARD_LOCAL:
2977         case ASB_DISCARD_REMOTE:
2978         case ASB_DISCARD_ZERO_CHG:
2979                 drbd_err(device, "Configuration error.\n");
2980                 break;
2981         case ASB_DISCONNECT:
2982                 break;
2983         case ASB_CONSENSUS:
2984                 hg = drbd_asb_recover_0p(peer_device);
2985                 if (hg == -1 && device->state.role == R_SECONDARY)
2986                         rv = hg;
2987                 if (hg == 1  && device->state.role == R_PRIMARY)
2988                         rv = hg;
2989                 break;
2990         case ASB_VIOLENTLY:
2991                 rv = drbd_asb_recover_0p(peer_device);
2992                 break;
2993         case ASB_DISCARD_SECONDARY:
2994                 return device->state.role == R_PRIMARY ? 1 : -1;
2995         case ASB_CALL_HELPER:
2996                 hg = drbd_asb_recover_0p(peer_device);
2997                 if (hg == -1 && device->state.role == R_PRIMARY) {
2998                         enum drbd_state_rv rv2;
2999
3000                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3001                           * we might be here in C_WF_REPORT_PARAMS which is transient.
3002                           * we do not need to wait for the after state change work either. */
3003                         rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
3004                         if (rv2 != SS_SUCCESS) {
3005                                 drbd_khelper(device, "pri-lost-after-sb");
3006                         } else {
3007                                 drbd_warn(device, "Successfully gave up primary role.\n");
3008                                 rv = hg;
3009                         }
3010                 } else
3011                         rv = hg;
3012         }
3013
3014         return rv;
3015 }
3016
3017 /**
3018  * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
3019  */
3020 static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
3021 {
3022         struct drbd_device *device = peer_device->device;
3023         int hg, rv = -100;
3024         enum drbd_after_sb_p after_sb_2p;
3025
3026         rcu_read_lock();
3027         after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
3028         rcu_read_unlock();
3029         switch (after_sb_2p) {
3030         case ASB_DISCARD_YOUNGER_PRI:
3031         case ASB_DISCARD_OLDER_PRI:
3032         case ASB_DISCARD_LEAST_CHG:
3033         case ASB_DISCARD_LOCAL:
3034         case ASB_DISCARD_REMOTE:
3035         case ASB_CONSENSUS:
3036         case ASB_DISCARD_SECONDARY:
3037         case ASB_DISCARD_ZERO_CHG:
3038                 drbd_err(device, "Configuration error.\n");
3039                 break;
3040         case ASB_VIOLENTLY:
3041                 rv = drbd_asb_recover_0p(peer_device);
3042                 break;
3043         case ASB_DISCONNECT:
3044                 break;
3045         case ASB_CALL_HELPER:
3046                 hg = drbd_asb_recover_0p(peer_device);
3047                 if (hg == -1) {
3048                         enum drbd_state_rv rv2;
3049
3050                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3051                           * we might be here in C_WF_REPORT_PARAMS which is transient.
3052                           * we do not need to wait for the after state change work either. */
3053                         rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
3054                         if (rv2 != SS_SUCCESS) {
3055                                 drbd_khelper(device, "pri-lost-after-sb");
3056                         } else {
3057                                 drbd_warn(device, "Successfully gave up primary role.\n");
3058                                 rv = hg;
3059                         }
3060                 } else
3061                         rv = hg;
3062         }
3063
3064         return rv;
3065 }
3066
3067 static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
3068                            u64 bits, u64 flags)
3069 {
3070         if (!uuid) {
3071                 drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
3072                 return;
3073         }
3074         drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
3075              text,
3076              (unsigned long long)uuid[UI_CURRENT],
3077              (unsigned long long)uuid[UI_BITMAP],
3078              (unsigned long long)uuid[UI_HISTORY_START],
3079              (unsigned long long)uuid[UI_HISTORY_END],
3080              (unsigned long long)bits,
3081              (unsigned long long)flags);
3082 }
3083
3084 /*
3085   100   after split brain try auto recover
3086     2   C_SYNC_SOURCE set BitMap
3087     1   C_SYNC_SOURCE use BitMap
3088     0   no Sync
3089    -1   C_SYNC_TARGET use BitMap
3090    -2   C_SYNC_TARGET set BitMap
3091  -100   after split brain, disconnect
3092 -1000   unrelated data
3093 -1091   requires proto 91
3094 -1096   requires proto 96
3095  */
3096
3097 static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
3098 {
3099         struct drbd_peer_device *const peer_device = first_peer_device(device);
3100         struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
3101         u64 self, peer;
3102         int i, j;
3103
3104         self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3105         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3106
3107         *rule_nr = 10;
3108         if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
3109                 return 0;
3110
3111         *rule_nr = 20;
3112         if ((self == UUID_JUST_CREATED || self == (u64)0) &&
3113              peer != UUID_JUST_CREATED)
3114                 return -2;
3115
3116         *rule_nr = 30;
3117         if (self != UUID_JUST_CREATED &&
3118             (peer == UUID_JUST_CREATED || peer == (u64)0))
3119                 return 2;
3120
3121         if (self == peer) {
3122                 int rct, dc; /* roles at crash time */
3123
3124                 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
3125
3126                         if (connection->agreed_pro_version < 91)
3127                                 return -1091;
3128
3129                         if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
3130                             (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
3131                                 drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
3132                                 drbd_uuid_move_history(device);
3133                                 device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
3134                                 device->ldev->md.uuid[UI_BITMAP] = 0;
3135
3136                                 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3137                                                device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
3138                                 *rule_nr = 34;
3139                         } else {
3140                                 drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
3141                                 *rule_nr = 36;
3142                         }
3143
3144                         return 1;
3145                 }
3146
3147                 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
3148
3149                         if (connection->agreed_pro_version < 91)
3150                                 return -1091;
3151
3152                         if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
3153                             (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
3154                                 drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
3155
3156                                 device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
3157                                 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
3158                                 device->p_uuid[UI_BITMAP] = 0UL;
3159
3160                                 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3161                                 *rule_nr = 35;
3162                         } else {
3163                                 drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
3164                                 *rule_nr = 37;
3165                         }
3166
3167                         return -1;
3168                 }
3169
3170                 /* Common power [off|failure] */
3171                 rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
3172                         (device->p_uuid[UI_FLAGS] & 2);
3173                 /* lowest bit is set when we were primary,
3174                  * next bit (weight 2) is set when peer was primary */
3175                 *rule_nr = 40;
3176
3177                 /* Neither has the "crashed primary" flag set,
3178                  * only a replication link hickup. */
3179                 if (rct == 0)
3180                         return 0;
3181
3182                 /* Current UUID equal and no bitmap uuid; does not necessarily
3183                  * mean this was a "simultaneous hard crash", maybe IO was
3184                  * frozen, so no UUID-bump happened.
3185                  * This is a protocol change, overload DRBD_FF_WSAME as flag
3186                  * for "new-enough" peer DRBD version. */
3187                 if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
3188                         *rule_nr = 41;
3189                         if (!(connection->agreed_features & DRBD_FF_WSAME)) {
3190                                 drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
3191                                 return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
3192                         }
3193                         if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
3194                                 /* At least one has the "crashed primary" bit set,
3195                                  * both are primary now, but neither has rotated its UUIDs?
3196                                  * "Can not happen." */
3197                                 drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
3198                                 return -100;
3199                         }
3200                         if (device->state.role == R_PRIMARY)
3201                                 return 1;
3202                         return -1;
3203                 }
3204
3205                 /* Both are secondary.
3206                  * Really looks like recovery from simultaneous hard crash.
3207                  * Check which had been primary before, and arbitrate. */
3208                 switch (rct) {
3209                 case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
3210                 case 1: /*  self_pri && !peer_pri */ return 1;
3211                 case 2: /* !self_pri &&  peer_pri */ return -1;
3212                 case 3: /*  self_pri &&  peer_pri */
3213                         dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
3214                         return dc ? -1 : 1;
3215                 }
3216         }
3217
3218         *rule_nr = 50;
3219         peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3220         if (self == peer)
3221                 return -1;
3222
3223         *rule_nr = 51;
3224         peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
3225         if (self == peer) {
3226                 if (connection->agreed_pro_version < 96 ?
3227                     (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
3228                     (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
3229                     peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
3230                         /* The last P_SYNC_UUID did not get though. Undo the last start of
3231                            resync as sync source modifications of the peer's UUIDs. */
3232
3233                         if (connection->agreed_pro_version < 91)
3234                                 return -1091;
3235
3236                         device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
3237                         device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
3238
3239                         drbd_info(device, "Lost last syncUUID packet, corrected:\n");
3240                         drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3241
3242                         return -1;
3243                 }
3244         }
3245
3246         *rule_nr = 60;
3247         self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3248         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3249                 peer = device->p_uuid[i] & ~((u64)1);
3250                 if (self == peer)
3251                         return -2;
3252         }
3253
3254         *rule_nr = 70;
3255         self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3256         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3257         if (self == peer)
3258                 return 1;
3259
3260         *rule_nr = 71;
3261         self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
3262         if (self == peer) {
3263                 if (connection->agreed_pro_version < 96 ?
3264                     (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
3265                     (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
3266                     self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
3267                         /* The last P_SYNC_UUID did not get though. Undo the last start of
3268                            resync as sync source modifications of our UUIDs. */
3269
3270                         if (connection->agreed_pro_version < 91)
3271                                 return -1091;
3272
3273                         __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
3274                         __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
3275
3276                         drbd_info(device, "Last syncUUID did not get through, corrected:\n");
3277                         drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3278                                        device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
3279
3280                         return 1;
3281                 }
3282         }
3283
3284
3285         *rule_nr = 80;
3286         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3287         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3288                 self = device->ldev->md.uuid[i] & ~((u64)1);
3289                 if (self == peer)
3290                         return 2;
3291         }
3292
3293         *rule_nr = 90;
3294         self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3295         peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3296         if (self == peer && self != ((u64)0))
3297                 return 100;
3298
3299         *rule_nr = 100;
3300         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3301                 self = device->ldev->md.uuid[i] & ~((u64)1);
3302                 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
3303                         peer = device->p_uuid[j] & ~((u64)1);
3304                         if (self == peer)
3305                                 return -100;
3306                 }
3307         }
3308
3309         return -1000;
3310 }
3311
3312 /* drbd_sync_handshake() returns the new conn state on success, or
3313    CONN_MASK (-1) on failure.
3314  */
3315 static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3316                                            enum drbd_role peer_role,
3317                                            enum drbd_disk_state peer_disk) __must_hold(local)
3318 {
3319         struct drbd_device *device = peer_device->device;
3320         enum drbd_conns rv = C_MASK;
3321         enum drbd_disk_state mydisk;
3322         struct net_conf *nc;
3323         int hg, rule_nr, rr_conflict, tentative;
3324
3325         mydisk = device->state.disk;
3326         if (mydisk == D_NEGOTIATING)
3327                 mydisk = device->new_state_tmp.disk;
3328
3329         drbd_info(device, "drbd_sync_handshake:\n");
3330
3331         spin_lock_irq(&device->ldev->md.uuid_lock);
3332         drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
3333         drbd_uuid_dump(device, "peer", device->p_uuid,
3334                        device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3335
3336         hg = drbd_uuid_compare(device, peer_role, &rule_nr);
3337         spin_unlock_irq(&device->ldev->md.uuid_lock);
3338
3339         drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
3340
3341         if (hg == -1000) {
3342                 drbd_alert(device, "Unrelated data, aborting!\n");
3343                 return C_MASK;
3344         }
3345         if (hg < -0x10000) {
3346                 int proto, fflags;
3347                 hg = -hg;
3348                 proto = hg & 0xff;
3349                 fflags = (hg >> 8) & 0xff;
3350                 drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
3351                                         proto, fflags);
3352                 return C_MASK;
3353         }
3354         if (hg < -1000) {
3355                 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
3356                 return C_MASK;
3357         }
3358
3359         if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
3360             (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
3361                 int f = (hg == -100) || abs(hg) == 2;
3362                 hg = mydisk > D_INCONSISTENT ? 1 : -1;
3363                 if (f)
3364                         hg = hg*2;
3365                 drbd_info(device, "Becoming sync %s due to disk states.\n",
3366                      hg > 0 ? "source" : "target");
3367         }
3368
3369         if (abs(hg) == 100)
3370                 drbd_khelper(device, "initial-split-brain");
3371
3372         rcu_read_lock();
3373         nc = rcu_dereference(peer_device->connection->net_conf);
3374
3375         if (hg == 100 || (hg == -100 && nc->always_asbp)) {
3376                 int pcount = (device->state.role == R_PRIMARY)
3377                            + (peer_role == R_PRIMARY);
3378                 int forced = (hg == -100);
3379
3380                 switch (pcount) {
3381                 case 0:
3382                         hg = drbd_asb_recover_0p(peer_device);
3383                         break;
3384                 case 1:
3385                         hg = drbd_asb_recover_1p(peer_device);
3386                         break;
3387                 case 2:
3388                         hg = drbd_asb_recover_2p(peer_device);
3389                         break;
3390                 }
3391                 if (abs(hg) < 100) {
3392                         drbd_warn(device, "Split-Brain detected, %d primaries, "
3393                              "automatically solved. Sync from %s node\n",
3394                              pcount, (hg < 0) ? "peer" : "this");
3395                         if (forced) {
3396                                 drbd_warn(device, "Doing a full sync, since"
3397                                      " UUIDs where ambiguous.\n");
3398                                 hg = hg*2;
3399                         }
3400                 }
3401         }
3402
3403         if (hg == -100) {
3404                 if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
3405                         hg = -1;
3406                 if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
3407                         hg = 1;
3408
3409                 if (abs(hg) < 100)
3410                         drbd_warn(device, "Split-Brain detected, manually solved. "
3411                              "Sync from %s node\n",
3412                              (hg < 0) ? "peer" : "this");
3413         }
3414         rr_conflict = nc->rr_conflict;
3415         tentative = nc->tentative;
3416         rcu_read_unlock();
3417
3418         if (hg == -100) {
3419                 /* FIXME this log message is not correct if we end up here
3420                  * after an attempted attach on a diskless node.
3421                  * We just refuse to attach -- well, we drop the "connection"
3422                  * to that disk, in a way... */
3423                 drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
3424                 drbd_khelper(device, "split-brain");
3425                 return C_MASK;
3426         }
3427
3428         if (hg > 0 && mydisk <= D_INCONSISTENT) {
3429                 drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
3430                 return C_MASK;
3431         }
3432
3433         if (hg < 0 && /* by intention we do not use mydisk here. */
3434             device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
3435                 switch (rr_conflict) {
3436                 case ASB_CALL_HELPER:
3437                         drbd_khelper(device, "pri-lost");
3438                         /* fall through */
3439                 case ASB_DISCONNECT:
3440                         drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
3441                         return C_MASK;
3442                 case ASB_VIOLENTLY:
3443                         drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
3444                              "assumption\n");
3445                 }
3446         }
3447
3448         if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
3449                 if (hg == 0)
3450                         drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
3451                 else
3452                         drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
3453                                  drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3454                                  abs(hg) >= 2 ? "full" : "bit-map based");
3455                 return C_MASK;
3456         }
3457
3458         if (abs(hg) >= 2) {
3459                 drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
3460                 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
3461                                         BM_LOCKED_SET_ALLOWED))
3462                         return C_MASK;
3463         }
3464
3465         if (hg > 0) { /* become sync source. */
3466                 rv = C_WF_BITMAP_S;
3467         } else if (hg < 0) { /* become sync target */
3468                 rv = C_WF_BITMAP_T;
3469         } else {
3470                 rv = C_CONNECTED;
3471                 if (drbd_bm_total_weight(device)) {
3472                         drbd_info(device, "No resync, but %lu bits in bitmap!\n",
3473                              drbd_bm_total_weight(device));
3474                 }
3475         }
3476
3477         return rv;
3478 }
3479
3480 static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
3481 {
3482         /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
3483         if (peer == ASB_DISCARD_REMOTE)
3484                 return ASB_DISCARD_LOCAL;
3485
3486         /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
3487         if (peer == ASB_DISCARD_LOCAL)
3488                 return ASB_DISCARD_REMOTE;
3489
3490         /* everything else is valid if they are equal on both sides. */
3491         return peer;
3492 }
3493
3494 static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
3495 {
3496         struct p_protocol *p = pi->data;
3497         enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3498         int p_proto, p_discard_my_data, p_two_primaries, cf;
3499         struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3500         char integrity_alg[SHARED_SECRET_MAX] = "";
3501         struct crypto_ahash *peer_integrity_tfm = NULL;
3502         void *int_dig_in = NULL, *int_dig_vv = NULL;
3503
3504         p_proto         = be32_to_cpu(p->protocol);
3505         p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
3506         p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
3507         p_after_sb_2p   = be32_to_cpu(p->after_sb_2p);
3508         p_two_primaries = be32_to_cpu(p->two_primaries);
3509         cf              = be32_to_cpu(p->conn_flags);
3510         p_discard_my_data = cf & CF_DISCARD_MY_DATA;
3511
3512         if (connection->agreed_pro_version >= 87) {
3513                 int err;
3514
3515                 if (pi->size > sizeof(integrity_alg))
3516                         return -EIO;
3517                 err = drbd_recv_all(connection, integrity_alg, pi->size);
3518                 if (err)
3519                         return err;
3520                 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
3521         }
3522
3523         if (pi->cmd != P_PROTOCOL_UPDATE) {
3524                 clear_bit(CONN_DRY_RUN, &connection->flags);
3525
3526                 if (cf & CF_DRY_RUN)
3527                         set_bit(CONN_DRY_RUN, &connection->flags);
3528
3529                 rcu_read_lock();
3530                 nc = rcu_dereference(connection->net_conf);
3531
3532                 if (p_proto != nc->wire_protocol) {
3533                         drbd_err(connection, "incompatible %s settings\n", "protocol");
3534                         goto disconnect_rcu_unlock;
3535                 }
3536
3537                 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
3538                         drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
3539                         goto disconnect_rcu_unlock;
3540                 }
3541
3542                 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
3543                         drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
3544                         goto disconnect_rcu_unlock;
3545                 }
3546
3547                 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
3548                         drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
3549                         goto disconnect_rcu_unlock;
3550                 }
3551
3552                 if (p_discard_my_data && nc->discard_my_data) {
3553                         drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
3554                         goto disconnect_rcu_unlock;
3555                 }
3556
3557                 if (p_two_primaries != nc->two_primaries) {
3558                         drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
3559                         goto disconnect_rcu_unlock;
3560                 }
3561
3562                 if (strcmp(integrity_alg, nc->integrity_alg)) {
3563                         drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
3564                         goto disconnect_rcu_unlock;
3565                 }
3566
3567                 rcu_read_unlock();
3568         }
3569
3570         if (integrity_alg[0]) {
3571                 int hash_size;
3572
3573                 /*
3574                  * We can only change the peer data integrity algorithm
3575                  * here.  Changing our own data integrity algorithm
3576                  * requires that we send a P_PROTOCOL_UPDATE packet at
3577                  * the same time; otherwise, the peer has no way to
3578                  * tell between which packets the algorithm should
3579                  * change.
3580                  */
3581
3582                 peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3583                 if (IS_ERR(peer_integrity_tfm)) {
3584                         peer_integrity_tfm = NULL;
3585                         drbd_err(connection, "peer data-integrity-alg %s not supported\n",
3586                                  integrity_alg);
3587                         goto disconnect;
3588                 }
3589
3590                 hash_size = crypto_ahash_digestsize(peer_integrity_tfm);
3591                 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3592                 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3593                 if (!(int_dig_in && int_dig_vv)) {
3594                         drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
3595                         goto disconnect;
3596                 }
3597         }
3598
3599         new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3600         if (!new_net_conf) {
3601                 drbd_err(connection, "Allocation of new net_conf failed\n");
3602                 goto disconnect;
3603         }
3604
3605         mutex_lock(&connection->data.mutex);
3606         mutex_lock(&connection->resource->conf_update);
3607         old_net_conf = connection->net_conf;
3608         *new_net_conf = *old_net_conf;
3609
3610         new_net_conf->wire_protocol = p_proto;
3611         new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3612         new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3613         new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3614         new_net_conf->two_primaries = p_two_primaries;
3615
3616         rcu_assign_pointer(connection->net_conf, new_net_conf);
3617         mutex_unlock(&connection->resource->conf_update);
3618         mutex_unlock(&connection->data.mutex);
3619
3620         crypto_free_ahash(connection->peer_integrity_tfm);
3621         kfree(connection->int_dig_in);
3622         kfree(connection->int_dig_vv);
3623         connection->peer_integrity_tfm = peer_integrity_tfm;
3624         connection->int_dig_in = int_dig_in;
3625         connection->int_dig_vv = int_dig_vv;
3626
3627         if (strcmp(old_net_conf->integrity_alg, integrity_alg))
3628                 drbd_info(connection, "peer data-integrity-alg: %s\n",
3629                           integrity_alg[0] ? integrity_alg : "(none)");
3630
3631         synchronize_rcu();
3632         kfree(old_net_conf);
3633         return 0;
3634
3635 disconnect_rcu_unlock:
3636         rcu_read_unlock();
3637 disconnect:
3638         crypto_free_ahash(peer_integrity_tfm);
3639         kfree(int_dig_in);
3640         kfree(int_dig_vv);
3641         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
3642         return -EIO;
3643 }
3644
3645 /* helper function
3646  * input: alg name, feature name
3647  * return: NULL (alg name was "")
3648  *         ERR_PTR(error) if something goes wrong
3649  *         or the crypto hash ptr, if it worked out ok. */
3650 static struct crypto_ahash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
3651                 const char *alg, const char *name)
3652 {
3653         struct crypto_ahash *tfm;
3654
3655         if (!alg[0])
3656                 return NULL;
3657
3658         tfm = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC);
3659         if (IS_ERR(tfm)) {
3660                 drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
3661                         alg, name, PTR_ERR(tfm));
3662                 return tfm;
3663         }
3664         return tfm;
3665 }
3666
3667 static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
3668 {
3669         void *buffer = connection->data.rbuf;
3670         int size = pi->size;
3671
3672         while (size) {
3673                 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
3674                 s = drbd_recv(connection, buffer, s);
3675                 if (s <= 0) {
3676                         if (s < 0)
3677                                 return s;
3678                         break;
3679                 }
3680                 size -= s;
3681         }
3682         if (size)
3683                 return -EIO;
3684         return 0;
3685 }
3686
3687 /*
3688  * config_unknown_volume  -  device configuration command for unknown volume
3689  *
3690  * When a device is added to an existing connection, the node on which the
3691  * device is added first will send configuration commands to its peer but the
3692  * peer will not know about the device yet.  It will warn and ignore these
3693  * commands.  Once the device is added on the second node, the second node will
3694  * send the same device configuration commands, but in the other direction.
3695  *
3696  * (We can also end up here if drbd is misconfigured.)
3697  */
3698 static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
3699 {
3700         drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
3701                   cmdname(pi->cmd), pi->vnr);
3702         return ignore_remaining_packet(connection, pi);
3703 }
3704
3705 static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
3706 {
3707         struct drbd_peer_device *peer_device;
3708         struct drbd_device *device;
3709         struct p_rs_param_95 *p;
3710         unsigned int header_size, data_size, exp_max_sz;
3711         struct crypto_ahash *verify_tfm = NULL;
3712         struct crypto_ahash *csums_tfm = NULL;
3713         struct net_conf *old_net_conf, *new_net_conf = NULL;
3714         struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
3715         const int apv = connection->agreed_pro_version;
3716         struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
3717         int fifo_size = 0;
3718         int err;
3719
3720         peer_device = conn_peer_device(connection, pi->vnr);
3721         if (!peer_device)
3722                 return config_unknown_volume(connection, pi);
3723         device = peer_device->device;
3724
3725         exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
3726                     : apv == 88 ? sizeof(struct p_rs_param)
3727                                         + SHARED_SECRET_MAX
3728                     : apv <= 94 ? sizeof(struct p_rs_param_89)
3729                     : /* apv >= 95 */ sizeof(struct p_rs_param_95);
3730
3731         if (pi->size > exp_max_sz) {
3732                 drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
3733                     pi->size, exp_max_sz);
3734                 return -EIO;
3735         }
3736
3737         if (apv <= 88) {
3738                 header_size = sizeof(struct p_rs_param);
3739                 data_size = pi->size - header_size;
3740         } else if (apv <= 94) {
3741                 header_size = sizeof(struct p_rs_param_89);
3742                 data_size = pi->size - header_size;
3743                 D_ASSERT(device, data_size == 0);
3744         } else {
3745                 header_size = sizeof(struct p_rs_param_95);
3746                 data_size = pi->size - header_size;
3747                 D_ASSERT(device, data_size == 0);
3748         }
3749
3750         /* initialize verify_alg and csums_alg */
3751         p = pi->data;
3752         memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3753
3754         err = drbd_recv_all(peer_device->connection, p, header_size);
3755         if (err)
3756                 return err;
3757
3758         mutex_lock(&connection->resource->conf_update);
3759         old_net_conf = peer_device->connection->net_conf;
3760         if (get_ldev(device)) {
3761                 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3762                 if (!new_disk_conf) {
3763                         put_ldev(device);
3764                         mutex_unlock(&connection->resource->conf_update);
3765                         drbd_err(device, "Allocation of new disk_conf failed\n");
3766                         return -ENOMEM;
3767                 }
3768
3769                 old_disk_conf = device->ldev->disk_conf;
3770                 *new_disk_conf = *old_disk_conf;
3771
3772                 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
3773         }
3774
3775         if (apv >= 88) {
3776                 if (apv == 88) {
3777                         if (data_size > SHARED_SECRET_MAX || data_size == 0) {
3778                                 drbd_err(device, "verify-alg of wrong size, "
3779                                         "peer wants %u, accepting only up to %u byte\n",
3780                                         data_size, SHARED_SECRET_MAX);
3781                                 err = -EIO;
3782                                 goto reconnect;
3783                         }
3784
3785                         err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
3786                         if (err)
3787                                 goto reconnect;
3788                         /* we expect NUL terminated string */
3789                         /* but just in case someone tries to be evil */
3790                         D_ASSERT(device, p->verify_alg[data_size-1] == 0);
3791                         p->verify_alg[data_size-1] = 0;
3792
3793                 } else /* apv >= 89 */ {
3794                         /* we still expect NUL terminated strings */
3795                         /* but just in case someone tries to be evil */
3796                         D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3797                         D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3798                         p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3799                         p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3800                 }
3801
3802                 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
3803                         if (device->state.conn == C_WF_REPORT_PARAMS) {
3804                                 drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
3805                                     old_net_conf->verify_alg, p->verify_alg);
3806                                 goto disconnect;
3807                         }
3808                         verify_tfm = drbd_crypto_alloc_digest_safe(device,
3809                                         p->verify_alg, "verify-alg");
3810                         if (IS_ERR(verify_tfm)) {
3811                                 verify_tfm = NULL;
3812                                 goto disconnect;
3813                         }
3814                 }
3815
3816                 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
3817                         if (device->state.conn == C_WF_REPORT_PARAMS) {
3818                                 drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
3819                                     old_net_conf->csums_alg, p->csums_alg);
3820                                 goto disconnect;
3821                         }
3822                         csums_tfm = drbd_crypto_alloc_digest_safe(device,
3823                                         p->csums_alg, "csums-alg");
3824                         if (IS_ERR(csums_tfm)) {
3825                                 csums_tfm = NULL;
3826                                 goto disconnect;
3827                         }
3828                 }
3829
3830                 if (apv > 94 && new_disk_conf) {
3831                         new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3832                         new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3833                         new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3834                         new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
3835
3836                         fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
3837                         if (fifo_size != device->rs_plan_s->size) {
3838                                 new_plan = fifo_alloc(fifo_size);
3839                                 if (!new_plan) {
3840                                         drbd_err(device, "kmalloc of fifo_buffer failed");
3841                                         put_ldev(device);
3842                                         goto disconnect;
3843                                 }
3844                         }
3845                 }
3846
3847                 if (verify_tfm || csums_tfm) {
3848                         new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
3849                         if (!new_net_conf) {
3850                                 drbd_err(device, "Allocation of new net_conf failed\n");
3851                                 goto disconnect;
3852                         }
3853
3854                         *new_net_conf = *old_net_conf;
3855
3856                         if (verify_tfm) {
3857                                 strcpy(new_net_conf->verify_alg, p->verify_alg);
3858                                 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
3859                                 crypto_free_ahash(peer_device->connection->verify_tfm);
3860                                 peer_device->connection->verify_tfm = verify_tfm;
3861                                 drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
3862                         }
3863                         if (csums_tfm) {
3864                                 strcpy(new_net_conf->csums_alg, p->csums_alg);
3865                                 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
3866                                 crypto_free_ahash(peer_device->connection->csums_tfm);
3867                                 peer_device->connection->csums_tfm = csums_tfm;
3868                                 drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
3869                         }
3870                         rcu_assign_pointer(connection->net_conf, new_net_conf);
3871                 }
3872         }
3873
3874         if (new_disk_conf) {
3875                 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
3876                 put_ldev(device);
3877         }
3878
3879         if (new_plan) {
3880                 old_plan = device->rs_plan_s;
3881                 rcu_assign_pointer(device->rs_plan_s, new_plan);
3882         }
3883
3884         mutex_unlock(&connection->resource->conf_update);
3885         synchronize_rcu();
3886         if (new_net_conf)
3887                 kfree(old_net_conf);
3888         kfree(old_disk_conf);
3889         kfree(old_plan);
3890
3891         return 0;
3892
3893 reconnect:
3894         if (new_disk_conf) {
3895                 put_ldev(device);
3896                 kfree(new_disk_conf);
3897         }
3898         mutex_unlock(&connection->resource->conf_update);
3899         return -EIO;
3900
3901 disconnect:
3902         kfree(new_plan);
3903         if (new_disk_conf) {
3904                 put_ldev(device);
3905                 kfree(new_disk_conf);
3906         }
3907         mutex_unlock(&connection->resource->conf_update);
3908         /* just for completeness: actually not needed,
3909          * as this is not reached if csums_tfm was ok. */
3910         crypto_free_ahash(csums_tfm);
3911         /* but free the verify_tfm again, if csums_tfm did not work out */
3912         crypto_free_ahash(verify_tfm);
3913         conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3914         return -EIO;
3915 }
3916
3917 /* warn if the arguments differ by more than 12.5% */
3918 static void warn_if_differ_considerably(struct drbd_device *device,
3919         const char *s, sector_t a, sector_t b)
3920 {
3921         sector_t d;
3922         if (a == 0 || b == 0)
3923                 return;
3924         d = (a > b) ? (a - b) : (b - a);
3925         if (d > (a>>3) || d > (b>>3))
3926                 drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
3927                      (unsigned long long)a, (unsigned long long)b);
3928 }
3929
3930 static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
3931 {
3932         struct drbd_peer_device *peer_device;
3933         struct drbd_device *device;
3934         struct p_sizes *p = pi->data;
3935         struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
3936         enum determine_dev_size dd = DS_UNCHANGED;
3937         sector_t p_size, p_usize, p_csize, my_usize;
3938         int ldsc = 0; /* local disk size changed */
3939         enum dds_flags ddsf;
3940
3941         peer_device = conn_peer_device(connection, pi->vnr);
3942         if (!peer_device)
3943                 return config_unknown_volume(connection, pi);
3944         device = peer_device->device;
3945
3946         p_size = be64_to_cpu(p->d_size);
3947         p_usize = be64_to_cpu(p->u_size);
3948         p_csize = be64_to_cpu(p->c_size);
3949
3950         /* just store the peer's disk size for now.
3951          * we still need to figure out whether we accept that. */
3952         device->p_size = p_size;
3953
3954         if (get_ldev(device)) {
3955                 sector_t new_size, cur_size;
3956                 rcu_read_lock();
3957                 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
3958                 rcu_read_unlock();
3959
3960                 warn_if_differ_considerably(device, "lower level device sizes",
3961                            p_size, drbd_get_max_capacity(device->ldev));
3962                 warn_if_differ_considerably(device, "user requested size",
3963                                             p_usize, my_usize);
3964
3965                 /* if this is the first connect, or an otherwise expected
3966                  * param exchange, choose the minimum */
3967                 if (device->state.conn == C_WF_REPORT_PARAMS)
3968                         p_usize = min_not_zero(my_usize, p_usize);
3969
3970                 /* Never shrink a device with usable data during connect.
3971                    But allow online shrinking if we are connected. */
3972                 new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
3973                 cur_size = drbd_get_capacity(device->this_bdev);
3974                 if (new_size < cur_size &&
3975                     device->state.disk >= D_OUTDATED &&
3976                     device->state.conn < C_CONNECTED) {
3977                         drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
3978                                         (unsigned long long)new_size, (unsigned long long)cur_size);
3979                         conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3980                         put_ldev(device);
3981                         return -EIO;
3982                 }
3983
3984                 if (my_usize != p_usize) {
3985                         struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
3986
3987                         new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3988                         if (!new_disk_conf) {
3989                                 drbd_err(device, "Allocation of new disk_conf failed\n");
3990                                 put_ldev(device);
3991                                 return -ENOMEM;
3992                         }
3993
3994                         mutex_lock(&connection->resource->conf_update);
3995                         old_disk_conf = device->ldev->disk_conf;
3996                         *new_disk_conf = *old_disk_conf;
3997                         new_disk_conf->disk_size = p_usize;
3998
3999                         rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
4000                         mutex_unlock(&connection->resource->conf_update);
4001                         synchronize_rcu();
4002                         kfree(old_disk_conf);
4003
4004                         drbd_info(device, "Peer sets u_size to %lu sectors\n",
4005                                  (unsigned long)my_usize);
4006                 }
4007
4008                 put_ldev(device);
4009         }
4010
4011         device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
4012         /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
4013            In case we cleared the QUEUE_FLAG_DISCARD from our queue in
4014            drbd_reconsider_queue_parameters(), we can be sure that after
4015            drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
4016
4017         ddsf = be16_to_cpu(p->dds_flags);
4018         if (get_ldev(device)) {
4019                 drbd_reconsider_queue_parameters(device, device->ldev, o);
4020                 dd = drbd_determine_dev_size(device, ddsf, NULL);
4021                 put_ldev(device);
4022                 if (dd == DS_ERROR)
4023                         return -EIO;
4024                 drbd_md_sync(device);
4025         } else {
4026                 /*
4027                  * I am diskless, need to accept the peer's *current* size.
4028                  * I must NOT accept the peers backing disk size,
4029                  * it may have been larger than mine all along...
4030                  *
4031                  * At this point, the peer knows more about my disk, or at
4032                  * least about what we last agreed upon, than myself.
4033                  * So if his c_size is less than his d_size, the most likely
4034                  * reason is that *my* d_size was smaller last time we checked.
4035                  *
4036                  * However, if he sends a zero current size,
4037                  * take his (user-capped or) backing disk size anyways.
4038                  */
4039                 drbd_reconsider_queue_parameters(device, NULL, o);
4040                 drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
4041         }
4042
4043         if (get_ldev(device)) {
4044                 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
4045                         device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
4046                         ldsc = 1;
4047                 }
4048
4049                 put_ldev(device);
4050         }
4051
4052         if (device->state.conn > C_WF_REPORT_PARAMS) {
4053                 if (be64_to_cpu(p->c_size) !=
4054                     drbd_get_capacity(device->this_bdev) || ldsc) {
4055                         /* we have different sizes, probably peer
4056                          * needs to know my new size... */
4057                         drbd_send_sizes(peer_device, 0, ddsf);
4058                 }
4059                 if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
4060                     (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
4061                         if (device->state.pdsk >= D_INCONSISTENT &&
4062                             device->state.disk >= D_INCONSISTENT) {
4063                                 if (ddsf & DDSF_NO_RESYNC)
4064                                         drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
4065                                 else
4066                                         resync_after_online_grow(device);
4067                         } else
4068                                 set_bit(RESYNC_AFTER_NEG, &device->flags);
4069                 }
4070         }
4071
4072         return 0;
4073 }
4074
4075 static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
4076 {
4077         struct drbd_peer_device *peer_device;
4078         struct drbd_device *device;
4079         struct p_uuids *p = pi->data;
4080         u64 *p_uuid;
4081         int i, updated_uuids = 0;
4082
4083         peer_device = conn_peer_device(connection, pi->vnr);
4084         if (!peer_device)
4085                 return config_unknown_volume(connection, pi);
4086         device = peer_device->device;
4087
4088         p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
4089         if (!p_uuid) {
4090                 drbd_err(device, "kmalloc of p_uuid failed\n");
4091                 return false;
4092         }
4093
4094         for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
4095                 p_uuid[i] = be64_to_cpu(p->uuid[i]);
4096
4097         kfree(device->p_uuid);
4098         device->p_uuid = p_uuid;
4099
4100         if (device->state.conn < C_CONNECTED &&
4101             device->state.disk < D_INCONSISTENT &&
4102             device->state.role == R_PRIMARY &&
4103             (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
4104                 drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
4105                     (unsigned long long)device->ed_uuid);
4106                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4107                 return -EIO;
4108         }
4109
4110         if (get_ldev(device)) {
4111                 int skip_initial_sync =
4112                         device->state.conn == C_CONNECTED &&
4113                         peer_device->connection->agreed_pro_version >= 90 &&
4114                         device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
4115                         (p_uuid[UI_FLAGS] & 8);
4116                 if (skip_initial_sync) {
4117                         drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
4118                         drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
4119                                         "clear_n_write from receive_uuids",
4120                                         BM_LOCKED_TEST_ALLOWED);
4121                         _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
4122                         _drbd_uuid_set(device, UI_BITMAP, 0);
4123                         _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
4124                                         CS_VERBOSE, NULL);
4125                         drbd_md_sync(device);
4126                         updated_uuids = 1;
4127                 }
4128                 put_ldev(device);
4129         } else if (device->state.disk < D_INCONSISTENT &&
4130                    device->state.role == R_PRIMARY) {
4131                 /* I am a diskless primary, the peer just created a new current UUID
4132                    for me. */
4133                 updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
4134         }
4135
4136         /* Before we test for the disk state, we should wait until an eventually
4137            ongoing cluster wide state change is finished. That is important if
4138            we are primary and are detaching from our disk. We need to see the
4139            new disk state... */
4140         mutex_lock(device->state_mutex);
4141         mutex_unlock(device->state_mutex);
4142         if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
4143                 updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
4144
4145         if (updated_uuids)
4146                 drbd_print_uuids(device, "receiver updated UUIDs to");
4147
4148         return 0;
4149 }
4150
4151 /**
4152  * convert_state() - Converts the peer's view of the cluster state to our point of view
4153  * @ps:         The state as seen by the peer.
4154  */
4155 static union drbd_state convert_state(union drbd_state ps)
4156 {
4157         union drbd_state ms;
4158
4159         static enum drbd_conns c_tab[] = {
4160                 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
4161                 [C_CONNECTED] = C_CONNECTED,
4162
4163                 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
4164                 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
4165                 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
4166                 [C_VERIFY_S]       = C_VERIFY_T,
4167                 [C_MASK]   = C_MASK,
4168         };
4169
4170         ms.i = ps.i;
4171
4172         ms.conn = c_tab[ps.conn];
4173         ms.peer = ps.role;
4174         ms.role = ps.peer;
4175         ms.pdsk = ps.disk;
4176         ms.disk = ps.pdsk;
4177         ms.peer_isp = (ps.aftr_isp | ps.user_isp);
4178
4179         return ms;
4180 }
4181
4182 static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
4183 {
4184         struct drbd_peer_device *peer_device;
4185         struct drbd_device *device;
4186         struct p_req_state *p = pi->data;
4187         union drbd_state mask, val;
4188         enum drbd_state_rv rv;
4189
4190         peer_device = conn_peer_device(connection, pi->vnr);
4191         if (!peer_device)
4192                 return -EIO;
4193         device = peer_device->device;
4194
4195         mask.i = be32_to_cpu(p->mask);
4196         val.i = be32_to_cpu(p->val);
4197
4198         if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
4199             mutex_is_locked(device->state_mutex)) {
4200                 drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
4201                 return 0;
4202         }
4203
4204         mask = convert_state(mask);
4205         val = convert_state(val);
4206
4207         rv = drbd_change_state(device, CS_VERBOSE, mask, val);
4208         drbd_send_sr_reply(peer_device, rv);
4209
4210         drbd_md_sync(device);
4211
4212         return 0;
4213 }
4214
4215 static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
4216 {
4217         struct p_req_state *p = pi->data;
4218         union drbd_state mask, val;
4219         enum drbd_state_rv rv;
4220
4221         mask.i = be32_to_cpu(p->mask);
4222         val.i = be32_to_cpu(p->val);
4223
4224         if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
4225             mutex_is_locked(&connection->cstate_mutex)) {
4226                 conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
4227                 return 0;
4228         }
4229
4230         mask = convert_state(mask);
4231         val = convert_state(val);
4232
4233         rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
4234         conn_send_sr_reply(connection, rv);
4235
4236         return 0;
4237 }
4238
4239 static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
4240 {
4241         struct drbd_peer_device *peer_device;
4242         struct drbd_device *device;
4243         struct p_state *p = pi->data;
4244         union drbd_state os, ns, peer_state;
4245         enum drbd_disk_state real_peer_disk;
4246         enum chg_state_flags cs_flags;
4247         int rv;
4248
4249         peer_device = conn_peer_device(connection, pi->vnr);
4250         if (!peer_device)
4251                 return config_unknown_volume(connection, pi);
4252         device = peer_device->device;
4253
4254         peer_state.i = be32_to_cpu(p->state);
4255
4256         real_peer_disk = peer_state.disk;
4257         if (peer_state.disk == D_NEGOTIATING) {
4258                 real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
4259                 drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
4260         }
4261
4262         spin_lock_irq(&device->resource->req_lock);
4263  retry:
4264         os = ns = drbd_read_state(device);
4265         spin_unlock_irq(&device->resource->req_lock);
4266
4267         /* If some other part of the code (ack_receiver thread, timeout)
4268          * already decided to close the connection again,
4269          * we must not "re-establish" it here. */
4270         if (os.conn <= C_TEAR_DOWN)
4271                 return -ECONNRESET;
4272
4273         /* If this is the "end of sync" confirmation, usually the peer disk
4274          * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
4275          * set) resync started in PausedSyncT, or if the timing of pause-/
4276          * unpause-sync events has been "just right", the peer disk may
4277          * transition from D_CONSISTENT to D_UP_TO_DATE as well.
4278          */
4279         if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
4280             real_peer_disk == D_UP_TO_DATE &&
4281             os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
4282                 /* If we are (becoming) SyncSource, but peer is still in sync
4283                  * preparation, ignore its uptodate-ness to avoid flapping, it
4284                  * will change to inconsistent once the peer reaches active
4285                  * syncing states.
4286                  * It may have changed syncer-paused flags, however, so we
4287                  * cannot ignore this completely. */
4288                 if (peer_state.conn > C_CONNECTED &&
4289                     peer_state.conn < C_SYNC_SOURCE)
4290                         real_peer_disk = D_INCONSISTENT;
4291
4292                 /* if peer_state changes to connected at the same time,
4293                  * it explicitly notifies us that it finished resync.
4294                  * Maybe we should finish it up, too? */
4295                 else if (os.conn >= C_SYNC_SOURCE &&
4296                          peer_state.conn == C_CONNECTED) {
4297                         if (drbd_bm_total_weight(device) <= device->rs_failed)
4298                                 drbd_resync_finished(device);
4299                         return 0;
4300                 }
4301         }
4302
4303         /* explicit verify finished notification, stop sector reached. */
4304         if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
4305             peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
4306                 ov_out_of_sync_print(device);
4307                 drbd_resync_finished(device);
4308                 return 0;
4309         }
4310
4311         /* peer says his disk is inconsistent, while we think it is uptodate,
4312          * and this happens while the peer still thinks we have a sync going on,
4313          * but we think we are already done with the sync.
4314          * We ignore this to avoid flapping pdsk.
4315          * This should not happen, if the peer is a recent version of drbd. */
4316         if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
4317             os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
4318                 real_peer_disk = D_UP_TO_DATE;
4319
4320         if (ns.conn == C_WF_REPORT_PARAMS)
4321                 ns.conn = C_CONNECTED;
4322
4323         if (peer_state.conn == C_AHEAD)
4324                 ns.conn = C_BEHIND;
4325
4326         if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
4327             get_ldev_if_state(device, D_NEGOTIATING)) {
4328                 int cr; /* consider resync */
4329
4330                 /* if we established a new connection */
4331                 cr  = (os.conn < C_CONNECTED);
4332                 /* if we had an established connection
4333                  * and one of the nodes newly attaches a disk */
4334                 cr |= (os.conn == C_CONNECTED &&
4335                        (peer_state.disk == D_NEGOTIATING ||
4336                         os.disk == D_NEGOTIATING));
4337                 /* if we have both been inconsistent, and the peer has been
4338                  * forced to be UpToDate with --overwrite-data */
4339                 cr |= test_bit(CONSIDER_RESYNC, &device->flags);
4340                 /* if we had been plain connected, and the admin requested to
4341                  * start a sync by "invalidate" or "invalidate-remote" */
4342                 cr |= (os.conn == C_CONNECTED &&
4343                                 (peer_state.conn >= C_STARTING_SYNC_S &&
4344                                  peer_state.conn <= C_WF_BITMAP_T));
4345
4346                 if (cr)
4347                         ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
4348
4349                 put_ldev(device);
4350                 if (ns.conn == C_MASK) {
4351                         ns.conn = C_CONNECTED;
4352                         if (device->state.disk == D_NEGOTIATING) {
4353                                 drbd_force_state(device, NS(disk, D_FAILED));
4354                         } else if (peer_state.disk == D_NEGOTIATING) {
4355                                 drbd_err(device, "Disk attach process on the peer node was aborted.\n");
4356                                 peer_state.disk = D_DISKLESS;
4357                                 real_peer_disk = D_DISKLESS;
4358                         } else {
4359                                 if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
4360                                         return -EIO;
4361                                 D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
4362                                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4363                                 return -EIO;
4364                         }
4365                 }
4366         }
4367
4368         spin_lock_irq(&device->resource->req_lock);
4369         if (os.i != drbd_read_state(device).i)
4370                 goto retry;
4371         clear_bit(CONSIDER_RESYNC, &device->flags);
4372         ns.peer = peer_state.role;
4373         ns.pdsk = real_peer_disk;
4374         ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4375         if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
4376                 ns.disk = device->new_state_tmp.disk;
4377         cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
4378         if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
4379             test_bit(NEW_CUR_UUID, &device->flags)) {
4380                 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
4381                    for temporal network outages! */
4382                 spin_unlock_irq(&device->resource->req_lock);
4383                 drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
4384                 tl_clear(peer_device->connection);
4385                 drbd_uuid_new_current(device);
4386                 clear_bit(NEW_CUR_UUID, &device->flags);
4387                 conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
4388                 return -EIO;
4389         }
4390         rv = _drbd_set_state(device, ns, cs_flags, NULL);
4391         ns = drbd_read_state(device);
4392         spin_unlock_irq(&device->resource->req_lock);
4393
4394         if (rv < SS_SUCCESS) {
4395                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4396                 return -EIO;
4397         }
4398
4399         if (os.conn > C_WF_REPORT_PARAMS) {
4400                 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
4401                     peer_state.disk != D_NEGOTIATING ) {
4402                         /* we want resync, peer has not yet decided to sync... */
4403                         /* Nowadays only used when forcing a node into primary role and
4404                            setting its disk to UpToDate with that */
4405                         drbd_send_uuids(peer_device);
4406                         drbd_send_current_state(peer_device);
4407                 }
4408         }
4409
4410         clear_bit(DISCARD_MY_DATA, &device->flags);
4411
4412         drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
4413
4414         return 0;
4415 }
4416
4417 static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
4418 {
4419         struct drbd_peer_device *peer_device;
4420         struct drbd_device *device;
4421         struct p_rs_uuid *p = pi->data;
4422
4423         peer_device = conn_peer_device(connection, pi->vnr);
4424         if (!peer_device)
4425                 return -EIO;
4426         device = peer_device->device;
4427
4428         wait_event(device->misc_wait,
4429                    device->state.conn == C_WF_SYNC_UUID ||
4430                    device->state.conn == C_BEHIND ||
4431                    device->state.conn < C_CONNECTED ||
4432                    device->state.disk < D_NEGOTIATING);
4433
4434         /* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
4435
4436         /* Here the _drbd_uuid_ functions are right, current should
4437            _not_ be rotated into the history */
4438         if (get_ldev_if_state(device, D_NEGOTIATING)) {
4439                 _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
4440                 _drbd_uuid_set(device, UI_BITMAP, 0UL);
4441
4442                 drbd_print_uuids(device, "updated sync uuid");
4443                 drbd_start_resync(device, C_SYNC_TARGET);
4444
4445                 put_ldev(device);
4446         } else
4447                 drbd_err(device, "Ignoring SyncUUID packet!\n");
4448
4449         return 0;
4450 }
4451
4452 /**
4453  * receive_bitmap_plain
4454  *
4455  * Return 0 when done, 1 when another iteration is needed, and a negative error
4456  * code upon failure.
4457  */
4458 static int
4459 receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
4460                      unsigned long *p, struct bm_xfer_ctx *c)
4461 {
4462         unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
4463                                  drbd_header_size(peer_device->connection);
4464         unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
4465                                        c->bm_words - c->word_offset);
4466         unsigned int want = num_words * sizeof(*p);
4467         int err;
4468
4469         if (want != size) {
4470                 drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
4471                 return -EIO;
4472         }
4473         if (want == 0)
4474                 return 0;
4475         err = drbd_recv_all(peer_device->connection, p, want);
4476         if (err)
4477                 return err;
4478
4479         drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
4480
4481         c->word_offset += num_words;
4482         c->bit_offset = c->word_offset * BITS_PER_LONG;
4483         if (c->bit_offset > c->bm_bits)
4484                 c->bit_offset = c->bm_bits;
4485
4486         return 1;
4487 }
4488
4489 static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4490 {
4491         return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4492 }
4493
4494 static int dcbp_get_start(struct p_compressed_bm *p)
4495 {
4496         return (p->encoding & 0x80) != 0;
4497 }
4498
4499 static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4500 {
4501         return (p->encoding >> 4) & 0x7;
4502 }
4503
4504 /**
4505  * recv_bm_rle_bits
4506  *
4507  * Return 0 when done, 1 when another iteration is needed, and a negative error
4508  * code upon failure.
4509  */
4510 static int
4511 recv_bm_rle_bits(struct drbd_peer_device *peer_device,
4512                 struct p_compressed_bm *p,
4513                  struct bm_xfer_ctx *c,
4514                  unsigned int len)
4515 {
4516         struct bitstream bs;
4517         u64 look_ahead;
4518         u64 rl;
4519         u64 tmp;
4520         unsigned long s = c->bit_offset;
4521         unsigned long e;
4522         int toggle = dcbp_get_start(p);
4523         int have;
4524         int bits;
4525
4526         bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
4527
4528         bits = bitstream_get_bits(&bs, &look_ahead, 64);
4529         if (bits < 0)
4530                 return -EIO;
4531
4532         for (have = bits; have > 0; s += rl, toggle = !toggle) {
4533                 bits = vli_decode_bits(&rl, look_ahead);
4534                 if (bits <= 0)
4535                         return -EIO;
4536
4537                 if (toggle) {
4538                         e = s + rl -1;
4539                         if (e >= c->bm_bits) {
4540                                 drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
4541                                 return -EIO;
4542                         }
4543                         _drbd_bm_set_bits(peer_device->device, s, e);
4544                 }
4545
4546                 if (have < bits) {
4547                         drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
4548                                 have, bits, look_ahead,
4549                                 (unsigned int)(bs.cur.b - p->code),
4550                                 (unsigned int)bs.buf_len);
4551                         return -EIO;
4552                 }
4553                 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4554                 if (likely(bits < 64))
4555                         look_ahead >>= bits;
4556                 else
4557                         look_ahead = 0;
4558                 have -= bits;
4559
4560                 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4561                 if (bits < 0)
4562                         return -EIO;
4563                 look_ahead |= tmp << have;
4564                 have += bits;
4565         }
4566
4567         c->bit_offset = s;
4568         bm_xfer_ctx_bit_to_word_offset(c);
4569
4570         return (s != c->bm_bits);
4571 }
4572
4573 /**
4574  * decode_bitmap_c
4575  *
4576  * Return 0 when done, 1 when another iteration is needed, and a negative error
4577  * code upon failure.
4578  */
4579 static int
4580 decode_bitmap_c(struct drbd_peer_device *peer_device,
4581                 struct p_compressed_bm *p,
4582                 struct bm_xfer_ctx *c,
4583                 unsigned int len)
4584 {
4585         if (dcbp_get_code(p) == RLE_VLI_Bits)
4586                 return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
4587
4588         /* other variants had been implemented for evaluation,
4589          * but have been dropped as this one turned out to be "best"
4590          * during all our tests. */
4591
4592         drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
4593         conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
4594         return -EIO;
4595 }
4596
4597 void INFO_bm_xfer_stats(struct drbd_device *device,
4598                 const char *direction, struct bm_xfer_ctx *c)
4599 {
4600         /* what would it take to transfer it "plaintext" */
4601         unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
4602         unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4603         unsigned int plain =
4604                 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4605                 c->bm_words * sizeof(unsigned long);
4606         unsigned int total = c->bytes[0] + c->bytes[1];
4607         unsigned int r;
4608
4609         /* total can not be zero. but just in case: */
4610         if (total == 0)
4611                 return;
4612
4613         /* don't report if not compressed */
4614         if (total >= plain)
4615                 return;
4616
4617         /* total < plain. check for overflow, still */
4618         r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4619                                     : (1000 * total / plain);
4620
4621         if (r > 1000)
4622                 r = 1000;
4623
4624         r = 1000 - r;
4625         drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
4626              "total %u; compression: %u.%u%%\n",
4627                         direction,
4628                         c->bytes[1], c->packets[1],
4629                         c->bytes[0], c->packets[0],
4630                         total, r/10, r % 10);
4631 }
4632
4633 /* Since we are processing the bitfield from lower addresses to higher,
4634    it does not matter if the process it in 32 bit chunks or 64 bit
4635    chunks as long as it is little endian. (Understand it as byte stream,
4636    beginning with the lowest byte...) If we would use big endian
4637    we would need to process it from the highest address to the lowest,
4638    in order to be agnostic to the 32 vs 64 bits issue.
4639
4640    returns 0 on failure, 1 if we successfully received it. */
4641 static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
4642 {
4643         struct drbd_peer_device *peer_device;
4644         struct drbd_device *device;
4645         struct bm_xfer_ctx c;
4646         int err;
4647
4648         peer_device = conn_peer_device(connection, pi->vnr);
4649         if (!peer_device)
4650                 return -EIO;
4651         device = peer_device->device;
4652
4653         drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
4654         /* you are supposed to send additional out-of-sync information
4655          * if you actually set bits during this phase */
4656
4657         c = (struct bm_xfer_ctx) {
4658                 .bm_bits = drbd_bm_bits(device),
4659                 .bm_words = drbd_bm_words(device),
4660         };
4661
4662         for(;;) {
4663                 if (pi->cmd == P_BITMAP)
4664                         err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
4665                 else if (pi->cmd == P_COMPRESSED_BITMAP) {
4666                         /* MAYBE: sanity check that we speak proto >= 90,
4667                          * and the feature is enabled! */
4668                         struct p_compressed_bm *p = pi->data;
4669
4670                         if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
4671                                 drbd_err(device, "ReportCBitmap packet too large\n");
4672                                 err = -EIO;
4673                                 goto out;
4674                         }
4675                         if (pi->size <= sizeof(*p)) {
4676                                 drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
4677                                 err = -EIO;
4678                                 goto out;
4679                         }
4680                         err = drbd_recv_all(peer_device->connection, p, pi->size);
4681                         if (err)
4682                                goto out;
4683                         err = decode_bitmap_c(peer_device, p, &c, pi->size);
4684                 } else {
4685                         drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
4686                         err = -EIO;
4687                         goto out;
4688                 }
4689
4690                 c.packets[pi->cmd == P_BITMAP]++;
4691                 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
4692
4693                 if (err <= 0) {
4694                         if (err < 0)
4695                                 goto out;
4696                         break;
4697                 }
4698                 err = drbd_recv_header(peer_device->connection, pi);
4699                 if (err)
4700                         goto out;
4701         }
4702
4703         INFO_bm_xfer_stats(device, "receive", &c);
4704
4705         if (device->state.conn == C_WF_BITMAP_T) {
4706                 enum drbd_state_rv rv;
4707
4708                 err = drbd_send_bitmap(device);
4709                 if (err)
4710                         goto out;
4711                 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
4712                 rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
4713                 D_ASSERT(device, rv == SS_SUCCESS);
4714         } else if (device->state.conn != C_WF_BITMAP_S) {
4715                 /* admin may have requested C_DISCONNECTING,
4716                  * other threads may have noticed network errors */
4717                 drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
4718                     drbd_conn_str(device->state.conn));
4719         }
4720         err = 0;
4721
4722  out:
4723         drbd_bm_unlock(device);
4724         if (!err && device->state.conn == C_WF_BITMAP_S)
4725                 drbd_start_resync(device, C_SYNC_SOURCE);
4726         return err;
4727 }
4728
4729 static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
4730 {
4731         drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
4732                  pi->cmd, pi->size);
4733
4734         return ignore_remaining_packet(connection, pi);
4735 }
4736
4737 static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
4738 {
4739         /* Make sure we've acked all the TCP data associated
4740          * with the data requests being unplugged */
4741         drbd_tcp_quickack(connection->data.socket);
4742
4743         return 0;
4744 }
4745
4746 static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
4747 {
4748         struct drbd_peer_device *peer_device;
4749         struct drbd_device *device;
4750         struct p_block_desc *p = pi->data;
4751
4752         peer_device = conn_peer_device(connection, pi->vnr);
4753         if (!peer_device)
4754                 return -EIO;
4755         device = peer_device->device;
4756
4757         switch (device->state.conn) {
4758         case C_WF_SYNC_UUID:
4759         case C_WF_BITMAP_T:
4760         case C_BEHIND:
4761                         break;
4762         default:
4763                 drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
4764                                 drbd_conn_str(device->state.conn));
4765         }
4766
4767         drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
4768
4769         return 0;
4770 }
4771
4772 static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
4773 {
4774         struct drbd_peer_device *peer_device;
4775         struct p_block_desc *p = pi->data;
4776         struct drbd_device *device;
4777         sector_t sector;
4778         int size, err = 0;
4779
4780         peer_device = conn_peer_device(connection, pi->vnr);
4781         if (!peer_device)
4782                 return -EIO;
4783         device = peer_device->device;
4784
4785         sector = be64_to_cpu(p->sector);
4786         size = be32_to_cpu(p->blksize);
4787
4788         dec_rs_pending(device);
4789
4790         if (get_ldev(device)) {
4791                 struct drbd_peer_request *peer_req;
4792                 const int op = REQ_OP_WRITE_ZEROES;
4793
4794                 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
4795                                                size, 0, GFP_NOIO);
4796                 if (!peer_req) {
4797                         put_ldev(device);
4798                         return -ENOMEM;
4799                 }
4800
4801                 peer_req->w.cb = e_end_resync_block;
4802                 peer_req->submit_jif = jiffies;
4803                 peer_req->flags |= EE_IS_TRIM;
4804
4805                 spin_lock_irq(&device->resource->req_lock);
4806                 list_add_tail(&peer_req->w.list, &device->sync_ee);
4807                 spin_unlock_irq(&device->resource->req_lock);
4808
4809                 atomic_add(pi->size >> 9, &device->rs_sect_ev);
4810                 err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR);
4811
4812                 if (err) {
4813                         spin_lock_irq(&device->resource->req_lock);
4814                         list_del(&peer_req->w.list);
4815                         spin_unlock_irq(&device->resource->req_lock);
4816
4817                         drbd_free_peer_req(device, peer_req);
4818                         put_ldev(device);
4819                         err = 0;
4820                         goto fail;
4821                 }
4822
4823                 inc_unacked(device);
4824
4825                 /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
4826                    as well as drbd_rs_complete_io() */
4827         } else {
4828         fail:
4829                 drbd_rs_complete_io(device, sector);
4830                 drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
4831         }
4832
4833         atomic_add(size >> 9, &device->rs_sect_in);
4834
4835         return err;
4836 }
4837
4838 struct data_cmd {
4839         int expect_payload;
4840         unsigned int pkt_size;
4841         int (*fn)(struct drbd_connection *, struct packet_info *);
4842 };
4843
4844 static struct data_cmd drbd_cmd_handler[] = {
4845         [P_DATA]            = { 1, sizeof(struct p_data), receive_Data },
4846         [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
4847         [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
4848         [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
4849         [P_BITMAP]          = { 1, 0, receive_bitmap } ,
4850         [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
4851         [P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
4852         [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
4853         [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4854         [P_SYNC_PARAM]      = { 1, 0, receive_SyncParam },
4855         [P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
4856         [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
4857         [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
4858         [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
4859         [P_STATE]           = { 0, sizeof(struct p_state), receive_state },
4860         [P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
4861         [P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
4862         [P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
4863         [P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
4864         [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4865         [P_RS_THIN_REQ]     = { 0, sizeof(struct p_block_req), receive_DataRequest },
4866         [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
4867         [P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4868         [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
4869         [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
4870         [P_TRIM]            = { 0, sizeof(struct p_trim), receive_Data },
4871         [P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
4872         [P_WSAME]           = { 1, sizeof(struct p_wsame), receive_Data },
4873 };
4874
4875 static void drbdd(struct drbd_connection *connection)
4876 {
4877         struct packet_info pi;
4878         size_t shs; /* sub header size */
4879         int err;
4880
4881         while (get_t_state(&connection->receiver) == RUNNING) {
4882                 struct data_cmd const *cmd;
4883
4884                 drbd_thread_current_set_cpu(&connection->receiver);
4885                 update_receiver_timing_details(connection, drbd_recv_header);
4886                 if (drbd_recv_header(connection, &pi))
4887                         goto err_out;
4888
4889                 cmd = &drbd_cmd_handler[pi.cmd];
4890                 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
4891                         drbd_err(connection, "Unexpected data packet %s (0x%04x)",
4892                                  cmdname(pi.cmd), pi.cmd);
4893                         goto err_out;
4894                 }
4895
4896                 shs = cmd->pkt_size;
4897                 if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
4898                         shs += sizeof(struct o_qlim);
4899                 if (pi.size > shs && !cmd->expect_payload) {
4900                         drbd_err(connection, "No payload expected %s l:%d\n",
4901                                  cmdname(pi.cmd), pi.size);
4902                         goto err_out;
4903                 }
4904                 if (pi.size < shs) {
4905                         drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
4906                                  cmdname(pi.cmd), (int)shs, pi.size);
4907                         goto err_out;
4908                 }
4909
4910                 if (shs) {
4911                         update_receiver_timing_details(connection, drbd_recv_all_warn);
4912                         err = drbd_recv_all_warn(connection, pi.data, shs);
4913                         if (err)
4914                                 goto err_out;
4915                         pi.size -= shs;
4916                 }
4917
4918                 update_receiver_timing_details(connection, cmd->fn);
4919                 err = cmd->fn(connection, &pi);
4920                 if (err) {
4921                         drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
4922                                  cmdname(pi.cmd), err, pi.size);
4923                         goto err_out;
4924                 }
4925         }
4926         return;
4927
4928     err_out:
4929         conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
4930 }
4931
4932 static void conn_disconnect(struct drbd_connection *connection)
4933 {
4934         struct drbd_peer_device *peer_device;
4935         enum drbd_conns oc;
4936         int vnr;
4937
4938         if (connection->cstate == C_STANDALONE)
4939                 return;
4940
4941         /* We are about to start the cleanup after connection loss.
4942          * Make sure drbd_make_request knows about that.
4943          * Usually we should be in some network failure state already,
4944          * but just in case we are not, we fix it up here.
4945          */
4946         conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
4947
4948         /* ack_receiver does not clean up anything. it must not interfere, either */
4949         drbd_thread_stop(&connection->ack_receiver);
4950         if (connection->ack_sender) {
4951                 destroy_workqueue(connection->ack_sender);
4952                 connection->ack_sender = NULL;
4953         }
4954         drbd_free_sock(connection);
4955
4956         rcu_read_lock();
4957         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
4958                 struct drbd_device *device = peer_device->device;
4959                 kref_get(&device->kref);
4960                 rcu_read_unlock();
4961                 drbd_disconnected(peer_device);
4962                 kref_put(&device->kref, drbd_destroy_device);
4963                 rcu_read_lock();
4964         }
4965         rcu_read_unlock();
4966
4967         if (!list_empty(&connection->current_epoch->list))
4968                 drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
4969         /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
4970         atomic_set(&connection->current_epoch->epoch_size, 0);
4971         connection->send.seen_any_write_yet = false;
4972
4973         drbd_info(connection, "Connection closed\n");
4974
4975         if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
4976                 conn_try_outdate_peer_async(connection);
4977
4978         spin_lock_irq(&connection->resource->req_lock);
4979         oc = connection->cstate;
4980         if (oc >= C_UNCONNECTED)
4981                 _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
4982
4983         spin_unlock_irq(&connection->resource->req_lock);
4984
4985         if (oc == C_DISCONNECTING)
4986                 conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
4987 }
4988
4989 static int drbd_disconnected(struct drbd_peer_device *peer_device)
4990 {
4991         struct drbd_device *device = peer_device->device;
4992         unsigned int i;
4993
4994         /* wait for current activity to cease. */
4995         spin_lock_irq(&device->resource->req_lock);
4996         _drbd_wait_ee_list_empty(device, &device->active_ee);
4997         _drbd_wait_ee_list_empty(device, &device->sync_ee);
4998         _drbd_wait_ee_list_empty(device, &device->read_ee);
4999         spin_unlock_irq(&device->resource->req_lock);
5000
5001         /* We do not have data structures that would allow us to
5002          * get the rs_pending_cnt down to 0 again.
5003          *  * On C_SYNC_TARGET we do not have any data structures describing
5004          *    the pending RSDataRequest's we have sent.
5005          *  * On C_SYNC_SOURCE there is no data structure that tracks
5006          *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
5007          *  And no, it is not the sum of the reference counts in the
5008          *  resync_LRU. The resync_LRU tracks the whole operation including
5009          *  the disk-IO, while the rs_pending_cnt only tracks the blocks
5010          *  on the fly. */
5011         drbd_rs_cancel_all(device);
5012         device->rs_total = 0;
5013         device->rs_failed = 0;
5014         atomic_set(&device->rs_pending_cnt, 0);
5015         wake_up(&device->misc_wait);
5016
5017         del_timer_sync(&device->resync_timer);
5018         resync_timer_fn((unsigned long)device);
5019
5020         /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
5021          * w_make_resync_request etc. which may still be on the worker queue
5022          * to be "canceled" */
5023         drbd_flush_workqueue(&peer_device->connection->sender_work);
5024
5025         drbd_finish_peer_reqs(device);
5026
5027         /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
5028            might have issued a work again. The one before drbd_finish_peer_reqs() is
5029            necessary to reclain net_ee in drbd_finish_peer_reqs(). */
5030         drbd_flush_workqueue(&peer_device->connection->sender_work);
5031
5032         /* need to do it again, drbd_finish_peer_reqs() may have populated it
5033          * again via drbd_try_clear_on_disk_bm(). */
5034         drbd_rs_cancel_all(device);
5035
5036         kfree(device->p_uuid);
5037         device->p_uuid = NULL;
5038
5039         if (!drbd_suspended(device))
5040                 tl_clear(peer_device->connection);
5041
5042         drbd_md_sync(device);
5043
5044         if (get_ldev(device)) {
5045                 drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
5046                                 "write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
5047                 put_ldev(device);
5048         }
5049
5050         /* tcp_close and release of sendpage pages can be deferred.  I don't
5051          * want to use SO_LINGER, because apparently it can be deferred for
5052          * more than 20 seconds (longest time I checked).
5053          *
5054          * Actually we don't care for exactly when the network stack does its
5055          * put_page(), but release our reference on these pages right here.
5056          */
5057         i = drbd_free_peer_reqs(device, &device->net_ee);
5058         if (i)
5059                 drbd_info(device, "net_ee not empty, killed %u entries\n", i);
5060         i = atomic_read(&device->pp_in_use_by_net);
5061         if (i)
5062                 drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
5063         i = atomic_read(&device->pp_in_use);
5064         if (i)
5065                 drbd_info(device, "pp_in_use = %d, expected 0\n", i);
5066
5067         D_ASSERT(device, list_empty(&device->read_ee));
5068         D_ASSERT(device, list_empty(&device->active_ee));
5069         D_ASSERT(device, list_empty(&device->sync_ee));
5070         D_ASSERT(device, list_empty(&device->done_ee));
5071
5072         return 0;
5073 }
5074
5075 /*
5076  * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
5077  * we can agree on is stored in agreed_pro_version.
5078  *
5079  * feature flags and the reserved array should be enough room for future
5080  * enhancements of the handshake protocol, and possible plugins...
5081  *
5082  * for now, they are expected to be zero, but ignored.
5083  */
5084 static int drbd_send_features(struct drbd_connection *connection)
5085 {
5086         struct drbd_socket *sock;
5087         struct p_connection_features *p;
5088
5089         sock = &connection->data;
5090         p = conn_prepare_command(connection, sock);
5091         if (!p)
5092                 return -EIO;
5093         memset(p, 0, sizeof(*p));
5094         p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
5095         p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
5096         p->feature_flags = cpu_to_be32(PRO_FEATURES);
5097         return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
5098 }
5099
5100 /*
5101  * return values:
5102  *   1 yes, we have a valid connection
5103  *   0 oops, did not work out, please try again
5104  *  -1 peer talks different language,
5105  *     no point in trying again, please go standalone.
5106  */
5107 static int drbd_do_features(struct drbd_connection *connection)
5108 {
5109         /* ASSERT current == connection->receiver ... */
5110         struct p_connection_features *p;
5111         const int expect = sizeof(struct p_connection_features);
5112         struct packet_info pi;
5113         int err;
5114
5115         err = drbd_send_features(connection);
5116         if (err)
5117                 return 0;
5118
5119         err = drbd_recv_header(connection, &pi);
5120         if (err)
5121                 return 0;
5122
5123         if (pi.cmd != P_CONNECTION_FEATURES) {
5124                 drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
5125                          cmdname(pi.cmd), pi.cmd);
5126                 return -1;
5127         }
5128
5129         if (pi.size != expect) {
5130                 drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
5131                      expect, pi.size);
5132                 return -1;
5133         }
5134
5135         p = pi.data;
5136         err = drbd_recv_all_warn(connection, p, expect);
5137         if (err)
5138                 return 0;
5139
5140         p->protocol_min = be32_to_cpu(p->protocol_min);
5141         p->protocol_max = be32_to_cpu(p->protocol_max);
5142         if (p->protocol_max == 0)
5143                 p->protocol_max = p->protocol_min;
5144
5145         if (PRO_VERSION_MAX < p->protocol_min ||
5146             PRO_VERSION_MIN > p->protocol_max)
5147                 goto incompat;
5148
5149         connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
5150         connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
5151
5152         drbd_info(connection, "Handshake successful: "
5153              "Agreed network protocol version %d\n", connection->agreed_pro_version);
5154
5155         drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n",
5156                   connection->agreed_features,
5157                   connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
5158                   connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
5159                   connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" :
5160                   connection->agreed_features ? "" : " none");
5161
5162         return 1;
5163
5164  incompat:
5165         drbd_err(connection, "incompatible DRBD dialects: "
5166             "I support %d-%d, peer supports %d-%d\n",
5167             PRO_VERSION_MIN, PRO_VERSION_MAX,
5168             p->protocol_min, p->protocol_max);
5169         return -1;
5170 }
5171
5172 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
5173 static int drbd_do_auth(struct drbd_connection *connection)
5174 {
5175         drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
5176         drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
5177         return -1;
5178 }
5179 #else
5180 #define CHALLENGE_LEN 64
5181
5182 /* Return value:
5183         1 - auth succeeded,
5184         0 - failed, try again (network error),
5185         -1 - auth failed, don't try again.
5186 */
5187
5188 static int drbd_do_auth(struct drbd_connection *connection)
5189 {
5190         struct drbd_socket *sock;
5191         char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
5192         char *response = NULL;
5193         char *right_response = NULL;
5194         char *peers_ch = NULL;
5195         unsigned int key_len;
5196         char secret[SHARED_SECRET_MAX]; /* 64 byte */
5197         unsigned int resp_size;
5198         SHASH_DESC_ON_STACK(desc, connection->cram_hmac_tfm);
5199         struct packet_info pi;
5200         struct net_conf *nc;
5201         int err, rv;
5202
5203         /* FIXME: Put the challenge/response into the preallocated socket buffer.  */
5204
5205         rcu_read_lock();
5206         nc = rcu_dereference(connection->net_conf);
5207         key_len = strlen(nc->shared_secret);
5208         memcpy(secret, nc->shared_secret, key_len);
5209         rcu_read_unlock();
5210
5211         desc->tfm = connection->cram_hmac_tfm;
5212         desc->flags = 0;
5213
5214         rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
5215         if (rv) {
5216                 drbd_err(connection, "crypto_shash_setkey() failed with %d\n", rv);
5217                 rv = -1;
5218                 goto fail;
5219         }
5220
5221         get_random_bytes(my_challenge, CHALLENGE_LEN);
5222
5223         sock = &connection->data;
5224         if (!conn_prepare_command(connection, sock)) {
5225                 rv = 0;
5226                 goto fail;
5227         }
5228         rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
5229                                 my_challenge, CHALLENGE_LEN);
5230         if (!rv)
5231                 goto fail;
5232
5233         err = drbd_recv_header(connection, &pi);
5234         if (err) {
5235                 rv = 0;
5236                 goto fail;
5237         }
5238
5239         if (pi.cmd != P_AUTH_CHALLENGE) {
5240                 drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
5241                          cmdname(pi.cmd), pi.cmd);
5242                 rv = 0;
5243                 goto fail;
5244         }
5245
5246         if (pi.size > CHALLENGE_LEN * 2) {
5247                 drbd_err(connection, "expected AuthChallenge payload too big.\n");
5248                 rv = -1;
5249                 goto fail;
5250         }
5251
5252         if (pi.size < CHALLENGE_LEN) {
5253                 drbd_err(connection, "AuthChallenge payload too small.\n");
5254                 rv = -1;
5255                 goto fail;
5256         }
5257
5258         peers_ch = kmalloc(pi.size, GFP_NOIO);
5259         if (peers_ch == NULL) {
5260                 drbd_err(connection, "kmalloc of peers_ch failed\n");
5261                 rv = -1;
5262                 goto fail;
5263         }
5264
5265         err = drbd_recv_all_warn(connection, peers_ch, pi.size);
5266         if (err) {
5267                 rv = 0;
5268                 goto fail;
5269         }
5270
5271         if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
5272                 drbd_err(connection, "Peer presented the same challenge!\n");
5273                 rv = -1;
5274                 goto fail;
5275         }
5276
5277         resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm);
5278         response = kmalloc(resp_size, GFP_NOIO);
5279         if (response == NULL) {
5280                 drbd_err(connection, "kmalloc of response failed\n");
5281                 rv = -1;
5282                 goto fail;
5283         }
5284
5285         rv = crypto_shash_digest(desc, peers_ch, pi.size, response);
5286         if (rv) {
5287                 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
5288                 rv = -1;
5289                 goto fail;
5290         }
5291
5292         if (!conn_prepare_command(connection, sock)) {
5293                 rv = 0;
5294                 goto fail;
5295         }
5296         rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
5297                                 response, resp_size);
5298         if (!rv)
5299                 goto fail;
5300
5301         err = drbd_recv_header(connection, &pi);
5302         if (err) {
5303                 rv = 0;
5304                 goto fail;
5305         }
5306
5307         if (pi.cmd != P_AUTH_RESPONSE) {
5308                 drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
5309                          cmdname(pi.cmd), pi.cmd);
5310                 rv = 0;
5311                 goto fail;
5312         }
5313
5314         if (pi.size != resp_size) {
5315                 drbd_err(connection, "expected AuthResponse payload of wrong size\n");
5316                 rv = 0;
5317                 goto fail;
5318         }
5319
5320         err = drbd_recv_all_warn(connection, response , resp_size);
5321         if (err) {
5322                 rv = 0;
5323                 goto fail;
5324         }
5325
5326         right_response = kmalloc(resp_size, GFP_NOIO);
5327         if (right_response == NULL) {
5328                 drbd_err(connection, "kmalloc of right_response failed\n");
5329                 rv = -1;
5330                 goto fail;
5331         }
5332
5333         rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN,
5334                                  right_response);
5335         if (rv) {
5336                 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
5337                 rv = -1;
5338                 goto fail;
5339         }
5340
5341         rv = !memcmp(response, right_response, resp_size);
5342
5343         if (rv)
5344                 drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
5345                      resp_size);
5346         else
5347                 rv = -1;
5348
5349  fail:
5350         kfree(peers_ch);
5351         kfree(response);
5352         kfree(right_response);
5353         shash_desc_zero(desc);
5354
5355         return rv;
5356 }
5357 #endif
5358
5359 int drbd_receiver(struct drbd_thread *thi)
5360 {
5361         struct drbd_connection *connection = thi->connection;
5362         int h;
5363
5364         drbd_info(connection, "receiver (re)started\n");
5365
5366         do {
5367                 h = conn_connect(connection);
5368                 if (h == 0) {
5369                         conn_disconnect(connection);
5370                         schedule_timeout_interruptible(HZ);
5371                 }
5372                 if (h == -1) {
5373                         drbd_warn(connection, "Discarding network configuration.\n");
5374                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
5375                 }
5376         } while (h == 0);
5377
5378         if (h > 0)
5379                 drbdd(connection);
5380
5381         conn_disconnect(connection);
5382
5383         drbd_info(connection, "receiver terminated\n");
5384         return 0;
5385 }
5386
5387 /* ********* acknowledge sender ******** */
5388
5389 static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5390 {
5391         struct p_req_state_reply *p = pi->data;
5392         int retcode = be32_to_cpu(p->retcode);
5393
5394         if (retcode >= SS_SUCCESS) {
5395                 set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
5396         } else {
5397                 set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
5398                 drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
5399                          drbd_set_st_err_str(retcode), retcode);
5400         }
5401         wake_up(&connection->ping_wait);
5402
5403         return 0;
5404 }
5405
5406 static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5407 {
5408         struct drbd_peer_device *peer_device;
5409         struct drbd_device *device;
5410         struct p_req_state_reply *p = pi->data;
5411         int retcode = be32_to_cpu(p->retcode);
5412
5413         peer_device = conn_peer_device(connection, pi->vnr);
5414         if (!peer_device)
5415                 return -EIO;
5416         device = peer_device->device;
5417
5418         if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
5419                 D_ASSERT(device, connection->agreed_pro_version < 100);
5420                 return got_conn_RqSReply(connection, pi);
5421         }
5422
5423         if (retcode >= SS_SUCCESS) {
5424                 set_bit(CL_ST_CHG_SUCCESS, &device->flags);
5425         } else {
5426                 set_bit(CL_ST_CHG_FAIL, &device->flags);
5427                 drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
5428                         drbd_set_st_err_str(retcode), retcode);
5429         }
5430         wake_up(&device->state_wait);
5431
5432         return 0;
5433 }
5434
5435 static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
5436 {
5437         return drbd_send_ping_ack(connection);
5438
5439 }
5440
5441 static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
5442 {
5443         /* restore idle timeout */
5444         connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
5445         if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
5446                 wake_up(&connection->ping_wait);
5447
5448         return 0;
5449 }
5450
5451 static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
5452 {
5453         struct drbd_peer_device *peer_device;
5454         struct drbd_device *device;
5455         struct p_block_ack *p = pi->data;
5456         sector_t sector = be64_to_cpu(p->sector);
5457         int blksize = be32_to_cpu(p->blksize);
5458
5459         peer_device = conn_peer_device(connection, pi->vnr);
5460         if (!peer_device)
5461                 return -EIO;
5462         device = peer_device->device;
5463
5464         D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
5465
5466         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5467
5468         if (get_ldev(device)) {
5469                 drbd_rs_complete_io(device, sector);
5470                 drbd_set_in_sync(device, sector, blksize);
5471                 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
5472                 device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
5473                 put_ldev(device);
5474         }
5475         dec_rs_pending(device);
5476         atomic_add(blksize >> 9, &device->rs_sect_in);
5477
5478         return 0;
5479 }
5480
5481 static int
5482 validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
5483                               struct rb_root *root, const char *func,
5484                               enum drbd_req_event what, bool missing_ok)
5485 {
5486         struct drbd_request *req;
5487         struct bio_and_error m;
5488
5489         spin_lock_irq(&device->resource->req_lock);
5490         req = find_request(device, root, id, sector, missing_ok, func);
5491         if (unlikely(!req)) {
5492                 spin_unlock_irq(&device->resource->req_lock);
5493                 return -EIO;
5494         }
5495         __req_mod(req, what, &m);
5496         spin_unlock_irq(&device->resource->req_lock);
5497
5498         if (m.bio)
5499                 complete_master_bio(device, &m);
5500         return 0;
5501 }
5502
5503 static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
5504 {
5505         struct drbd_peer_device *peer_device;
5506         struct drbd_device *device;
5507         struct p_block_ack *p = pi->data;
5508         sector_t sector = be64_to_cpu(p->sector);
5509         int blksize = be32_to_cpu(p->blksize);
5510         enum drbd_req_event what;
5511
5512         peer_device = conn_peer_device(connection, pi->vnr);
5513         if (!peer_device)
5514                 return -EIO;
5515         device = peer_device->device;
5516
5517         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5518
5519         if (p->block_id == ID_SYNCER) {
5520                 drbd_set_in_sync(device, sector, blksize);
5521                 dec_rs_pending(device);
5522                 return 0;
5523         }
5524         switch (pi->cmd) {
5525         case P_RS_WRITE_ACK:
5526                 what = WRITE_ACKED_BY_PEER_AND_SIS;
5527                 break;
5528         case P_WRITE_ACK:
5529                 what = WRITE_ACKED_BY_PEER;
5530                 break;
5531         case P_RECV_ACK:
5532                 what = RECV_ACKED_BY_PEER;
5533                 break;
5534         case P_SUPERSEDED:
5535                 what = CONFLICT_RESOLVED;
5536                 break;
5537         case P_RETRY_WRITE:
5538                 what = POSTPONE_WRITE;
5539                 break;
5540         default:
5541                 BUG();
5542         }
5543
5544         return validate_req_change_req_state(device, p->block_id, sector,
5545                                              &device->write_requests, __func__,
5546                                              what, false);
5547 }
5548
5549 static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
5550 {
5551         struct drbd_peer_device *peer_device;
5552         struct drbd_device *device;
5553         struct p_block_ack *p = pi->data;
5554         sector_t sector = be64_to_cpu(p->sector);
5555         int size = be32_to_cpu(p->blksize);
5556         int err;
5557
5558         peer_device = conn_peer_device(connection, pi->vnr);
5559         if (!peer_device)
5560                 return -EIO;
5561         device = peer_device->device;
5562
5563         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5564
5565         if (p->block_id == ID_SYNCER) {
5566                 dec_rs_pending(device);
5567                 drbd_rs_failed_io(device, sector, size);
5568                 return 0;
5569         }
5570
5571         err = validate_req_change_req_state(device, p->block_id, sector,
5572                                             &device->write_requests, __func__,
5573                                             NEG_ACKED, true);
5574         if (err) {
5575                 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5576                    The master bio might already be completed, therefore the
5577                    request is no longer in the collision hash. */
5578                 /* In Protocol B we might already have got a P_RECV_ACK
5579                    but then get a P_NEG_ACK afterwards. */
5580                 drbd_set_out_of_sync(device, sector, size);
5581         }
5582         return 0;
5583 }
5584
5585 static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
5586 {
5587         struct drbd_peer_device *peer_device;
5588         struct drbd_device *device;
5589         struct p_block_ack *p = pi->data;
5590         sector_t sector = be64_to_cpu(p->sector);
5591
5592         peer_device = conn_peer_device(connection, pi->vnr);
5593         if (!peer_device)
5594                 return -EIO;
5595         device = peer_device->device;
5596
5597         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5598
5599         drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
5600             (unsigned long long)sector, be32_to_cpu(p->blksize));
5601
5602         return validate_req_change_req_state(device, p->block_id, sector,
5603                                              &device->read_requests, __func__,
5604                                              NEG_ACKED, false);
5605 }
5606
5607 static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
5608 {
5609         struct drbd_peer_device *peer_device;
5610         struct drbd_device *device;
5611         sector_t sector;
5612         int size;
5613         struct p_block_ack *p = pi->data;
5614
5615         peer_device = conn_peer_device(connection, pi->vnr);
5616         if (!peer_device)
5617                 return -EIO;
5618         device = peer_device->device;
5619
5620         sector = be64_to_cpu(p->sector);
5621         size = be32_to_cpu(p->blksize);
5622
5623         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5624
5625         dec_rs_pending(device);
5626
5627         if (get_ldev_if_state(device, D_FAILED)) {
5628                 drbd_rs_complete_io(device, sector);
5629                 switch (pi->cmd) {
5630                 case P_NEG_RS_DREPLY:
5631                         drbd_rs_failed_io(device, sector, size);
5632                 case P_RS_CANCEL:
5633                         break;
5634                 default:
5635                         BUG();
5636                 }
5637                 put_ldev(device);
5638         }
5639
5640         return 0;
5641 }
5642
5643 static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
5644 {
5645         struct p_barrier_ack *p = pi->data;
5646         struct drbd_peer_device *peer_device;
5647         int vnr;
5648
5649         tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
5650
5651         rcu_read_lock();
5652         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5653                 struct drbd_device *device = peer_device->device;
5654
5655                 if (device->state.conn == C_AHEAD &&
5656                     atomic_read(&device->ap_in_flight) == 0 &&
5657                     !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
5658                         device->start_resync_timer.expires = jiffies + HZ;
5659                         add_timer(&device->start_resync_timer);
5660                 }
5661         }
5662         rcu_read_unlock();
5663
5664         return 0;
5665 }
5666
5667 static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
5668 {
5669         struct drbd_peer_device *peer_device;
5670         struct drbd_device *device;
5671         struct p_block_ack *p = pi->data;
5672         struct drbd_device_work *dw;
5673         sector_t sector;
5674         int size;
5675
5676         peer_device = conn_peer_device(connection, pi->vnr);
5677         if (!peer_device)
5678                 return -EIO;
5679         device = peer_device->device;
5680
5681         sector = be64_to_cpu(p->sector);
5682         size = be32_to_cpu(p->blksize);
5683
5684         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5685
5686         if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
5687                 drbd_ov_out_of_sync_found(device, sector, size);
5688         else
5689                 ov_out_of_sync_print(device);
5690
5691         if (!get_ldev(device))
5692                 return 0;
5693
5694         drbd_rs_complete_io(device, sector);
5695         dec_rs_pending(device);
5696
5697         --device->ov_left;
5698
5699         /* let's advance progress step marks only for every other megabyte */
5700         if ((device->ov_left & 0x200) == 0x200)
5701                 drbd_advance_rs_marks(device, device->ov_left);
5702
5703         if (device->ov_left == 0) {
5704                 dw = kmalloc(sizeof(*dw), GFP_NOIO);
5705                 if (dw) {
5706                         dw->w.cb = w_ov_finished;
5707                         dw->device = device;
5708                         drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
5709                 } else {
5710                         drbd_err(device, "kmalloc(dw) failed.");
5711                         ov_out_of_sync_print(device);
5712                         drbd_resync_finished(device);
5713                 }
5714         }
5715         put_ldev(device);
5716         return 0;
5717 }
5718
5719 static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
5720 {
5721         return 0;
5722 }
5723
5724 struct meta_sock_cmd {
5725         size_t pkt_size;
5726         int (*fn)(struct drbd_connection *connection, struct packet_info *);
5727 };
5728
5729 static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
5730 {
5731         long t;
5732         struct net_conf *nc;
5733
5734         rcu_read_lock();
5735         nc = rcu_dereference(connection->net_conf);
5736         t = ping_timeout ? nc->ping_timeo : nc->ping_int;
5737         rcu_read_unlock();
5738
5739         t *= HZ;
5740         if (ping_timeout)
5741                 t /= 10;
5742
5743         connection->meta.socket->sk->sk_rcvtimeo = t;
5744 }
5745
5746 static void set_ping_timeout(struct drbd_connection *connection)
5747 {
5748         set_rcvtimeo(connection, 1);
5749 }
5750
5751 static void set_idle_timeout(struct drbd_connection *connection)
5752 {
5753         set_rcvtimeo(connection, 0);
5754 }
5755
5756 static struct meta_sock_cmd ack_receiver_tbl[] = {
5757         [P_PING]            = { 0, got_Ping },
5758         [P_PING_ACK]        = { 0, got_PingAck },
5759         [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
5760         [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
5761         [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
5762         [P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
5763         [P_NEG_ACK]         = { sizeof(struct p_block_ack), got_NegAck },
5764         [P_NEG_DREPLY]      = { sizeof(struct p_block_ack), got_NegDReply },
5765         [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
5766         [P_OV_RESULT]       = { sizeof(struct p_block_ack), got_OVResult },
5767         [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
5768         [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5769         [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
5770         [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
5771         [P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
5772         [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5773         [P_RETRY_WRITE]     = { sizeof(struct p_block_ack), got_BlockAck },
5774 };
5775
5776 int drbd_ack_receiver(struct drbd_thread *thi)
5777 {
5778         struct drbd_connection *connection = thi->connection;
5779         struct meta_sock_cmd *cmd = NULL;
5780         struct packet_info pi;
5781         unsigned long pre_recv_jif;
5782         int rv;
5783         void *buf    = connection->meta.rbuf;
5784         int received = 0;
5785         unsigned int header_size = drbd_header_size(connection);
5786         int expect   = header_size;
5787         bool ping_timeout_active = false;
5788         struct sched_param param = { .sched_priority = 2 };
5789
5790         rv = sched_setscheduler(current, SCHED_RR, &param);
5791         if (rv < 0)
5792                 drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
5793
5794         while (get_t_state(thi) == RUNNING) {
5795                 drbd_thread_current_set_cpu(thi);
5796
5797                 conn_reclaim_net_peer_reqs(connection);
5798
5799                 if (test_and_clear_bit(SEND_PING, &connection->flags)) {
5800                         if (drbd_send_ping(connection)) {
5801                                 drbd_err(connection, "drbd_send_ping has failed\n");
5802                                 goto reconnect;
5803                         }
5804                         set_ping_timeout(connection);
5805                         ping_timeout_active = true;
5806                 }
5807
5808                 pre_recv_jif = jiffies;
5809                 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
5810
5811                 /* Note:
5812                  * -EINTR        (on meta) we got a signal
5813                  * -EAGAIN       (on meta) rcvtimeo expired
5814                  * -ECONNRESET   other side closed the connection
5815                  * -ERESTARTSYS  (on data) we got a signal
5816                  * rv <  0       other than above: unexpected error!
5817                  * rv == expected: full header or command
5818                  * rv <  expected: "woken" by signal during receive
5819                  * rv == 0       : "connection shut down by peer"
5820                  */
5821                 if (likely(rv > 0)) {
5822                         received += rv;
5823                         buf      += rv;
5824                 } else if (rv == 0) {
5825                         if (test_bit(DISCONNECT_SENT, &connection->flags)) {
5826                                 long t;
5827                                 rcu_read_lock();
5828                                 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
5829                                 rcu_read_unlock();
5830
5831                                 t = wait_event_timeout(connection->ping_wait,
5832                                                        connection->cstate < C_WF_REPORT_PARAMS,
5833                                                        t);
5834                                 if (t)
5835                                         break;
5836                         }
5837                         drbd_err(connection, "meta connection shut down by peer.\n");
5838                         goto reconnect;
5839                 } else if (rv == -EAGAIN) {
5840                         /* If the data socket received something meanwhile,
5841                          * that is good enough: peer is still alive. */
5842                         if (time_after(connection->last_received, pre_recv_jif))
5843                                 continue;
5844                         if (ping_timeout_active) {
5845                                 drbd_err(connection, "PingAck did not arrive in time.\n");
5846                                 goto reconnect;
5847                         }
5848                         set_bit(SEND_PING, &connection->flags);
5849                         continue;
5850                 } else if (rv == -EINTR) {
5851                         /* maybe drbd_thread_stop(): the while condition will notice.
5852                          * maybe woken for send_ping: we'll send a ping above,
5853                          * and change the rcvtimeo */
5854                         flush_signals(current);
5855                         continue;
5856                 } else {
5857                         drbd_err(connection, "sock_recvmsg returned %d\n", rv);
5858                         goto reconnect;
5859                 }
5860
5861                 if (received == expect && cmd == NULL) {
5862                         if (decode_header(connection, connection->meta.rbuf, &pi))
5863                                 goto reconnect;
5864                         cmd = &ack_receiver_tbl[pi.cmd];
5865                         if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
5866                                 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
5867                                          cmdname(pi.cmd), pi.cmd);
5868                                 goto disconnect;
5869                         }
5870                         expect = header_size + cmd->pkt_size;
5871                         if (pi.size != expect - header_size) {
5872                                 drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
5873                                         pi.cmd, pi.size);
5874                                 goto reconnect;
5875                         }
5876                 }
5877                 if (received == expect) {
5878                         bool err;
5879
5880                         err = cmd->fn(connection, &pi);
5881                         if (err) {
5882                                 drbd_err(connection, "%pf failed\n", cmd->fn);
5883                                 goto reconnect;
5884                         }
5885
5886                         connection->last_received = jiffies;
5887
5888                         if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
5889                                 set_idle_timeout(connection);
5890                                 ping_timeout_active = false;
5891                         }
5892
5893                         buf      = connection->meta.rbuf;
5894                         received = 0;
5895                         expect   = header_size;
5896                         cmd      = NULL;
5897                 }
5898         }
5899
5900         if (0) {
5901 reconnect:
5902                 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5903                 conn_md_sync(connection);
5904         }
5905         if (0) {
5906 disconnect:
5907                 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
5908         }
5909
5910         drbd_info(connection, "ack_receiver terminated\n");
5911
5912         return 0;
5913 }
5914
5915 void drbd_send_acks_wf(struct work_struct *ws)
5916 {
5917         struct drbd_peer_device *peer_device =
5918                 container_of(ws, struct drbd_peer_device, send_acks_work);
5919         struct drbd_connection *connection = peer_device->connection;
5920         struct drbd_device *device = peer_device->device;
5921         struct net_conf *nc;
5922         int tcp_cork, err;
5923
5924         rcu_read_lock();
5925         nc = rcu_dereference(connection->net_conf);
5926         tcp_cork = nc->tcp_cork;
5927         rcu_read_unlock();
5928
5929         if (tcp_cork)
5930                 drbd_tcp_cork(connection->meta.socket);
5931
5932         err = drbd_finish_peer_reqs(device);
5933         kref_put(&device->kref, drbd_destroy_device);
5934         /* get is in drbd_endio_write_sec_final(). That is necessary to keep the
5935            struct work_struct send_acks_work alive, which is in the peer_device object */
5936
5937         if (err) {
5938                 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5939                 return;
5940         }
5941
5942         if (tcp_cork)
5943                 drbd_tcp_uncork(connection->meta.socket);
5944
5945         return;
5946 }