]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - net/ceph/messenger.c
Merge tag 'v2.6.38' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[mv-sheeva.git] / net / ceph / messenger.c
index b6ff4a1519ab664c43d9bff871dc95d8efb52daa..05f357828a2fb8de1bb2be9f4103c90058484259 100644 (file)
@@ -96,7 +96,7 @@ struct workqueue_struct *ceph_msgr_wq;
 
 int ceph_msgr_init(void)
 {
-       ceph_msgr_wq = create_workqueue("ceph-msgr");
+       ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
        if (!ceph_msgr_wq) {
                pr_err("msgr_init failed to create workqueue\n");
                return -ENOMEM;
@@ -252,8 +252,12 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
 {
        struct kvec iov = {buf, len};
        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+       int r;
 
-       return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+       r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+       if (r == -EAGAIN)
+               r = 0;
+       return r;
 }
 
 /*
@@ -264,13 +268,17 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
                     size_t kvlen, size_t len, int more)
 {
        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+       int r;
 
        if (more)
                msg.msg_flags |= MSG_MORE;
        else
                msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
 
-       return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+       r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+       if (r == -EAGAIN)
+               r = 0;
+       return r;
 }
 
 
@@ -328,7 +336,6 @@ static void reset_connection(struct ceph_connection *con)
                ceph_msg_put(con->out_msg);
                con->out_msg = NULL;
        }
-       con->out_keepalive_pending = false;
        con->in_seq = 0;
        con->in_seq_acked = 0;
 }
@@ -847,6 +854,8 @@ static int write_partial_msg_pages(struct ceph_connection *con)
                    (msg->pages || msg->pagelist || msg->bio || in_trail))
                        kunmap(page);
 
+               if (ret == -EAGAIN)
+                       ret = 0;
                if (ret <= 0)
                        goto out;
 
@@ -1238,8 +1247,6 @@ static int process_connect(struct ceph_connection *con)
                     con->auth_retry);
                if (con->auth_retry == 2) {
                        con->error_msg = "connect authorization failure";
-                       reset_connection(con);
-                       set_bit(CLOSED, &con->state);
                        return -1;
                }
                con->auth_retry = 1;
@@ -1705,14 +1712,6 @@ more:
 
        /* open the socket first? */
        if (con->sock == NULL) {
-               /*
-                * if we were STANDBY and are reconnecting _this_
-                * connection, bump connect_seq now.  Always bump
-                * global_seq.
-                */
-               if (test_and_clear_bit(STANDBY, &con->state))
-                       con->connect_seq++;
-
                prepare_write_banner(msgr, con);
                prepare_write_connect(msgr, con, 1);
                prepare_read_banner(con);
@@ -1737,16 +1736,12 @@ more_kvec:
        if (con->out_skip) {
                ret = write_partial_skip(con);
                if (ret <= 0)
-                       goto done;
-               if (ret < 0) {
-                       dout("try_write write_partial_skip err %d\n", ret);
-                       goto done;
-               }
+                       goto out;
        }
        if (con->out_kvec_left) {
                ret = write_partial_kvec(con);
                if (ret <= 0)
-                       goto done;
+                       goto out;
        }
 
        /* msg pages? */
@@ -1761,11 +1756,11 @@ more_kvec:
                if (ret == 1)
                        goto more_kvec;  /* we need to send the footer, too! */
                if (ret == 0)
-                       goto done;
+                       goto out;
                if (ret < 0) {
                        dout("try_write write_partial_msg_pages err %d\n",
                             ret);
-                       goto done;
+                       goto out;
                }
        }
 
@@ -1789,10 +1784,9 @@ do_next:
        /* Nothing to do! */
        clear_bit(WRITE_PENDING, &con->state);
        dout("try_write nothing else to write.\n");
-done:
        ret = 0;
 out:
-       dout("try_write done on %p\n", con);
+       dout("try_write done on %p ret %d\n", con, ret);
        return ret;
 }
 
@@ -1821,19 +1815,17 @@ more:
                        dout("try_read connecting\n");
                        ret = read_partial_banner(con);
                        if (ret <= 0)
-                               goto done;
-                       if (process_banner(con) < 0) {
-                               ret = -1;
                                goto out;
-                       }
+                       ret = process_banner(con);
+                       if (ret < 0)
+                               goto out;
                }
                ret = read_partial_connect(con);
                if (ret <= 0)
-                       goto done;
-               if (process_connect(con) < 0) {
-                       ret = -1;
                        goto out;
-               }
+               ret = process_connect(con);
+               if (ret < 0)
+                       goto out;
                goto more;
        }
 
@@ -1848,7 +1840,7 @@ more:
                dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
                ret = ceph_tcp_recvmsg(con->sock, buf, skip);
                if (ret <= 0)
-                       goto done;
+                       goto out;
                con->in_base_pos += ret;
                if (con->in_base_pos)
                        goto more;
@@ -1859,7 +1851,7 @@ more:
                 */
                ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
                if (ret <= 0)
-                       goto done;
+                       goto out;
                dout("try_read got tag %d\n", (int)con->in_tag);
                switch (con->in_tag) {
                case CEPH_MSGR_TAG_MSG:
@@ -1870,7 +1862,7 @@ more:
                        break;
                case CEPH_MSGR_TAG_CLOSE:
                        set_bit(CLOSED, &con->state);   /* fixme */
-                       goto done;
+                       goto out;
                default:
                        goto bad_tag;
                }
@@ -1882,13 +1874,12 @@ more:
                        case -EBADMSG:
                                con->error_msg = "bad crc";
                                ret = -EIO;
-                               goto out;
+                               break;
                        case -EIO:
                                con->error_msg = "io error";
-                               goto out;
-                       default:
-                               goto done;
+                               break;
                        }
+                       goto out;
                }
                if (con->in_tag == CEPH_MSGR_TAG_READY)
                        goto more;
@@ -1898,15 +1889,13 @@ more:
        if (con->in_tag == CEPH_MSGR_TAG_ACK) {
                ret = read_partial_ack(con);
                if (ret <= 0)
-                       goto done;
+                       goto out;
                process_ack(con);
                goto more;
        }
 
-done:
-       ret = 0;
 out:
-       dout("try_read done on %p\n", con);
+       dout("try_read done on %p ret %d\n", con, ret);
        return ret;
 
 bad_tag:
@@ -1920,20 +1909,6 @@ bad_tag:
 /*
  * Atomically queue work on a connection.  Bump @con reference to
  * avoid races with connection teardown.
- *
- * There is some trickery going on with QUEUED and BUSY because we
- * only want a _single_ thread operating on each connection at any
- * point in time, but we want to use all available CPUs.
- *
- * The worker thread only proceeds if it can atomically set BUSY.  It
- * clears QUEUED and does it's thing.  When it thinks it's done, it
- * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
- * (tries again to set BUSY).
- *
- * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
- * try to queue work.  If that fails (work is already queued, or BUSY)
- * we give up (work also already being done or is queued) but leave QUEUED
- * set so that the worker thread will loop if necessary.
  */
 static void queue_con(struct ceph_connection *con)
 {
@@ -1948,11 +1923,7 @@ static void queue_con(struct ceph_connection *con)
                return;
        }
 
-       set_bit(QUEUED, &con->state);
-       if (test_bit(BUSY, &con->state)) {
-               dout("queue_con %p - already BUSY\n", con);
-               con->ops->put(con);
-       } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+       if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) {
                dout("queue_con %p - already queued\n", con);
                con->ops->put(con);
        } else {
@@ -1967,18 +1938,26 @@ static void con_work(struct work_struct *work)
 {
        struct ceph_connection *con = container_of(work, struct ceph_connection,
                                                   work.work);
-       int backoff = 0;
-
-more:
-       if (test_and_set_bit(BUSY, &con->state) != 0) {
-               dout("con_work %p BUSY already set\n", con);
-               goto out;
-       }
-       dout("con_work %p start, clearing QUEUED\n", con);
-       clear_bit(QUEUED, &con->state);
 
        mutex_lock(&con->mutex);
+       if (test_and_clear_bit(BACKOFF, &con->state)) {
+               dout("con_work %p backing off\n", con);
+               if (queue_delayed_work(ceph_msgr_wq, &con->work,
+                                      round_jiffies_relative(con->delay))) {
+                       dout("con_work %p backoff %lu\n", con, con->delay);
+                       mutex_unlock(&con->mutex);
+                       return;
+               } else {
+                       con->ops->put(con);
+                       dout("con_work %p FAILED to back off %lu\n", con,
+                            con->delay);
+               }
+       }
 
+       if (test_bit(STANDBY, &con->state)) {
+               dout("con_work %p STANDBY\n", con);
+               goto done;
+       }
        if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
                dout("con_work CLOSED\n");
                con_close_socket(con);
@@ -1994,28 +1973,13 @@ more:
            try_read(con) < 0 ||
            try_write(con) < 0) {
                mutex_unlock(&con->mutex);
-               backoff = 1;
                ceph_fault(con);     /* error/fault path */
                goto done_unlocked;
        }
 
 done:
        mutex_unlock(&con->mutex);
-
 done_unlocked:
-       clear_bit(BUSY, &con->state);
-       dout("con->state=%lu\n", con->state);
-       if (test_bit(QUEUED, &con->state)) {
-               if (!backoff || test_bit(OPENING, &con->state)) {
-                       dout("con_work %p QUEUED reset, looping\n", con);
-                       goto more;
-               }
-               dout("con_work %p QUEUED reset, but just faulted\n", con);
-               clear_bit(QUEUED, &con->state);
-       }
-       dout("con_work %p done\n", con);
-
-out:
        con->ops->put(con);
 }
 
@@ -2050,10 +2014,12 @@ static void ceph_fault(struct ceph_connection *con)
        /* Requeue anything that hasn't been acked */
        list_splice_init(&con->out_sent, &con->out_queue);
 
-       /* If there are no messages in the queue, place the connection
-        * in a STANDBY state (i.e., don't try to reconnect just yet). */
-       if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
-               dout("fault setting STANDBY\n");
+       /* If there are no messages queued or keepalive pending, place
+        * the connection in a STANDBY state */
+       if (list_empty(&con->out_queue) &&
+           !test_bit(KEEPALIVE_PENDING, &con->state)) {
+               dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
+               clear_bit(WRITE_PENDING, &con->state);
                set_bit(STANDBY, &con->state);
        } else {
                /* retry after a delay. */
@@ -2061,11 +2027,24 @@ static void ceph_fault(struct ceph_connection *con)
                        con->delay = BASE_DELAY_INTERVAL;
                else if (con->delay < MAX_DELAY_INTERVAL)
                        con->delay *= 2;
-               dout("fault queueing %p delay %lu\n", con, con->delay);
                con->ops->get(con);
                if (queue_delayed_work(ceph_msgr_wq, &con->work,
-                                      round_jiffies_relative(con->delay)) == 0)
+                                      round_jiffies_relative(con->delay))) {
+                       dout("fault queued %p delay %lu\n", con, con->delay);
+               } else {
                        con->ops->put(con);
+                       dout("fault failed to queue %p delay %lu, backoff\n",
+                            con, con->delay);
+                       /*
+                        * In many cases we see a socket state change
+                        * while con_work is running and end up
+                        * queuing (non-delayed) work, such that we
+                        * can't backoff with a delay.  Set a flag so
+                        * that when con_work restarts we schedule the
+                        * delay then.
+                        */
+                       set_bit(BACKOFF, &con->state);
+               }
        }
 
 out_unlock:
@@ -2136,6 +2115,19 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr)
 }
 EXPORT_SYMBOL(ceph_messenger_destroy);
 
+static void clear_standby(struct ceph_connection *con)
+{
+       /* come back from STANDBY? */
+       if (test_and_clear_bit(STANDBY, &con->state)) {
+               mutex_lock(&con->mutex);
+               dout("clear_standby %p and ++connect_seq\n", con);
+               con->connect_seq++;
+               WARN_ON(test_bit(WRITE_PENDING, &con->state));
+               WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state));
+               mutex_unlock(&con->mutex);
+       }
+}
+
 /*
  * Queue up an outgoing message on the given connection.
  */
@@ -2168,6 +2160,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
        /* if there wasn't anything waiting to send before, queue
         * new work */
+       clear_standby(con);
        if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
                queue_con(con);
 }
@@ -2233,6 +2226,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
  */
 void ceph_con_keepalive(struct ceph_connection *con)
 {
+       dout("con_keepalive %p\n", con);
+       clear_standby(con);
        if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
            test_and_set_bit(WRITE_PENDING, &con->state) == 0)
                queue_con(con);