[PATCH] ocfs2: add dlm_wait_for_node_death

author Kurt Hackel <kurt.hackel@oracle.com>

Thu, 19 Jan 2006 01:05:38 +0000 (17:05 -0800)

committer Mark Fasheh <mark.fasheh@oracle.com>

Thu, 16 Feb 2006 20:01:38 +0000 (12:01 -0800)
author Kurt Hackel <kurt.hackel@oracle.com>
Thu, 19 Jan 2006 01:05:38 +0000 (17:05 -0800)
committer Mark Fasheh <mark.fasheh@oracle.com>
Thu, 16 Feb 2006 20:01:38 +0000 (12:01 -0800)
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h

index 42eb53b5293be362df0b5d3a608c5f360a004122..23ceaa7127b4c117fbd3fa738a42665f4c612ab6 100644 (file)
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -208,6 +208,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
  #define DLM_LOCK_RES_IN_PROGRESS          0x00000010
  #define DLM_LOCK_RES_MIGRATING            0x00000020
  
+/* max milliseconds to wait to sync up a network failure with a node death */
+#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
+
  #define DLM_PURGE_INTERVAL_MS   (8 * 1000)
  
  struct dlm_lock_resource
@@ -658,6 +661,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
  void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
  void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
  int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
  
  void dlm_put(struct dlm_ctxt *dlm);
  struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c

index f5c2f1979ad3d68c588b9f50c4ddb806af5b5b41..f66e2d818ccdefa2c4eb20625fc77d417e464e37 100644 (file)
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
         } else {
                 mlog_errno(tmpret);
                 if (dlm_is_host_down(tmpret)) {
+                       /* instead of logging the same network error over
+                        * and over, sleep here and wait for the heartbeat
+                        * to notice the node is dead.  times out after 5s. */
+                       dlm_wait_for_node_death(dlm, res->owner, 
+                                               DLM_NODE_DEATH_WAIT_MAX);
                         ret = DLM_RECOVERING;
                         mlog(0, "node %u died so returning DLM_RECOVERING "
                              "from convert message!\n", res->owner);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c

index d1a0038557a32fa2cc5ab8d698c50fb390d5fc7f..e709412e6e323de0029c2926ec035e21974cd291 100644 (file)
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -646,7 +646,19 @@ retry_lock:
                         mlog(0, "retrying lock with migration/"
                              "recovery/in progress\n");
                         msleep(100);
-                       dlm_wait_for_recovery(dlm);
+                       /* no waiting for dlm_reco_thread */
+                       if (recovery) {
+                               if (status == DLM_RECOVERING) {
+                                       mlog(0, "%s: got RECOVERING "
+                                            "for $REOCVERY lock, master "
+                                            "was %u\n", dlm->name, 
+                                            res->owner);
+                                       dlm_wait_for_node_death(dlm, res->owner, 
+                                                       DLM_NODE_DEATH_WAIT_MAX);
+                               }
+                       } else {
+                               dlm_wait_for_recovery(dlm);
+                       }
                         goto retry_lock;
                 }
  
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c

index f9ce864966ec60a90ce8c25badca9028ebccce74..ed76bda1a5344c4abb53072c08667617f7e05461 100644 (file)
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -278,6 +278,24 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
         return dead;
  }
  
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+       if (timeout) {
+               mlog(ML_NOTICE, "%s: waiting %dms for notification of "
+                    "death of node %u\n", dlm->name, timeout, node);
+               wait_event_timeout(dlm->dlm_reco_thread_wq,
+                          dlm_is_node_dead(dlm, node),
+                          msecs_to_jiffies(timeout));
+       } else {
+               mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
+                    "of death of node %u\n", dlm->name, node);
+               wait_event(dlm->dlm_reco_thread_wq,
+                          dlm_is_node_dead(dlm, node));
+       }
+       /* for now, return 0 */
+       return 0;
+}
+
  /* callers of the top-level api calls (dlmlock/dlmunlock) should
   * block on the dlm->reco.event when recovery is in progress.
   * the dlm recovery thread will set this state when it begins
author	Kurt Hackel <kurt.hackel@oracle.com>
	Thu, 19 Jan 2006 01:05:38 +0000 (17:05 -0800)
committer	Mark Fasheh <mark.fasheh@oracle.com>
	Thu, 16 Feb 2006 20:01:38 +0000 (12:01 -0800)
fs/ocfs2/dlm/dlmcommon.h		patch \| blob \| history
fs/ocfs2/dlm/dlmconvert.c		patch \| blob \| history
fs/ocfs2/dlm/dlmlock.c		patch \| blob \| history
fs/ocfs2/dlm/dlmrecovery.c		patch \| blob \| history