]> git.karo-electronics.de Git - karo-tx-linux.git/blob - fs/ocfs2/stack_user.c
ocfs2: pass ocfs2_cluster_connection to ocfs2_this_node
[karo-tx-linux.git] / fs / ocfs2 / stack_user.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * stack_user.c
5  *
6  * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
7  *
8  * Copyright (C) 2007 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation, version 2.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * General Public License for more details.
18  */
19
20 #include <linux/module.h>
21 #include <linux/fs.h>
22 #include <linux/miscdevice.h>
23 #include <linux/mutex.h>
24 #include <linux/slab.h>
25 #include <linux/reboot.h>
26 #include <asm/uaccess.h>
27
28 #include "stackglue.h"
29
30 #include <linux/dlm_plock.h>
31
32 /*
33  * The control protocol starts with a handshake.  Until the handshake
34  * is complete, the control device will fail all write(2)s.
35  *
36  * The handshake is simple.  First, the client reads until EOF.  Each line
37  * of output is a supported protocol tag.  All protocol tags are a single
38  * character followed by a two hex digit version number.  Currently the
39  * only things supported is T01, for "Text-base version 0x01".  Next, the
40  * client writes the version they would like to use, including the newline.
41  * Thus, the protocol tag is 'T01\n'.  If the version tag written is
42  * unknown, -EINVAL is returned.  Once the negotiation is complete, the
43  * client can start sending messages.
44  *
45  * The T01 protocol has three messages.  First is the "SETN" message.
46  * It has the following syntax:
47  *
48  *  SETN<space><8-char-hex-nodenum><newline>
49  *
50  * This is 14 characters.
51  *
52  * The "SETN" message must be the first message following the protocol.
53  * It tells ocfs2_control the local node number.
54  *
55  * Next comes the "SETV" message.  It has the following syntax:
56  *
57  *  SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
58  *
59  * This is 11 characters.
60  *
61  * The "SETV" message sets the filesystem locking protocol version as
62  * negotiated by the client.  The client negotiates based on the maximum
63  * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
64  * number from the "SETV" message must match
65  * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
66  * must be less than or equal to ...sp_max_version.pv_minor.
67  *
68  * Once this information has been set, mounts will be allowed.  From this
69  * point on, the "DOWN" message can be sent for node down notification.
70  * It has the following syntax:
71  *
72  *  DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
73  *
74  * eg:
75  *
76  *  DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
77  *
78  * This is 47 characters.
79  */
80
81 /*
82  * Whether or not the client has done the handshake.
83  * For now, we have just one protocol version.
84  */
85 #define OCFS2_CONTROL_PROTO                     "T01\n"
86 #define OCFS2_CONTROL_PROTO_LEN                 4
87
88 /* Handshake states */
89 #define OCFS2_CONTROL_HANDSHAKE_INVALID         (0)
90 #define OCFS2_CONTROL_HANDSHAKE_READ            (1)
91 #define OCFS2_CONTROL_HANDSHAKE_PROTOCOL        (2)
92 #define OCFS2_CONTROL_HANDSHAKE_VALID           (3)
93
94 /* Messages */
95 #define OCFS2_CONTROL_MESSAGE_OP_LEN            4
96 #define OCFS2_CONTROL_MESSAGE_SETNODE_OP        "SETN"
97 #define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14
98 #define OCFS2_CONTROL_MESSAGE_SETVERSION_OP     "SETV"
99 #define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN      11
100 #define OCFS2_CONTROL_MESSAGE_DOWN_OP           "DOWN"
101 #define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN    47
102 #define OCFS2_TEXT_UUID_LEN                     32
103 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN        2
104 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN       8
105
106 enum ocfs2_connection_type {
107         WITH_CONTROLD,
108         NO_CONTROLD
109 };
110
111 /*
112  * ocfs2_live_connection is refcounted because the filesystem and
113  * miscdevice sides can detach in different order.  Let's just be safe.
114  */
115 struct ocfs2_live_connection {
116         struct list_head                oc_list;
117         struct ocfs2_cluster_connection *oc_conn;
118         enum ocfs2_connection_type      oc_type;
119         atomic_t                        oc_this_node;
120         int                             oc_our_slot;
121 };
122
123 struct ocfs2_control_private {
124         struct list_head op_list;
125         int op_state;
126         int op_this_node;
127         struct ocfs2_protocol_version op_proto;
128 };
129
130 /* SETN<space><8-char-hex-nodenum><newline> */
131 struct ocfs2_control_message_setn {
132         char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
133         char    space;
134         char    nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
135         char    newline;
136 };
137
138 /* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
139 struct ocfs2_control_message_setv {
140         char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
141         char    space1;
142         char    major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
143         char    space2;
144         char    minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
145         char    newline;
146 };
147
148 /* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
149 struct ocfs2_control_message_down {
150         char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
151         char    space1;
152         char    uuid[OCFS2_TEXT_UUID_LEN];
153         char    space2;
154         char    nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
155         char    newline;
156 };
157
158 union ocfs2_control_message {
159         char                                    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
160         struct ocfs2_control_message_setn       u_setn;
161         struct ocfs2_control_message_setv       u_setv;
162         struct ocfs2_control_message_down       u_down;
163 };
164
165 static struct ocfs2_stack_plugin ocfs2_user_plugin;
166
167 static atomic_t ocfs2_control_opened;
168 static int ocfs2_control_this_node = -1;
169 static struct ocfs2_protocol_version running_proto;
170
171 static LIST_HEAD(ocfs2_live_connection_list);
172 static LIST_HEAD(ocfs2_control_private_list);
173 static DEFINE_MUTEX(ocfs2_control_lock);
174
175 static inline void ocfs2_control_set_handshake_state(struct file *file,
176                                                      int state)
177 {
178         struct ocfs2_control_private *p = file->private_data;
179         p->op_state = state;
180 }
181
182 static inline int ocfs2_control_get_handshake_state(struct file *file)
183 {
184         struct ocfs2_control_private *p = file->private_data;
185         return p->op_state;
186 }
187
188 static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
189 {
190         size_t len = strlen(name);
191         struct ocfs2_live_connection *c;
192
193         BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
194
195         list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
196                 if ((c->oc_conn->cc_namelen == len) &&
197                     !strncmp(c->oc_conn->cc_name, name, len))
198                         return c;
199         }
200
201         return NULL;
202 }
203
204 /*
205  * ocfs2_live_connection structures are created underneath the ocfs2
206  * mount path.  Since the VFS prevents multiple calls to
207  * fill_super(), we can't get dupes here.
208  */
209 static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
210                                      struct ocfs2_live_connection *c)
211 {
212         int rc = 0;
213
214         mutex_lock(&ocfs2_control_lock);
215         c->oc_conn = conn;
216
217         if (atomic_read(&ocfs2_control_opened))
218                 list_add(&c->oc_list, &ocfs2_live_connection_list);
219         else {
220                 printk(KERN_ERR
221                        "ocfs2: Userspace control daemon is not present\n");
222                 rc = -ESRCH;
223         }
224
225         mutex_unlock(&ocfs2_control_lock);
226         return rc;
227 }
228
229 /*
230  * This function disconnects the cluster connection from ocfs2_control.
231  * Afterwards, userspace can't affect the cluster connection.
232  */
233 static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
234 {
235         mutex_lock(&ocfs2_control_lock);
236         list_del_init(&c->oc_list);
237         c->oc_conn = NULL;
238         mutex_unlock(&ocfs2_control_lock);
239
240         kfree(c);
241 }
242
243 static int ocfs2_control_cfu(void *target, size_t target_len,
244                              const char __user *buf, size_t count)
245 {
246         /* The T01 expects write(2) calls to have exactly one command */
247         if ((count != target_len) ||
248             (count > sizeof(union ocfs2_control_message)))
249                 return -EINVAL;
250
251         if (copy_from_user(target, buf, target_len))
252                 return -EFAULT;
253
254         return 0;
255 }
256
257 static ssize_t ocfs2_control_validate_protocol(struct file *file,
258                                                const char __user *buf,
259                                                size_t count)
260 {
261         ssize_t ret;
262         char kbuf[OCFS2_CONTROL_PROTO_LEN];
263
264         ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
265                                 buf, count);
266         if (ret)
267                 return ret;
268
269         if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
270                 return -EINVAL;
271
272         ocfs2_control_set_handshake_state(file,
273                                           OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
274
275         return count;
276 }
277
278 static void ocfs2_control_send_down(const char *uuid,
279                                     int nodenum)
280 {
281         struct ocfs2_live_connection *c;
282
283         mutex_lock(&ocfs2_control_lock);
284
285         c = ocfs2_connection_find(uuid);
286         if (c) {
287                 BUG_ON(c->oc_conn == NULL);
288                 c->oc_conn->cc_recovery_handler(nodenum,
289                                                 c->oc_conn->cc_recovery_data);
290         }
291
292         mutex_unlock(&ocfs2_control_lock);
293 }
294
295 /*
296  * Called whenever configuration elements are sent to /dev/ocfs2_control.
297  * If all configuration elements are present, try to set the global
298  * values.  If there is a problem, return an error.  Skip any missing
299  * elements, and only bump ocfs2_control_opened when we have all elements
300  * and are successful.
301  */
302 static int ocfs2_control_install_private(struct file *file)
303 {
304         int rc = 0;
305         int set_p = 1;
306         struct ocfs2_control_private *p = file->private_data;
307
308         BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
309
310         mutex_lock(&ocfs2_control_lock);
311
312         if (p->op_this_node < 0) {
313                 set_p = 0;
314         } else if ((ocfs2_control_this_node >= 0) &&
315                    (ocfs2_control_this_node != p->op_this_node)) {
316                 rc = -EINVAL;
317                 goto out_unlock;
318         }
319
320         if (!p->op_proto.pv_major) {
321                 set_p = 0;
322         } else if (!list_empty(&ocfs2_live_connection_list) &&
323                    ((running_proto.pv_major != p->op_proto.pv_major) ||
324                     (running_proto.pv_minor != p->op_proto.pv_minor))) {
325                 rc = -EINVAL;
326                 goto out_unlock;
327         }
328
329         if (set_p) {
330                 ocfs2_control_this_node = p->op_this_node;
331                 running_proto.pv_major = p->op_proto.pv_major;
332                 running_proto.pv_minor = p->op_proto.pv_minor;
333         }
334
335 out_unlock:
336         mutex_unlock(&ocfs2_control_lock);
337
338         if (!rc && set_p) {
339                 /* We set the global values successfully */
340                 atomic_inc(&ocfs2_control_opened);
341                 ocfs2_control_set_handshake_state(file,
342                                         OCFS2_CONTROL_HANDSHAKE_VALID);
343         }
344
345         return rc;
346 }
347
348 static int ocfs2_control_get_this_node(void)
349 {
350         int rc;
351
352         mutex_lock(&ocfs2_control_lock);
353         if (ocfs2_control_this_node < 0)
354                 rc = -EINVAL;
355         else
356                 rc = ocfs2_control_this_node;
357         mutex_unlock(&ocfs2_control_lock);
358
359         return rc;
360 }
361
362 static int ocfs2_control_do_setnode_msg(struct file *file,
363                                         struct ocfs2_control_message_setn *msg)
364 {
365         long nodenum;
366         char *ptr = NULL;
367         struct ocfs2_control_private *p = file->private_data;
368
369         if (ocfs2_control_get_handshake_state(file) !=
370             OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
371                 return -EINVAL;
372
373         if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
374                     OCFS2_CONTROL_MESSAGE_OP_LEN))
375                 return -EINVAL;
376
377         if ((msg->space != ' ') || (msg->newline != '\n'))
378                 return -EINVAL;
379         msg->space = msg->newline = '\0';
380
381         nodenum = simple_strtol(msg->nodestr, &ptr, 16);
382         if (!ptr || *ptr)
383                 return -EINVAL;
384
385         if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
386             (nodenum > INT_MAX) || (nodenum < 0))
387                 return -ERANGE;
388         p->op_this_node = nodenum;
389
390         return ocfs2_control_install_private(file);
391 }
392
393 static int ocfs2_control_do_setversion_msg(struct file *file,
394                                            struct ocfs2_control_message_setv *msg)
395  {
396         long major, minor;
397         char *ptr = NULL;
398         struct ocfs2_control_private *p = file->private_data;
399         struct ocfs2_protocol_version *max =
400                 &ocfs2_user_plugin.sp_max_proto;
401
402         if (ocfs2_control_get_handshake_state(file) !=
403             OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
404                 return -EINVAL;
405
406         if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
407                     OCFS2_CONTROL_MESSAGE_OP_LEN))
408                 return -EINVAL;
409
410         if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
411             (msg->newline != '\n'))
412                 return -EINVAL;
413         msg->space1 = msg->space2 = msg->newline = '\0';
414
415         major = simple_strtol(msg->major, &ptr, 16);
416         if (!ptr || *ptr)
417                 return -EINVAL;
418         minor = simple_strtol(msg->minor, &ptr, 16);
419         if (!ptr || *ptr)
420                 return -EINVAL;
421
422         /*
423          * The major must be between 1 and 255, inclusive.  The minor
424          * must be between 0 and 255, inclusive.  The version passed in
425          * must be within the maximum version supported by the filesystem.
426          */
427         if ((major == LONG_MIN) || (major == LONG_MAX) ||
428             (major > (u8)-1) || (major < 1))
429                 return -ERANGE;
430         if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
431             (minor > (u8)-1) || (minor < 0))
432                 return -ERANGE;
433         if ((major != max->pv_major) ||
434             (minor > max->pv_minor))
435                 return -EINVAL;
436
437         p->op_proto.pv_major = major;
438         p->op_proto.pv_minor = minor;
439
440         return ocfs2_control_install_private(file);
441 }
442
443 static int ocfs2_control_do_down_msg(struct file *file,
444                                      struct ocfs2_control_message_down *msg)
445 {
446         long nodenum;
447         char *p = NULL;
448
449         if (ocfs2_control_get_handshake_state(file) !=
450             OCFS2_CONTROL_HANDSHAKE_VALID)
451                 return -EINVAL;
452
453         if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
454                     OCFS2_CONTROL_MESSAGE_OP_LEN))
455                 return -EINVAL;
456
457         if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
458             (msg->newline != '\n'))
459                 return -EINVAL;
460         msg->space1 = msg->space2 = msg->newline = '\0';
461
462         nodenum = simple_strtol(msg->nodestr, &p, 16);
463         if (!p || *p)
464                 return -EINVAL;
465
466         if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
467             (nodenum > INT_MAX) || (nodenum < 0))
468                 return -ERANGE;
469
470         ocfs2_control_send_down(msg->uuid, nodenum);
471
472         return 0;
473 }
474
475 static ssize_t ocfs2_control_message(struct file *file,
476                                      const char __user *buf,
477                                      size_t count)
478 {
479         ssize_t ret;
480         union ocfs2_control_message msg;
481
482         /* Try to catch padding issues */
483         WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
484                 (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
485
486         memset(&msg, 0, sizeof(union ocfs2_control_message));
487         ret = ocfs2_control_cfu(&msg, count, buf, count);
488         if (ret)
489                 goto out;
490
491         if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
492             !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
493                      OCFS2_CONTROL_MESSAGE_OP_LEN))
494                 ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
495         else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
496                  !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
497                           OCFS2_CONTROL_MESSAGE_OP_LEN))
498                 ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
499         else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
500                  !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
501                           OCFS2_CONTROL_MESSAGE_OP_LEN))
502                 ret = ocfs2_control_do_down_msg(file, &msg.u_down);
503         else
504                 ret = -EINVAL;
505
506 out:
507         return ret ? ret : count;
508 }
509
510 static ssize_t ocfs2_control_write(struct file *file,
511                                    const char __user *buf,
512                                    size_t count,
513                                    loff_t *ppos)
514 {
515         ssize_t ret;
516
517         switch (ocfs2_control_get_handshake_state(file)) {
518                 case OCFS2_CONTROL_HANDSHAKE_INVALID:
519                         ret = -EINVAL;
520                         break;
521
522                 case OCFS2_CONTROL_HANDSHAKE_READ:
523                         ret = ocfs2_control_validate_protocol(file, buf,
524                                                               count);
525                         break;
526
527                 case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
528                 case OCFS2_CONTROL_HANDSHAKE_VALID:
529                         ret = ocfs2_control_message(file, buf, count);
530                         break;
531
532                 default:
533                         BUG();
534                         ret = -EIO;
535                         break;
536         }
537
538         return ret;
539 }
540
541 /*
542  * This is a naive version.  If we ever have a new protocol, we'll expand
543  * it.  Probably using seq_file.
544  */
545 static ssize_t ocfs2_control_read(struct file *file,
546                                   char __user *buf,
547                                   size_t count,
548                                   loff_t *ppos)
549 {
550         ssize_t ret;
551
552         ret = simple_read_from_buffer(buf, count, ppos,
553                         OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
554
555         /* Have we read the whole protocol list? */
556         if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
557                 ocfs2_control_set_handshake_state(file,
558                                                   OCFS2_CONTROL_HANDSHAKE_READ);
559
560         return ret;
561 }
562
563 static int ocfs2_control_release(struct inode *inode, struct file *file)
564 {
565         struct ocfs2_control_private *p = file->private_data;
566
567         mutex_lock(&ocfs2_control_lock);
568
569         if (ocfs2_control_get_handshake_state(file) !=
570             OCFS2_CONTROL_HANDSHAKE_VALID)
571                 goto out;
572
573         if (atomic_dec_and_test(&ocfs2_control_opened)) {
574                 if (!list_empty(&ocfs2_live_connection_list)) {
575                         /* XXX: Do bad things! */
576                         printk(KERN_ERR
577                                "ocfs2: Unexpected release of ocfs2_control!\n"
578                                "       Loss of cluster connection requires "
579                                "an emergency restart!\n");
580                         emergency_restart();
581                 }
582                 /*
583                  * Last valid close clears the node number and resets
584                  * the locking protocol version
585                  */
586                 ocfs2_control_this_node = -1;
587                 running_proto.pv_major = 0;
588                 running_proto.pv_major = 0;
589         }
590
591 out:
592         list_del_init(&p->op_list);
593         file->private_data = NULL;
594
595         mutex_unlock(&ocfs2_control_lock);
596
597         kfree(p);
598
599         return 0;
600 }
601
602 static int ocfs2_control_open(struct inode *inode, struct file *file)
603 {
604         struct ocfs2_control_private *p;
605
606         p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
607         if (!p)
608                 return -ENOMEM;
609         p->op_this_node = -1;
610
611         mutex_lock(&ocfs2_control_lock);
612         file->private_data = p;
613         list_add(&p->op_list, &ocfs2_control_private_list);
614         mutex_unlock(&ocfs2_control_lock);
615
616         return 0;
617 }
618
619 static const struct file_operations ocfs2_control_fops = {
620         .open    = ocfs2_control_open,
621         .release = ocfs2_control_release,
622         .read    = ocfs2_control_read,
623         .write   = ocfs2_control_write,
624         .owner   = THIS_MODULE,
625         .llseek  = default_llseek,
626 };
627
628 static struct miscdevice ocfs2_control_device = {
629         .minor          = MISC_DYNAMIC_MINOR,
630         .name           = "ocfs2_control",
631         .fops           = &ocfs2_control_fops,
632 };
633
634 static int ocfs2_control_init(void)
635 {
636         int rc;
637
638         atomic_set(&ocfs2_control_opened, 0);
639
640         rc = misc_register(&ocfs2_control_device);
641         if (rc)
642                 printk(KERN_ERR
643                        "ocfs2: Unable to register ocfs2_control device "
644                        "(errno %d)\n",
645                        -rc);
646
647         return rc;
648 }
649
650 static void ocfs2_control_exit(void)
651 {
652         int rc;
653
654         rc = misc_deregister(&ocfs2_control_device);
655         if (rc)
656                 printk(KERN_ERR
657                        "ocfs2: Unable to deregister ocfs2_control device "
658                        "(errno %d)\n",
659                        -rc);
660 }
661
662 static void fsdlm_lock_ast_wrapper(void *astarg)
663 {
664         struct ocfs2_dlm_lksb *lksb = astarg;
665         int status = lksb->lksb_fsdlm.sb_status;
666
667         /*
668          * For now we're punting on the issue of other non-standard errors
669          * where we can't tell if the unlock_ast or lock_ast should be called.
670          * The main "other error" that's possible is EINVAL which means the
671          * function was called with invalid args, which shouldn't be possible
672          * since the caller here is under our control.  Other non-standard
673          * errors probably fall into the same category, or otherwise are fatal
674          * which means we can't carry on anyway.
675          */
676
677         if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
678                 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
679         else
680                 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
681 }
682
683 static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
684 {
685         struct ocfs2_dlm_lksb *lksb = astarg;
686
687         lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
688 }
689
690 static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
691                          int mode,
692                          struct ocfs2_dlm_lksb *lksb,
693                          u32 flags,
694                          void *name,
695                          unsigned int namelen)
696 {
697         int ret;
698
699         if (!lksb->lksb_fsdlm.sb_lvbptr)
700                 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
701                                              sizeof(struct dlm_lksb);
702
703         ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
704                        flags|DLM_LKF_NODLCKWT, name, namelen, 0,
705                        fsdlm_lock_ast_wrapper, lksb,
706                        fsdlm_blocking_ast_wrapper);
707         return ret;
708 }
709
710 static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
711                            struct ocfs2_dlm_lksb *lksb,
712                            u32 flags)
713 {
714         int ret;
715
716         ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
717                          flags, &lksb->lksb_fsdlm, lksb);
718         return ret;
719 }
720
721 static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
722 {
723         return lksb->lksb_fsdlm.sb_status;
724 }
725
726 static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
727 {
728         int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
729
730         return !invalid;
731 }
732
733 static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
734 {
735         if (!lksb->lksb_fsdlm.sb_lvbptr)
736                 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
737                                              sizeof(struct dlm_lksb);
738         return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
739 }
740
741 static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
742 {
743 }
744
745 static int user_plock(struct ocfs2_cluster_connection *conn,
746                       u64 ino,
747                       struct file *file,
748                       int cmd,
749                       struct file_lock *fl)
750 {
751         /*
752          * This more or less just demuxes the plock request into any
753          * one of three dlm calls.
754          *
755          * Internally, fs/dlm will pass these to a misc device, which
756          * a userspace daemon will read and write to.
757          *
758          * For now, cancel requests (which happen internally only),
759          * are turned into unlocks. Most of this function taken from
760          * gfs2_lock.
761          */
762
763         if (cmd == F_CANCELLK) {
764                 cmd = F_SETLK;
765                 fl->fl_type = F_UNLCK;
766         }
767
768         if (IS_GETLK(cmd))
769                 return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
770         else if (fl->fl_type == F_UNLCK)
771                 return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
772         else
773                 return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
774 }
775
776 /*
777  * Compare a requested locking protocol version against the current one.
778  *
779  * If the major numbers are different, they are incompatible.
780  * If the current minor is greater than the request, they are incompatible.
781  * If the current minor is less than or equal to the request, they are
782  * compatible, and the requester should run at the current minor version.
783  */
784 static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
785                                struct ocfs2_protocol_version *request)
786 {
787         if (existing->pv_major != request->pv_major)
788                 return 1;
789
790         if (existing->pv_minor > request->pv_minor)
791                 return 1;
792
793         if (existing->pv_minor < request->pv_minor)
794                 request->pv_minor = existing->pv_minor;
795
796         return 0;
797 }
798
799 static void user_recover_prep(void *arg)
800 {
801 }
802
803 static void user_recover_slot(void *arg, struct dlm_slot *slot)
804 {
805         struct ocfs2_cluster_connection *conn = arg;
806         printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
807                         slot->nodeid, slot->slot);
808         conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
809
810 }
811
812 static void user_recover_done(void *arg, struct dlm_slot *slots,
813                 int num_slots, int our_slot,
814                 uint32_t generation)
815 {
816         struct ocfs2_cluster_connection *conn = arg;
817         struct ocfs2_live_connection *lc = conn->cc_private;
818         int i;
819
820         for (i = 0; i < num_slots; i++)
821                 if (slots[i].slot == our_slot) {
822                         atomic_set(&lc->oc_this_node, slots[i].nodeid);
823                         break;
824                 }
825
826         lc->oc_our_slot = our_slot;
827 }
828
829 const struct dlm_lockspace_ops ocfs2_ls_ops = {
830         .recover_prep = user_recover_prep,
831         .recover_slot = user_recover_slot,
832         .recover_done = user_recover_done,
833 };
834
835 static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
836 {
837         dlm_lockspace_t *fsdlm;
838         struct ocfs2_live_connection *lc;
839         int rc;
840
841         BUG_ON(conn == NULL);
842
843         lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
844         if (!lc) {
845                 rc = -ENOMEM;
846                 goto out;
847         }
848
849         lc->oc_type = WITH_CONTROLD;
850         rc = ocfs2_live_connection_attach(conn, lc);
851         if (rc)
852                 goto out;
853
854         /*
855          * running_proto must have been set before we allowed any mounts
856          * to proceed.
857          */
858         if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
859                 printk(KERN_ERR
860                        "Unable to mount with fs locking protocol version "
861                        "%u.%u because the userspace control daemon has "
862                        "negotiated %u.%u\n",
863                        conn->cc_version.pv_major, conn->cc_version.pv_minor,
864                        running_proto.pv_major, running_proto.pv_minor);
865                 rc = -EPROTO;
866                 ocfs2_live_connection_drop(lc);
867                 lc = NULL;
868                 goto out;
869         }
870
871         rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
872                                NULL, NULL, NULL, &fsdlm);
873         if (rc) {
874                 ocfs2_live_connection_drop(lc);
875                 lc = NULL;
876                 goto out;
877         }
878
879         conn->cc_private = lc;
880         conn->cc_lockspace = fsdlm;
881 out:
882         if (rc && lc)
883                 kfree(lc);
884         return rc;
885 }
886
887 static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
888 {
889         dlm_release_lockspace(conn->cc_lockspace, 2);
890         conn->cc_lockspace = NULL;
891         ocfs2_live_connection_drop(conn->cc_private);
892         conn->cc_private = NULL;
893         return 0;
894 }
895
896 static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
897                                   unsigned int *this_node)
898 {
899         int rc;
900         struct ocfs2_live_connection *lc = conn->cc_private;
901
902         if (lc->oc_type == WITH_CONTROLD)
903                 rc = ocfs2_control_get_this_node();
904         else
905                 rc = -EINVAL;
906         if (rc < 0)
907                 return rc;
908
909         *this_node = rc;
910         return 0;
911 }
912
913 static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
914         .connect        = user_cluster_connect,
915         .disconnect     = user_cluster_disconnect,
916         .this_node      = user_cluster_this_node,
917         .dlm_lock       = user_dlm_lock,
918         .dlm_unlock     = user_dlm_unlock,
919         .lock_status    = user_dlm_lock_status,
920         .lvb_valid      = user_dlm_lvb_valid,
921         .lock_lvb       = user_dlm_lvb,
922         .plock          = user_plock,
923         .dump_lksb      = user_dlm_dump_lksb,
924 };
925
926 static struct ocfs2_stack_plugin ocfs2_user_plugin = {
927         .sp_name        = "user",
928         .sp_ops         = &ocfs2_user_plugin_ops,
929         .sp_owner       = THIS_MODULE,
930 };
931
932
933 static int __init ocfs2_user_plugin_init(void)
934 {
935         int rc;
936
937         rc = ocfs2_control_init();
938         if (!rc) {
939                 rc = ocfs2_stack_glue_register(&ocfs2_user_plugin);
940                 if (rc)
941                         ocfs2_control_exit();
942         }
943
944         return rc;
945 }
946
947 static void __exit ocfs2_user_plugin_exit(void)
948 {
949         ocfs2_stack_glue_unregister(&ocfs2_user_plugin);
950         ocfs2_control_exit();
951 }
952
953 MODULE_AUTHOR("Oracle");
954 MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
955 MODULE_LICENSE("GPL");
956 module_init(ocfs2_user_plugin_init);
957 module_exit(ocfs2_user_plugin_exit);