2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
5 This program can be distributed under the terms of the GNU GPL.
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/poll.h>
14 #include <linux/uio.h>
15 #include <linux/miscdevice.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/slab.h>
20 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
22 static kmem_cache_t *fuse_req_cachep;
24 static struct fuse_conn *fuse_get_conn(struct file *file)
27 spin_lock(&fuse_lock);
28 fc = file->private_data;
29 if (fc && !fc->connected)
31 spin_unlock(&fuse_lock);
35 static void fuse_request_init(struct fuse_req *req)
37 memset(req, 0, sizeof(*req));
38 INIT_LIST_HEAD(&req->list);
39 init_waitqueue_head(&req->waitq);
40 atomic_set(&req->count, 1);
43 struct fuse_req *fuse_request_alloc(void)
45 struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
47 fuse_request_init(req);
51 void fuse_request_free(struct fuse_req *req)
53 kmem_cache_free(fuse_req_cachep, req);
56 static void block_sigs(sigset_t *oldset)
60 siginitsetinv(&mask, sigmask(SIGKILL));
61 sigprocmask(SIG_BLOCK, &mask, oldset);
64 static void restore_sigs(sigset_t *oldset)
66 sigprocmask(SIG_SETMASK, oldset, NULL);
69 void fuse_reset_request(struct fuse_req *req)
71 int preallocated = req->preallocated;
72 BUG_ON(atomic_read(&req->count) != 1);
73 fuse_request_init(req);
74 req->preallocated = preallocated;
77 static void __fuse_get_request(struct fuse_req *req)
79 atomic_inc(&req->count);
82 /* Must be called with > 1 refcount */
83 static void __fuse_put_request(struct fuse_req *req)
85 BUG_ON(atomic_read(&req->count) < 2);
86 atomic_dec(&req->count);
89 static struct fuse_req *do_get_request(struct fuse_conn *fc)
93 spin_lock(&fuse_lock);
94 BUG_ON(list_empty(&fc->unused_list));
95 req = list_entry(fc->unused_list.next, struct fuse_req, list);
96 list_del_init(&req->list);
97 spin_unlock(&fuse_lock);
98 fuse_request_init(req);
99 req->preallocated = 1;
100 req->in.h.uid = current->fsuid;
101 req->in.h.gid = current->fsgid;
102 req->in.h.pid = current->pid;
106 /* This can return NULL, but only in case it's interrupted by a SIGKILL */
107 struct fuse_req *fuse_get_request(struct fuse_conn *fc)
112 atomic_inc(&fc->num_waiting);
114 intr = down_interruptible(&fc->outstanding_sem);
115 restore_sigs(&oldset);
117 atomic_dec(&fc->num_waiting);
120 return do_get_request(fc);
123 static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
125 spin_lock(&fuse_lock);
126 if (req->preallocated) {
127 atomic_dec(&fc->num_waiting);
128 list_add(&req->list, &fc->unused_list);
130 fuse_request_free(req);
132 /* If we are in debt decrease that first */
133 if (fc->outstanding_debt)
134 fc->outstanding_debt--;
136 up(&fc->outstanding_sem);
137 spin_unlock(&fuse_lock);
140 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
142 if (atomic_dec_and_test(&req->count))
143 fuse_putback_request(fc, req);
146 void fuse_release_background(struct fuse_req *req)
152 spin_lock(&fuse_lock);
153 list_del(&req->bg_entry);
154 spin_unlock(&fuse_lock);
157 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
160 struct fuse_init_out *arg = &req->misc.init_out;
162 if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
165 fc->minor = arg->minor;
166 fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
169 /* After INIT reply is received other requests can go
170 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
171 up()s on outstanding_sem. The last up() is done in
172 fuse_putback_request() */
173 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
174 up(&fc->outstanding_sem);
178 * This function is called when a request is finished. Either a reply
179 * has arrived or it was interrupted (and not yet sent) or some error
180 * occurred during communication with userspace, or the device file
181 * was closed. In case of a background request the reference to the
182 * stored objects are released. The requester thread is woken up (if
183 * still waiting), and finally the reference to the request is
186 * Called with fuse_lock, unlocks it
188 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
190 list_del(&req->list);
191 req->state = FUSE_REQ_FINISHED;
192 spin_unlock(&fuse_lock);
193 if (req->background) {
194 down_read(&fc->sbput_sem);
196 fuse_release_background(req);
197 up_read(&fc->sbput_sem);
199 wake_up(&req->waitq);
200 if (req->in.h.opcode == FUSE_INIT)
201 process_init_reply(fc, req);
202 else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
203 /* Special case for failed iget in CREATE */
204 u64 nodeid = req->in.h.nodeid;
205 fuse_reset_request(req);
206 fuse_send_forget(fc, req, nodeid, 1);
209 fuse_put_request(fc, req);
213 * Unfortunately request interruption not just solves the deadlock
214 * problem, it causes problems too. These stem from the fact, that an
215 * interrupted request is continued to be processed in userspace,
216 * while all the locks and object references (inode and file) held
217 * during the operation are released.
219 * To release the locks is exactly why there's a need to interrupt the
220 * request, so there's not a lot that can be done about this, except
221 * introduce additional locking in userspace.
223 * More important is to keep inode and file references until userspace
224 * has replied, otherwise FORGET and RELEASE could be sent while the
225 * inode/file is still used by the filesystem.
227 * For this reason the concept of "background" request is introduced.
228 * An interrupted request is backgrounded if it has been already sent
229 * to userspace. Backgrounding involves getting an extra reference to
230 * inode(s) or file used in the request, and adding the request to
231 * fc->background list. When a reply is received for a background
232 * request, the object references are released, and the request is
233 * removed from the list. If the filesystem is unmounted while there
234 * are still background requests, the list is walked and references
235 * are released as if a reply was received.
237 * There's one more use for a background request. The RELEASE message is
238 * always sent as background, since it doesn't return an error or
241 static void background_request(struct fuse_conn *fc, struct fuse_req *req)
244 list_add(&req->bg_entry, &fc->background);
246 req->inode = igrab(req->inode);
248 req->inode2 = igrab(req->inode2);
253 /* Called with fuse_lock held. Releases, and then reacquires it. */
254 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
258 spin_unlock(&fuse_lock);
260 wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
261 restore_sigs(&oldset);
262 spin_lock(&fuse_lock);
263 if (req->state == FUSE_REQ_FINISHED)
266 req->out.h.error = -EINTR;
267 req->interrupted = 1;
269 /* This is uninterruptible sleep, because data is
270 being copied to/from the buffers of req. During
271 locked state, there mustn't be any filesystem
272 operation (e.g. page fault), since that could lead
274 spin_unlock(&fuse_lock);
275 wait_event(req->waitq, !req->locked);
276 spin_lock(&fuse_lock);
278 if (req->state == FUSE_REQ_PENDING) {
279 list_del(&req->list);
280 __fuse_put_request(req);
281 } else if (req->state == FUSE_REQ_SENT)
282 background_request(fc, req);
285 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
290 for (i = 0; i < numargs; i++)
291 nbytes += args[i].size;
296 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
299 /* zero is special */
302 req->in.h.unique = fc->reqctr;
303 req->in.h.len = sizeof(struct fuse_in_header) +
304 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
305 if (!req->preallocated) {
306 /* If request is not preallocated (either FORGET or
307 RELEASE), then still decrease outstanding_sem, so
308 user can't open infinite number of files while not
309 processing the RELEASE requests. However for
310 efficiency do it without blocking, so if down()
311 would block, just increase the debt instead */
312 if (down_trylock(&fc->outstanding_sem))
313 fc->outstanding_debt++;
315 list_add_tail(&req->list, &fc->pending);
316 req->state = FUSE_REQ_PENDING;
321 * This can only be interrupted by a SIGKILL
323 void request_send(struct fuse_conn *fc, struct fuse_req *req)
326 spin_lock(&fuse_lock);
328 req->out.h.error = -ENOTCONN;
329 else if (fc->conn_error)
330 req->out.h.error = -ECONNREFUSED;
332 queue_request(fc, req);
333 /* acquire extra reference, since request is still needed
334 after request_end() */
335 __fuse_get_request(req);
337 request_wait_answer(fc, req);
339 spin_unlock(&fuse_lock);
342 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
344 spin_lock(&fuse_lock);
346 queue_request(fc, req);
347 spin_unlock(&fuse_lock);
349 req->out.h.error = -ENOTCONN;
350 request_end(fc, req);
354 void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
357 request_send_nowait(fc, req);
360 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
363 spin_lock(&fuse_lock);
364 background_request(fc, req);
365 spin_unlock(&fuse_lock);
366 request_send_nowait(fc, req);
369 void fuse_send_init(struct fuse_conn *fc)
371 /* This is called from fuse_read_super() so there's guaranteed
372 to be exactly one request available */
373 struct fuse_req *req = fuse_get_request(fc);
374 struct fuse_init_in *arg = &req->misc.init_in;
375 arg->major = FUSE_KERNEL_VERSION;
376 arg->minor = FUSE_KERNEL_MINOR_VERSION;
377 req->in.h.opcode = FUSE_INIT;
379 req->in.args[0].size = sizeof(*arg);
380 req->in.args[0].value = arg;
381 req->out.numargs = 1;
382 /* Variable length arguement used for backward compatibility
383 with interface version < 7.5. Rest of init_out is zeroed
384 by do_get_request(), so a short reply is not a problem */
386 req->out.args[0].size = sizeof(struct fuse_init_out);
387 req->out.args[0].value = &req->misc.init_out;
388 request_send_background(fc, req);
392 * Lock the request. Up to the next unlock_request() there mustn't be
393 * anything that could cause a page-fault. If the request was already
394 * interrupted bail out.
396 static int lock_request(struct fuse_req *req)
400 spin_lock(&fuse_lock);
401 if (req->interrupted)
405 spin_unlock(&fuse_lock);
411 * Unlock request. If it was interrupted during being locked, the
412 * requester thread is currently waiting for it to be unlocked, so
415 static void unlock_request(struct fuse_req *req)
418 spin_lock(&fuse_lock);
420 if (req->interrupted)
421 wake_up(&req->waitq);
422 spin_unlock(&fuse_lock);
426 struct fuse_copy_state {
428 struct fuse_req *req;
429 const struct iovec *iov;
430 unsigned long nr_segs;
431 unsigned long seglen;
439 static void fuse_copy_init(struct fuse_copy_state *cs, int write,
440 struct fuse_req *req, const struct iovec *iov,
441 unsigned long nr_segs)
443 memset(cs, 0, sizeof(*cs));
447 cs->nr_segs = nr_segs;
450 /* Unmap and put previous page of userspace buffer */
451 static void fuse_copy_finish(struct fuse_copy_state *cs)
454 kunmap_atomic(cs->mapaddr, KM_USER0);
456 flush_dcache_page(cs->pg);
457 set_page_dirty_lock(cs->pg);
465 * Get another pagefull of userspace buffer, and map it to kernel
466 * address space, and lock request
468 static int fuse_copy_fill(struct fuse_copy_state *cs)
470 unsigned long offset;
473 unlock_request(cs->req);
474 fuse_copy_finish(cs);
476 BUG_ON(!cs->nr_segs);
477 cs->seglen = cs->iov[0].iov_len;
478 cs->addr = (unsigned long) cs->iov[0].iov_base;
482 down_read(¤t->mm->mmap_sem);
483 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
485 up_read(¤t->mm->mmap_sem);
489 offset = cs->addr % PAGE_SIZE;
490 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
491 cs->buf = cs->mapaddr + offset;
492 cs->len = min(PAGE_SIZE - offset, cs->seglen);
493 cs->seglen -= cs->len;
496 return lock_request(cs->req);
499 /* Do as much copy to/from userspace buffer as we can */
500 static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
502 unsigned ncpy = min(*size, cs->len);
505 memcpy(cs->buf, *val, ncpy);
507 memcpy(*val, cs->buf, ncpy);
517 * Copy a page in the request to/from the userspace buffer. Must be
520 static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
521 unsigned offset, unsigned count, int zeroing)
523 if (page && zeroing && count < PAGE_SIZE) {
524 void *mapaddr = kmap_atomic(page, KM_USER1);
525 memset(mapaddr, 0, PAGE_SIZE);
526 kunmap_atomic(mapaddr, KM_USER1);
530 if (!cs->len && (err = fuse_copy_fill(cs)))
533 void *mapaddr = kmap_atomic(page, KM_USER1);
534 void *buf = mapaddr + offset;
535 offset += fuse_copy_do(cs, &buf, &count);
536 kunmap_atomic(mapaddr, KM_USER1);
538 offset += fuse_copy_do(cs, NULL, &count);
540 if (page && !cs->write)
541 flush_dcache_page(page);
545 /* Copy pages in the request to/from userspace buffer */
546 static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
550 struct fuse_req *req = cs->req;
551 unsigned offset = req->page_offset;
552 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
554 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
555 struct page *page = req->pages[i];
556 int err = fuse_copy_page(cs, page, offset, count, zeroing);
561 count = min(nbytes, (unsigned) PAGE_SIZE);
567 /* Copy a single argument in the request to/from userspace buffer */
568 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
572 if (!cs->len && (err = fuse_copy_fill(cs)))
574 fuse_copy_do(cs, &val, &size);
579 /* Copy request arguments to/from userspace buffer */
580 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
581 unsigned argpages, struct fuse_arg *args,
587 for (i = 0; !err && i < numargs; i++) {
588 struct fuse_arg *arg = &args[i];
589 if (i == numargs - 1 && argpages)
590 err = fuse_copy_pages(cs, arg->size, zeroing);
592 err = fuse_copy_one(cs, arg->value, arg->size);
597 /* Wait until a request is available on the pending list */
598 static void request_wait(struct fuse_conn *fc)
600 DECLARE_WAITQUEUE(wait, current);
602 add_wait_queue_exclusive(&fc->waitq, &wait);
603 while (fc->connected && list_empty(&fc->pending)) {
604 set_current_state(TASK_INTERRUPTIBLE);
605 if (signal_pending(current))
608 spin_unlock(&fuse_lock);
610 spin_lock(&fuse_lock);
612 set_current_state(TASK_RUNNING);
613 remove_wait_queue(&fc->waitq, &wait);
617 * Read a single request into the userspace filesystem's buffer. This
618 * function waits until a request is available, then removes it from
619 * the pending list and copies request data to userspace buffer. If
620 * no reply is needed (FORGET) or request has been interrupted or
621 * there was an error during the copying then it's finished by calling
622 * request_end(). Otherwise add it to the processing list, and set
625 static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
626 unsigned long nr_segs, loff_t *off)
629 struct fuse_conn *fc;
630 struct fuse_req *req;
632 struct fuse_copy_state cs;
636 spin_lock(&fuse_lock);
637 fc = file->private_data;
646 if (list_empty(&fc->pending))
649 req = list_entry(fc->pending.next, struct fuse_req, list);
650 req->state = FUSE_REQ_READING;
651 list_move(&req->list, &fc->io);
655 /* If request is too large, reply with an error and restart the read */
656 if (iov_length(iov, nr_segs) < reqsize) {
657 req->out.h.error = -EIO;
658 /* SETXATTR is special, since it may contain too large data */
659 if (in->h.opcode == FUSE_SETXATTR)
660 req->out.h.error = -E2BIG;
661 request_end(fc, req);
664 spin_unlock(&fuse_lock);
665 fuse_copy_init(&cs, 1, req, iov, nr_segs);
666 err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
668 err = fuse_copy_args(&cs, in->numargs, in->argpages,
669 (struct fuse_arg *) in->args, 0);
670 fuse_copy_finish(&cs);
671 spin_lock(&fuse_lock);
673 if (!err && req->interrupted)
676 if (!req->interrupted)
677 req->out.h.error = -EIO;
678 request_end(fc, req);
682 request_end(fc, req);
684 req->state = FUSE_REQ_SENT;
685 list_move_tail(&req->list, &fc->processing);
686 spin_unlock(&fuse_lock);
691 spin_unlock(&fuse_lock);
695 static ssize_t fuse_dev_read(struct file *file, char __user *buf,
696 size_t nbytes, loff_t *off)
699 iov.iov_len = nbytes;
701 return fuse_dev_readv(file, &iov, 1, off);
704 /* Look up request on processing list by unique ID */
705 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
707 struct list_head *entry;
709 list_for_each(entry, &fc->processing) {
710 struct fuse_req *req;
711 req = list_entry(entry, struct fuse_req, list);
712 if (req->in.h.unique == unique)
718 static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
721 unsigned reqsize = sizeof(struct fuse_out_header);
724 return nbytes != reqsize ? -EINVAL : 0;
726 reqsize += len_args(out->numargs, out->args);
728 if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
730 else if (reqsize > nbytes) {
731 struct fuse_arg *lastarg = &out->args[out->numargs-1];
732 unsigned diffsize = reqsize - nbytes;
733 if (diffsize > lastarg->size)
735 lastarg->size -= diffsize;
737 return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
742 * Write a single reply to a request. First the header is copied from
743 * the write buffer. The request is then searched on the processing
744 * list by the unique ID found in the header. If found, then remove
745 * it from the list and copy the rest of the buffer to the request.
746 * The request is finished by calling request_end()
748 static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
749 unsigned long nr_segs, loff_t *off)
752 unsigned nbytes = iov_length(iov, nr_segs);
753 struct fuse_req *req;
754 struct fuse_out_header oh;
755 struct fuse_copy_state cs;
756 struct fuse_conn *fc = fuse_get_conn(file);
760 fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
761 if (nbytes < sizeof(struct fuse_out_header))
764 err = fuse_copy_one(&cs, &oh, sizeof(oh));
768 if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
772 spin_lock(&fuse_lock);
773 req = request_find(fc, oh.unique);
778 if (req->interrupted) {
779 spin_unlock(&fuse_lock);
780 fuse_copy_finish(&cs);
781 spin_lock(&fuse_lock);
782 request_end(fc, req);
785 list_move(&req->list, &fc->io);
789 spin_unlock(&fuse_lock);
791 err = copy_out_args(&cs, &req->out, nbytes);
792 fuse_copy_finish(&cs);
794 spin_lock(&fuse_lock);
797 if (req->interrupted)
799 } else if (!req->interrupted)
800 req->out.h.error = -EIO;
801 request_end(fc, req);
803 return err ? err : nbytes;
806 spin_unlock(&fuse_lock);
808 fuse_copy_finish(&cs);
812 static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
813 size_t nbytes, loff_t *off)
816 iov.iov_len = nbytes;
817 iov.iov_base = (char __user *) buf;
818 return fuse_dev_writev(file, &iov, 1, off);
821 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
823 struct fuse_conn *fc = fuse_get_conn(file);
824 unsigned mask = POLLOUT | POLLWRNORM;
829 poll_wait(file, &fc->waitq, wait);
831 spin_lock(&fuse_lock);
832 if (!list_empty(&fc->pending))
833 mask |= POLLIN | POLLRDNORM;
834 spin_unlock(&fuse_lock);
839 /* Abort all requests on the given list (pending or processing) */
840 static void end_requests(struct fuse_conn *fc, struct list_head *head)
842 while (!list_empty(head)) {
843 struct fuse_req *req;
844 req = list_entry(head->next, struct fuse_req, list);
845 req->out.h.error = -ECONNABORTED;
846 request_end(fc, req);
847 spin_lock(&fuse_lock);
851 static int fuse_dev_release(struct inode *inode, struct file *file)
853 struct fuse_conn *fc;
855 spin_lock(&fuse_lock);
856 fc = file->private_data;
859 end_requests(fc, &fc->pending);
860 end_requests(fc, &fc->processing);
862 spin_unlock(&fuse_lock);
864 kobject_put(&fc->kobj);
869 struct file_operations fuse_dev_operations = {
870 .owner = THIS_MODULE,
872 .read = fuse_dev_read,
873 .readv = fuse_dev_readv,
874 .write = fuse_dev_write,
875 .writev = fuse_dev_writev,
876 .poll = fuse_dev_poll,
877 .release = fuse_dev_release,
880 static struct miscdevice fuse_miscdevice = {
883 .fops = &fuse_dev_operations,
886 int __init fuse_dev_init(void)
889 fuse_req_cachep = kmem_cache_create("fuse_request",
890 sizeof(struct fuse_req),
892 if (!fuse_req_cachep)
895 err = misc_register(&fuse_miscdevice);
897 goto out_cache_clean;
902 kmem_cache_destroy(fuse_req_cachep);
907 void fuse_dev_cleanup(void)
909 misc_deregister(&fuse_miscdevice);
910 kmem_cache_destroy(fuse_req_cachep);