2 * (C) 2001 Clemson University and The University of Chicago
4 * See COPYING in top-level directory.
8 * Linux VFS file operations.
12 #include "orangefs-kernel.h"
13 #include "orangefs-bufmap.h"
15 #include <linux/pagemap.h>
18 * Copy to client-core's address space from the buffers specified
19 * by the iovec upto total_size bytes.
20 * NOTE: the iovector can either contain addresses which
21 * can futher be kernel-space or user-space addresses.
22 * or it can pointers to struct page's
24 static int precopy_buffers(struct orangefs_bufmap *bufmap,
26 struct iov_iter *iter,
31 * copy data from application/kernel by pulling it out
37 ret = orangefs_bufmap_copy_from_iovec(bufmap,
42 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
48 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
55 * Copy from client-core's address space to the buffers specified
56 * by the iovec upto total_size bytes.
57 * NOTE: the iovector can either contain addresses which
58 * can futher be kernel-space or user-space addresses.
59 * or it can pointers to struct page's
61 static int postcopy_buffers(struct orangefs_bufmap *bufmap,
63 struct iov_iter *iter,
68 * copy data to application/kernel by pushing it out to
69 * the iovec. NOTE; target buffers can be addresses or
70 * struct page pointers.
73 ret = orangefs_bufmap_copy_to_iovec(bufmap,
78 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
86 * handles two possible error cases, depending on context.
88 * by design, our vfs i/o errors need to be handled in one of two ways,
89 * depending on where the error occured.
91 * if the error happens in the waitqueue code because we either timed
92 * out or a signal was raised while waiting, we need to cancel the
93 * userspace i/o operation and free the op manually. this is done to
94 * avoid having the device start writing application data to our shared
95 * bufmap pages without us expecting it.
97 * FIXME: POSSIBLE OPTIMIZATION:
98 * However, if we timed out or if we got a signal AND our upcall was never
99 * picked off the queue (i.e. we were in OP_VFS_STATE_WAITING), then we don't
100 * need to send a cancellation upcall. The way we can handle this is
101 * set error_exit to 2 in such cases and 1 whenever cancellation has to be
102 * sent and have handle_error
103 * take care of this situation as well..
105 * if a orangefs sysint level error occured and i/o has been completed,
106 * there is no need to cancel the operation, as the user has finished
107 * using the bufmap page and so there is no danger in this case. in
108 * this case, we wake up the device normally so that it may free the
111 * note the only reason this is a macro is because both read and write
112 * cases need the exact same handling code.
114 #define handle_io_error() \
116 if (!op_state_serviced(new_op)) { \
117 orangefs_cancel_op_in_progress(new_op->tag); \
119 complete(&new_op->done); \
121 orangefs_bufmap_put(bufmap, buffer_index); \
126 * Post and wait for the I/O upcall to finish
128 static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
129 loff_t *offset, struct iov_iter *iter,
130 size_t total_size, loff_t readahead_size)
132 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
133 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
134 struct orangefs_bufmap *bufmap = NULL;
135 struct orangefs_kernel_op_s *new_op = NULL;
136 int buffer_index = -1;
139 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
143 /* synchronous I/O */
144 new_op->upcall.req.io.async_vfs_io = ORANGEFS_VFS_SYNC_IO;
145 new_op->upcall.req.io.readahead_size = readahead_size;
146 new_op->upcall.req.io.io_type = type;
147 new_op->upcall.req.io.refn = orangefs_inode->refn;
149 populate_shared_memory:
150 /* get a shared buffer index */
151 ret = orangefs_bufmap_get(&bufmap, &buffer_index);
153 gossip_debug(GOSSIP_FILE_DEBUG,
154 "%s: orangefs_bufmap_get failure (%ld)\n",
155 __func__, (long)ret);
158 gossip_debug(GOSSIP_FILE_DEBUG,
159 "%s(%pU): GET op %p -> buffer_index %d\n",
165 new_op->uses_shared_memory = 1;
166 new_op->upcall.req.io.buf_index = buffer_index;
167 new_op->upcall.req.io.count = total_size;
168 new_op->upcall.req.io.offset = *offset;
170 gossip_debug(GOSSIP_FILE_DEBUG,
171 "%s(%pU): offset: %llu total_size: %zd\n",
177 * Stage 1: copy the buffers into client-core's address space
178 * precopy_buffers only pertains to writes.
180 if (type == ORANGEFS_IO_WRITE) {
181 ret = precopy_buffers(bufmap,
189 gossip_debug(GOSSIP_FILE_DEBUG,
190 "%s(%pU): Calling post_io_request with tag (%llu)\n",
195 /* Stage 2: Service the I/O operation */
196 ret = service_operation(new_op,
197 type == ORANGEFS_IO_WRITE ?
200 get_interruptible_flag(inode));
203 * If service_operation() returns -EAGAIN #and# the operation was
204 * purged from orangefs_request_list or htable_ops_in_progress, then
205 * we know that the client was restarted, causing the shared memory
206 * area to be wiped clean. To restart a write operation in this
207 * case, we must re-copy the data from the user's iovec to a NEW
208 * shared memory location. To restart a read operation, we must get
209 * a new shared memory location.
211 if (ret == -EAGAIN && op_state_purged(new_op)) {
212 orangefs_bufmap_put(bufmap, buffer_index);
213 gossip_debug(GOSSIP_FILE_DEBUG,
214 "%s:going to repopulate_shared_memory.\n",
216 goto populate_shared_memory;
222 * don't write an error to syslog on signaled operation
223 * termination unless we've got debugging turned on, as
224 * this can happen regularly (i.e. ctrl-c)
227 gossip_debug(GOSSIP_FILE_DEBUG,
228 "%s: returning error %ld\n", __func__,
231 gossip_err("%s: error in %s handle %pU, returning %zd\n",
233 type == ORANGEFS_IO_READ ?
234 "read from" : "write to",
240 * Stage 3: Post copy buffers from client-core's address space
241 * postcopy_buffers only pertains to reads.
243 if (type == ORANGEFS_IO_READ) {
244 ret = postcopy_buffers(bufmap,
247 new_op->downcall.resp.io.amt_complete);
250 * put error codes in downcall so that handle_io_error()
251 * preserves it properly
253 WARN_ON(!op_state_serviced(new_op));
254 new_op->downcall.status = ret;
259 gossip_debug(GOSSIP_FILE_DEBUG,
260 "%s(%pU): Amount written as returned by the sys-io call:%d\n",
263 (int)new_op->downcall.resp.io.amt_complete);
265 ret = new_op->downcall.resp.io.amt_complete;
268 * tell the device file owner waiting on I/O that this read has
269 * completed and it can return now.
271 complete(&new_op->done);
274 if (buffer_index >= 0) {
275 orangefs_bufmap_put(bufmap, buffer_index);
276 gossip_debug(GOSSIP_FILE_DEBUG,
277 "%s(%pU): PUT buffer_index %d\n",
278 __func__, handle, buffer_index);
286 * Common entry point for read/write/readv/writev
287 * This function will dispatch it to either the direct I/O
288 * or buffered I/O path depending on the mount options and/or
289 * augmented/extended metadata attached to the file.
290 * Note: File extended attributes override any mount options.
292 static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
293 loff_t *offset, struct iov_iter *iter)
295 struct inode *inode = file->f_mapping->host;
296 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
297 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
298 size_t count = iov_iter_count(iter);
299 ssize_t total_count = 0;
300 ssize_t ret = -EINVAL;
302 gossip_debug(GOSSIP_FILE_DEBUG,
303 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
308 if (type == ORANGEFS_IO_WRITE) {
309 gossip_debug(GOSSIP_FILE_DEBUG,
310 "%s(%pU): proceeding with offset : %llu, "
323 while (iov_iter_count(iter)) {
324 size_t each_count = iov_iter_count(iter);
327 /* how much to transfer in this loop iteration */
328 if (each_count > orangefs_bufmap_size_query())
329 each_count = orangefs_bufmap_size_query();
331 gossip_debug(GOSSIP_FILE_DEBUG,
332 "%s(%pU): size of each_count(%d)\n",
336 gossip_debug(GOSSIP_FILE_DEBUG,
337 "%s(%pU): BEFORE wait_for_io: offset is %d\n",
342 ret = wait_for_direct_io(type, inode, offset, iter,
344 gossip_debug(GOSSIP_FILE_DEBUG,
345 "%s(%pU): return from wait_for_io:%d\n",
357 gossip_debug(GOSSIP_FILE_DEBUG,
358 "%s(%pU): AFTER wait_for_io: offset is %d\n",
364 * if we got a short I/O operations,
365 * fall out and return what we got so far
367 if (amt_complete < each_count)
375 if (type == ORANGEFS_IO_READ) {
378 SetMtimeFlag(orangefs_inode);
379 inode->i_mtime = CURRENT_TIME;
380 mark_inode_dirty_sync(inode);
384 gossip_debug(GOSSIP_FILE_DEBUG,
385 "%s(%pU): Value(%d) returned.\n",
394 * Read data from a specified offset in a file (referenced by inode).
395 * Data may be placed either in a user or kernel buffer.
397 ssize_t orangefs_inode_read(struct inode *inode,
398 struct iov_iter *iter,
400 loff_t readahead_size)
402 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
403 size_t count = iov_iter_count(iter);
405 ssize_t ret = -EINVAL;
407 g_orangefs_stats.reads++;
409 bufmap_size = orangefs_bufmap_size_query();
410 if (count > bufmap_size) {
411 gossip_debug(GOSSIP_FILE_DEBUG,
412 "%s: count is too large (%zd/%zd)!\n",
413 __func__, count, bufmap_size);
417 gossip_debug(GOSSIP_FILE_DEBUG,
418 "%s(%pU) %zd@%llu\n",
420 &orangefs_inode->refn.khandle,
424 ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter,
425 count, readahead_size);
429 gossip_debug(GOSSIP_FILE_DEBUG,
430 "%s(%pU): Value(%zd) returned.\n",
432 &orangefs_inode->refn.khandle,
438 static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
440 struct file *file = iocb->ki_filp;
441 loff_t pos = *(&iocb->ki_pos);
444 BUG_ON(iocb->private);
446 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
448 g_orangefs_stats.reads++;
450 rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
456 static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
458 struct file *file = iocb->ki_filp;
462 BUG_ON(iocb->private);
464 gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
466 mutex_lock(&file->f_mapping->host->i_mutex);
468 /* Make sure generic_write_checks sees an up to date inode size. */
469 if (file->f_flags & O_APPEND) {
470 rc = orangefs_inode_getattr(file->f_mapping->host,
471 ORANGEFS_ATTR_SYS_SIZE, 0);
473 gossip_err("%s: orangefs_inode_getattr failed, rc:%zd:.\n",
479 if (file->f_pos > i_size_read(file->f_mapping->host))
480 orangefs_i_size_write(file->f_mapping->host, file->f_pos);
482 rc = generic_write_checks(iocb, iter);
485 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
491 * if we are appending, generic_write_checks would have updated
492 * pos to the end of the file, so we will wait till now to set
495 pos = *(&iocb->ki_pos);
497 rc = do_readv_writev(ORANGEFS_IO_WRITE,
502 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
508 g_orangefs_stats.writes++;
512 mutex_unlock(&file->f_mapping->host->i_mutex);
517 * Perform a miscellaneous operation on a file.
519 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
525 gossip_debug(GOSSIP_FILE_DEBUG,
526 "orangefs_ioctl: called with cmd %d\n",
530 * we understand some general ioctls on files, such as the immutable
533 if (cmd == FS_IOC_GETFLAGS) {
535 ret = orangefs_inode_getxattr(file_inode(file),
536 ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
537 "user.pvfs2.meta_hint",
539 if (ret < 0 && ret != -ENODATA)
541 else if (ret == -ENODATA)
544 gossip_debug(GOSSIP_FILE_DEBUG,
545 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
546 (unsigned long long)uval);
547 return put_user(uval, (int __user *)arg);
548 } else if (cmd == FS_IOC_SETFLAGS) {
550 if (get_user(uval, (int __user *)arg))
553 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode
554 * is turned on for a file. The user is not allowed to turn
555 * on this bit, but the bit is present if the user first gets
556 * the flags and then updates the flags with some new
557 * settings. So, we ignore it in the following edit. bligon.
559 if ((uval & ~ORANGEFS_MIRROR_FL) &
560 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
561 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
565 gossip_debug(GOSSIP_FILE_DEBUG,
566 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
567 (unsigned long long)val);
568 ret = orangefs_inode_setxattr(file_inode(file),
569 ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
570 "user.pvfs2.meta_hint",
571 &val, sizeof(val), 0);
578 * Memory map a region of a file.
580 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
582 gossip_debug(GOSSIP_FILE_DEBUG,
583 "orangefs_file_mmap: called on %s\n",
585 (char *)file->f_path.dentry->d_name.name :
588 /* set the sequential readahead hint */
589 vma->vm_flags |= VM_SEQ_READ;
590 vma->vm_flags &= ~VM_RAND_READ;
592 /* Use readonly mmap since we cannot support writable maps. */
593 return generic_file_readonly_mmap(file, vma);
596 #define mapping_nrpages(idata) ((idata)->nrpages)
599 * Called to notify the module that there are no more references to
600 * this file (i.e. no processes have it open).
602 * \note Not called when each file is closed.
604 static int orangefs_file_release(struct inode *inode, struct file *file)
606 gossip_debug(GOSSIP_FILE_DEBUG,
607 "orangefs_file_release: called on %s\n",
608 file->f_path.dentry->d_name.name);
610 orangefs_flush_inode(inode);
613 * remove all associated inode pages from the page cache and mmap
614 * readahead cache (if any); this forces an expensive refresh of
615 * data for the next caller of mmap (or 'get_block' accesses)
617 if (file->f_path.dentry->d_inode &&
618 file->f_path.dentry->d_inode->i_mapping &&
619 mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
620 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
626 * Push all data for a specific file onto permanent storage.
628 static int orangefs_fsync(struct file *file,
634 struct orangefs_inode_s *orangefs_inode =
635 ORANGEFS_I(file->f_path.dentry->d_inode);
636 struct orangefs_kernel_op_s *new_op = NULL;
639 filemap_write_and_wait_range(file->f_mapping, start, end);
641 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
644 new_op->upcall.req.fsync.refn = orangefs_inode->refn;
646 ret = service_operation(new_op,
648 get_interruptible_flag(file->f_path.dentry->d_inode));
650 gossip_debug(GOSSIP_FILE_DEBUG,
651 "orangefs_fsync got return value of %d\n",
656 orangefs_flush_inode(file->f_path.dentry->d_inode);
661 * Change the file pointer position for an instance of an open file.
663 * \note If .llseek is overriden, we must acquire lock as described in
664 * Documentation/filesystems/Locking.
666 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
667 * require much changes to the FS
669 static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
672 struct inode *inode = file->f_path.dentry->d_inode;
675 gossip_err("orangefs_file_llseek: invalid inode (NULL)\n");
679 if (origin == ORANGEFS_SEEK_END) {
681 * revalidate the inode's file size.
682 * NOTE: We are only interested in file size here,
683 * so we set mask accordingly.
685 ret = orangefs_inode_getattr(inode, ORANGEFS_ATTR_SYS_SIZE, 0);
687 gossip_debug(GOSSIP_FILE_DEBUG,
688 "%s:%s:%d calling make bad inode\n",
692 orangefs_make_bad_inode(inode);
697 gossip_debug(GOSSIP_FILE_DEBUG,
698 "orangefs_file_llseek: offset is %ld | origin is %d"
699 " | inode size is %lu\n",
702 (unsigned long)file->f_path.dentry->d_inode->i_size);
704 return generic_file_llseek(file, offset, origin);
708 * Support local locks (locks that only this kernel knows about)
709 * if Orangefs was mounted -o local_lock.
711 static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
715 if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
716 if (cmd == F_GETLK) {
718 posix_test_lock(filp, fl);
720 rc = posix_lock_file(filp, fl, NULL);
727 /** ORANGEFS implementation of VFS file operations */
728 const struct file_operations orangefs_file_operations = {
729 .llseek = orangefs_file_llseek,
730 .read_iter = orangefs_file_read_iter,
731 .write_iter = orangefs_file_write_iter,
732 .lock = orangefs_lock,
733 .unlocked_ioctl = orangefs_ioctl,
734 .mmap = orangefs_file_mmap,
735 .open = generic_file_open,
736 .release = orangefs_file_release,
737 .fsync = orangefs_fsync,