#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/aio.h>
+#include <linux/falloc.h>
#include "super.h"
#include "mds_client.h"
{
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- u64 pos, this_len;
+ u64 pos, this_len, left;
int io_align, page_align;
- int left, pages_left;
+ int pages_left;
int read;
struct page **page_pos;
int ret;
ret = 0;
hit_stripe = this_len < left;
was_short = ret >= 0 && ret < this_len;
- dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+ dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
if (ret > 0) {
if (pos + left > inode->i_size)
left = inode->i_size - pos;
- dout("zero tail %d\n", left);
+ dout("zero tail %llu\n", left);
ceph_zero_page_vector_range(page_align + read, left,
pages);
read += left;
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
(fi->flags & CEPH_F_SYNC))
/* hmm, this isn't really async... */
ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
&ceph_sb_to_client(inode->i_sb)->client->osdc;
ssize_t count, written = 0;
int err, want, got;
- bool hold_mutex;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
- hold_mutex = true;
err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
if (err)
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
(fi->flags & CEPH_F_SYNC)) {
mutex_unlock(&inode->i_mutex);
written = ceph_sync_write(file, iov->iov_base, count,
pos, &iocb->ki_pos);
+ if (written == -EOLDSNAPC) {
+ dout("aio_write %p %llx.%llx %llu~%u"
+ "got EOLDSNAPC, retrying\n",
+ inode, ceph_vinop(inode),
+ pos, (unsigned)iov->iov_len);
+ mutex_lock(&inode->i_mutex);
+ goto retry_snap;
+ }
} else {
+ /*
+ * No need to acquire the i_truncate_mutex. Because
+ * the MDS revokes Fwb caps before sending truncate
+ * message to us. We can't get Fwb cap while there
+ * are pending vmtruncate. So write and vmtruncate
+ * can not run at the same time
+ */
written = generic_file_buffered_write(iocb, iov, nr_segs,
pos, &iocb->ki_pos,
count, 0);
mutex_unlock(&inode->i_mutex);
}
- hold_mutex = false;
if (written >= 0) {
int dirty;
written = err;
}
- if (written == -EOLDSNAPC) {
- dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
- mutex_lock(&inode->i_mutex);
- hold_mutex = true;
- goto retry_snap;
- }
+ goto out_unlocked;
+
out:
- if (hold_mutex)
- mutex_unlock(&inode->i_mutex);
- sb_end_write(inode->i_sb);
+ mutex_unlock(&inode->i_mutex);
+out_unlocked:
current->backing_dev_info = NULL;
-
return written ? written : err;
}
int ret;
mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode, false);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
return offset;
}
+static inline void ceph_zero_partial_page(
+ struct inode *inode, loff_t offset, unsigned size)
+{
+ struct page *page;
+ pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+
+ page = find_lock_page(inode->i_mapping, index);
+ if (page) {
+ wait_on_page_writeback(page);
+ zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+}
+
+static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+ loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+ if (offset < nearly) {
+ loff_t size = nearly - offset;
+ if (length < size)
+ size = length;
+ ceph_zero_partial_page(inode, offset, size);
+ offset += size;
+ length -= size;
+ }
+ if (length >= PAGE_CACHE_SIZE) {
+ loff_t size = round_down(length, PAGE_CACHE_SIZE);
+ truncate_pagecache_range(inode, offset, offset + size - 1);
+ offset += size;
+ length -= size;
+ }
+ if (length)
+ ceph_zero_partial_page(inode, offset, length);
+}
+
+static int ceph_zero_partial_object(struct inode *inode,
+ loff_t offset, loff_t *length)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_request *req;
+ int ret = 0;
+ loff_t zero = 0;
+ int op;
+
+ if (!length) {
+ op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
+ length = &zero;
+ } else {
+ op = CEPH_OSD_OP_ZERO;
+ }
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode),
+ offset, length,
+ 1, op,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ NULL, 0, 0, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ goto out;
+ }
+
+ ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
+ &inode->i_mtime);
+
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!ret) {
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ if (ret == -ENOENT)
+ ret = 0;
+ }
+ ceph_osdc_put_request(req);
+
+out:
+ return ret;
+}
+
+static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
+{
+ int ret = 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
+ s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+ u64 object_set_size = object_size * stripe_count;
+ u64 nearly, t;
+
+ /* round offset up to next period boundary */
+ nearly = offset + object_set_size - 1;
+ t = nearly;
+ nearly -= do_div(t, object_set_size);
+
+ while (length && offset < nearly) {
+ loff_t size = length;
+ ret = ceph_zero_partial_object(inode, offset, &size);
+ if (ret < 0)
+ return ret;
+ offset += size;
+ length -= size;
+ }
+ while (length >= object_set_size) {
+ int i;
+ loff_t pos = offset;
+ for (i = 0; i < stripe_count; ++i) {
+ ret = ceph_zero_partial_object(inode, pos, NULL);
+ if (ret < 0)
+ return ret;
+ pos += stripe_unit;
+ }
+ offset += object_set_size;
+ length -= object_set_size;
+ }
+ while (length) {
+ loff_t size = length;
+ ret = ceph_zero_partial_object(inode, offset, &size);
+ if (ret < 0)
+ return ret;
+ offset += size;
+ length -= size;
+ }
+ return ret;
+}
+
+static long ceph_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t length)
+{
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc =
+ &ceph_inode_to_client(inode)->client->osdc;
+ int want, got = 0;
+ int dirty;
+ int ret = 0;
+ loff_t endoff = 0;
+ loff_t size;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto unlock;
+ }
+
+ if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) &&
+ !(mode & FALLOC_FL_PUNCH_HOLE)) {
+ ret = -ENOSPC;
+ goto unlock;
+ }
+
+ size = i_size_read(inode);
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ endoff = offset + length;
+
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
+ if (ret < 0)
+ goto unlock;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (offset < size)
+ ceph_zero_pagecache_range(inode, offset, length);
+ ret = ceph_zero_objects(inode, offset, length);
+ } else if (endoff > size) {
+ truncate_pagecache_range(inode, size, -1);
+ if (ceph_inode_set_size(inode, endoff))
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY, NULL);
+ }
+
+ if (!ret) {
+ spin_lock(&ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+
+ ceph_put_cap_refs(ci, got);
+unlock:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
.splice_write = generic_file_splice_write,
.unlocked_ioctl = ceph_ioctl,
.compat_ioctl = ceph_ioctl,
+ .fallocate = ceph_fallocate,
};