4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include "../include/lustre_dlm.h"
45 #include "../include/lustre_lite.h"
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include "../include/lustre/ll_fiemap.h"
51 #include "../include/cl_object.h"
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 fd = kmem_cache_zalloc(ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
74 static void ll_file_data_put(struct ll_file_data *fd)
77 kmem_cache_free(ll_file_data_slab, fd);
80 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
94 op_data->op_handle = *fh;
96 if (ll_i2info(inode)->lli_flags & LLIF_DATA_MODIFIED)
97 op_data->op_bias |= MDS_DATA_MODIFIED;
101 * Closes the IO epoch and packs all the attributes into @op_data for
104 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
105 struct obd_client_handle *och)
107 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
108 ATTR_MTIME | ATTR_MTIME_SET |
109 ATTR_CTIME | ATTR_CTIME_SET;
111 if (!(och->och_flags & FMODE_WRITE))
114 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
115 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
117 ll_ioepoch_close(inode, op_data, &och, 0);
120 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
121 ll_prep_md_op_data(op_data, inode, NULL, NULL,
122 0, 0, LUSTRE_OPC_ANY, NULL);
125 static int ll_close_inode_openhandle(struct obd_export *md_exp,
127 struct obd_client_handle *och,
128 const __u64 *data_version)
130 struct obd_export *exp = ll_i2mdexp(inode);
131 struct md_op_data *op_data;
132 struct ptlrpc_request *req = NULL;
133 struct obd_device *obd = class_exp2obd(exp);
139 * XXX: in case of LMV, is this correct to access
142 CERROR("Invalid MDC connection handle %#llx\n",
143 ll_i2mdexp(inode)->exp_handle.h_cookie);
148 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
150 /* XXX We leak openhandle and request here. */
155 ll_prepare_close(inode, op_data, och);
157 /* Pass in data_version implies release. */
158 op_data->op_bias |= MDS_HSM_RELEASE;
159 op_data->op_data_version = *data_version;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163 epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
164 rc = md_close(md_exp, op_data, och->och_mod, &req);
166 /* This close must have the epoch closed. */
167 LASSERT(epoch_close);
168 /* MDS has instructed us to obtain Size-on-MDS attribute from
169 * OSTs and send setattr to back to MDS.
171 rc = ll_som_update(inode, op_data);
173 CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
178 CERROR("inode %lu mdc close failed: rc = %d\n",
182 /* DATA_MODIFIED flag was successfully sent on close, cancel data
185 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
186 struct ll_inode_info *lli = ll_i2info(inode);
188 spin_lock(&lli->lli_lock);
189 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
190 spin_unlock(&lli->lli_lock);
194 rc = ll_objects_destroy(req, inode);
196 CERROR("inode %lu ll_objects destroy: rc = %d\n",
199 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
200 struct mdt_body *body;
202 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
203 if (!(body->valid & OBD_MD_FLRELEASED))
207 ll_finish_md_op_data(op_data);
210 if (exp_connect_som(exp) && !epoch_close &&
211 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
212 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
214 md_clear_open_replay_data(md_exp, och);
215 /* Free @och if it is not waiting for DONE_WRITING. */
216 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
219 if (req) /* This is close request */
220 ptlrpc_req_finished(req);
224 int ll_md_real_close(struct inode *inode, fmode_t fmode)
226 struct ll_inode_info *lli = ll_i2info(inode);
227 struct obd_client_handle **och_p;
228 struct obd_client_handle *och;
232 if (fmode & FMODE_WRITE) {
233 och_p = &lli->lli_mds_write_och;
234 och_usecount = &lli->lli_open_fd_write_count;
235 } else if (fmode & FMODE_EXEC) {
236 och_p = &lli->lli_mds_exec_och;
237 och_usecount = &lli->lli_open_fd_exec_count;
239 LASSERT(fmode & FMODE_READ);
240 och_p = &lli->lli_mds_read_och;
241 och_usecount = &lli->lli_open_fd_read_count;
244 mutex_lock(&lli->lli_och_mutex);
245 if (*och_usecount > 0) {
246 /* There are still users of this handle, so skip
249 mutex_unlock(&lli->lli_och_mutex);
255 mutex_unlock(&lli->lli_och_mutex);
258 /* There might be a race and this handle may already
261 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
268 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
271 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
272 struct ll_inode_info *lli = ll_i2info(inode);
274 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
275 struct lustre_handle lockh;
276 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN} };
279 /* clear group lock, if present */
280 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
281 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
283 if (fd->fd_lease_och) {
286 /* Usually the lease is not released when the
287 * application crashed, we need to release here.
289 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
290 CDEBUG(rc ? D_ERROR : D_INODE,
291 "Clean up lease " DFID " %d/%d\n",
292 PFID(&lli->lli_fid), rc, lease_broken);
294 fd->fd_lease_och = NULL;
298 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
303 /* Let's see if we have good enough OPEN lock on the file and if
304 * we can skip talking to MDS
307 mutex_lock(&lli->lli_och_mutex);
308 if (fd->fd_omode & FMODE_WRITE) {
310 LASSERT(lli->lli_open_fd_write_count);
311 lli->lli_open_fd_write_count--;
312 } else if (fd->fd_omode & FMODE_EXEC) {
314 LASSERT(lli->lli_open_fd_exec_count);
315 lli->lli_open_fd_exec_count--;
318 LASSERT(lli->lli_open_fd_read_count);
319 lli->lli_open_fd_read_count--;
321 mutex_unlock(&lli->lli_och_mutex);
323 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
324 LDLM_IBITS, &policy, lockmode, &lockh))
325 rc = ll_md_real_close(inode, fd->fd_omode);
328 LUSTRE_FPRIVATE(file) = NULL;
329 ll_file_data_put(fd);
334 /* While this returns an error code, fput() the caller does not, so we need
335 * to make every effort to clean up all of our state here. Also, applications
336 * rarely check close errors and even if an error is returned they will not
337 * re-try the close call.
339 int ll_file_release(struct inode *inode, struct file *file)
341 struct ll_file_data *fd;
342 struct ll_sb_info *sbi = ll_i2sbi(inode);
343 struct ll_inode_info *lli = ll_i2info(inode);
346 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
347 inode->i_generation, inode);
349 #ifdef CONFIG_FS_POSIX_ACL
350 if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
351 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
353 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
354 fd->fd_flags &= ~LL_FILE_RMTACL;
355 rct_del(&sbi->ll_rct, current_pid());
356 et_search_free(&sbi->ll_et, current_pid());
361 if (!is_root_inode(inode))
362 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
363 fd = LUSTRE_FPRIVATE(file);
366 /* The last ref on @file, maybe not be the owner pid of statahead.
367 * Different processes can open the same dir, "ll_opendir_key" means:
368 * it is me that should stop the statahead thread.
370 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
371 lli->lli_opendir_pid != 0)
372 ll_stop_statahead(inode, lli->lli_opendir_key);
374 if (is_root_inode(inode)) {
375 LUSTRE_FPRIVATE(file) = NULL;
376 ll_file_data_put(fd);
380 if (!S_ISDIR(inode->i_mode)) {
381 lov_read_and_clear_async_rc(lli->lli_clob);
382 lli->lli_async_rc = 0;
385 rc = ll_md_close(sbi->ll_md_exp, inode, file);
387 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
388 libcfs_debug_dumplog();
393 static int ll_intent_file_open(struct dentry *dentry, void *lmm,
394 int lmmsize, struct lookup_intent *itp)
396 struct inode *inode = d_inode(dentry);
397 struct ll_sb_info *sbi = ll_i2sbi(inode);
398 struct dentry *parent = dentry->d_parent;
399 const char *name = dentry->d_name.name;
400 const int len = dentry->d_name.len;
401 struct md_op_data *op_data;
402 struct ptlrpc_request *req;
403 __u32 opc = LUSTRE_OPC_ANY;
406 /* Usually we come here only for NFSD, and we want open lock. */
407 /* We can also get here if there was cached open handle in revalidate_it
408 * but it disappeared while we were getting from there to ll_file_open.
409 * But this means this file was closed and immediately opened which
410 * makes a good candidate for using OPEN lock
412 /* If lmmsize & lmm are not 0, we are just setting stripe info
413 * parameters. No need for the open lock
415 if (!lmm && lmmsize == 0) {
416 itp->it_flags |= MDS_OPEN_LOCK;
417 if (itp->it_flags & FMODE_WRITE)
418 opc = LUSTRE_OPC_CREATE;
421 op_data = ll_prep_md_op_data(NULL, d_inode(parent),
425 return PTR_ERR(op_data);
427 itp->it_flags |= MDS_OPEN_BY_FID;
428 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
429 0 /*unused */, &req, ll_md_blocking_ast, 0);
430 ll_finish_md_op_data(op_data);
432 /* reason for keep own exit path - don`t flood log
433 * with messages with -ESTALE errors.
435 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
436 it_open_error(DISP_OPEN_OPEN, itp))
438 ll_release_openhandle(inode, itp);
442 if (it_disposition(itp, DISP_LOOKUP_NEG)) {
447 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
448 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
449 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
453 rc = ll_prep_inode(&inode, req, NULL, itp);
454 if (!rc && itp->d.lustre.it_lock_mode)
455 ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
458 ptlrpc_req_finished(req);
459 ll_intent_drop_lock(itp);
465 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
466 * not believe attributes if a few ioepoch holders exist. Attributes for
467 * previous ioepoch if new one is opened are also skipped by MDS.
469 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
471 if (ioepoch && lli->lli_ioepoch != ioepoch) {
472 lli->lli_ioepoch = ioepoch;
473 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
474 ioepoch, PFID(&lli->lli_fid));
478 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
479 struct obd_client_handle *och)
481 struct ptlrpc_request *req = it->d.lustre.it_data;
482 struct mdt_body *body;
484 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
485 och->och_fh = body->handle;
486 och->och_fid = body->fid1;
487 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
488 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
489 och->och_flags = it->it_flags;
491 return md_set_open_replay_data(md_exp, och, it);
494 static int ll_local_open(struct file *file, struct lookup_intent *it,
495 struct ll_file_data *fd, struct obd_client_handle *och)
497 struct inode *inode = file_inode(file);
498 struct ll_inode_info *lli = ll_i2info(inode);
500 LASSERT(!LUSTRE_FPRIVATE(file));
505 struct ptlrpc_request *req = it->d.lustre.it_data;
506 struct mdt_body *body;
509 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
513 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
514 ll_ioepoch_open(lli, body->ioepoch);
517 LUSTRE_FPRIVATE(file) = fd;
518 ll_readahead_init(inode, &fd->fd_ras);
519 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
523 /* Open a file, and (for the very first open) create objects on the OSTs at
524 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
525 * creation or open until ll_lov_setstripe() ioctl is called.
527 * If we already have the stripe MD locally then we don't request it in
528 * md_open(), by passing a lmm_size = 0.
530 * It is up to the application to ensure no other processes open this file
531 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
532 * used. We might be able to avoid races of that sort by getting lli_open_sem
533 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
534 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
536 int ll_file_open(struct inode *inode, struct file *file)
538 struct ll_inode_info *lli = ll_i2info(inode);
539 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
540 .it_flags = file->f_flags };
541 struct obd_client_handle **och_p = NULL;
542 __u64 *och_usecount = NULL;
543 struct ll_file_data *fd;
544 int rc = 0, opendir_set = 0;
546 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
547 inode->i_generation, inode, file->f_flags);
549 it = file->private_data; /* XXX: compat macro */
550 file->private_data = NULL; /* prevent ll_local_open assertion */
552 fd = ll_file_data_get();
559 if (S_ISDIR(inode->i_mode)) {
560 spin_lock(&lli->lli_sa_lock);
561 if (!lli->lli_opendir_key && !lli->lli_sai &&
562 lli->lli_opendir_pid == 0) {
563 lli->lli_opendir_key = fd;
564 lli->lli_opendir_pid = current_pid();
567 spin_unlock(&lli->lli_sa_lock);
570 if (is_root_inode(inode)) {
571 LUSTRE_FPRIVATE(file) = fd;
575 if (!it || !it->d.lustre.it_disposition) {
576 /* Convert f_flags into access mode. We cannot use file->f_mode,
577 * because everything but O_ACCMODE mask was stripped from
580 if ((oit.it_flags + 1) & O_ACCMODE)
582 if (file->f_flags & O_TRUNC)
583 oit.it_flags |= FMODE_WRITE;
585 /* kernel only call f_op->open in dentry_open. filp_open calls
586 * dentry_open after call to open_namei that checks permissions.
587 * Only nfsd_open call dentry_open directly without checking
588 * permissions and because of that this code below is safe.
590 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
591 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
593 /* We do not want O_EXCL here, presumably we opened the file
594 * already? XXX - NFS implications?
596 oit.it_flags &= ~O_EXCL;
598 /* bug20584, if "it_flags" contains O_CREAT, the file will be
599 * created if necessary, then "IT_CREAT" should be set to keep
602 if (oit.it_flags & O_CREAT)
603 oit.it_op |= IT_CREAT;
609 /* Let's see if we have file open on MDS already. */
610 if (it->it_flags & FMODE_WRITE) {
611 och_p = &lli->lli_mds_write_och;
612 och_usecount = &lli->lli_open_fd_write_count;
613 } else if (it->it_flags & FMODE_EXEC) {
614 och_p = &lli->lli_mds_exec_och;
615 och_usecount = &lli->lli_open_fd_exec_count;
617 och_p = &lli->lli_mds_read_och;
618 och_usecount = &lli->lli_open_fd_read_count;
621 mutex_lock(&lli->lli_och_mutex);
622 if (*och_p) { /* Open handle is present */
623 if (it_disposition(it, DISP_OPEN_OPEN)) {
624 /* Well, there's extra open request that we do not need,
625 * let's close it somehow. This will decref request.
627 rc = it_open_error(DISP_OPEN_OPEN, it);
629 mutex_unlock(&lli->lli_och_mutex);
633 ll_release_openhandle(inode, it);
637 rc = ll_local_open(file, it, fd, NULL);
640 mutex_unlock(&lli->lli_och_mutex);
644 LASSERT(*och_usecount == 0);
645 if (!it->d.lustre.it_disposition) {
646 /* We cannot just request lock handle now, new ELC code
647 * means that one of other OPEN locks for this file
648 * could be cancelled, and since blocking ast handler
649 * would attempt to grab och_mutex as well, that would
650 * result in a deadlock
652 mutex_unlock(&lli->lli_och_mutex);
653 it->it_create_mode |= M_CHECK_STALE;
654 rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
655 it->it_create_mode &= ~M_CHECK_STALE;
661 *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
669 /* md_intent_lock() didn't get a request ref if there was an
670 * open error, so don't do cleanup on the request here
673 /* XXX (green): Should not we bail out on any error here, not
676 rc = it_open_error(DISP_OPEN_OPEN, it);
680 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
682 rc = ll_local_open(file, it, fd, *och_p);
686 mutex_unlock(&lli->lli_och_mutex);
689 /* Must do this outside lli_och_mutex lock to prevent deadlock where
690 * different kind of OPEN lock for this same inode gets cancelled
693 if (!S_ISREG(inode->i_mode))
696 if (!lli->lli_has_smd &&
697 (cl_is_lov_delay_create(file->f_flags) ||
698 (file->f_mode & FMODE_WRITE) == 0)) {
699 CDEBUG(D_INODE, "object creation was delayed\n");
702 cl_lov_delay_create_clear(&file->f_flags);
707 if (och_p && *och_p) {
712 mutex_unlock(&lli->lli_och_mutex);
715 if (opendir_set != 0)
716 ll_stop_statahead(inode, lli->lli_opendir_key);
717 ll_file_data_put(fd);
719 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
722 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
723 ptlrpc_req_finished(it->d.lustre.it_data);
724 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
730 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
731 struct ldlm_lock_desc *desc,
732 void *data, int flag)
735 struct lustre_handle lockh;
738 case LDLM_CB_BLOCKING:
739 ldlm_lock2handle(lock, &lockh);
740 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
742 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
746 case LDLM_CB_CANCELING:
754 * Acquire a lease and open the file.
756 static struct obd_client_handle *
757 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
760 struct lookup_intent it = { .it_op = IT_OPEN };
761 struct ll_sb_info *sbi = ll_i2sbi(inode);
762 struct md_op_data *op_data;
763 struct ptlrpc_request *req;
764 struct lustre_handle old_handle = { 0 };
765 struct obd_client_handle *och = NULL;
769 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
770 return ERR_PTR(-EINVAL);
773 struct ll_inode_info *lli = ll_i2info(inode);
774 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
775 struct obd_client_handle **och_p;
778 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
779 return ERR_PTR(-EPERM);
781 /* Get the openhandle of the file */
783 mutex_lock(&lli->lli_och_mutex);
784 if (fd->fd_lease_och) {
785 mutex_unlock(&lli->lli_och_mutex);
790 if (file->f_mode & FMODE_WRITE) {
791 LASSERT(lli->lli_mds_write_och);
792 och_p = &lli->lli_mds_write_och;
793 och_usecount = &lli->lli_open_fd_write_count;
795 LASSERT(lli->lli_mds_read_och);
796 och_p = &lli->lli_mds_read_och;
797 och_usecount = &lli->lli_open_fd_read_count;
799 if (*och_usecount == 1) {
806 mutex_unlock(&lli->lli_och_mutex);
807 if (rc < 0) /* more than 1 opener */
811 old_handle = fd->fd_och->och_fh;
814 och = kzalloc(sizeof(*och), GFP_NOFS);
816 return ERR_PTR(-ENOMEM);
818 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
819 LUSTRE_OPC_ANY, NULL);
820 if (IS_ERR(op_data)) {
821 rc = PTR_ERR(op_data);
825 /* To tell the MDT this openhandle is from the same owner */
826 op_data->op_handle = old_handle;
828 it.it_flags = fmode | open_flags;
829 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
830 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
831 ll_md_blocking_lease_ast,
832 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
833 * it can be cancelled which may mislead applications that the lease is
835 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
836 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
837 * doesn't deal with openhandle, so normal openhandle will be leaked.
839 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
840 ll_finish_md_op_data(op_data);
841 ptlrpc_req_finished(req);
845 if (it_disposition(&it, DISP_LOOKUP_NEG)) {
850 rc = it_open_error(DISP_OPEN_OPEN, &it);
854 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
855 ll_och_fill(sbi->ll_md_exp, &it, och);
857 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
862 /* already get lease, handle lease lock */
863 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
864 if (it.d.lustre.it_lock_mode == 0 ||
865 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
866 /* open lock must return for lease */
867 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
868 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
869 it.d.lustre.it_lock_bits);
874 ll_intent_release(&it);
878 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
880 CERROR("Close openhandle returned %d\n", rc2);
882 /* cancel open lock */
883 if (it.d.lustre.it_lock_mode != 0) {
884 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
885 it.d.lustre.it_lock_mode);
886 it.d.lustre.it_lock_mode = 0;
889 ll_intent_release(&it);
896 * Release lease and close the file.
897 * It will check if the lease has ever broken.
899 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
902 struct ldlm_lock *lock;
903 bool cancelled = true;
906 lock = ldlm_handle2lock(&och->och_lease_handle);
908 lock_res_and_lock(lock);
909 cancelled = ldlm_is_cancel(lock);
910 unlock_res_and_lock(lock);
914 CDEBUG(D_INODE, "lease for " DFID " broken? %d\n",
915 PFID(&ll_i2info(inode)->lli_fid), cancelled);
918 ldlm_cli_cancel(&och->och_lease_handle, 0);
920 *lease_broken = cancelled;
922 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
927 /* Fills the obdo with the attributes for the lsm */
928 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
929 struct obdo *obdo, __u64 ioepoch, int sync)
931 struct ptlrpc_request_set *set;
932 struct obd_info oinfo = { };
939 oinfo.oi_oa->o_oi = lsm->lsm_oi;
940 oinfo.oi_oa->o_mode = S_IFREG;
941 oinfo.oi_oa->o_ioepoch = ioepoch;
942 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
943 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
944 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
945 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
946 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
947 OBD_MD_FLDATAVERSION;
949 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
950 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
953 set = ptlrpc_prep_set();
955 CERROR("can't allocate ptlrpc set\n");
958 rc = obd_getattr_async(exp, &oinfo, set);
960 rc = ptlrpc_set_wait(set);
961 ptlrpc_set_destroy(set);
964 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
965 OBD_MD_FLATIME | OBD_MD_FLMTIME |
966 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
967 OBD_MD_FLDATAVERSION);
972 * Performs the getattr on the inode and updates its fields.
973 * If @sync != 0, perform the getattr under the server-side lock.
975 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
976 __u64 ioepoch, int sync)
978 struct lov_stripe_md *lsm;
981 lsm = ccc_inode_lsm_get(inode);
982 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
983 obdo, ioepoch, sync);
985 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
987 obdo_refresh_inode(inode, obdo, obdo->o_valid);
988 CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
989 POSTID(oi), i_size_read(inode),
990 (unsigned long long)inode->i_blocks,
991 1UL << inode->i_blkbits);
993 ccc_inode_lsm_put(inode, lsm);
997 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
999 struct ll_inode_info *lli = ll_i2info(inode);
1000 struct cl_object *obj = lli->lli_clob;
1001 struct cl_attr *attr = ccc_env_thread_attr(env);
1005 ll_inode_size_lock(inode);
1006 /* merge timestamps the most recently obtained from mds with
1007 * timestamps obtained from osts
1009 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1010 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1011 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1013 lvb.lvb_size = i_size_read(inode);
1014 lvb.lvb_blocks = inode->i_blocks;
1015 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1016 lvb.lvb_atime = LTIME_S(inode->i_atime);
1017 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1019 cl_object_attr_lock(obj);
1020 rc = cl_object_attr_get(env, obj, attr);
1021 cl_object_attr_unlock(obj);
1024 if (lvb.lvb_atime < attr->cat_atime)
1025 lvb.lvb_atime = attr->cat_atime;
1026 if (lvb.lvb_ctime < attr->cat_ctime)
1027 lvb.lvb_ctime = attr->cat_ctime;
1028 if (lvb.lvb_mtime < attr->cat_mtime)
1029 lvb.lvb_mtime = attr->cat_mtime;
1031 CDEBUG(D_VFSTRACE, DFID " updating i_size %llu\n",
1032 PFID(&lli->lli_fid), attr->cat_size);
1033 cl_isize_write_nolock(inode, attr->cat_size);
1035 inode->i_blocks = attr->cat_blocks;
1037 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1038 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1039 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1041 ll_inode_size_unlock(inode);
1046 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1049 struct obdo obdo = { 0 };
1052 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, &obdo, 0, 0);
1054 st->st_size = obdo.o_size;
1055 st->st_blocks = obdo.o_blocks;
1056 st->st_mtime = obdo.o_mtime;
1057 st->st_atime = obdo.o_atime;
1058 st->st_ctime = obdo.o_ctime;
1063 static bool file_is_noatime(const struct file *file)
1065 const struct vfsmount *mnt = file->f_path.mnt;
1066 const struct inode *inode = file_inode(file);
1068 /* Adapted from file_accessed() and touch_atime().*/
1069 if (file->f_flags & O_NOATIME)
1072 if (inode->i_flags & S_NOATIME)
1075 if (IS_NOATIME(inode))
1078 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1081 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1084 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1090 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1092 struct inode *inode = file_inode(file);
1094 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1096 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1097 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1098 file->f_flags & O_DIRECT ||
1101 io->ci_obj = ll_i2info(inode)->lli_clob;
1102 io->ci_lockreq = CILR_MAYBE;
1103 if (ll_file_nolock(file)) {
1104 io->ci_lockreq = CILR_NEVER;
1105 io->ci_no_srvlock = 1;
1106 } else if (file->f_flags & O_APPEND) {
1107 io->ci_lockreq = CILR_MANDATORY;
1110 io->ci_noatime = file_is_noatime(file);
1114 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1115 struct file *file, enum cl_io_type iot,
1116 loff_t *ppos, size_t count)
1118 struct ll_inode_info *lli = ll_i2info(file_inode(file));
1119 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1124 io = ccc_env_thread_io(env);
1125 ll_io_init(io, file, iot == CIT_WRITE);
1127 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1128 struct vvp_io *vio = vvp_env_io(env);
1129 struct ccc_io *cio = ccc_env_io(env);
1130 int write_mutex_locked = 0;
1132 cio->cui_fd = LUSTRE_FPRIVATE(file);
1133 vio->cui_io_subtype = args->via_io_subtype;
1135 switch (vio->cui_io_subtype) {
1137 cio->cui_iter = args->u.normal.via_iter;
1138 cio->cui_iocb = args->u.normal.via_iocb;
1139 if ((iot == CIT_WRITE) &&
1140 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1141 if (mutex_lock_interruptible(&lli->
1143 result = -ERESTARTSYS;
1146 write_mutex_locked = 1;
1147 } else if (iot == CIT_READ) {
1148 down_read(&lli->lli_trunc_sem);
1152 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1153 vio->u.splice.cui_flags = args->u.splice.via_flags;
1156 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1159 result = cl_io_loop(env, io);
1160 if (write_mutex_locked)
1161 mutex_unlock(&lli->lli_write_mutex);
1162 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1163 up_read(&lli->lli_trunc_sem);
1165 /* cl_io_rw_init() handled IO */
1166 result = io->ci_result;
1169 if (io->ci_nob > 0) {
1170 result = io->ci_nob;
1171 *ppos = io->u.ci_wr.wr.crw_pos;
1175 cl_io_fini(env, io);
1176 /* If any bit been read/written (result != 0), we just return
1177 * short read/write instead of restart io.
1179 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1180 CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1181 iot == CIT_READ ? "read" : "write",
1182 file, *ppos, count);
1183 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1187 if (iot == CIT_READ) {
1189 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1190 LPROC_LL_READ_BYTES, result);
1191 } else if (iot == CIT_WRITE) {
1193 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1194 LPROC_LL_WRITE_BYTES, result);
1195 fd->fd_write_failed = false;
1196 } else if (result != -ERESTARTSYS) {
1197 fd->fd_write_failed = true;
1204 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1207 struct vvp_io_args *args;
1211 env = cl_env_get(&refcheck);
1213 return PTR_ERR(env);
1215 args = vvp_env_args(env, IO_NORMAL);
1216 args->u.normal.via_iter = to;
1217 args->u.normal.via_iocb = iocb;
1219 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1220 &iocb->ki_pos, iov_iter_count(to));
1221 cl_env_put(env, &refcheck);
1226 * Write to a file (through the page cache).
1228 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1231 struct vvp_io_args *args;
1235 env = cl_env_get(&refcheck);
1237 return PTR_ERR(env);
1239 args = vvp_env_args(env, IO_NORMAL);
1240 args->u.normal.via_iter = from;
1241 args->u.normal.via_iocb = iocb;
1243 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1244 &iocb->ki_pos, iov_iter_count(from));
1245 cl_env_put(env, &refcheck);
1250 * Send file content (through pagecache) somewhere with helper
1252 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1253 struct pipe_inode_info *pipe, size_t count,
1257 struct vvp_io_args *args;
1261 env = cl_env_get(&refcheck);
1263 return PTR_ERR(env);
1265 args = vvp_env_args(env, IO_SPLICE);
1266 args->u.splice.via_pipe = pipe;
1267 args->u.splice.via_flags = flags;
1269 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1270 cl_env_put(env, &refcheck);
1274 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1276 struct obd_export *exp = ll_i2dtexp(inode);
1277 struct obd_trans_info oti = { 0 };
1278 struct obdo *oa = NULL;
1281 struct lov_stripe_md *lsm = NULL, *lsm2;
1283 oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS);
1287 lsm = ccc_inode_lsm_get(inode);
1288 if (!lsm_has_objects(lsm)) {
1293 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1294 (lsm->lsm_stripe_count));
1296 lsm2 = libcfs_kvzalloc(lsm_size, GFP_NOFS);
1303 oa->o_nlink = ost_idx;
1304 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1305 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1306 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1307 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1308 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1309 memcpy(lsm2, lsm, lsm_size);
1310 ll_inode_size_lock(inode);
1311 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1312 ll_inode_size_unlock(inode);
1317 ccc_inode_lsm_put(inode, lsm);
1318 kmem_cache_free(obdo_cachep, oa);
1322 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1324 struct ll_recreate_obj ucreat;
1327 if (!capable(CFS_CAP_SYS_ADMIN))
1330 if (copy_from_user(&ucreat, (struct ll_recreate_obj __user *)arg,
1334 ostid_set_seq_mdt0(&oi);
1335 ostid_set_id(&oi, ucreat.lrc_id);
1336 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1339 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1345 if (!capable(CFS_CAP_SYS_ADMIN))
1348 if (copy_from_user(&fid, (struct lu_fid __user *)arg, sizeof(fid)))
1351 fid_to_ostid(&fid, &oi);
1352 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1353 return ll_lov_recreate(inode, &oi, ost_idx);
1356 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1357 int flags, struct lov_user_md *lum, int lum_size)
1359 struct lov_stripe_md *lsm = NULL;
1360 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1363 lsm = ccc_inode_lsm_get(inode);
1365 ccc_inode_lsm_put(inode, lsm);
1366 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1372 ll_inode_size_lock(inode);
1373 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1376 rc = oit.d.lustre.it_status;
1380 ll_release_openhandle(inode, &oit);
1383 ll_inode_size_unlock(inode);
1384 ll_intent_release(&oit);
1385 ccc_inode_lsm_put(inode, lsm);
1389 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1393 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1394 struct lov_mds_md **lmmp, int *lmm_size,
1395 struct ptlrpc_request **request)
1397 struct ll_sb_info *sbi = ll_i2sbi(inode);
1398 struct mdt_body *body;
1399 struct lov_mds_md *lmm = NULL;
1400 struct ptlrpc_request *req = NULL;
1401 struct md_op_data *op_data;
1404 rc = ll_get_default_mdsize(sbi, &lmmsize);
1408 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1409 strlen(filename), lmmsize,
1410 LUSTRE_OPC_ANY, NULL);
1411 if (IS_ERR(op_data))
1412 return PTR_ERR(op_data);
1414 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1415 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1416 ll_finish_md_op_data(op_data);
1418 CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1423 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1425 lmmsize = body->eadatasize;
1427 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1433 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1435 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1436 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1442 * This is coming from the MDS, so is probably in
1443 * little endian. We convert it to host endian before
1444 * passing it to userspace.
1446 if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
1449 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1450 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1453 /* if function called for directory - we should
1454 * avoid swab not existent lsm objects
1456 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1457 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1458 if (S_ISREG(body->mode))
1459 lustre_swab_lov_user_md_objects(
1460 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1462 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1463 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1464 if (S_ISREG(body->mode))
1465 lustre_swab_lov_user_md_objects(
1466 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1473 *lmm_size = lmmsize;
1478 static int ll_lov_setea(struct inode *inode, struct file *file,
1481 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1482 struct lov_user_md *lump;
1483 int lum_size = sizeof(struct lov_user_md) +
1484 sizeof(struct lov_user_ost_data);
1487 if (!capable(CFS_CAP_SYS_ADMIN))
1490 lump = libcfs_kvzalloc(lum_size, GFP_NOFS);
1494 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1499 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1501 cl_lov_delay_create_clear(&file->f_flags);
1507 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1510 struct lov_user_md_v3 lumv3;
1511 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1512 struct lov_user_md_v1 __user *lumv1p = (void __user *)arg;
1513 struct lov_user_md_v3 __user *lumv3p = (void __user *)arg;
1515 int flags = FMODE_WRITE;
1517 /* first try with v1 which is smaller than v3 */
1518 lum_size = sizeof(struct lov_user_md_v1);
1519 if (copy_from_user(lumv1, lumv1p, lum_size))
1522 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1523 lum_size = sizeof(struct lov_user_md_v3);
1524 if (copy_from_user(&lumv3, lumv3p, lum_size))
1528 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1530 cl_lov_delay_create_clear(&file->f_flags);
1532 struct lov_stripe_md *lsm;
1535 put_user(0, &lumv1p->lmm_stripe_count);
1537 ll_layout_refresh(inode, &gen);
1538 lsm = ccc_inode_lsm_get(inode);
1539 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1540 0, lsm, (void __user *)arg);
1541 ccc_inode_lsm_put(inode, lsm);
1546 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1548 struct lov_stripe_md *lsm;
1551 lsm = ccc_inode_lsm_get(inode);
1553 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1554 lsm, (void __user *)arg);
1555 ccc_inode_lsm_put(inode, lsm);
1560 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1562 struct ll_inode_info *lli = ll_i2info(inode);
1563 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1564 struct ccc_grouplock grouplock;
1568 CWARN("group id for group lock must not be 0\n");
1572 if (ll_file_nolock(file))
1575 spin_lock(&lli->lli_lock);
1576 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1577 CWARN("group lock already existed with gid %lu\n",
1578 fd->fd_grouplock.cg_gid);
1579 spin_unlock(&lli->lli_lock);
1582 LASSERT(!fd->fd_grouplock.cg_lock);
1583 spin_unlock(&lli->lli_lock);
1585 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1586 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1590 spin_lock(&lli->lli_lock);
1591 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1592 spin_unlock(&lli->lli_lock);
1593 CERROR("another thread just won the race\n");
1594 cl_put_grouplock(&grouplock);
1598 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1599 fd->fd_grouplock = grouplock;
1600 spin_unlock(&lli->lli_lock);
1602 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1606 static int ll_put_grouplock(struct inode *inode, struct file *file,
1609 struct ll_inode_info *lli = ll_i2info(inode);
1610 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1611 struct ccc_grouplock grouplock;
1613 spin_lock(&lli->lli_lock);
1614 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1615 spin_unlock(&lli->lli_lock);
1616 CWARN("no group lock held\n");
1619 LASSERT(fd->fd_grouplock.cg_lock);
1621 if (fd->fd_grouplock.cg_gid != arg) {
1622 CWARN("group lock %lu doesn't match current id %lu\n",
1623 arg, fd->fd_grouplock.cg_gid);
1624 spin_unlock(&lli->lli_lock);
1628 grouplock = fd->fd_grouplock;
1629 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1630 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1631 spin_unlock(&lli->lli_lock);
1633 cl_put_grouplock(&grouplock);
1634 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1639 * Close inode open handle
1641 * \param inode [in] inode in question
1642 * \param it [in,out] intent which contains open info and result
1645 * \retval <0 failure
1647 int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1649 struct obd_client_handle *och;
1654 /* Root ? Do nothing. */
1655 if (is_root_inode(inode))
1658 /* No open handle to close? Move away */
1659 if (!it_disposition(it, DISP_OPEN_OPEN))
1662 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1664 och = kzalloc(sizeof(*och), GFP_NOFS);
1670 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1672 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1675 /* this one is in place of ll_file_open */
1676 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1677 ptlrpc_req_finished(it->d.lustre.it_data);
1678 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1684 * Get size for inode for which FIEMAP mapping is requested.
1685 * Make the FIEMAP get_info call and returns the result.
1687 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1690 struct obd_export *exp = ll_i2dtexp(inode);
1691 struct lov_stripe_md *lsm = NULL;
1692 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1693 __u32 vallen = num_bytes;
1696 /* Checks for fiemap flags */
1697 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1698 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1702 /* Check for FIEMAP_FLAG_SYNC */
1703 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1704 rc = filemap_fdatawrite(inode->i_mapping);
1709 lsm = ccc_inode_lsm_get(inode);
1713 /* If the stripe_count > 1 and the application does not understand
1714 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1716 if (lsm->lsm_stripe_count > 1 &&
1717 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1722 fm_key.oa.o_oi = lsm->lsm_oi;
1723 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1725 if (i_size_read(inode) == 0) {
1726 rc = ll_glimpse_size(inode);
1731 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1732 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1733 /* If filesize is 0, then there would be no objects for mapping */
1734 if (fm_key.oa.o_size == 0) {
1735 fiemap->fm_mapped_extents = 0;
1740 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1742 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1745 CERROR("obd_get_info failed: rc = %d\n", rc);
1748 ccc_inode_lsm_put(inode, lsm);
1752 int ll_fid2path(struct inode *inode, void __user *arg)
1754 struct obd_export *exp = ll_i2mdexp(inode);
1755 const struct getinfo_fid2path __user *gfin = arg;
1756 struct getinfo_fid2path *gfout;
1761 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1762 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1765 /* Only need to get the buflen */
1766 if (get_user(pathlen, &gfin->gf_pathlen))
1769 if (pathlen > PATH_MAX)
1772 outsize = sizeof(*gfout) + pathlen;
1774 gfout = kzalloc(outsize, GFP_NOFS);
1778 if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1783 /* Call mdc_iocontrol */
1784 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1788 if (copy_to_user(arg, gfout, outsize))
1796 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1798 struct ll_user_fiemap *fiemap_s;
1799 size_t num_bytes, ret_bytes;
1800 unsigned int extent_count;
1803 /* Get the extent count so we can calculate the size of
1804 * required fiemap buffer
1806 if (get_user(extent_count,
1807 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1811 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1813 num_bytes = sizeof(*fiemap_s) + (extent_count *
1814 sizeof(struct ll_fiemap_extent));
1816 fiemap_s = libcfs_kvzalloc(num_bytes, GFP_NOFS);
1820 /* get the fiemap value */
1821 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1822 sizeof(*fiemap_s))) {
1827 /* If fm_extent_count is non-zero, read the first extent since
1828 * it is used to calculate end_offset and device from previous
1832 if (copy_from_user(&fiemap_s->fm_extents[0],
1833 (char __user *)arg + sizeof(*fiemap_s),
1834 sizeof(struct ll_fiemap_extent))) {
1840 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1844 ret_bytes = sizeof(struct ll_user_fiemap);
1846 if (extent_count != 0)
1847 ret_bytes += (fiemap_s->fm_mapped_extents *
1848 sizeof(struct ll_fiemap_extent));
1850 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
1859 * Read the data_version for inode.
1861 * This value is computed using stripe object version on OST.
1862 * Version is computed using server side locking.
1864 * @param extent_lock Take extent lock. Not needed if a process is already
1865 * holding the OST object group locks.
1867 int ll_data_version(struct inode *inode, __u64 *data_version,
1870 struct lov_stripe_md *lsm = NULL;
1871 struct ll_sb_info *sbi = ll_i2sbi(inode);
1872 struct obdo *obdo = NULL;
1875 /* If no stripe, we consider version is 0. */
1876 lsm = ccc_inode_lsm_get(inode);
1877 if (!lsm_has_objects(lsm)) {
1879 CDEBUG(D_INODE, "No object for inode\n");
1884 obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1890 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, obdo, 0, extent_lock);
1892 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1895 *data_version = obdo->o_data_version;
1900 ccc_inode_lsm_put(inode, lsm);
1905 * Trigger a HSM release request for the provided inode.
1907 int ll_hsm_release(struct inode *inode)
1909 struct cl_env_nest nest;
1911 struct obd_client_handle *och = NULL;
1912 __u64 data_version = 0;
1915 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1916 ll_get_fsname(inode->i_sb, NULL, 0),
1917 PFID(&ll_i2info(inode)->lli_fid));
1919 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1925 /* Grab latest data_version and [am]time values */
1926 rc = ll_data_version(inode, &data_version, 1);
1930 env = cl_env_nested_get(&nest);
1936 ll_merge_lvb(env, inode);
1937 cl_env_nested_put(&nest, env);
1939 /* Release the file.
1940 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1941 * we still need it to pack l_remote_handle to MDT.
1943 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1948 if (och && !IS_ERR(och)) /* close the file */
1949 ll_lease_close(och, inode, NULL);
1954 struct ll_swap_stack {
1955 struct iattr ia1, ia2;
1957 struct inode *inode1, *inode2;
1958 bool check_dv1, check_dv2;
1961 static int ll_swap_layouts(struct file *file1, struct file *file2,
1962 struct lustre_swap_layouts *lsl)
1964 struct mdc_swap_layouts msl;
1965 struct md_op_data *op_data;
1968 struct ll_swap_stack *llss = NULL;
1971 llss = kzalloc(sizeof(*llss), GFP_NOFS);
1975 llss->inode1 = file_inode(file1);
1976 llss->inode2 = file_inode(file2);
1978 if (!S_ISREG(llss->inode2->i_mode)) {
1983 if (inode_permission(llss->inode1, MAY_WRITE) ||
1984 inode_permission(llss->inode2, MAY_WRITE)) {
1989 if (llss->inode2->i_sb != llss->inode1->i_sb) {
1994 /* we use 2 bool because it is easier to swap than 2 bits */
1995 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1996 llss->check_dv1 = true;
1998 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1999 llss->check_dv2 = true;
2001 /* we cannot use lsl->sl_dvX directly because we may swap them */
2002 llss->dv1 = lsl->sl_dv1;
2003 llss->dv2 = lsl->sl_dv2;
2005 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2006 if (rc == 0) /* same file, done! */ {
2011 if (rc < 0) { /* sequentialize it */
2012 swap(llss->inode1, llss->inode2);
2014 swap(llss->dv1, llss->dv2);
2015 swap(llss->check_dv1, llss->check_dv2);
2019 if (gid != 0) { /* application asks to flush dirty cache */
2020 rc = ll_get_grouplock(llss->inode1, file1, gid);
2024 rc = ll_get_grouplock(llss->inode2, file2, gid);
2026 ll_put_grouplock(llss->inode1, file1, gid);
2031 /* to be able to restore mtime and atime after swap
2032 * we need to first save them
2035 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2036 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2037 llss->ia1.ia_atime = llss->inode1->i_atime;
2038 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2039 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2040 llss->ia2.ia_atime = llss->inode2->i_atime;
2041 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2044 /* ultimate check, before swapping the layouts we check if
2045 * dataversion has changed (if requested)
2047 if (llss->check_dv1) {
2048 rc = ll_data_version(llss->inode1, &dv, 0);
2051 if (dv != llss->dv1) {
2057 if (llss->check_dv2) {
2058 rc = ll_data_version(llss->inode2, &dv, 0);
2061 if (dv != llss->dv2) {
2067 /* struct md_op_data is used to send the swap args to the mdt
2068 * only flags is missing, so we use struct mdc_swap_layouts
2069 * through the md_op_data->op_data
2071 /* flags from user space have to be converted before they are send to
2072 * server, no flag is sent today, they are only used on the client
2076 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2077 0, LUSTRE_OPC_ANY, &msl);
2078 if (IS_ERR(op_data)) {
2079 rc = PTR_ERR(op_data);
2083 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2084 sizeof(*op_data), op_data, NULL);
2085 ll_finish_md_op_data(op_data);
2089 ll_put_grouplock(llss->inode2, file2, gid);
2090 ll_put_grouplock(llss->inode1, file1, gid);
2093 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2097 /* clear useless flags */
2098 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2099 llss->ia1.ia_valid &= ~ATTR_MTIME;
2100 llss->ia2.ia_valid &= ~ATTR_MTIME;
2103 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2104 llss->ia1.ia_valid &= ~ATTR_ATIME;
2105 llss->ia2.ia_valid &= ~ATTR_ATIME;
2108 /* update time if requested */
2110 if (llss->ia2.ia_valid != 0) {
2111 inode_lock(llss->inode1);
2112 rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2113 inode_unlock(llss->inode1);
2116 if (llss->ia1.ia_valid != 0) {
2119 inode_lock(llss->inode2);
2120 rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2121 inode_unlock(llss->inode2);
2132 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2134 struct md_op_data *op_data;
2137 /* Detect out-of range masks */
2138 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2141 /* Non-root users are forbidden to set or clear flags which are
2142 * NOT defined in HSM_USER_MASK.
2144 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2145 !capable(CFS_CAP_SYS_ADMIN))
2148 /* Detect out-of range archive id */
2149 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2150 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2153 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2154 LUSTRE_OPC_ANY, hss);
2155 if (IS_ERR(op_data))
2156 return PTR_ERR(op_data);
2158 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2159 sizeof(*op_data), op_data, NULL);
2161 ll_finish_md_op_data(op_data);
2166 static int ll_hsm_import(struct inode *inode, struct file *file,
2167 struct hsm_user_import *hui)
2169 struct hsm_state_set *hss = NULL;
2170 struct iattr *attr = NULL;
2173 if (!S_ISREG(inode->i_mode))
2177 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2181 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2182 hss->hss_archive_id = hui->hui_archive_id;
2183 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2184 rc = ll_hsm_state_set(inode, hss);
2188 attr = kzalloc(sizeof(*attr), GFP_NOFS);
2194 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2195 attr->ia_mode |= S_IFREG;
2196 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2197 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2198 attr->ia_size = hui->hui_size;
2199 attr->ia_mtime.tv_sec = hui->hui_mtime;
2200 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2201 attr->ia_atime.tv_sec = hui->hui_atime;
2202 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2204 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2205 ATTR_UID | ATTR_GID |
2206 ATTR_MTIME | ATTR_MTIME_SET |
2207 ATTR_ATIME | ATTR_ATIME_SET;
2211 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2215 inode_unlock(inode);
2224 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2226 struct inode *inode = file_inode(file);
2227 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2230 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2231 inode->i_generation, inode, cmd);
2232 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2234 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2235 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2239 case LL_IOC_GETFLAGS:
2240 /* Get the current value of the file flags */
2241 return put_user(fd->fd_flags, (int __user *)arg);
2242 case LL_IOC_SETFLAGS:
2243 case LL_IOC_CLRFLAGS:
2244 /* Set or clear specific file flags */
2245 /* XXX This probably needs checks to ensure the flags are
2246 * not abused, and to handle any flag side effects.
2248 if (get_user(flags, (int __user *)arg))
2251 if (cmd == LL_IOC_SETFLAGS) {
2252 if ((flags & LL_FILE_IGNORE_LOCK) &&
2253 !(file->f_flags & O_DIRECT)) {
2254 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2259 fd->fd_flags |= flags;
2261 fd->fd_flags &= ~flags;
2264 case LL_IOC_LOV_SETSTRIPE:
2265 return ll_lov_setstripe(inode, file, arg);
2266 case LL_IOC_LOV_SETEA:
2267 return ll_lov_setea(inode, file, arg);
2268 case LL_IOC_LOV_SWAP_LAYOUTS: {
2270 struct lustre_swap_layouts lsl;
2272 if (copy_from_user(&lsl, (char __user *)arg,
2273 sizeof(struct lustre_swap_layouts)))
2276 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2279 file2 = fget(lsl.sl_fd);
2284 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2285 rc = ll_swap_layouts(file, file2, &lsl);
2289 case LL_IOC_LOV_GETSTRIPE:
2290 return ll_lov_getstripe(inode, arg);
2291 case LL_IOC_RECREATE_OBJ:
2292 return ll_lov_recreate_obj(inode, arg);
2293 case LL_IOC_RECREATE_FID:
2294 return ll_lov_recreate_fid(inode, arg);
2295 case FSFILT_IOC_FIEMAP:
2296 return ll_ioctl_fiemap(inode, arg);
2297 case FSFILT_IOC_GETFLAGS:
2298 case FSFILT_IOC_SETFLAGS:
2299 return ll_iocontrol(inode, file, cmd, arg);
2300 case FSFILT_IOC_GETVERSION_OLD:
2301 case FSFILT_IOC_GETVERSION:
2302 return put_user(inode->i_generation, (int __user *)arg);
2303 case LL_IOC_GROUP_LOCK:
2304 return ll_get_grouplock(inode, file, arg);
2305 case LL_IOC_GROUP_UNLOCK:
2306 return ll_put_grouplock(inode, file, arg);
2307 case IOC_OBD_STATFS:
2308 return ll_obd_statfs(inode, (void __user *)arg);
2310 /* We need to special case any other ioctls we want to handle,
2311 * to send them to the MDS/OST as appropriate and to properly
2312 * network encode the arg field.
2313 case FSFILT_IOC_SETVERSION_OLD:
2314 case FSFILT_IOC_SETVERSION:
2316 case LL_IOC_FLUSHCTX:
2317 return ll_flush_ctx(inode);
2318 case LL_IOC_PATH2FID: {
2319 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2320 sizeof(struct lu_fid)))
2325 case OBD_IOC_FID2PATH:
2326 return ll_fid2path(inode, (void __user *)arg);
2327 case LL_IOC_DATA_VERSION: {
2328 struct ioc_data_version idv;
2331 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2334 rc = ll_data_version(inode, &idv.idv_version,
2335 !(idv.idv_flags & LL_DV_NOFLUSH));
2337 if (rc == 0 && copy_to_user((char __user *)arg, &idv,
2344 case LL_IOC_GET_MDTIDX: {
2347 mdtidx = ll_get_mdt_idx(inode);
2351 if (put_user(mdtidx, (int __user *)arg))
2356 case OBD_IOC_GETDTNAME:
2357 case OBD_IOC_GETMDNAME:
2358 return ll_get_obd_name(inode, cmd, arg);
2359 case LL_IOC_HSM_STATE_GET: {
2360 struct md_op_data *op_data;
2361 struct hsm_user_state *hus;
2364 hus = kzalloc(sizeof(*hus), GFP_NOFS);
2368 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2369 LUSTRE_OPC_ANY, hus);
2370 if (IS_ERR(op_data)) {
2372 return PTR_ERR(op_data);
2375 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2378 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2381 ll_finish_md_op_data(op_data);
2385 case LL_IOC_HSM_STATE_SET: {
2386 struct hsm_state_set *hss;
2389 hss = memdup_user((char __user *)arg, sizeof(*hss));
2391 return PTR_ERR(hss);
2393 rc = ll_hsm_state_set(inode, hss);
2398 case LL_IOC_HSM_ACTION: {
2399 struct md_op_data *op_data;
2400 struct hsm_current_action *hca;
2403 hca = kzalloc(sizeof(*hca), GFP_NOFS);
2407 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2408 LUSTRE_OPC_ANY, hca);
2409 if (IS_ERR(op_data)) {
2411 return PTR_ERR(op_data);
2414 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2417 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2420 ll_finish_md_op_data(op_data);
2424 case LL_IOC_SET_LEASE: {
2425 struct ll_inode_info *lli = ll_i2info(inode);
2426 struct obd_client_handle *och = NULL;
2432 if (!(file->f_mode & FMODE_WRITE))
2437 if (!(file->f_mode & FMODE_READ))
2442 mutex_lock(&lli->lli_och_mutex);
2443 if (fd->fd_lease_och) {
2444 och = fd->fd_lease_och;
2445 fd->fd_lease_och = NULL;
2447 mutex_unlock(&lli->lli_och_mutex);
2450 mode = och->och_flags &
2451 (FMODE_READ|FMODE_WRITE);
2452 rc = ll_lease_close(och, inode, &lease_broken);
2453 if (rc == 0 && lease_broken)
2459 /* return the type of lease or error */
2460 return rc < 0 ? rc : (int)mode;
2465 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2467 /* apply for lease */
2468 och = ll_lease_open(inode, file, mode, 0);
2470 return PTR_ERR(och);
2473 mutex_lock(&lli->lli_och_mutex);
2474 if (!fd->fd_lease_och) {
2475 fd->fd_lease_och = och;
2478 mutex_unlock(&lli->lli_och_mutex);
2480 /* impossible now that only excl is supported for now */
2481 ll_lease_close(och, inode, &lease_broken);
2486 case LL_IOC_GET_LEASE: {
2487 struct ll_inode_info *lli = ll_i2info(inode);
2488 struct ldlm_lock *lock = NULL;
2491 mutex_lock(&lli->lli_och_mutex);
2492 if (fd->fd_lease_och) {
2493 struct obd_client_handle *och = fd->fd_lease_och;
2495 lock = ldlm_handle2lock(&och->och_lease_handle);
2497 lock_res_and_lock(lock);
2498 if (!ldlm_is_cancel(lock))
2499 rc = och->och_flags &
2500 (FMODE_READ | FMODE_WRITE);
2501 unlock_res_and_lock(lock);
2502 ldlm_lock_put(lock);
2505 mutex_unlock(&lli->lli_och_mutex);
2508 case LL_IOC_HSM_IMPORT: {
2509 struct hsm_user_import *hui;
2511 hui = memdup_user((void __user *)arg, sizeof(*hui));
2513 return PTR_ERR(hui);
2515 rc = ll_hsm_import(inode, file, hui);
2523 if (ll_iocontrol_call(inode, file, cmd, arg, &err) ==
2527 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2528 (void __user *)arg);
2533 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2535 struct inode *inode = file_inode(file);
2536 loff_t retval, eof = 0;
2538 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2539 (origin == SEEK_CUR) ? file->f_pos : 0);
2540 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2541 inode->i_ino, inode->i_generation, inode, retval, retval,
2543 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2545 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2546 retval = ll_glimpse_size(inode);
2549 eof = i_size_read(inode);
2552 retval = generic_file_llseek_size(file, offset, origin,
2553 ll_file_maxbytes(inode), eof);
2557 static int ll_flush(struct file *file, fl_owner_t id)
2559 struct inode *inode = file_inode(file);
2560 struct ll_inode_info *lli = ll_i2info(inode);
2561 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2564 LASSERT(!S_ISDIR(inode->i_mode));
2566 /* catch async errors that were recorded back when async writeback
2567 * failed for pages in this mapping.
2569 rc = lli->lli_async_rc;
2570 lli->lli_async_rc = 0;
2571 err = lov_read_and_clear_async_rc(lli->lli_clob);
2575 /* The application has been told about write failure already.
2576 * Do not report failure again.
2578 if (fd->fd_write_failed)
2580 return rc ? -EIO : 0;
2584 * Called to make sure a portion of file has been written out.
2585 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2587 * Return how many pages have been written.
2589 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2590 enum cl_fsync_mode mode, int ignore_layout)
2592 struct cl_env_nest nest;
2595 struct cl_fsync_io *fio;
2598 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2599 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2602 env = cl_env_nested_get(&nest);
2604 return PTR_ERR(env);
2606 io = ccc_env_thread_io(env);
2607 io->ci_obj = cl_i2info(inode)->lli_clob;
2608 io->ci_ignore_layout = ignore_layout;
2610 /* initialize parameters for sync */
2611 fio = &io->u.ci_fsync;
2612 fio->fi_start = start;
2614 fio->fi_fid = ll_inode2fid(inode);
2615 fio->fi_mode = mode;
2616 fio->fi_nr_written = 0;
2618 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2619 result = cl_io_loop(env, io);
2621 result = io->ci_result;
2623 result = fio->fi_nr_written;
2624 cl_io_fini(env, io);
2625 cl_env_nested_put(&nest, env);
2630 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2632 struct inode *inode = file_inode(file);
2633 struct ll_inode_info *lli = ll_i2info(inode);
2634 struct ptlrpc_request *req;
2637 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2638 inode->i_generation, inode);
2639 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2641 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2644 /* catch async errors that were recorded back when async writeback
2645 * failed for pages in this mapping.
2647 if (!S_ISDIR(inode->i_mode)) {
2648 err = lli->lli_async_rc;
2649 lli->lli_async_rc = 0;
2652 err = lov_read_and_clear_async_rc(lli->lli_clob);
2657 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2661 ptlrpc_req_finished(req);
2663 if (S_ISREG(inode->i_mode)) {
2664 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2666 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2667 if (rc == 0 && err < 0)
2670 fd->fd_write_failed = true;
2672 fd->fd_write_failed = false;
2675 inode_unlock(inode);
2680 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2682 struct inode *inode = file_inode(file);
2683 struct ll_sb_info *sbi = ll_i2sbi(inode);
2684 struct ldlm_enqueue_info einfo = {
2685 .ei_type = LDLM_FLOCK,
2686 .ei_cb_cp = ldlm_flock_completion_ast,
2687 .ei_cbdata = file_lock,
2689 struct md_op_data *op_data;
2690 struct lustre_handle lockh = {0};
2691 ldlm_policy_data_t flock = { {0} };
2696 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2697 inode->i_ino, file_lock);
2699 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2701 if (file_lock->fl_flags & FL_FLOCK)
2702 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2703 else if (!(file_lock->fl_flags & FL_POSIX))
2706 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2707 flock.l_flock.pid = file_lock->fl_pid;
2708 flock.l_flock.start = file_lock->fl_start;
2709 flock.l_flock.end = file_lock->fl_end;
2711 /* Somewhat ugly workaround for svc lockd.
2712 * lockd installs custom fl_lmops->lm_compare_owner that checks
2713 * for the fl_owner to be the same (which it always is on local node
2714 * I guess between lockd processes) and then compares pid.
2715 * As such we assign pid to the owner field to make it all work,
2716 * conflict with normal locks is unlikely since pid space and
2717 * pointer space for current->files are not intersecting
2719 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2720 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2722 switch (file_lock->fl_type) {
2724 einfo.ei_mode = LCK_PR;
2727 /* An unlock request may or may not have any relation to
2728 * existing locks so we may not be able to pass a lock handle
2729 * via a normal ldlm_lock_cancel() request. The request may even
2730 * unlock a byte range in the middle of an existing lock. In
2731 * order to process an unlock request we need all of the same
2732 * information that is given with a normal read or write record
2733 * lock request. To avoid creating another ldlm unlock (cancel)
2734 * message we'll treat a LCK_NL flock request as an unlock.
2736 einfo.ei_mode = LCK_NL;
2739 einfo.ei_mode = LCK_PW;
2742 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2743 file_lock->fl_type);
2758 flags = LDLM_FL_BLOCK_NOWAIT;
2764 flags = LDLM_FL_TEST_LOCK;
2765 /* Save the old mode so that if the mode in the lock changes we
2766 * can decrement the appropriate reader or writer refcount.
2768 file_lock->fl_type = einfo.ei_mode;
2771 CERROR("unknown fcntl lock command: %d\n", cmd);
2775 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2776 LUSTRE_OPC_ANY, NULL);
2777 if (IS_ERR(op_data))
2778 return PTR_ERR(op_data);
2780 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2781 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2782 flock.l_flock.start, flock.l_flock.end);
2784 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2785 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2787 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
2788 !(flags & LDLM_FL_TEST_LOCK))
2789 rc2 = locks_lock_file_wait(file, file_lock);
2791 if (rc2 && file_lock->fl_type != F_UNLCK) {
2792 einfo.ei_mode = LCK_NL;
2793 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2794 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2798 ll_finish_md_op_data(op_data);
2804 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2810 * test if some locks matching bits and l_req_mode are acquired
2811 * - bits can be in different locks
2812 * - if found clear the common lock bits in *bits
2813 * - the bits not found, are kept in *bits
2815 * \param bits [IN] searched lock bits [IN]
2816 * \param l_req_mode [IN] searched lock mode
2817 * \retval boolean, true iff all bits are found
2819 int ll_have_md_lock(struct inode *inode, __u64 *bits,
2820 enum ldlm_mode l_req_mode)
2822 struct lustre_handle lockh;
2823 ldlm_policy_data_t policy;
2824 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
2825 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2833 fid = &ll_i2info(inode)->lli_fid;
2834 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2835 ldlm_lockname[mode]);
2837 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2838 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2839 policy.l_inodebits.bits = *bits & (1 << i);
2840 if (policy.l_inodebits.bits == 0)
2843 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2844 &policy, mode, &lockh)) {
2845 struct ldlm_lock *lock;
2847 lock = ldlm_handle2lock(&lockh);
2850 ~(lock->l_policy_data.l_inodebits.bits);
2851 LDLM_LOCK_PUT(lock);
2853 *bits &= ~policy.l_inodebits.bits;
2860 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
2861 struct lustre_handle *lockh, __u64 flags,
2862 enum ldlm_mode mode)
2864 ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2868 fid = &ll_i2info(inode)->lli_fid;
2869 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2871 rc = md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED,
2872 fid, LDLM_IBITS, &policy, mode, lockh);
2877 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2879 /* Already unlinked. Just update nlink and return success */
2880 if (rc == -ENOENT) {
2882 /* This path cannot be hit for regular files unless in
2883 * case of obscure races, so no need to validate size.
2885 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2887 } else if (rc != 0) {
2888 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2889 "%s: revalidate FID "DFID" error: rc = %d\n",
2890 ll_get_fsname(inode->i_sb, NULL, 0),
2891 PFID(ll_inode2fid(inode)), rc);
2897 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2899 struct inode *inode = d_inode(dentry);
2900 struct ptlrpc_request *req = NULL;
2901 struct obd_export *exp;
2904 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2905 inode->i_ino, inode->i_generation, inode, dentry);
2907 exp = ll_i2mdexp(inode);
2909 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2910 * But under CMD case, it caused some lock issues, should be fixed
2911 * with new CMD ibits lock. See bug 12718
2913 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2914 struct lookup_intent oit = { .it_op = IT_GETATTR };
2915 struct md_op_data *op_data;
2917 if (ibits == MDS_INODELOCK_LOOKUP)
2918 oit.it_op = IT_LOOKUP;
2920 /* Call getattr by fid, so do not provide name at all. */
2921 op_data = ll_prep_md_op_data(NULL, inode,
2923 LUSTRE_OPC_ANY, NULL);
2924 if (IS_ERR(op_data))
2925 return PTR_ERR(op_data);
2927 oit.it_create_mode |= M_CHECK_STALE;
2928 rc = md_intent_lock(exp, op_data, NULL, 0,
2929 /* we are not interested in name
2933 ll_md_blocking_ast, 0);
2934 ll_finish_md_op_data(op_data);
2935 oit.it_create_mode &= ~M_CHECK_STALE;
2937 rc = ll_inode_revalidate_fini(inode, rc);
2941 rc = ll_revalidate_it_finish(req, &oit, inode);
2943 ll_intent_release(&oit);
2947 /* Unlinked? Unhash dentry, so it is not picked up later by
2948 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2949 * here to preserve get_cwd functionality on 2.6.
2952 if (!d_inode(dentry)->i_nlink)
2953 d_lustre_invalidate(dentry, 0);
2955 ll_lookup_finish_locks(&oit, inode);
2956 } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
2957 struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
2958 u64 valid = OBD_MD_FLGETATTR;
2959 struct md_op_data *op_data;
2962 if (S_ISREG(inode->i_mode)) {
2963 rc = ll_get_default_mdsize(sbi, &ealen);
2966 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2969 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2970 0, ealen, LUSTRE_OPC_ANY,
2972 if (IS_ERR(op_data))
2973 return PTR_ERR(op_data);
2975 op_data->op_valid = valid;
2976 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2977 ll_finish_md_op_data(op_data);
2979 rc = ll_inode_revalidate_fini(inode, rc);
2983 rc = ll_prep_inode(&inode, req, NULL, NULL);
2986 ptlrpc_req_finished(req);
2990 static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2992 struct inode *inode = d_inode(dentry);
2995 rc = __ll_inode_revalidate(dentry, ibits);
2999 /* if object isn't regular file, don't validate size */
3000 if (!S_ISREG(inode->i_mode)) {
3001 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3002 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3003 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3005 /* In case of restore, the MDT has the right size and has
3006 * already send it back without granting the layout lock,
3007 * inode is up-to-date so glimpse is useless.
3008 * Also to glimpse we need the layout, in case of a running
3009 * restore the MDT holds the layout lock so the glimpse will
3010 * block up to the end of restore (getattr will block)
3012 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3013 rc = ll_glimpse_size(inode);
3018 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3020 struct inode *inode = d_inode(de);
3021 struct ll_sb_info *sbi = ll_i2sbi(inode);
3022 struct ll_inode_info *lli = ll_i2info(inode);
3025 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3026 MDS_INODELOCK_LOOKUP);
3027 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3032 stat->dev = inode->i_sb->s_dev;
3033 if (ll_need_32bit_api(sbi))
3034 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3036 stat->ino = inode->i_ino;
3037 stat->mode = inode->i_mode;
3038 stat->nlink = inode->i_nlink;
3039 stat->uid = inode->i_uid;
3040 stat->gid = inode->i_gid;
3041 stat->rdev = inode->i_rdev;
3042 stat->atime = inode->i_atime;
3043 stat->mtime = inode->i_mtime;
3044 stat->ctime = inode->i_ctime;
3045 stat->blksize = 1 << inode->i_blkbits;
3047 stat->size = i_size_read(inode);
3048 stat->blocks = inode->i_blocks;
3053 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3054 __u64 start, __u64 len)
3058 struct ll_user_fiemap *fiemap;
3059 unsigned int extent_count = fieinfo->fi_extents_max;
3061 num_bytes = sizeof(*fiemap) + (extent_count *
3062 sizeof(struct ll_fiemap_extent));
3063 fiemap = libcfs_kvzalloc(num_bytes, GFP_NOFS);
3068 fiemap->fm_flags = fieinfo->fi_flags;
3069 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3070 fiemap->fm_start = start;
3071 fiemap->fm_length = len;
3072 if (extent_count > 0 &&
3073 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3074 sizeof(struct ll_fiemap_extent)) != 0) {
3079 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3081 fieinfo->fi_flags = fiemap->fm_flags;
3082 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3083 if (extent_count > 0 &&
3084 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3085 fiemap->fm_mapped_extents *
3086 sizeof(struct ll_fiemap_extent)) != 0) {
3096 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3098 struct ll_inode_info *lli = ll_i2info(inode);
3099 struct posix_acl *acl = NULL;
3101 spin_lock(&lli->lli_lock);
3102 /* VFS' acl_permission_check->check_acl will release the refcount */
3103 acl = posix_acl_dup(lli->lli_posix_acl);
3104 spin_unlock(&lli->lli_lock);
3109 int ll_inode_permission(struct inode *inode, int mask)
3113 if (mask & MAY_NOT_BLOCK)
3116 /* as root inode are NOT getting validated in lookup operation,
3117 * need to do it before permission check.
3120 if (is_root_inode(inode)) {
3121 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3122 MDS_INODELOCK_LOOKUP);
3127 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3128 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3130 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3131 return lustre_check_remote_perm(inode, mask);
3133 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3134 rc = generic_permission(inode, mask);
3139 /* -o localflock - only provides locally consistent flock locks */
3140 struct file_operations ll_file_operations = {
3141 .read_iter = ll_file_read_iter,
3142 .write_iter = ll_file_write_iter,
3143 .unlocked_ioctl = ll_file_ioctl,
3144 .open = ll_file_open,
3145 .release = ll_file_release,
3146 .mmap = ll_file_mmap,
3147 .llseek = ll_file_seek,
3148 .splice_read = ll_file_splice_read,
3153 struct file_operations ll_file_operations_flock = {
3154 .read_iter = ll_file_read_iter,
3155 .write_iter = ll_file_write_iter,
3156 .unlocked_ioctl = ll_file_ioctl,
3157 .open = ll_file_open,
3158 .release = ll_file_release,
3159 .mmap = ll_file_mmap,
3160 .llseek = ll_file_seek,
3161 .splice_read = ll_file_splice_read,
3164 .flock = ll_file_flock,
3165 .lock = ll_file_flock
3168 /* These are for -o noflock - to return ENOSYS on flock calls */
3169 struct file_operations ll_file_operations_noflock = {
3170 .read_iter = ll_file_read_iter,
3171 .write_iter = ll_file_write_iter,
3172 .unlocked_ioctl = ll_file_ioctl,
3173 .open = ll_file_open,
3174 .release = ll_file_release,
3175 .mmap = ll_file_mmap,
3176 .llseek = ll_file_seek,
3177 .splice_read = ll_file_splice_read,
3180 .flock = ll_file_noflock,
3181 .lock = ll_file_noflock
3184 const struct inode_operations ll_file_inode_operations = {
3185 .setattr = ll_setattr,
3186 .getattr = ll_getattr,
3187 .permission = ll_inode_permission,
3188 .setxattr = ll_setxattr,
3189 .getxattr = ll_getxattr,
3190 .listxattr = ll_listxattr,
3191 .removexattr = ll_removexattr,
3192 .fiemap = ll_fiemap,
3193 .get_acl = ll_get_acl,
3196 /* dynamic ioctl number support routines */
3197 static struct llioc_ctl_data {
3198 struct rw_semaphore ioc_sem;
3199 struct list_head ioc_head;
3201 __RWSEM_INITIALIZER(llioc.ioc_sem),
3202 LIST_HEAD_INIT(llioc.ioc_head)
3206 struct list_head iocd_list;
3207 unsigned int iocd_size;
3208 llioc_callback_t iocd_cb;
3209 unsigned int iocd_count;
3210 unsigned int iocd_cmd[0];
3213 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3216 struct llioc_data *in_data = NULL;
3218 if (!cb || !cmd || count > LLIOC_MAX_CMD || count < 0)
3221 size = sizeof(*in_data) + count * sizeof(unsigned int);
3222 in_data = kzalloc(size, GFP_NOFS);
3226 memset(in_data, 0, sizeof(*in_data));
3227 in_data->iocd_size = size;
3228 in_data->iocd_cb = cb;
3229 in_data->iocd_count = count;
3230 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3232 down_write(&llioc.ioc_sem);
3233 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3234 up_write(&llioc.ioc_sem);
3238 EXPORT_SYMBOL(ll_iocontrol_register);
3240 void ll_iocontrol_unregister(void *magic)
3242 struct llioc_data *tmp;
3247 down_write(&llioc.ioc_sem);
3248 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3250 list_del(&tmp->iocd_list);
3251 up_write(&llioc.ioc_sem);
3257 up_write(&llioc.ioc_sem);
3259 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3261 EXPORT_SYMBOL(ll_iocontrol_unregister);
3263 static enum llioc_iter
3264 ll_iocontrol_call(struct inode *inode, struct file *file,
3265 unsigned int cmd, unsigned long arg, int *rcp)
3267 enum llioc_iter ret = LLIOC_CONT;
3268 struct llioc_data *data;
3269 int rc = -EINVAL, i;
3271 down_read(&llioc.ioc_sem);
3272 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3273 for (i = 0; i < data->iocd_count; i++) {
3274 if (cmd != data->iocd_cmd[i])
3277 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3281 if (ret == LLIOC_STOP)
3284 up_read(&llioc.ioc_sem);
3291 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3293 struct ll_inode_info *lli = ll_i2info(inode);
3294 struct cl_env_nest nest;
3301 env = cl_env_nested_get(&nest);
3303 return PTR_ERR(env);
3305 result = cl_conf_set(env, lli->lli_clob, conf);
3306 cl_env_nested_put(&nest, env);
3308 if (conf->coc_opc == OBJECT_CONF_SET) {
3309 struct ldlm_lock *lock = conf->coc_lock;
3312 LASSERT(ldlm_has_layout(lock));
3314 /* it can only be allowed to match after layout is
3315 * applied to inode otherwise false layout would be
3316 * seen. Applying layout should happen before dropping
3319 ldlm_lock_allow_match(lock);
3325 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3326 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3329 struct ll_sb_info *sbi = ll_i2sbi(inode);
3330 struct ptlrpc_request *req;
3331 struct mdt_body *body;
3337 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3338 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3339 lock->l_lvb_data, lock->l_lvb_len);
3341 if (lock->l_lvb_data && (lock->l_flags & LDLM_FL_LVB_READY))
3344 /* if layout lock was granted right away, the layout is returned
3345 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3346 * blocked and then granted via completion ast, we have to fetch
3347 * layout here. Please note that we can't use the LVB buffer in
3348 * completion AST because it doesn't have a large enough buffer
3350 rc = ll_get_default_mdsize(sbi, &lmmsize);
3352 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3353 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3358 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3364 lmmsize = body->eadatasize;
3365 if (lmmsize == 0) /* empty layout */ {
3370 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3376 lvbdata = libcfs_kvzalloc(lmmsize, GFP_NOFS);
3382 memcpy(lvbdata, lmm, lmmsize);
3383 lock_res_and_lock(lock);
3384 if (lock->l_lvb_data)
3385 kvfree(lock->l_lvb_data);
3387 lock->l_lvb_data = lvbdata;
3388 lock->l_lvb_len = lmmsize;
3389 unlock_res_and_lock(lock);
3392 ptlrpc_req_finished(req);
3397 * Apply the layout to the inode. Layout lock is held and will be released
3400 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
3401 struct inode *inode, __u32 *gen, bool reconf)
3403 struct ll_inode_info *lli = ll_i2info(inode);
3404 struct ll_sb_info *sbi = ll_i2sbi(inode);
3405 struct ldlm_lock *lock;
3406 struct lustre_md md = { NULL };
3407 struct cl_object_conf conf;
3410 bool wait_layout = false;
3412 LASSERT(lustre_handle_is_used(lockh));
3414 lock = ldlm_handle2lock(lockh);
3416 LASSERT(ldlm_has_layout(lock));
3418 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3419 inode, PFID(&lli->lli_fid), reconf);
3421 /* in case this is a caching lock and reinstate with new inode */
3422 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3424 lock_res_and_lock(lock);
3425 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3426 unlock_res_and_lock(lock);
3427 /* checking lvb_ready is racy but this is okay. The worst case is
3428 * that multi processes may configure the file on the same time.
3430 if (lvb_ready || !reconf) {
3433 /* layout_gen must be valid if layout lock is not
3434 * cancelled and stripe has already set
3436 *gen = ll_layout_version_get(lli);
3442 rc = ll_layout_fetch(inode, lock);
3446 /* for layout lock, lmm is returned in lock's lvb.
3447 * lvb_data is immutable if the lock is held so it's safe to access it
3448 * without res lock. See the description in ldlm_lock_decref_internal()
3449 * for the condition to free lvb_data of layout lock
3451 if (lock->l_lvb_data) {
3452 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3453 lock->l_lvb_data, lock->l_lvb_len);
3455 *gen = LL_LAYOUT_GEN_EMPTY;
3457 *gen = md.lsm->lsm_layout_gen;
3460 CERROR("%s: file " DFID " unpackmd error: %d\n",
3461 ll_get_fsname(inode->i_sb, NULL, 0),
3462 PFID(&lli->lli_fid), rc);
3468 /* set layout to file. Unlikely this will fail as old layout was
3471 memset(&conf, 0, sizeof(conf));
3472 conf.coc_opc = OBJECT_CONF_SET;
3473 conf.coc_inode = inode;
3474 conf.coc_lock = lock;
3475 conf.u.coc_md = &md;
3476 rc = ll_layout_conf(inode, &conf);
3479 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3481 /* refresh layout failed, need to wait */
3482 wait_layout = rc == -EBUSY;
3485 LDLM_LOCK_PUT(lock);
3486 ldlm_lock_decref(lockh, mode);
3488 /* wait for IO to complete if it's still being used. */
3490 CDEBUG(D_INODE, "%s: %p/" DFID " wait for layout reconf.\n",
3491 ll_get_fsname(inode->i_sb, NULL, 0),
3492 inode, PFID(&lli->lli_fid));
3494 memset(&conf, 0, sizeof(conf));
3495 conf.coc_opc = OBJECT_CONF_WAIT;
3496 conf.coc_inode = inode;
3497 rc = ll_layout_conf(inode, &conf);
3501 CDEBUG(D_INODE, "file: " DFID " waiting layout return: %d.\n",
3502 PFID(&lli->lli_fid), rc);
3508 * This function checks if there exists a LAYOUT lock on the client side,
3509 * or enqueues it if it doesn't have one in cache.
3511 * This function will not hold layout lock so it may be revoked any time after
3512 * this function returns. Any operations depend on layout should be redone
3515 * This function should be called before lov_io_init() to get an uptodate
3516 * layout version, the caller should save the version number and after IO
3517 * is finished, this function should be called again to verify that layout
3518 * is not changed during IO time.
3520 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3522 struct ll_inode_info *lli = ll_i2info(inode);
3523 struct ll_sb_info *sbi = ll_i2sbi(inode);
3524 struct md_op_data *op_data;
3525 struct lookup_intent it;
3526 struct lustre_handle lockh;
3527 enum ldlm_mode mode;
3528 struct ldlm_enqueue_info einfo = {
3529 .ei_type = LDLM_IBITS,
3531 .ei_cb_bl = ll_md_blocking_ast,
3532 .ei_cb_cp = ldlm_completion_ast,
3536 *gen = ll_layout_version_get(lli);
3537 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3541 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3542 LASSERT(S_ISREG(inode->i_mode));
3544 /* take layout lock mutex to enqueue layout lock exclusively. */
3545 mutex_lock(&lli->lli_layout_mutex);
3548 /* mostly layout lock is caching on the local side, so try to match
3549 * it before grabbing layout lock mutex.
3551 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3552 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3553 if (mode != 0) { /* hit cached lock */
3554 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3558 mutex_unlock(&lli->lli_layout_mutex);
3562 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3563 0, 0, LUSTRE_OPC_ANY, NULL);
3564 if (IS_ERR(op_data)) {
3565 mutex_unlock(&lli->lli_layout_mutex);
3566 return PTR_ERR(op_data);
3569 /* have to enqueue one */
3570 memset(&it, 0, sizeof(it));
3571 it.it_op = IT_LAYOUT;
3572 lockh.cookie = 0ULL;
3574 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/" DFID ".\n",
3575 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3576 PFID(&lli->lli_fid));
3578 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3580 ptlrpc_req_finished(it.d.lustre.it_data);
3581 it.d.lustre.it_data = NULL;
3583 ll_finish_md_op_data(op_data);
3585 mode = it.d.lustre.it_lock_mode;
3586 it.d.lustre.it_lock_mode = 0;
3587 ll_intent_drop_lock(&it);
3590 /* set lock data in case this is a new lock */
3591 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3592 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3596 mutex_unlock(&lli->lli_layout_mutex);
3602 * This function send a restore request to the MDT
3604 int ll_layout_restore(struct inode *inode)
3606 struct hsm_user_request *hur;
3609 len = sizeof(struct hsm_user_request) +
3610 sizeof(struct hsm_user_item);
3611 hur = kzalloc(len, GFP_NOFS);
3615 hur->hur_request.hr_action = HUA_RESTORE;
3616 hur->hur_request.hr_archive_id = 0;
3617 hur->hur_request.hr_flags = 0;
3618 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3619 sizeof(hur->hur_user_item[0].hui_fid));
3620 hur->hur_user_item[0].hui_extent.length = -1;
3621 hur->hur_request.hr_itemcount = 1;
3622 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,