4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
60 fd->fd_write_failed = false;
64 static void ll_file_data_put(struct ll_file_data *fd)
67 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
70 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71 struct lustre_handle *fh)
73 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74 op_data->op_attr.ia_mode = inode->i_mode;
75 op_data->op_attr.ia_atime = inode->i_atime;
76 op_data->op_attr.ia_mtime = inode->i_mtime;
77 op_data->op_attr.ia_ctime = inode->i_ctime;
78 op_data->op_attr.ia_size = i_size_read(inode);
79 op_data->op_attr_blocks = inode->i_blocks;
80 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81 ll_inode_to_ext_flags(inode->i_flags);
82 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
84 op_data->op_handle = *fh;
85 op_data->op_capa1 = ll_mdscapa_get(inode);
87 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88 op_data->op_bias |= MDS_DATA_MODIFIED;
92 * Closes the IO epoch and packs all the attributes into @op_data for
95 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
102 if (!(och->och_flags & FMODE_WRITE))
105 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
108 ll_ioepoch_close(inode, op_data, &och, 0);
111 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112 ll_prep_md_op_data(op_data, inode, NULL, NULL,
113 0, 0, LUSTRE_OPC_ANY, NULL);
116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
118 struct obd_client_handle *och,
119 const __u64 *data_version)
121 struct obd_export *exp = ll_i2mdexp(inode);
122 struct md_op_data *op_data;
123 struct ptlrpc_request *req = NULL;
124 struct obd_device *obd = class_exp2obd(exp);
130 * XXX: in case of LMV, is this correct to access
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
138 OBD_ALLOC_PTR(op_data);
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
142 ll_prepare_close(inode, op_data, och);
143 if (data_version != NULL) {
144 /* Pass in data_version implies release. */
145 op_data->op_bias |= MDS_HSM_RELEASE;
146 op_data->op_data_version = *data_version;
147 op_data->op_lease_handle = och->och_lease_handle;
148 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
150 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
151 rc = md_close(md_exp, op_data, och->och_mod, &req);
153 /* This close must have the epoch closed. */
154 LASSERT(epoch_close);
155 /* MDS has instructed us to obtain Size-on-MDS attribute from
156 * OSTs and send setattr to back to MDS. */
157 rc = ll_som_update(inode, op_data);
159 CERROR("inode %lu mdc Size-on-MDS update failed: "
160 "rc = %d\n", inode->i_ino, rc);
164 CERROR("inode %lu mdc close failed: rc = %d\n",
168 /* DATA_MODIFIED flag was successfully sent on close, cancel data
169 * modification flag. */
170 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
171 struct ll_inode_info *lli = ll_i2info(inode);
173 spin_lock(&lli->lli_lock);
174 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
175 spin_unlock(&lli->lli_lock);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
184 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->valid & OBD_MD_FLRELEASED))
191 ll_finish_md_op_data(op_data);
194 if (exp_connect_som(exp) && !epoch_close &&
195 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
196 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
198 md_clear_open_replay_data(md_exp, och);
199 /* Free @och if it is not waiting for DONE_WRITING. */
200 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
203 if (req) /* This is close request */
204 ptlrpc_req_finished(req);
208 int ll_md_real_close(struct inode *inode, fmode_t fmode)
210 struct ll_inode_info *lli = ll_i2info(inode);
211 struct obd_client_handle **och_p;
212 struct obd_client_handle *och;
216 if (fmode & FMODE_WRITE) {
217 och_p = &lli->lli_mds_write_och;
218 och_usecount = &lli->lli_open_fd_write_count;
219 } else if (fmode & FMODE_EXEC) {
220 och_p = &lli->lli_mds_exec_och;
221 och_usecount = &lli->lli_open_fd_exec_count;
223 LASSERT(fmode & FMODE_READ);
224 och_p = &lli->lli_mds_read_och;
225 och_usecount = &lli->lli_open_fd_read_count;
228 mutex_lock(&lli->lli_och_mutex);
229 if (*och_usecount > 0) {
230 /* There are still users of this handle, so skip
232 mutex_unlock(&lli->lli_och_mutex);
238 mutex_unlock(&lli->lli_och_mutex);
241 /* There might be a race and this handle may already
243 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
250 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
253 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
254 struct ll_inode_info *lli = ll_i2info(inode);
257 /* clear group lock, if present */
258 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
259 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
261 if (fd->fd_lease_och != NULL) {
264 /* Usually the lease is not released when the
265 * application crashed, we need to release here. */
266 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
267 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
268 PFID(&lli->lli_fid), rc, lease_broken);
270 fd->fd_lease_och = NULL;
273 if (fd->fd_och != NULL) {
274 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
279 /* Let's see if we have good enough OPEN lock on the file and if
280 we can skip talking to MDS */
281 if (file->f_dentry->d_inode) { /* Can this ever be false? */
283 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
284 struct lustre_handle lockh;
285 struct inode *inode = file->f_dentry->d_inode;
286 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
288 mutex_lock(&lli->lli_och_mutex);
289 if (fd->fd_omode & FMODE_WRITE) {
291 LASSERT(lli->lli_open_fd_write_count);
292 lli->lli_open_fd_write_count--;
293 } else if (fd->fd_omode & FMODE_EXEC) {
295 LASSERT(lli->lli_open_fd_exec_count);
296 lli->lli_open_fd_exec_count--;
299 LASSERT(lli->lli_open_fd_read_count);
300 lli->lli_open_fd_read_count--;
302 mutex_unlock(&lli->lli_och_mutex);
304 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
305 LDLM_IBITS, &policy, lockmode,
307 rc = ll_md_real_close(file->f_dentry->d_inode,
311 CERROR("Releasing a file %p with negative dentry %p. Name %s",
312 file, file->f_dentry, file->f_dentry->d_name.name);
316 LUSTRE_FPRIVATE(file) = NULL;
317 ll_file_data_put(fd);
318 ll_capa_close(inode);
323 /* While this returns an error code, fput() the caller does not, so we need
324 * to make every effort to clean up all of our state here. Also, applications
325 * rarely check close errors and even if an error is returned they will not
326 * re-try the close call.
328 int ll_file_release(struct inode *inode, struct file *file)
330 struct ll_file_data *fd;
331 struct ll_sb_info *sbi = ll_i2sbi(inode);
332 struct ll_inode_info *lli = ll_i2info(inode);
335 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
336 inode->i_generation, inode);
338 #ifdef CONFIG_FS_POSIX_ACL
339 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
340 inode == inode->i_sb->s_root->d_inode) {
341 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
344 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
345 fd->fd_flags &= ~LL_FILE_RMTACL;
346 rct_del(&sbi->ll_rct, current_pid());
347 et_search_free(&sbi->ll_et, current_pid());
352 if (inode->i_sb->s_root != file->f_dentry)
353 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
354 fd = LUSTRE_FPRIVATE(file);
357 /* The last ref on @file, maybe not the the owner pid of statahead.
358 * Different processes can open the same dir, "ll_opendir_key" means:
359 * it is me that should stop the statahead thread. */
360 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
361 lli->lli_opendir_pid != 0)
362 ll_stop_statahead(inode, lli->lli_opendir_key);
364 if (inode->i_sb->s_root == file->f_dentry) {
365 LUSTRE_FPRIVATE(file) = NULL;
366 ll_file_data_put(fd);
370 if (!S_ISDIR(inode->i_mode)) {
371 lov_read_and_clear_async_rc(lli->lli_clob);
372 lli->lli_async_rc = 0;
375 rc = ll_md_close(sbi->ll_md_exp, inode, file);
377 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
378 libcfs_debug_dumplog();
383 static int ll_intent_file_open(struct file *file, void *lmm,
384 int lmmsize, struct lookup_intent *itp)
386 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
387 struct dentry *parent = file->f_dentry->d_parent;
388 const char *name = file->f_dentry->d_name.name;
389 const int len = file->f_dentry->d_name.len;
390 struct md_op_data *op_data;
391 struct ptlrpc_request *req;
392 __u32 opc = LUSTRE_OPC_ANY;
398 /* Usually we come here only for NFSD, and we want open lock.
399 But we can also get here with pre 2.6.15 patchless kernels, and in
400 that case that lock is also ok */
401 /* We can also get here if there was cached open handle in revalidate_it
402 * but it disappeared while we were getting from there to ll_file_open.
403 * But this means this file was closed and immediately opened which
404 * makes a good candidate for using OPEN lock */
405 /* If lmmsize & lmm are not 0, we are just setting stripe info
406 * parameters. No need for the open lock */
407 if (lmm == NULL && lmmsize == 0) {
408 itp->it_flags |= MDS_OPEN_LOCK;
409 if (itp->it_flags & FMODE_WRITE)
410 opc = LUSTRE_OPC_CREATE;
413 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
414 file->f_dentry->d_inode, name, len,
417 return PTR_ERR(op_data);
419 itp->it_flags |= MDS_OPEN_BY_FID;
420 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
421 0 /*unused */, &req, ll_md_blocking_ast, 0);
422 ll_finish_md_op_data(op_data);
424 /* reason for keep own exit path - don`t flood log
425 * with messages with -ESTALE errors.
427 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
428 it_open_error(DISP_OPEN_OPEN, itp))
430 ll_release_openhandle(file->f_dentry, itp);
434 if (it_disposition(itp, DISP_LOOKUP_NEG))
435 GOTO(out, rc = -ENOENT);
437 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
438 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
439 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
443 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
444 if (!rc && itp->d.lustre.it_lock_mode)
445 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
449 ptlrpc_req_finished(req);
450 ll_intent_drop_lock(itp);
456 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
457 * not believe attributes if a few ioepoch holders exist. Attributes for
458 * previous ioepoch if new one is opened are also skipped by MDS.
460 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
462 if (ioepoch && lli->lli_ioepoch != ioepoch) {
463 lli->lli_ioepoch = ioepoch;
464 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
465 ioepoch, PFID(&lli->lli_fid));
469 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
470 struct obd_client_handle *och)
472 struct ptlrpc_request *req = it->d.lustre.it_data;
473 struct mdt_body *body;
475 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
476 och->och_fh = body->handle;
477 och->och_fid = body->fid1;
478 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
479 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
480 och->och_flags = it->it_flags;
482 return md_set_open_replay_data(md_exp, och, it);
485 int ll_local_open(struct file *file, struct lookup_intent *it,
486 struct ll_file_data *fd, struct obd_client_handle *och)
488 struct inode *inode = file->f_dentry->d_inode;
489 struct ll_inode_info *lli = ll_i2info(inode);
491 LASSERT(!LUSTRE_FPRIVATE(file));
496 struct ptlrpc_request *req = it->d.lustre.it_data;
497 struct mdt_body *body;
500 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
504 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
505 ll_ioepoch_open(lli, body->ioepoch);
508 LUSTRE_FPRIVATE(file) = fd;
509 ll_readahead_init(inode, &fd->fd_ras);
510 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
514 /* Open a file, and (for the very first open) create objects on the OSTs at
515 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
516 * creation or open until ll_lov_setstripe() ioctl is called.
518 * If we already have the stripe MD locally then we don't request it in
519 * md_open(), by passing a lmm_size = 0.
521 * It is up to the application to ensure no other processes open this file
522 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
523 * used. We might be able to avoid races of that sort by getting lli_open_sem
524 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
525 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
527 int ll_file_open(struct inode *inode, struct file *file)
529 struct ll_inode_info *lli = ll_i2info(inode);
530 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
531 .it_flags = file->f_flags };
532 struct obd_client_handle **och_p = NULL;
533 __u64 *och_usecount = NULL;
534 struct ll_file_data *fd;
535 int rc = 0, opendir_set = 0;
537 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
538 inode->i_generation, inode, file->f_flags);
540 it = file->private_data; /* XXX: compat macro */
541 file->private_data = NULL; /* prevent ll_local_open assertion */
543 fd = ll_file_data_get();
545 GOTO(out_openerr, rc = -ENOMEM);
548 if (S_ISDIR(inode->i_mode)) {
549 spin_lock(&lli->lli_sa_lock);
550 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
551 lli->lli_opendir_pid == 0) {
552 lli->lli_opendir_key = fd;
553 lli->lli_opendir_pid = current_pid();
556 spin_unlock(&lli->lli_sa_lock);
559 if (inode->i_sb->s_root == file->f_dentry) {
560 LUSTRE_FPRIVATE(file) = fd;
564 if (!it || !it->d.lustre.it_disposition) {
565 /* Convert f_flags into access mode. We cannot use file->f_mode,
566 * because everything but O_ACCMODE mask was stripped from
568 if ((oit.it_flags + 1) & O_ACCMODE)
570 if (file->f_flags & O_TRUNC)
571 oit.it_flags |= FMODE_WRITE;
573 /* kernel only call f_op->open in dentry_open. filp_open calls
574 * dentry_open after call to open_namei that checks permissions.
575 * Only nfsd_open call dentry_open directly without checking
576 * permissions and because of that this code below is safe. */
577 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
578 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
580 /* We do not want O_EXCL here, presumably we opened the file
581 * already? XXX - NFS implications? */
582 oit.it_flags &= ~O_EXCL;
584 /* bug20584, if "it_flags" contains O_CREAT, the file will be
585 * created if necessary, then "IT_CREAT" should be set to keep
586 * consistent with it */
587 if (oit.it_flags & O_CREAT)
588 oit.it_op |= IT_CREAT;
594 /* Let's see if we have file open on MDS already. */
595 if (it->it_flags & FMODE_WRITE) {
596 och_p = &lli->lli_mds_write_och;
597 och_usecount = &lli->lli_open_fd_write_count;
598 } else if (it->it_flags & FMODE_EXEC) {
599 och_p = &lli->lli_mds_exec_och;
600 och_usecount = &lli->lli_open_fd_exec_count;
602 och_p = &lli->lli_mds_read_och;
603 och_usecount = &lli->lli_open_fd_read_count;
606 mutex_lock(&lli->lli_och_mutex);
607 if (*och_p) { /* Open handle is present */
608 if (it_disposition(it, DISP_OPEN_OPEN)) {
609 /* Well, there's extra open request that we do not need,
610 let's close it somehow. This will decref request. */
611 rc = it_open_error(DISP_OPEN_OPEN, it);
613 mutex_unlock(&lli->lli_och_mutex);
614 GOTO(out_openerr, rc);
617 ll_release_openhandle(file->f_dentry, it);
621 rc = ll_local_open(file, it, fd, NULL);
624 mutex_unlock(&lli->lli_och_mutex);
625 GOTO(out_openerr, rc);
628 LASSERT(*och_usecount == 0);
629 if (!it->d.lustre.it_disposition) {
630 /* We cannot just request lock handle now, new ELC code
631 means that one of other OPEN locks for this file
632 could be cancelled, and since blocking ast handler
633 would attempt to grab och_mutex as well, that would
634 result in a deadlock */
635 mutex_unlock(&lli->lli_och_mutex);
636 it->it_create_mode |= M_CHECK_STALE;
637 rc = ll_intent_file_open(file, NULL, 0, it);
638 it->it_create_mode &= ~M_CHECK_STALE;
640 GOTO(out_openerr, rc);
644 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
646 GOTO(out_och_free, rc = -ENOMEM);
650 /* md_intent_lock() didn't get a request ref if there was an
651 * open error, so don't do cleanup on the request here
653 /* XXX (green): Should not we bail out on any error here, not
654 * just open error? */
655 rc = it_open_error(DISP_OPEN_OPEN, it);
657 GOTO(out_och_free, rc);
659 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
661 rc = ll_local_open(file, it, fd, *och_p);
663 GOTO(out_och_free, rc);
665 mutex_unlock(&lli->lli_och_mutex);
668 /* Must do this outside lli_och_mutex lock to prevent deadlock where
669 different kind of OPEN lock for this same inode gets cancelled
670 by ldlm_cancel_lru */
671 if (!S_ISREG(inode->i_mode))
672 GOTO(out_och_free, rc);
676 if (!lli->lli_has_smd &&
677 (cl_is_lov_delay_create(file->f_flags) ||
678 (file->f_mode & FMODE_WRITE) == 0)) {
679 CDEBUG(D_INODE, "object creation was delayed\n");
680 GOTO(out_och_free, rc);
682 cl_lov_delay_create_clear(&file->f_flags);
683 GOTO(out_och_free, rc);
687 if (och_p && *och_p) {
688 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
689 *och_p = NULL; /* OBD_FREE writes some magic there */
692 mutex_unlock(&lli->lli_och_mutex);
695 if (opendir_set != 0)
696 ll_stop_statahead(inode, lli->lli_opendir_key);
698 ll_file_data_put(fd);
700 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
703 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
704 ptlrpc_req_finished(it->d.lustre.it_data);
705 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
711 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
712 struct ldlm_lock_desc *desc, void *data, int flag)
715 struct lustre_handle lockh;
718 case LDLM_CB_BLOCKING:
719 ldlm_lock2handle(lock, &lockh);
720 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
722 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
726 case LDLM_CB_CANCELING:
734 * Acquire a lease and open the file.
736 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
737 fmode_t fmode, __u64 open_flags)
739 struct lookup_intent it = { .it_op = IT_OPEN };
740 struct ll_sb_info *sbi = ll_i2sbi(inode);
741 struct md_op_data *op_data;
742 struct ptlrpc_request *req;
743 struct lustre_handle old_handle = { 0 };
744 struct obd_client_handle *och = NULL;
748 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
749 return ERR_PTR(-EINVAL);
752 struct ll_inode_info *lli = ll_i2info(inode);
753 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
754 struct obd_client_handle **och_p;
757 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
758 return ERR_PTR(-EPERM);
760 /* Get the openhandle of the file */
762 mutex_lock(&lli->lli_och_mutex);
763 if (fd->fd_lease_och != NULL) {
764 mutex_unlock(&lli->lli_och_mutex);
768 if (fd->fd_och == NULL) {
769 if (file->f_mode & FMODE_WRITE) {
770 LASSERT(lli->lli_mds_write_och != NULL);
771 och_p = &lli->lli_mds_write_och;
772 och_usecount = &lli->lli_open_fd_write_count;
774 LASSERT(lli->lli_mds_read_och != NULL);
775 och_p = &lli->lli_mds_read_och;
776 och_usecount = &lli->lli_open_fd_read_count;
778 if (*och_usecount == 1) {
785 mutex_unlock(&lli->lli_och_mutex);
786 if (rc < 0) /* more than 1 opener */
789 LASSERT(fd->fd_och != NULL);
790 old_handle = fd->fd_och->och_fh;
795 return ERR_PTR(-ENOMEM);
797 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
798 LUSTRE_OPC_ANY, NULL);
800 GOTO(out, rc = PTR_ERR(op_data));
802 /* To tell the MDT this openhandle is from the same owner */
803 op_data->op_handle = old_handle;
805 it.it_flags = fmode | open_flags;
806 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
807 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
808 ll_md_blocking_lease_ast,
809 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
810 * it can be cancelled which may mislead applications that the lease is
812 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
813 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
814 * doesn't deal with openhandle, so normal openhandle will be leaked. */
815 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
816 ll_finish_md_op_data(op_data);
817 ptlrpc_req_finished(req);
819 GOTO(out_release_it, rc);
821 if (it_disposition(&it, DISP_LOOKUP_NEG))
822 GOTO(out_release_it, rc = -ENOENT);
824 rc = it_open_error(DISP_OPEN_OPEN, &it);
826 GOTO(out_release_it, rc);
828 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
829 ll_och_fill(sbi->ll_md_exp, &it, och);
831 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
832 GOTO(out_close, rc = -EOPNOTSUPP);
834 /* already get lease, handle lease lock */
835 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
836 if (it.d.lustre.it_lock_mode == 0 ||
837 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
838 /* open lock must return for lease */
839 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
840 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
841 it.d.lustre.it_lock_bits);
842 GOTO(out_close, rc = -EPROTO);
845 ll_intent_release(&it);
849 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
851 CERROR("Close openhandle returned %d\n", rc2);
853 /* cancel open lock */
854 if (it.d.lustre.it_lock_mode != 0) {
855 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
856 it.d.lustre.it_lock_mode);
857 it.d.lustre.it_lock_mode = 0;
860 ll_intent_release(&it);
865 EXPORT_SYMBOL(ll_lease_open);
868 * Release lease and close the file.
869 * It will check if the lease has ever broken.
871 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
874 struct ldlm_lock *lock;
875 bool cancelled = true;
878 lock = ldlm_handle2lock(&och->och_lease_handle);
880 lock_res_and_lock(lock);
881 cancelled = ldlm_is_cancel(lock);
882 unlock_res_and_lock(lock);
886 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
887 PFID(&ll_i2info(inode)->lli_fid), cancelled);
890 ldlm_cli_cancel(&och->och_lease_handle, 0);
891 if (lease_broken != NULL)
892 *lease_broken = cancelled;
894 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
898 EXPORT_SYMBOL(ll_lease_close);
900 /* Fills the obdo with the attributes for the lsm */
901 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
902 struct obd_capa *capa, struct obdo *obdo,
903 __u64 ioepoch, int sync)
905 struct ptlrpc_request_set *set;
906 struct obd_info oinfo = { { { 0 } } };
909 LASSERT(lsm != NULL);
913 oinfo.oi_oa->o_oi = lsm->lsm_oi;
914 oinfo.oi_oa->o_mode = S_IFREG;
915 oinfo.oi_oa->o_ioepoch = ioepoch;
916 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
917 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
918 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
919 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
920 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
921 OBD_MD_FLDATAVERSION;
922 oinfo.oi_capa = capa;
924 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
925 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
928 set = ptlrpc_prep_set();
930 CERROR("can't allocate ptlrpc set\n");
933 rc = obd_getattr_async(exp, &oinfo, set);
935 rc = ptlrpc_set_wait(set);
936 ptlrpc_set_destroy(set);
939 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
940 OBD_MD_FLATIME | OBD_MD_FLMTIME |
941 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
942 OBD_MD_FLDATAVERSION);
947 * Performs the getattr on the inode and updates its fields.
948 * If @sync != 0, perform the getattr under the server-side lock.
950 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
951 __u64 ioepoch, int sync)
953 struct obd_capa *capa = ll_mdscapa_get(inode);
954 struct lov_stripe_md *lsm;
957 lsm = ccc_inode_lsm_get(inode);
958 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
959 capa, obdo, ioepoch, sync);
962 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
964 obdo_refresh_inode(inode, obdo, obdo->o_valid);
965 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
966 " blksize %lu\n", POSTID(oi), i_size_read(inode),
967 (unsigned long long)inode->i_blocks,
968 (unsigned long)ll_inode_blksize(inode));
970 ccc_inode_lsm_put(inode, lsm);
974 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
976 struct ll_inode_info *lli = ll_i2info(inode);
977 struct cl_object *obj = lli->lli_clob;
978 struct cl_attr *attr = ccc_env_thread_attr(env);
982 ll_inode_size_lock(inode);
983 /* merge timestamps the most recently obtained from mds with
984 timestamps obtained from osts */
985 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
986 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
987 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
988 inode_init_lvb(inode, &lvb);
990 cl_object_attr_lock(obj);
991 rc = cl_object_attr_get(env, obj, attr);
992 cl_object_attr_unlock(obj);
995 if (lvb.lvb_atime < attr->cat_atime)
996 lvb.lvb_atime = attr->cat_atime;
997 if (lvb.lvb_ctime < attr->cat_ctime)
998 lvb.lvb_ctime = attr->cat_ctime;
999 if (lvb.lvb_mtime < attr->cat_mtime)
1000 lvb.lvb_mtime = attr->cat_mtime;
1002 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1003 PFID(&lli->lli_fid), attr->cat_size);
1004 cl_isize_write_nolock(inode, attr->cat_size);
1006 inode->i_blocks = attr->cat_blocks;
1008 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1009 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1010 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1012 ll_inode_size_unlock(inode);
1017 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1020 struct obdo obdo = { 0 };
1023 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1025 st->st_size = obdo.o_size;
1026 st->st_blocks = obdo.o_blocks;
1027 st->st_mtime = obdo.o_mtime;
1028 st->st_atime = obdo.o_atime;
1029 st->st_ctime = obdo.o_ctime;
1034 static bool file_is_noatime(const struct file *file)
1036 const struct vfsmount *mnt = file->f_path.mnt;
1037 const struct inode *inode = file->f_path.dentry->d_inode;
1039 /* Adapted from file_accessed() and touch_atime().*/
1040 if (file->f_flags & O_NOATIME)
1043 if (inode->i_flags & S_NOATIME)
1046 if (IS_NOATIME(inode))
1049 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1052 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1055 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1061 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1063 struct inode *inode = file->f_dentry->d_inode;
1065 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1067 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1068 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1069 file->f_flags & O_DIRECT ||
1072 io->ci_obj = ll_i2info(inode)->lli_clob;
1073 io->ci_lockreq = CILR_MAYBE;
1074 if (ll_file_nolock(file)) {
1075 io->ci_lockreq = CILR_NEVER;
1076 io->ci_no_srvlock = 1;
1077 } else if (file->f_flags & O_APPEND) {
1078 io->ci_lockreq = CILR_MANDATORY;
1081 io->ci_noatime = file_is_noatime(file);
1085 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1086 struct file *file, enum cl_io_type iot,
1087 loff_t *ppos, size_t count)
1089 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1090 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1095 io = ccc_env_thread_io(env);
1096 ll_io_init(io, file, iot == CIT_WRITE);
1098 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1099 struct vvp_io *vio = vvp_env_io(env);
1100 struct ccc_io *cio = ccc_env_io(env);
1101 int write_mutex_locked = 0;
1103 cio->cui_fd = LUSTRE_FPRIVATE(file);
1104 vio->cui_io_subtype = args->via_io_subtype;
1106 switch (vio->cui_io_subtype) {
1108 cio->cui_iov = args->u.normal.via_iov;
1109 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1110 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1111 cio->cui_iocb = args->u.normal.via_iocb;
1112 if ((iot == CIT_WRITE) &&
1113 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1114 if (mutex_lock_interruptible(&lli->
1116 GOTO(out, result = -ERESTARTSYS);
1117 write_mutex_locked = 1;
1118 } else if (iot == CIT_READ) {
1119 down_read(&lli->lli_trunc_sem);
1123 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1124 vio->u.splice.cui_flags = args->u.splice.via_flags;
1127 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1130 result = cl_io_loop(env, io);
1131 if (write_mutex_locked)
1132 mutex_unlock(&lli->lli_write_mutex);
1133 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1134 up_read(&lli->lli_trunc_sem);
1136 /* cl_io_rw_init() handled IO */
1137 result = io->ci_result;
1140 if (io->ci_nob > 0) {
1141 result = io->ci_nob;
1142 *ppos = io->u.ci_wr.wr.crw_pos;
1146 cl_io_fini(env, io);
1147 /* If any bit been read/written (result != 0), we just return
1148 * short read/write instead of restart io. */
1149 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1150 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1151 iot == CIT_READ ? "read" : "write",
1152 file->f_dentry->d_name.name, *ppos, count);
1153 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1157 if (iot == CIT_READ) {
1159 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1160 LPROC_LL_READ_BYTES, result);
1161 } else if (iot == CIT_WRITE) {
1163 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1164 LPROC_LL_WRITE_BYTES, result);
1165 fd->fd_write_failed = false;
1166 } else if (result != -ERESTARTSYS) {
1167 fd->fd_write_failed = true;
1174 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1175 unsigned long nr_segs, loff_t pos)
1178 struct vvp_io_args *args;
1183 result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1187 env = cl_env_get(&refcheck);
1189 return PTR_ERR(env);
1191 args = vvp_env_args(env, IO_NORMAL);
1192 args->u.normal.via_iov = (struct iovec *)iov;
1193 args->u.normal.via_nrsegs = nr_segs;
1194 args->u.normal.via_iocb = iocb;
1196 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1197 &iocb->ki_pos, count);
1198 cl_env_put(env, &refcheck);
1202 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1206 struct iovec *local_iov;
1207 struct kiocb *kiocb;
1211 env = cl_env_get(&refcheck);
1213 return PTR_ERR(env);
1215 local_iov = &vvp_env_info(env)->vti_local_iov;
1216 kiocb = &vvp_env_info(env)->vti_kiocb;
1217 local_iov->iov_base = (void __user *)buf;
1218 local_iov->iov_len = count;
1219 init_sync_kiocb(kiocb, file);
1220 kiocb->ki_pos = *ppos;
1221 kiocb->ki_nbytes = count;
1223 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1224 *ppos = kiocb->ki_pos;
1226 cl_env_put(env, &refcheck);
1231 * Write to a file (through the page cache).
1233 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1234 unsigned long nr_segs, loff_t pos)
1237 struct vvp_io_args *args;
1242 result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
1246 env = cl_env_get(&refcheck);
1248 return PTR_ERR(env);
1250 args = vvp_env_args(env, IO_NORMAL);
1251 args->u.normal.via_iov = (struct iovec *)iov;
1252 args->u.normal.via_nrsegs = nr_segs;
1253 args->u.normal.via_iocb = iocb;
1255 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1256 &iocb->ki_pos, count);
1257 cl_env_put(env, &refcheck);
1261 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1265 struct iovec *local_iov;
1266 struct kiocb *kiocb;
1270 env = cl_env_get(&refcheck);
1272 return PTR_ERR(env);
1274 local_iov = &vvp_env_info(env)->vti_local_iov;
1275 kiocb = &vvp_env_info(env)->vti_kiocb;
1276 local_iov->iov_base = (void __user *)buf;
1277 local_iov->iov_len = count;
1278 init_sync_kiocb(kiocb, file);
1279 kiocb->ki_pos = *ppos;
1280 kiocb->ki_nbytes = count;
1282 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1283 *ppos = kiocb->ki_pos;
1285 cl_env_put(env, &refcheck);
1292 * Send file content (through pagecache) somewhere with helper
1294 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1295 struct pipe_inode_info *pipe, size_t count,
1299 struct vvp_io_args *args;
1303 env = cl_env_get(&refcheck);
1305 return PTR_ERR(env);
1307 args = vvp_env_args(env, IO_SPLICE);
1308 args->u.splice.via_pipe = pipe;
1309 args->u.splice.via_flags = flags;
1311 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1312 cl_env_put(env, &refcheck);
1316 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1319 struct obd_export *exp = ll_i2dtexp(inode);
1320 struct obd_trans_info oti = { 0 };
1321 struct obdo *oa = NULL;
1324 struct lov_stripe_md *lsm = NULL, *lsm2;
1330 lsm = ccc_inode_lsm_get(inode);
1331 if (!lsm_has_objects(lsm))
1332 GOTO(out, rc = -ENOENT);
1334 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1335 (lsm->lsm_stripe_count));
1337 OBD_ALLOC_LARGE(lsm2, lsm_size);
1339 GOTO(out, rc = -ENOMEM);
1342 oa->o_nlink = ost_idx;
1343 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1344 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1345 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1346 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1347 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1348 memcpy(lsm2, lsm, lsm_size);
1349 ll_inode_size_lock(inode);
1350 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1351 ll_inode_size_unlock(inode);
1353 OBD_FREE_LARGE(lsm2, lsm_size);
1356 ccc_inode_lsm_put(inode, lsm);
1361 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1363 struct ll_recreate_obj ucreat;
1366 if (!capable(CFS_CAP_SYS_ADMIN))
1369 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1373 ostid_set_seq_mdt0(&oi);
1374 ostid_set_id(&oi, ucreat.lrc_id);
1375 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1378 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1384 if (!capable(CFS_CAP_SYS_ADMIN))
1387 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1390 fid_to_ostid(&fid, &oi);
1391 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1392 return ll_lov_recreate(inode, &oi, ost_idx);
1395 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1396 int flags, struct lov_user_md *lum, int lum_size)
1398 struct lov_stripe_md *lsm = NULL;
1399 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1402 lsm = ccc_inode_lsm_get(inode);
1404 ccc_inode_lsm_put(inode, lsm);
1405 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1407 GOTO(out, rc = -EEXIST);
1410 ll_inode_size_lock(inode);
1411 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1413 GOTO(out_unlock, rc);
1414 rc = oit.d.lustre.it_status;
1416 GOTO(out_req_free, rc);
1418 ll_release_openhandle(file->f_dentry, &oit);
1421 ll_inode_size_unlock(inode);
1422 ll_intent_release(&oit);
1423 ccc_inode_lsm_put(inode, lsm);
1425 cl_lov_delay_create_clear(&file->f_flags);
1428 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1432 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1433 struct lov_mds_md **lmmp, int *lmm_size,
1434 struct ptlrpc_request **request)
1436 struct ll_sb_info *sbi = ll_i2sbi(inode);
1437 struct mdt_body *body;
1438 struct lov_mds_md *lmm = NULL;
1439 struct ptlrpc_request *req = NULL;
1440 struct md_op_data *op_data;
1443 rc = ll_get_max_mdsize(sbi, &lmmsize);
1447 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1448 strlen(filename), lmmsize,
1449 LUSTRE_OPC_ANY, NULL);
1450 if (IS_ERR(op_data))
1451 return PTR_ERR(op_data);
1453 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1454 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1455 ll_finish_md_op_data(op_data);
1457 CDEBUG(D_INFO, "md_getattr_name failed "
1458 "on %s: rc %d\n", filename, rc);
1462 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1463 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1465 lmmsize = body->eadatasize;
1467 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1469 GOTO(out, rc = -ENODATA);
1472 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1473 LASSERT(lmm != NULL);
1475 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1476 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1477 GOTO(out, rc = -EPROTO);
1481 * This is coming from the MDS, so is probably in
1482 * little endian. We convert it to host endian before
1483 * passing it to userspace.
1485 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1488 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1489 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1492 /* if function called for directory - we should
1493 * avoid swab not existent lsm objects */
1494 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1495 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1496 if (S_ISREG(body->mode))
1497 lustre_swab_lov_user_md_objects(
1498 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1500 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1501 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1502 if (S_ISREG(body->mode))
1503 lustre_swab_lov_user_md_objects(
1504 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1511 *lmm_size = lmmsize;
1516 static int ll_lov_setea(struct inode *inode, struct file *file,
1519 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1520 struct lov_user_md *lump;
1521 int lum_size = sizeof(struct lov_user_md) +
1522 sizeof(struct lov_user_ost_data);
1525 if (!capable(CFS_CAP_SYS_ADMIN))
1528 OBD_ALLOC_LARGE(lump, lum_size);
1532 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1533 OBD_FREE_LARGE(lump, lum_size);
1537 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1539 OBD_FREE_LARGE(lump, lum_size);
1543 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1546 struct lov_user_md_v3 lumv3;
1547 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1548 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1549 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1551 int flags = FMODE_WRITE;
1553 /* first try with v1 which is smaller than v3 */
1554 lum_size = sizeof(struct lov_user_md_v1);
1555 if (copy_from_user(lumv1, lumv1p, lum_size))
1558 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1559 lum_size = sizeof(struct lov_user_md_v3);
1560 if (copy_from_user(&lumv3, lumv3p, lum_size))
1564 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1566 struct lov_stripe_md *lsm;
1569 put_user(0, &lumv1p->lmm_stripe_count);
1571 ll_layout_refresh(inode, &gen);
1572 lsm = ccc_inode_lsm_get(inode);
1573 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1574 0, lsm, (void *)arg);
1575 ccc_inode_lsm_put(inode, lsm);
1580 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1582 struct lov_stripe_md *lsm;
1585 lsm = ccc_inode_lsm_get(inode);
1587 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1589 ccc_inode_lsm_put(inode, lsm);
1593 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1595 struct ll_inode_info *lli = ll_i2info(inode);
1596 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1597 struct ccc_grouplock grouplock;
1600 if (ll_file_nolock(file))
1603 spin_lock(&lli->lli_lock);
1604 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1605 CWARN("group lock already existed with gid %lu\n",
1606 fd->fd_grouplock.cg_gid);
1607 spin_unlock(&lli->lli_lock);
1610 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1611 spin_unlock(&lli->lli_lock);
1613 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1614 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1618 spin_lock(&lli->lli_lock);
1619 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1620 spin_unlock(&lli->lli_lock);
1621 CERROR("another thread just won the race\n");
1622 cl_put_grouplock(&grouplock);
1626 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1627 fd->fd_grouplock = grouplock;
1628 spin_unlock(&lli->lli_lock);
1630 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1634 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1636 struct ll_inode_info *lli = ll_i2info(inode);
1637 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1638 struct ccc_grouplock grouplock;
1640 spin_lock(&lli->lli_lock);
1641 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1642 spin_unlock(&lli->lli_lock);
1643 CWARN("no group lock held\n");
1646 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1648 if (fd->fd_grouplock.cg_gid != arg) {
1649 CWARN("group lock %lu doesn't match current id %lu\n",
1650 arg, fd->fd_grouplock.cg_gid);
1651 spin_unlock(&lli->lli_lock);
1655 grouplock = fd->fd_grouplock;
1656 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1657 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1658 spin_unlock(&lli->lli_lock);
1660 cl_put_grouplock(&grouplock);
1661 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1666 * Close inode open handle
1668 * \param dentry [in] dentry which contains the inode
1669 * \param it [in,out] intent which contains open info and result
1672 * \retval <0 failure
1674 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1676 struct inode *inode = dentry->d_inode;
1677 struct obd_client_handle *och;
1682 /* Root ? Do nothing. */
1683 if (dentry->d_inode->i_sb->s_root == dentry)
1686 /* No open handle to close? Move away */
1687 if (!it_disposition(it, DISP_OPEN_OPEN))
1690 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1692 OBD_ALLOC(och, sizeof(*och));
1694 GOTO(out, rc = -ENOMEM);
1696 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1698 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1701 /* this one is in place of ll_file_open */
1702 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1703 ptlrpc_req_finished(it->d.lustre.it_data);
1704 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1710 * Get size for inode for which FIEMAP mapping is requested.
1711 * Make the FIEMAP get_info call and returns the result.
1713 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1716 struct obd_export *exp = ll_i2dtexp(inode);
1717 struct lov_stripe_md *lsm = NULL;
1718 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1719 int vallen = num_bytes;
1722 /* Checks for fiemap flags */
1723 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1724 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1728 /* Check for FIEMAP_FLAG_SYNC */
1729 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1730 rc = filemap_fdatawrite(inode->i_mapping);
1735 lsm = ccc_inode_lsm_get(inode);
1739 /* If the stripe_count > 1 and the application does not understand
1740 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1742 if (lsm->lsm_stripe_count > 1 &&
1743 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1744 GOTO(out, rc = -EOPNOTSUPP);
1746 fm_key.oa.o_oi = lsm->lsm_oi;
1747 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1749 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1750 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1751 /* If filesize is 0, then there would be no objects for mapping */
1752 if (fm_key.oa.o_size == 0) {
1753 fiemap->fm_mapped_extents = 0;
1757 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1759 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1762 CERROR("obd_get_info failed: rc = %d\n", rc);
1765 ccc_inode_lsm_put(inode, lsm);
1769 int ll_fid2path(struct inode *inode, void *arg)
1771 struct obd_export *exp = ll_i2mdexp(inode);
1772 struct getinfo_fid2path *gfout, *gfin;
1775 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1776 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1779 /* Need to get the buflen */
1780 OBD_ALLOC_PTR(gfin);
1783 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1788 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1789 OBD_ALLOC(gfout, outsize);
1790 if (gfout == NULL) {
1794 memcpy(gfout, gfin, sizeof(*gfout));
1797 /* Call mdc_iocontrol */
1798 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1802 if (copy_to_user(arg, gfout, outsize))
1806 OBD_FREE(gfout, outsize);
1810 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1812 struct ll_user_fiemap *fiemap_s;
1813 size_t num_bytes, ret_bytes;
1814 unsigned int extent_count;
1817 /* Get the extent count so we can calculate the size of
1818 * required fiemap buffer */
1819 if (get_user(extent_count,
1820 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1822 num_bytes = sizeof(*fiemap_s) + (extent_count *
1823 sizeof(struct ll_fiemap_extent));
1825 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1826 if (fiemap_s == NULL)
1829 /* get the fiemap value */
1830 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1832 GOTO(error, rc = -EFAULT);
1834 /* If fm_extent_count is non-zero, read the first extent since
1835 * it is used to calculate end_offset and device from previous
1838 if (copy_from_user(&fiemap_s->fm_extents[0],
1839 (char __user *)arg + sizeof(*fiemap_s),
1840 sizeof(struct ll_fiemap_extent)))
1841 GOTO(error, rc = -EFAULT);
1844 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1848 ret_bytes = sizeof(struct ll_user_fiemap);
1850 if (extent_count != 0)
1851 ret_bytes += (fiemap_s->fm_mapped_extents *
1852 sizeof(struct ll_fiemap_extent));
1854 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1858 OBD_FREE_LARGE(fiemap_s, num_bytes);
1863 * Read the data_version for inode.
1865 * This value is computed using stripe object version on OST.
1866 * Version is computed using server side locking.
1868 * @param extent_lock Take extent lock. Not needed if a process is already
1869 * holding the OST object group locks.
1871 int ll_data_version(struct inode *inode, __u64 *data_version,
1874 struct lov_stripe_md *lsm = NULL;
1875 struct ll_sb_info *sbi = ll_i2sbi(inode);
1876 struct obdo *obdo = NULL;
1879 /* If no stripe, we consider version is 0. */
1880 lsm = ccc_inode_lsm_get(inode);
1881 if (!lsm_has_objects(lsm)) {
1883 CDEBUG(D_INODE, "No object for inode\n");
1887 OBD_ALLOC_PTR(obdo);
1889 GOTO(out, rc = -ENOMEM);
1891 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1893 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1896 *data_version = obdo->o_data_version;
1901 ccc_inode_lsm_put(inode, lsm);
1906 * Trigger a HSM release request for the provided inode.
1908 int ll_hsm_release(struct inode *inode)
1910 struct cl_env_nest nest;
1912 struct obd_client_handle *och = NULL;
1913 __u64 data_version = 0;
1917 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1918 ll_get_fsname(inode->i_sb, NULL, 0),
1919 PFID(&ll_i2info(inode)->lli_fid));
1921 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1923 GOTO(out, rc = PTR_ERR(och));
1925 /* Grab latest data_version and [am]time values */
1926 rc = ll_data_version(inode, &data_version, 1);
1930 env = cl_env_nested_get(&nest);
1932 GOTO(out, rc = PTR_ERR(env));
1934 ll_merge_lvb(env, inode);
1935 cl_env_nested_put(&nest, env);
1937 /* Release the file.
1938 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1939 * we still need it to pack l_remote_handle to MDT. */
1940 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1946 if (och != NULL && !IS_ERR(och)) /* close the file */
1947 ll_lease_close(och, inode, NULL);
1952 struct ll_swap_stack {
1953 struct iattr ia1, ia2;
1955 struct inode *inode1, *inode2;
1956 bool check_dv1, check_dv2;
1959 static int ll_swap_layouts(struct file *file1, struct file *file2,
1960 struct lustre_swap_layouts *lsl)
1962 struct mdc_swap_layouts msl;
1963 struct md_op_data *op_data;
1966 struct ll_swap_stack *llss = NULL;
1969 OBD_ALLOC_PTR(llss);
1973 llss->inode1 = file1->f_dentry->d_inode;
1974 llss->inode2 = file2->f_dentry->d_inode;
1976 if (!S_ISREG(llss->inode2->i_mode))
1977 GOTO(free, rc = -EINVAL);
1979 if (inode_permission(llss->inode1, MAY_WRITE) ||
1980 inode_permission(llss->inode2, MAY_WRITE))
1981 GOTO(free, rc = -EPERM);
1983 if (llss->inode2->i_sb != llss->inode1->i_sb)
1984 GOTO(free, rc = -EXDEV);
1986 /* we use 2 bool because it is easier to swap than 2 bits */
1987 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1988 llss->check_dv1 = true;
1990 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1991 llss->check_dv2 = true;
1993 /* we cannot use lsl->sl_dvX directly because we may swap them */
1994 llss->dv1 = lsl->sl_dv1;
1995 llss->dv2 = lsl->sl_dv2;
1997 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1998 if (rc == 0) /* same file, done! */
2001 if (rc < 0) { /* sequentialize it */
2002 swap(llss->inode1, llss->inode2);
2004 swap(llss->dv1, llss->dv2);
2005 swap(llss->check_dv1, llss->check_dv2);
2009 if (gid != 0) { /* application asks to flush dirty cache */
2010 rc = ll_get_grouplock(llss->inode1, file1, gid);
2014 rc = ll_get_grouplock(llss->inode2, file2, gid);
2016 ll_put_grouplock(llss->inode1, file1, gid);
2021 /* to be able to restore mtime and atime after swap
2022 * we need to first save them */
2024 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2025 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2026 llss->ia1.ia_atime = llss->inode1->i_atime;
2027 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2028 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2029 llss->ia2.ia_atime = llss->inode2->i_atime;
2030 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2033 /* ultimate check, before swapping the layouts we check if
2034 * dataversion has changed (if requested) */
2035 if (llss->check_dv1) {
2036 rc = ll_data_version(llss->inode1, &dv, 0);
2039 if (dv != llss->dv1)
2040 GOTO(putgl, rc = -EAGAIN);
2043 if (llss->check_dv2) {
2044 rc = ll_data_version(llss->inode2, &dv, 0);
2047 if (dv != llss->dv2)
2048 GOTO(putgl, rc = -EAGAIN);
2051 /* struct md_op_data is used to send the swap args to the mdt
2052 * only flags is missing, so we use struct mdc_swap_layouts
2053 * through the md_op_data->op_data */
2054 /* flags from user space have to be converted before they are send to
2055 * server, no flag is sent today, they are only used on the client */
2058 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2059 0, LUSTRE_OPC_ANY, &msl);
2060 if (IS_ERR(op_data))
2061 GOTO(free, rc = PTR_ERR(op_data));
2063 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2064 sizeof(*op_data), op_data, NULL);
2065 ll_finish_md_op_data(op_data);
2069 ll_put_grouplock(llss->inode2, file2, gid);
2070 ll_put_grouplock(llss->inode1, file1, gid);
2073 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2077 /* clear useless flags */
2078 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2079 llss->ia1.ia_valid &= ~ATTR_MTIME;
2080 llss->ia2.ia_valid &= ~ATTR_MTIME;
2083 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2084 llss->ia1.ia_valid &= ~ATTR_ATIME;
2085 llss->ia2.ia_valid &= ~ATTR_ATIME;
2088 /* update time if requested */
2090 if (llss->ia2.ia_valid != 0) {
2091 mutex_lock(&llss->inode1->i_mutex);
2092 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2093 mutex_unlock(&llss->inode1->i_mutex);
2096 if (llss->ia1.ia_valid != 0) {
2099 mutex_lock(&llss->inode2->i_mutex);
2100 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2101 mutex_unlock(&llss->inode2->i_mutex);
2113 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2115 struct md_op_data *op_data;
2118 /* Non-root users are forbidden to set or clear flags which are
2119 * NOT defined in HSM_USER_MASK. */
2120 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2121 !capable(CFS_CAP_SYS_ADMIN))
2124 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2125 LUSTRE_OPC_ANY, hss);
2126 if (IS_ERR(op_data))
2127 return PTR_ERR(op_data);
2129 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2130 sizeof(*op_data), op_data, NULL);
2132 ll_finish_md_op_data(op_data);
2137 static int ll_hsm_import(struct inode *inode, struct file *file,
2138 struct hsm_user_import *hui)
2140 struct hsm_state_set *hss = NULL;
2141 struct iattr *attr = NULL;
2145 if (!S_ISREG(inode->i_mode))
2151 GOTO(out, rc = -ENOMEM);
2153 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2154 hss->hss_archive_id = hui->hui_archive_id;
2155 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2156 rc = ll_hsm_state_set(inode, hss);
2160 OBD_ALLOC_PTR(attr);
2162 GOTO(out, rc = -ENOMEM);
2164 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2165 attr->ia_mode |= S_IFREG;
2166 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2167 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2168 attr->ia_size = hui->hui_size;
2169 attr->ia_mtime.tv_sec = hui->hui_mtime;
2170 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2171 attr->ia_atime.tv_sec = hui->hui_atime;
2172 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2174 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2175 ATTR_UID | ATTR_GID |
2176 ATTR_MTIME | ATTR_MTIME_SET |
2177 ATTR_ATIME | ATTR_ATIME_SET;
2179 rc = ll_setattr_raw(file->f_dentry, attr, true);
2193 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2195 struct inode *inode = file->f_dentry->d_inode;
2196 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2199 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2200 inode->i_generation, inode, cmd);
2201 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2203 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2204 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2208 case LL_IOC_GETFLAGS:
2209 /* Get the current value of the file flags */
2210 return put_user(fd->fd_flags, (int *)arg);
2211 case LL_IOC_SETFLAGS:
2212 case LL_IOC_CLRFLAGS:
2213 /* Set or clear specific file flags */
2214 /* XXX This probably needs checks to ensure the flags are
2215 * not abused, and to handle any flag side effects.
2217 if (get_user(flags, (int *) arg))
2220 if (cmd == LL_IOC_SETFLAGS) {
2221 if ((flags & LL_FILE_IGNORE_LOCK) &&
2222 !(file->f_flags & O_DIRECT)) {
2223 CERROR("%s: unable to disable locking on "
2224 "non-O_DIRECT file\n", current->comm);
2228 fd->fd_flags |= flags;
2230 fd->fd_flags &= ~flags;
2233 case LL_IOC_LOV_SETSTRIPE:
2234 return ll_lov_setstripe(inode, file, arg);
2235 case LL_IOC_LOV_SETEA:
2236 return ll_lov_setea(inode, file, arg);
2237 case LL_IOC_LOV_SWAP_LAYOUTS: {
2239 struct lustre_swap_layouts lsl;
2241 if (copy_from_user(&lsl, (char *)arg,
2242 sizeof(struct lustre_swap_layouts)))
2245 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2248 file2 = fget(lsl.sl_fd);
2253 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2254 rc = ll_swap_layouts(file, file2, &lsl);
2258 case LL_IOC_LOV_GETSTRIPE:
2259 return ll_lov_getstripe(inode, arg);
2260 case LL_IOC_RECREATE_OBJ:
2261 return ll_lov_recreate_obj(inode, arg);
2262 case LL_IOC_RECREATE_FID:
2263 return ll_lov_recreate_fid(inode, arg);
2264 case FSFILT_IOC_FIEMAP:
2265 return ll_ioctl_fiemap(inode, arg);
2266 case FSFILT_IOC_GETFLAGS:
2267 case FSFILT_IOC_SETFLAGS:
2268 return ll_iocontrol(inode, file, cmd, arg);
2269 case FSFILT_IOC_GETVERSION_OLD:
2270 case FSFILT_IOC_GETVERSION:
2271 return put_user(inode->i_generation, (int *)arg);
2272 case LL_IOC_GROUP_LOCK:
2273 return ll_get_grouplock(inode, file, arg);
2274 case LL_IOC_GROUP_UNLOCK:
2275 return ll_put_grouplock(inode, file, arg);
2276 case IOC_OBD_STATFS:
2277 return ll_obd_statfs(inode, (void *)arg);
2279 /* We need to special case any other ioctls we want to handle,
2280 * to send them to the MDS/OST as appropriate and to properly
2281 * network encode the arg field.
2282 case FSFILT_IOC_SETVERSION_OLD:
2283 case FSFILT_IOC_SETVERSION:
2285 case LL_IOC_FLUSHCTX:
2286 return ll_flush_ctx(inode);
2287 case LL_IOC_PATH2FID: {
2288 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2289 sizeof(struct lu_fid)))
2294 case OBD_IOC_FID2PATH:
2295 return ll_fid2path(inode, (void *)arg);
2296 case LL_IOC_DATA_VERSION: {
2297 struct ioc_data_version idv;
2300 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2303 rc = ll_data_version(inode, &idv.idv_version,
2304 !(idv.idv_flags & LL_DV_NOFLUSH));
2306 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2312 case LL_IOC_GET_MDTIDX: {
2315 mdtidx = ll_get_mdt_idx(inode);
2319 if (put_user((int)mdtidx, (int*)arg))
2324 case OBD_IOC_GETDTNAME:
2325 case OBD_IOC_GETMDNAME:
2326 return ll_get_obd_name(inode, cmd, arg);
2327 case LL_IOC_HSM_STATE_GET: {
2328 struct md_op_data *op_data;
2329 struct hsm_user_state *hus;
2336 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2337 LUSTRE_OPC_ANY, hus);
2338 if (IS_ERR(op_data)) {
2340 return PTR_ERR(op_data);
2343 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2346 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2349 ll_finish_md_op_data(op_data);
2353 case LL_IOC_HSM_STATE_SET: {
2354 struct hsm_state_set *hss;
2361 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2366 rc = ll_hsm_state_set(inode, hss);
2371 case LL_IOC_HSM_ACTION: {
2372 struct md_op_data *op_data;
2373 struct hsm_current_action *hca;
2380 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2381 LUSTRE_OPC_ANY, hca);
2382 if (IS_ERR(op_data)) {
2384 return PTR_ERR(op_data);
2387 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2390 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2393 ll_finish_md_op_data(op_data);
2397 case LL_IOC_SET_LEASE: {
2398 struct ll_inode_info *lli = ll_i2info(inode);
2399 struct obd_client_handle *och = NULL;
2405 if (!(file->f_mode & FMODE_WRITE))
2410 if (!(file->f_mode & FMODE_READ))
2415 mutex_lock(&lli->lli_och_mutex);
2416 if (fd->fd_lease_och != NULL) {
2417 och = fd->fd_lease_och;
2418 fd->fd_lease_och = NULL;
2420 mutex_unlock(&lli->lli_och_mutex);
2423 mode = och->och_flags &
2424 (FMODE_READ|FMODE_WRITE);
2425 rc = ll_lease_close(och, inode, &lease_broken);
2426 if (rc == 0 && lease_broken)
2432 /* return the type of lease or error */
2433 return rc < 0 ? rc : (int)mode;
2438 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2440 /* apply for lease */
2441 och = ll_lease_open(inode, file, mode, 0);
2443 return PTR_ERR(och);
2446 mutex_lock(&lli->lli_och_mutex);
2447 if (fd->fd_lease_och == NULL) {
2448 fd->fd_lease_och = och;
2451 mutex_unlock(&lli->lli_och_mutex);
2453 /* impossible now that only excl is supported for now */
2454 ll_lease_close(och, inode, &lease_broken);
2459 case LL_IOC_GET_LEASE: {
2460 struct ll_inode_info *lli = ll_i2info(inode);
2461 struct ldlm_lock *lock = NULL;
2464 mutex_lock(&lli->lli_och_mutex);
2465 if (fd->fd_lease_och != NULL) {
2466 struct obd_client_handle *och = fd->fd_lease_och;
2468 lock = ldlm_handle2lock(&och->och_lease_handle);
2470 lock_res_and_lock(lock);
2471 if (!ldlm_is_cancel(lock))
2472 rc = och->och_flags &
2473 (FMODE_READ | FMODE_WRITE);
2474 unlock_res_and_lock(lock);
2475 ldlm_lock_put(lock);
2478 mutex_unlock(&lli->lli_och_mutex);
2481 case LL_IOC_HSM_IMPORT: {
2482 struct hsm_user_import *hui;
2488 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2493 rc = ll_hsm_import(inode, file, hui);
2502 ll_iocontrol_call(inode, file, cmd, arg, &err))
2505 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2512 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2514 struct inode *inode = file->f_dentry->d_inode;
2515 loff_t retval, eof = 0;
2517 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2518 (origin == SEEK_CUR) ? file->f_pos : 0);
2519 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2520 inode->i_ino, inode->i_generation, inode, retval, retval,
2522 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2524 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2525 retval = ll_glimpse_size(inode);
2528 eof = i_size_read(inode);
2531 retval = generic_file_llseek_size(file, offset, origin,
2532 ll_file_maxbytes(inode), eof);
2536 int ll_flush(struct file *file, fl_owner_t id)
2538 struct inode *inode = file->f_dentry->d_inode;
2539 struct ll_inode_info *lli = ll_i2info(inode);
2540 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2543 LASSERT(!S_ISDIR(inode->i_mode));
2545 /* catch async errors that were recorded back when async writeback
2546 * failed for pages in this mapping. */
2547 rc = lli->lli_async_rc;
2548 lli->lli_async_rc = 0;
2549 err = lov_read_and_clear_async_rc(lli->lli_clob);
2553 /* The application has been told write failure already.
2554 * Do not report failure again. */
2555 if (fd->fd_write_failed)
2557 return rc ? -EIO : 0;
2561 * Called to make sure a portion of file has been written out.
2562 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2564 * Return how many pages have been written.
2566 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2567 enum cl_fsync_mode mode, int ignore_layout)
2569 struct cl_env_nest nest;
2572 struct obd_capa *capa = NULL;
2573 struct cl_fsync_io *fio;
2576 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2577 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2580 env = cl_env_nested_get(&nest);
2582 return PTR_ERR(env);
2584 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2586 io = ccc_env_thread_io(env);
2587 io->ci_obj = cl_i2info(inode)->lli_clob;
2588 io->ci_ignore_layout = ignore_layout;
2590 /* initialize parameters for sync */
2591 fio = &io->u.ci_fsync;
2592 fio->fi_capa = capa;
2593 fio->fi_start = start;
2595 fio->fi_fid = ll_inode2fid(inode);
2596 fio->fi_mode = mode;
2597 fio->fi_nr_written = 0;
2599 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2600 result = cl_io_loop(env, io);
2602 result = io->ci_result;
2604 result = fio->fi_nr_written;
2605 cl_io_fini(env, io);
2606 cl_env_nested_put(&nest, env);
2614 * When dentry is provided (the 'else' case), *file->f_dentry may be
2615 * null and dentry must be used directly rather than pulled from
2616 * *file->f_dentry as is done otherwise.
2619 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2621 struct dentry *dentry = file->f_dentry;
2622 struct inode *inode = dentry->d_inode;
2623 struct ll_inode_info *lli = ll_i2info(inode);
2624 struct ptlrpc_request *req;
2625 struct obd_capa *oc;
2628 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2629 inode->i_generation, inode);
2630 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2632 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2633 mutex_lock(&inode->i_mutex);
2635 /* catch async errors that were recorded back when async writeback
2636 * failed for pages in this mapping. */
2637 if (!S_ISDIR(inode->i_mode)) {
2638 err = lli->lli_async_rc;
2639 lli->lli_async_rc = 0;
2642 err = lov_read_and_clear_async_rc(lli->lli_clob);
2647 oc = ll_mdscapa_get(inode);
2648 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2654 ptlrpc_req_finished(req);
2656 if (datasync && S_ISREG(inode->i_mode)) {
2657 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2659 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2661 if (rc == 0 && err < 0)
2664 fd->fd_write_failed = true;
2666 fd->fd_write_failed = false;
2669 mutex_unlock(&inode->i_mutex);
2673 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2675 struct inode *inode = file->f_dentry->d_inode;
2676 struct ll_sb_info *sbi = ll_i2sbi(inode);
2677 struct ldlm_enqueue_info einfo = {
2678 .ei_type = LDLM_FLOCK,
2679 .ei_cb_cp = ldlm_flock_completion_ast,
2680 .ei_cbdata = file_lock,
2682 struct md_op_data *op_data;
2683 struct lustre_handle lockh = {0};
2684 ldlm_policy_data_t flock = {{0}};
2689 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2690 inode->i_ino, file_lock);
2692 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2694 if (file_lock->fl_flags & FL_FLOCK) {
2695 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2696 /* flocks are whole-file locks */
2697 flock.l_flock.end = OFFSET_MAX;
2698 /* For flocks owner is determined by the local file descriptor*/
2699 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2700 } else if (file_lock->fl_flags & FL_POSIX) {
2701 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2702 flock.l_flock.start = file_lock->fl_start;
2703 flock.l_flock.end = file_lock->fl_end;
2707 flock.l_flock.pid = file_lock->fl_pid;
2709 /* Somewhat ugly workaround for svc lockd.
2710 * lockd installs custom fl_lmops->lm_compare_owner that checks
2711 * for the fl_owner to be the same (which it always is on local node
2712 * I guess between lockd processes) and then compares pid.
2713 * As such we assign pid to the owner field to make it all work,
2714 * conflict with normal locks is unlikely since pid space and
2715 * pointer space for current->files are not intersecting */
2716 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2717 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2719 switch (file_lock->fl_type) {
2721 einfo.ei_mode = LCK_PR;
2724 /* An unlock request may or may not have any relation to
2725 * existing locks so we may not be able to pass a lock handle
2726 * via a normal ldlm_lock_cancel() request. The request may even
2727 * unlock a byte range in the middle of an existing lock. In
2728 * order to process an unlock request we need all of the same
2729 * information that is given with a normal read or write record
2730 * lock request. To avoid creating another ldlm unlock (cancel)
2731 * message we'll treat a LCK_NL flock request as an unlock. */
2732 einfo.ei_mode = LCK_NL;
2735 einfo.ei_mode = LCK_PW;
2738 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2739 file_lock->fl_type);
2754 flags = LDLM_FL_BLOCK_NOWAIT;
2760 flags = LDLM_FL_TEST_LOCK;
2761 /* Save the old mode so that if the mode in the lock changes we
2762 * can decrement the appropriate reader or writer refcount. */
2763 file_lock->fl_type = einfo.ei_mode;
2766 CERROR("unknown fcntl lock command: %d\n", cmd);
2770 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2771 LUSTRE_OPC_ANY, NULL);
2772 if (IS_ERR(op_data))
2773 return PTR_ERR(op_data);
2775 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2776 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2777 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2779 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2780 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2782 if ((file_lock->fl_flags & FL_FLOCK) &&
2783 (rc == 0 || file_lock->fl_type == F_UNLCK))
2784 rc2 = flock_lock_file_wait(file, file_lock);
2785 if ((file_lock->fl_flags & FL_POSIX) &&
2786 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2787 !(flags & LDLM_FL_TEST_LOCK))
2788 rc2 = posix_lock_file_wait(file, file_lock);
2790 if (rc2 && file_lock->fl_type != F_UNLCK) {
2791 einfo.ei_mode = LCK_NL;
2792 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2793 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2797 ll_finish_md_op_data(op_data);
2802 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2808 * test if some locks matching bits and l_req_mode are acquired
2809 * - bits can be in different locks
2810 * - if found clear the common lock bits in *bits
2811 * - the bits not found, are kept in *bits
2813 * \param bits [IN] searched lock bits [IN]
2814 * \param l_req_mode [IN] searched lock mode
2815 * \retval boolean, true iff all bits are found
2817 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2819 struct lustre_handle lockh;
2820 ldlm_policy_data_t policy;
2821 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2822 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2830 fid = &ll_i2info(inode)->lli_fid;
2831 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2832 ldlm_lockname[mode]);
2834 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2835 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2836 policy.l_inodebits.bits = *bits & (1 << i);
2837 if (policy.l_inodebits.bits == 0)
2840 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2841 &policy, mode, &lockh)) {
2842 struct ldlm_lock *lock;
2844 lock = ldlm_handle2lock(&lockh);
2847 ~(lock->l_policy_data.l_inodebits.bits);
2848 LDLM_LOCK_PUT(lock);
2850 *bits &= ~policy.l_inodebits.bits;
2857 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2858 struct lustre_handle *lockh, __u64 flags,
2861 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2865 fid = &ll_i2info(inode)->lli_fid;
2866 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2868 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2869 fid, LDLM_IBITS, &policy, mode, lockh);
2874 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2876 /* Already unlinked. Just update nlink and return success */
2877 if (rc == -ENOENT) {
2879 /* This path cannot be hit for regular files unless in
2880 * case of obscure races, so no need to validate size.
2882 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2884 } else if (rc != 0) {
2885 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2886 ll_get_fsname(inode->i_sb, NULL, 0),
2887 PFID(ll_inode2fid(inode)), rc);
2893 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2896 struct inode *inode = dentry->d_inode;
2897 struct ptlrpc_request *req = NULL;
2898 struct obd_export *exp;
2901 LASSERT(inode != NULL);
2903 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2904 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2906 exp = ll_i2mdexp(inode);
2908 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2909 * But under CMD case, it caused some lock issues, should be fixed
2910 * with new CMD ibits lock. See bug 12718 */
2911 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2912 struct lookup_intent oit = { .it_op = IT_GETATTR };
2913 struct md_op_data *op_data;
2915 if (ibits == MDS_INODELOCK_LOOKUP)
2916 oit.it_op = IT_LOOKUP;
2918 /* Call getattr by fid, so do not provide name at all. */
2919 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
2920 dentry->d_inode, NULL, 0, 0,
2921 LUSTRE_OPC_ANY, NULL);
2922 if (IS_ERR(op_data))
2923 return PTR_ERR(op_data);
2925 oit.it_create_mode |= M_CHECK_STALE;
2926 rc = md_intent_lock(exp, op_data, NULL, 0,
2927 /* we are not interested in name
2930 ll_md_blocking_ast, 0);
2931 ll_finish_md_op_data(op_data);
2932 oit.it_create_mode &= ~M_CHECK_STALE;
2934 rc = ll_inode_revalidate_fini(inode, rc);
2938 rc = ll_revalidate_it_finish(req, &oit, dentry);
2940 ll_intent_release(&oit);
2944 /* Unlinked? Unhash dentry, so it is not picked up later by
2945 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2946 here to preserve get_cwd functionality on 2.6.
2948 if (!dentry->d_inode->i_nlink)
2949 d_lustre_invalidate(dentry, 0);
2951 ll_lookup_finish_locks(&oit, dentry);
2952 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2953 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2954 obd_valid valid = OBD_MD_FLGETATTR;
2955 struct md_op_data *op_data;
2958 if (S_ISREG(inode->i_mode)) {
2959 rc = ll_get_max_mdsize(sbi, &ealen);
2962 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2965 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2966 0, ealen, LUSTRE_OPC_ANY,
2968 if (IS_ERR(op_data))
2969 return PTR_ERR(op_data);
2971 op_data->op_valid = valid;
2972 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2973 * capa for this inode. Because we only keep capas of dirs
2975 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2976 ll_finish_md_op_data(op_data);
2978 rc = ll_inode_revalidate_fini(inode, rc);
2982 rc = ll_prep_inode(&inode, req, NULL, NULL);
2985 ptlrpc_req_finished(req);
2989 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2992 struct inode *inode = dentry->d_inode;
2995 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2999 /* if object isn't regular file, don't validate size */
3000 if (!S_ISREG(inode->i_mode)) {
3001 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3002 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3003 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3005 /* In case of restore, the MDT has the right size and has
3006 * already send it back without granting the layout lock,
3007 * inode is up-to-date so glimpse is useless.
3008 * Also to glimpse we need the layout, in case of a running
3009 * restore the MDT holds the layout lock so the glimpse will
3010 * block up to the end of restore (getattr will block)
3012 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3013 rc = ll_glimpse_size(inode);
3018 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3019 struct lookup_intent *it, struct kstat *stat)
3021 struct inode *inode = de->d_inode;
3022 struct ll_sb_info *sbi = ll_i2sbi(inode);
3023 struct ll_inode_info *lli = ll_i2info(inode);
3026 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3027 MDS_INODELOCK_LOOKUP);
3028 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3033 stat->dev = inode->i_sb->s_dev;
3034 if (ll_need_32bit_api(sbi))
3035 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3037 stat->ino = inode->i_ino;
3038 stat->mode = inode->i_mode;
3039 stat->nlink = inode->i_nlink;
3040 stat->uid = inode->i_uid;
3041 stat->gid = inode->i_gid;
3042 stat->rdev = inode->i_rdev;
3043 stat->atime = inode->i_atime;
3044 stat->mtime = inode->i_mtime;
3045 stat->ctime = inode->i_ctime;
3046 stat->blksize = 1 << inode->i_blkbits;
3048 stat->size = i_size_read(inode);
3049 stat->blocks = inode->i_blocks;
3053 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3055 struct lookup_intent it = { .it_op = IT_GETATTR };
3057 return ll_getattr_it(mnt, de, &it, stat);
3060 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3061 __u64 start, __u64 len)
3065 struct ll_user_fiemap *fiemap;
3066 unsigned int extent_count = fieinfo->fi_extents_max;
3068 num_bytes = sizeof(*fiemap) + (extent_count *
3069 sizeof(struct ll_fiemap_extent));
3070 OBD_ALLOC_LARGE(fiemap, num_bytes);
3075 fiemap->fm_flags = fieinfo->fi_flags;
3076 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3077 fiemap->fm_start = start;
3078 fiemap->fm_length = len;
3079 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3080 sizeof(struct ll_fiemap_extent));
3082 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3084 fieinfo->fi_flags = fiemap->fm_flags;
3085 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3086 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3087 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3089 OBD_FREE_LARGE(fiemap, num_bytes);
3093 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3095 struct ll_inode_info *lli = ll_i2info(inode);
3096 struct posix_acl *acl = NULL;
3098 spin_lock(&lli->lli_lock);
3099 /* VFS' acl_permission_check->check_acl will release the refcount */
3100 acl = posix_acl_dup(lli->lli_posix_acl);
3101 spin_unlock(&lli->lli_lock);
3107 int ll_inode_permission(struct inode *inode, int mask)
3111 #ifdef MAY_NOT_BLOCK
3112 if (mask & MAY_NOT_BLOCK)
3116 /* as root inode are NOT getting validated in lookup operation,
3117 * need to do it before permission check. */
3119 if (inode == inode->i_sb->s_root->d_inode) {
3120 struct lookup_intent it = { .it_op = IT_LOOKUP };
3122 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3123 MDS_INODELOCK_LOOKUP);
3128 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3129 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3131 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3132 return lustre_check_remote_perm(inode, mask);
3134 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3135 rc = generic_permission(inode, mask);
3140 /* -o localflock - only provides locally consistent flock locks */
3141 struct file_operations ll_file_operations = {
3142 .read = ll_file_read,
3143 .aio_read = ll_file_aio_read,
3144 .write = ll_file_write,
3145 .aio_write = ll_file_aio_write,
3146 .unlocked_ioctl = ll_file_ioctl,
3147 .open = ll_file_open,
3148 .release = ll_file_release,
3149 .mmap = ll_file_mmap,
3150 .llseek = ll_file_seek,
3151 .splice_read = ll_file_splice_read,
3156 struct file_operations ll_file_operations_flock = {
3157 .read = ll_file_read,
3158 .aio_read = ll_file_aio_read,
3159 .write = ll_file_write,
3160 .aio_write = ll_file_aio_write,
3161 .unlocked_ioctl = ll_file_ioctl,
3162 .open = ll_file_open,
3163 .release = ll_file_release,
3164 .mmap = ll_file_mmap,
3165 .llseek = ll_file_seek,
3166 .splice_read = ll_file_splice_read,
3169 .flock = ll_file_flock,
3170 .lock = ll_file_flock
3173 /* These are for -o noflock - to return ENOSYS on flock calls */
3174 struct file_operations ll_file_operations_noflock = {
3175 .read = ll_file_read,
3176 .aio_read = ll_file_aio_read,
3177 .write = ll_file_write,
3178 .aio_write = ll_file_aio_write,
3179 .unlocked_ioctl = ll_file_ioctl,
3180 .open = ll_file_open,
3181 .release = ll_file_release,
3182 .mmap = ll_file_mmap,
3183 .llseek = ll_file_seek,
3184 .splice_read = ll_file_splice_read,
3187 .flock = ll_file_noflock,
3188 .lock = ll_file_noflock
3191 struct inode_operations ll_file_inode_operations = {
3192 .setattr = ll_setattr,
3193 .getattr = ll_getattr,
3194 .permission = ll_inode_permission,
3195 .setxattr = ll_setxattr,
3196 .getxattr = ll_getxattr,
3197 .listxattr = ll_listxattr,
3198 .removexattr = ll_removexattr,
3199 .fiemap = ll_fiemap,
3200 .get_acl = ll_get_acl,
3203 /* dynamic ioctl number support routines */
3204 static struct llioc_ctl_data {
3205 struct rw_semaphore ioc_sem;
3206 struct list_head ioc_head;
3208 __RWSEM_INITIALIZER(llioc.ioc_sem),
3209 LIST_HEAD_INIT(llioc.ioc_head)
3214 struct list_head iocd_list;
3215 unsigned int iocd_size;
3216 llioc_callback_t iocd_cb;
3217 unsigned int iocd_count;
3218 unsigned int iocd_cmd[0];
3221 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3224 struct llioc_data *in_data = NULL;
3226 if (cb == NULL || cmd == NULL ||
3227 count > LLIOC_MAX_CMD || count < 0)
3230 size = sizeof(*in_data) + count * sizeof(unsigned int);
3231 OBD_ALLOC(in_data, size);
3232 if (in_data == NULL)
3235 memset(in_data, 0, sizeof(*in_data));
3236 in_data->iocd_size = size;
3237 in_data->iocd_cb = cb;
3238 in_data->iocd_count = count;
3239 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3241 down_write(&llioc.ioc_sem);
3242 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3243 up_write(&llioc.ioc_sem);
3248 void ll_iocontrol_unregister(void *magic)
3250 struct llioc_data *tmp;
3255 down_write(&llioc.ioc_sem);
3256 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3258 unsigned int size = tmp->iocd_size;
3260 list_del(&tmp->iocd_list);
3261 up_write(&llioc.ioc_sem);
3263 OBD_FREE(tmp, size);
3267 up_write(&llioc.ioc_sem);
3269 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3272 EXPORT_SYMBOL(ll_iocontrol_register);
3273 EXPORT_SYMBOL(ll_iocontrol_unregister);
3275 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3276 unsigned int cmd, unsigned long arg, int *rcp)
3278 enum llioc_iter ret = LLIOC_CONT;
3279 struct llioc_data *data;
3280 int rc = -EINVAL, i;
3282 down_read(&llioc.ioc_sem);
3283 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3284 for (i = 0; i < data->iocd_count; i++) {
3285 if (cmd != data->iocd_cmd[i])
3288 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3292 if (ret == LLIOC_STOP)
3295 up_read(&llioc.ioc_sem);
3302 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3304 struct ll_inode_info *lli = ll_i2info(inode);
3305 struct cl_env_nest nest;
3309 if (lli->lli_clob == NULL)
3312 env = cl_env_nested_get(&nest);
3314 return PTR_ERR(env);
3316 result = cl_conf_set(env, lli->lli_clob, conf);
3317 cl_env_nested_put(&nest, env);
3319 if (conf->coc_opc == OBJECT_CONF_SET) {
3320 struct ldlm_lock *lock = conf->coc_lock;
3322 LASSERT(lock != NULL);
3323 LASSERT(ldlm_has_layout(lock));
3325 /* it can only be allowed to match after layout is
3326 * applied to inode otherwise false layout would be
3327 * seen. Applying layout should happen before dropping
3328 * the intent lock. */
3329 ldlm_lock_allow_match(lock);
3335 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3336 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3339 struct ll_sb_info *sbi = ll_i2sbi(inode);
3340 struct obd_capa *oc;
3341 struct ptlrpc_request *req;
3342 struct mdt_body *body;
3348 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3349 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3350 lock->l_lvb_data, lock->l_lvb_len);
3352 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3355 /* if layout lock was granted right away, the layout is returned
3356 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3357 * blocked and then granted via completion ast, we have to fetch
3358 * layout here. Please note that we can't use the LVB buffer in
3359 * completion AST because it doesn't have a large enough buffer */
3360 oc = ll_mdscapa_get(inode);
3361 rc = ll_get_max_mdsize(sbi, &lmmsize);
3363 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3364 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3370 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3371 if (body == NULL || body->eadatasize > lmmsize)
3372 GOTO(out, rc = -EPROTO);
3374 lmmsize = body->eadatasize;
3375 if (lmmsize == 0) /* empty layout */
3378 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3380 GOTO(out, rc = -EFAULT);
3382 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3383 if (lvbdata == NULL)
3384 GOTO(out, rc = -ENOMEM);
3386 memcpy(lvbdata, lmm, lmmsize);
3387 lock_res_and_lock(lock);
3388 if (lock->l_lvb_data != NULL)
3389 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3391 lock->l_lvb_data = lvbdata;
3392 lock->l_lvb_len = lmmsize;
3393 unlock_res_and_lock(lock);
3396 ptlrpc_req_finished(req);
3401 * Apply the layout to the inode. Layout lock is held and will be released
3404 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3405 struct inode *inode, __u32 *gen, bool reconf)
3407 struct ll_inode_info *lli = ll_i2info(inode);
3408 struct ll_sb_info *sbi = ll_i2sbi(inode);
3409 struct ldlm_lock *lock;
3410 struct lustre_md md = { NULL };
3411 struct cl_object_conf conf;
3414 bool wait_layout = false;
3416 LASSERT(lustre_handle_is_used(lockh));
3418 lock = ldlm_handle2lock(lockh);
3419 LASSERT(lock != NULL);
3420 LASSERT(ldlm_has_layout(lock));
3422 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3423 inode, PFID(&lli->lli_fid), reconf);
3425 /* in case this is a caching lock and reinstate with new inode */
3426 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3428 lock_res_and_lock(lock);
3429 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3430 unlock_res_and_lock(lock);
3431 /* checking lvb_ready is racy but this is okay. The worst case is
3432 * that multi processes may configure the file on the same time. */
3433 if (lvb_ready || !reconf) {
3436 /* layout_gen must be valid if layout lock is not
3437 * cancelled and stripe has already set */
3438 *gen = lli->lli_layout_gen;
3444 rc = ll_layout_fetch(inode, lock);
3448 /* for layout lock, lmm is returned in lock's lvb.
3449 * lvb_data is immutable if the lock is held so it's safe to access it
3450 * without res lock. See the description in ldlm_lock_decref_internal()
3451 * for the condition to free lvb_data of layout lock */
3452 if (lock->l_lvb_data != NULL) {
3453 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3454 lock->l_lvb_data, lock->l_lvb_len);
3456 *gen = LL_LAYOUT_GEN_EMPTY;
3458 *gen = md.lsm->lsm_layout_gen;
3461 CERROR("%s: file "DFID" unpackmd error: %d\n",
3462 ll_get_fsname(inode->i_sb, NULL, 0),
3463 PFID(&lli->lli_fid), rc);
3469 /* set layout to file. Unlikely this will fail as old layout was
3470 * surely eliminated */
3471 memset(&conf, 0, sizeof(conf));
3472 conf.coc_opc = OBJECT_CONF_SET;
3473 conf.coc_inode = inode;
3474 conf.coc_lock = lock;
3475 conf.u.coc_md = &md;
3476 rc = ll_layout_conf(inode, &conf);
3479 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3481 /* refresh layout failed, need to wait */
3482 wait_layout = rc == -EBUSY;
3485 LDLM_LOCK_PUT(lock);
3486 ldlm_lock_decref(lockh, mode);
3488 /* wait for IO to complete if it's still being used. */
3490 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3491 ll_get_fsname(inode->i_sb, NULL, 0),
3492 inode, PFID(&lli->lli_fid));
3494 memset(&conf, 0, sizeof(conf));
3495 conf.coc_opc = OBJECT_CONF_WAIT;
3496 conf.coc_inode = inode;
3497 rc = ll_layout_conf(inode, &conf);
3501 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3502 PFID(&lli->lli_fid), rc);
3508 * This function checks if there exists a LAYOUT lock on the client side,
3509 * or enqueues it if it doesn't have one in cache.
3511 * This function will not hold layout lock so it may be revoked any time after
3512 * this function returns. Any operations depend on layout should be redone
3515 * This function should be called before lov_io_init() to get an uptodate
3516 * layout version, the caller should save the version number and after IO
3517 * is finished, this function should be called again to verify that layout
3518 * is not changed during IO time.
3520 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3522 struct ll_inode_info *lli = ll_i2info(inode);
3523 struct ll_sb_info *sbi = ll_i2sbi(inode);
3524 struct md_op_data *op_data;
3525 struct lookup_intent it;
3526 struct lustre_handle lockh;
3528 struct ldlm_enqueue_info einfo = {
3529 .ei_type = LDLM_IBITS,
3531 .ei_cb_bl = ll_md_blocking_ast,
3532 .ei_cb_cp = ldlm_completion_ast,
3536 *gen = lli->lli_layout_gen;
3537 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3541 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3542 LASSERT(S_ISREG(inode->i_mode));
3544 /* mostly layout lock is caching on the local side, so try to match
3545 * it before grabbing layout lock mutex. */
3546 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3547 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3548 if (mode != 0) { /* hit cached lock */
3549 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3553 /* better hold lli_layout_mutex to try again otherwise
3554 * it will have starvation problem. */
3557 /* take layout lock mutex to enqueue layout lock exclusively. */
3558 mutex_lock(&lli->lli_layout_mutex);
3561 /* try again. Maybe somebody else has done this. */
3562 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3563 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3564 if (mode != 0) { /* hit cached lock */
3565 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3569 mutex_unlock(&lli->lli_layout_mutex);
3573 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3574 0, 0, LUSTRE_OPC_ANY, NULL);
3575 if (IS_ERR(op_data)) {
3576 mutex_unlock(&lli->lli_layout_mutex);
3577 return PTR_ERR(op_data);
3580 /* have to enqueue one */
3581 memset(&it, 0, sizeof(it));
3582 it.it_op = IT_LAYOUT;
3583 lockh.cookie = 0ULL;
3585 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3586 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3587 PFID(&lli->lli_fid));
3589 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3591 if (it.d.lustre.it_data != NULL)
3592 ptlrpc_req_finished(it.d.lustre.it_data);
3593 it.d.lustre.it_data = NULL;
3595 ll_finish_md_op_data(op_data);
3597 mode = it.d.lustre.it_lock_mode;
3598 it.d.lustre.it_lock_mode = 0;
3599 ll_intent_drop_lock(&it);
3602 /* set lock data in case this is a new lock */
3603 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3604 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3608 mutex_unlock(&lli->lli_layout_mutex);
3614 * This function send a restore request to the MDT
3616 int ll_layout_restore(struct inode *inode)
3618 struct hsm_user_request *hur;
3621 len = sizeof(struct hsm_user_request) +
3622 sizeof(struct hsm_user_item);
3623 OBD_ALLOC(hur, len);
3627 hur->hur_request.hr_action = HUA_RESTORE;
3628 hur->hur_request.hr_archive_id = 0;
3629 hur->hur_request.hr_flags = 0;
3630 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3631 sizeof(hur->hur_user_item[0].hui_fid));
3632 hur->hur_user_item[0].hui_extent.length = -1;
3633 hur->hur_request.hr_itemcount = 1;
3634 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,