]> git.karo-electronics.de Git - karo-tx-linux.git/blob - fs/ext4/inode.c
Merge tag 'ktest-v3.6' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux...
[karo-tx-linux.git] / fs / ext4 / inode.c
1 /*
2  *  linux/fs/ext4/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  64-bit file support on 64-bit platforms by Jakub Jelinek
16  *      (jj@sunsite.ms.mff.cuni.cz)
17  *
18  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
19  */
20
21 #include <linux/fs.h>
22 #include <linux/time.h>
23 #include <linux/jbd2.h>
24 #include <linux/highuid.h>
25 #include <linux/pagemap.h>
26 #include <linux/quotaops.h>
27 #include <linux/string.h>
28 #include <linux/buffer_head.h>
29 #include <linux/writeback.h>
30 #include <linux/pagevec.h>
31 #include <linux/mpage.h>
32 #include <linux/namei.h>
33 #include <linux/uio.h>
34 #include <linux/bio.h>
35 #include <linux/workqueue.h>
36 #include <linux/kernel.h>
37 #include <linux/printk.h>
38 #include <linux/slab.h>
39 #include <linux/ratelimit.h>
40
41 #include "ext4_jbd2.h"
42 #include "xattr.h"
43 #include "acl.h"
44 #include "truncate.h"
45
46 #include <trace/events/ext4.h>
47
48 #define MPAGE_DA_EXTENT_TAIL 0x01
49
50 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
51                               struct ext4_inode_info *ei)
52 {
53         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
54         __u16 csum_lo;
55         __u16 csum_hi = 0;
56         __u32 csum;
57
58         csum_lo = raw->i_checksum_lo;
59         raw->i_checksum_lo = 0;
60         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
61             EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
62                 csum_hi = raw->i_checksum_hi;
63                 raw->i_checksum_hi = 0;
64         }
65
66         csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
67                            EXT4_INODE_SIZE(inode->i_sb));
68
69         raw->i_checksum_lo = csum_lo;
70         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
71             EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
72                 raw->i_checksum_hi = csum_hi;
73
74         return csum;
75 }
76
77 static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
78                                   struct ext4_inode_info *ei)
79 {
80         __u32 provided, calculated;
81
82         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
83             cpu_to_le32(EXT4_OS_LINUX) ||
84             !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
85                 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
86                 return 1;
87
88         provided = le16_to_cpu(raw->i_checksum_lo);
89         calculated = ext4_inode_csum(inode, raw, ei);
90         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
91             EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
92                 provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
93         else
94                 calculated &= 0xFFFF;
95
96         return provided == calculated;
97 }
98
99 static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
100                                 struct ext4_inode_info *ei)
101 {
102         __u32 csum;
103
104         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
105             cpu_to_le32(EXT4_OS_LINUX) ||
106             !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
107                 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
108                 return;
109
110         csum = ext4_inode_csum(inode, raw, ei);
111         raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
112         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
113             EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
114                 raw->i_checksum_hi = cpu_to_le16(csum >> 16);
115 }
116
117 static inline int ext4_begin_ordered_truncate(struct inode *inode,
118                                               loff_t new_size)
119 {
120         trace_ext4_begin_ordered_truncate(inode, new_size);
121         /*
122          * If jinode is zero, then we never opened the file for
123          * writing, so there's no need to call
124          * jbd2_journal_begin_ordered_truncate() since there's no
125          * outstanding writes we need to flush.
126          */
127         if (!EXT4_I(inode)->jinode)
128                 return 0;
129         return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
130                                                    EXT4_I(inode)->jinode,
131                                                    new_size);
132 }
133
134 static void ext4_invalidatepage(struct page *page, unsigned long offset);
135 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
136                                    struct buffer_head *bh_result, int create);
137 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
138 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
139 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
140 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
141 static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
142                 struct inode *inode, struct page *page, loff_t from,
143                 loff_t length, int flags);
144
145 /*
146  * Test whether an inode is a fast symlink.
147  */
148 static int ext4_inode_is_fast_symlink(struct inode *inode)
149 {
150         int ea_blocks = EXT4_I(inode)->i_file_acl ?
151                 (inode->i_sb->s_blocksize >> 9) : 0;
152
153         return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
154 }
155
156 /*
157  * Restart the transaction associated with *handle.  This does a commit,
158  * so before we call here everything must be consistently dirtied against
159  * this transaction.
160  */
161 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
162                                  int nblocks)
163 {
164         int ret;
165
166         /*
167          * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
168          * moment, get_block can be called only for blocks inside i_size since
169          * page cache has been already dropped and writes are blocked by
170          * i_mutex. So we can safely drop the i_data_sem here.
171          */
172         BUG_ON(EXT4_JOURNAL(inode) == NULL);
173         jbd_debug(2, "restarting handle %p\n", handle);
174         up_write(&EXT4_I(inode)->i_data_sem);
175         ret = ext4_journal_restart(handle, nblocks);
176         down_write(&EXT4_I(inode)->i_data_sem);
177         ext4_discard_preallocations(inode);
178
179         return ret;
180 }
181
182 /*
183  * Called at the last iput() if i_nlink is zero.
184  */
185 void ext4_evict_inode(struct inode *inode)
186 {
187         handle_t *handle;
188         int err;
189
190         trace_ext4_evict_inode(inode);
191
192         ext4_ioend_wait(inode);
193
194         if (inode->i_nlink) {
195                 /*
196                  * When journalling data dirty buffers are tracked only in the
197                  * journal. So although mm thinks everything is clean and
198                  * ready for reaping the inode might still have some pages to
199                  * write in the running transaction or waiting to be
200                  * checkpointed. Thus calling jbd2_journal_invalidatepage()
201                  * (via truncate_inode_pages()) to discard these buffers can
202                  * cause data loss. Also even if we did not discard these
203                  * buffers, we would have no way to find them after the inode
204                  * is reaped and thus user could see stale data if he tries to
205                  * read them before the transaction is checkpointed. So be
206                  * careful and force everything to disk here... We use
207                  * ei->i_datasync_tid to store the newest transaction
208                  * containing inode's data.
209                  *
210                  * Note that directories do not have this problem because they
211                  * don't use page cache.
212                  */
213                 if (ext4_should_journal_data(inode) &&
214                     (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
215                         journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
216                         tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
217
218                         jbd2_log_start_commit(journal, commit_tid);
219                         jbd2_log_wait_commit(journal, commit_tid);
220                         filemap_write_and_wait(&inode->i_data);
221                 }
222                 truncate_inode_pages(&inode->i_data, 0);
223                 goto no_delete;
224         }
225
226         if (!is_bad_inode(inode))
227                 dquot_initialize(inode);
228
229         if (ext4_should_order_data(inode))
230                 ext4_begin_ordered_truncate(inode, 0);
231         truncate_inode_pages(&inode->i_data, 0);
232
233         if (is_bad_inode(inode))
234                 goto no_delete;
235
236         handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
237         if (IS_ERR(handle)) {
238                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
239                 /*
240                  * If we're going to skip the normal cleanup, we still need to
241                  * make sure that the in-core orphan linked list is properly
242                  * cleaned up.
243                  */
244                 ext4_orphan_del(NULL, inode);
245                 goto no_delete;
246         }
247
248         if (IS_SYNC(inode))
249                 ext4_handle_sync(handle);
250         inode->i_size = 0;
251         err = ext4_mark_inode_dirty(handle, inode);
252         if (err) {
253                 ext4_warning(inode->i_sb,
254                              "couldn't mark inode dirty (err %d)", err);
255                 goto stop_handle;
256         }
257         if (inode->i_blocks)
258                 ext4_truncate(inode);
259
260         /*
261          * ext4_ext_truncate() doesn't reserve any slop when it
262          * restarts journal transactions; therefore there may not be
263          * enough credits left in the handle to remove the inode from
264          * the orphan list and set the dtime field.
265          */
266         if (!ext4_handle_has_enough_credits(handle, 3)) {
267                 err = ext4_journal_extend(handle, 3);
268                 if (err > 0)
269                         err = ext4_journal_restart(handle, 3);
270                 if (err != 0) {
271                         ext4_warning(inode->i_sb,
272                                      "couldn't extend journal (err %d)", err);
273                 stop_handle:
274                         ext4_journal_stop(handle);
275                         ext4_orphan_del(NULL, inode);
276                         goto no_delete;
277                 }
278         }
279
280         /*
281          * Kill off the orphan record which ext4_truncate created.
282          * AKPM: I think this can be inside the above `if'.
283          * Note that ext4_orphan_del() has to be able to cope with the
284          * deletion of a non-existent orphan - this is because we don't
285          * know if ext4_truncate() actually created an orphan record.
286          * (Well, we could do this if we need to, but heck - it works)
287          */
288         ext4_orphan_del(handle, inode);
289         EXT4_I(inode)->i_dtime  = get_seconds();
290
291         /*
292          * One subtle ordering requirement: if anything has gone wrong
293          * (transaction abort, IO errors, whatever), then we can still
294          * do these next steps (the fs will already have been marked as
295          * having errors), but we can't free the inode if the mark_dirty
296          * fails.
297          */
298         if (ext4_mark_inode_dirty(handle, inode))
299                 /* If that failed, just do the required in-core inode clear. */
300                 ext4_clear_inode(inode);
301         else
302                 ext4_free_inode(handle, inode);
303         ext4_journal_stop(handle);
304         return;
305 no_delete:
306         ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
307 }
308
309 #ifdef CONFIG_QUOTA
310 qsize_t *ext4_get_reserved_space(struct inode *inode)
311 {
312         return &EXT4_I(inode)->i_reserved_quota;
313 }
314 #endif
315
316 /*
317  * Calculate the number of metadata blocks need to reserve
318  * to allocate a block located at @lblock
319  */
320 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
321 {
322         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
323                 return ext4_ext_calc_metadata_amount(inode, lblock);
324
325         return ext4_ind_calc_metadata_amount(inode, lblock);
326 }
327
328 /*
329  * Called with i_data_sem down, which is important since we can call
330  * ext4_discard_preallocations() from here.
331  */
332 void ext4_da_update_reserve_space(struct inode *inode,
333                                         int used, int quota_claim)
334 {
335         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
336         struct ext4_inode_info *ei = EXT4_I(inode);
337
338         spin_lock(&ei->i_block_reservation_lock);
339         trace_ext4_da_update_reserve_space(inode, used, quota_claim);
340         if (unlikely(used > ei->i_reserved_data_blocks)) {
341                 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
342                          "with only %d reserved data blocks",
343                          __func__, inode->i_ino, used,
344                          ei->i_reserved_data_blocks);
345                 WARN_ON(1);
346                 used = ei->i_reserved_data_blocks;
347         }
348
349         if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
350                 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
351                          "with only %d reserved metadata blocks\n", __func__,
352                          inode->i_ino, ei->i_allocated_meta_blocks,
353                          ei->i_reserved_meta_blocks);
354                 WARN_ON(1);
355                 ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
356         }
357
358         /* Update per-inode reservations */
359         ei->i_reserved_data_blocks -= used;
360         ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
361         percpu_counter_sub(&sbi->s_dirtyclusters_counter,
362                            used + ei->i_allocated_meta_blocks);
363         ei->i_allocated_meta_blocks = 0;
364
365         if (ei->i_reserved_data_blocks == 0) {
366                 /*
367                  * We can release all of the reserved metadata blocks
368                  * only when we have written all of the delayed
369                  * allocation blocks.
370                  */
371                 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
372                                    ei->i_reserved_meta_blocks);
373                 ei->i_reserved_meta_blocks = 0;
374                 ei->i_da_metadata_calc_len = 0;
375         }
376         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
377
378         /* Update quota subsystem for data blocks */
379         if (quota_claim)
380                 dquot_claim_block(inode, EXT4_C2B(sbi, used));
381         else {
382                 /*
383                  * We did fallocate with an offset that is already delayed
384                  * allocated. So on delayed allocated writeback we should
385                  * not re-claim the quota for fallocated blocks.
386                  */
387                 dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
388         }
389
390         /*
391          * If we have done all the pending block allocations and if
392          * there aren't any writers on the inode, we can discard the
393          * inode's preallocations.
394          */
395         if ((ei->i_reserved_data_blocks == 0) &&
396             (atomic_read(&inode->i_writecount) == 0))
397                 ext4_discard_preallocations(inode);
398 }
399
400 static int __check_block_validity(struct inode *inode, const char *func,
401                                 unsigned int line,
402                                 struct ext4_map_blocks *map)
403 {
404         if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
405                                    map->m_len)) {
406                 ext4_error_inode(inode, func, line, map->m_pblk,
407                                  "lblock %lu mapped to illegal pblock "
408                                  "(length %d)", (unsigned long) map->m_lblk,
409                                  map->m_len);
410                 return -EIO;
411         }
412         return 0;
413 }
414
415 #define check_block_validity(inode, map)        \
416         __check_block_validity((inode), __func__, __LINE__, (map))
417
418 /*
419  * Return the number of contiguous dirty pages in a given inode
420  * starting at page frame idx.
421  */
422 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
423                                     unsigned int max_pages)
424 {
425         struct address_space *mapping = inode->i_mapping;
426         pgoff_t index;
427         struct pagevec pvec;
428         pgoff_t num = 0;
429         int i, nr_pages, done = 0;
430
431         if (max_pages == 0)
432                 return 0;
433         pagevec_init(&pvec, 0);
434         while (!done) {
435                 index = idx;
436                 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
437                                               PAGECACHE_TAG_DIRTY,
438                                               (pgoff_t)PAGEVEC_SIZE);
439                 if (nr_pages == 0)
440                         break;
441                 for (i = 0; i < nr_pages; i++) {
442                         struct page *page = pvec.pages[i];
443                         struct buffer_head *bh, *head;
444
445                         lock_page(page);
446                         if (unlikely(page->mapping != mapping) ||
447                             !PageDirty(page) ||
448                             PageWriteback(page) ||
449                             page->index != idx) {
450                                 done = 1;
451                                 unlock_page(page);
452                                 break;
453                         }
454                         if (page_has_buffers(page)) {
455                                 bh = head = page_buffers(page);
456                                 do {
457                                         if (!buffer_delay(bh) &&
458                                             !buffer_unwritten(bh))
459                                                 done = 1;
460                                         bh = bh->b_this_page;
461                                 } while (!done && (bh != head));
462                         }
463                         unlock_page(page);
464                         if (done)
465                                 break;
466                         idx++;
467                         num++;
468                         if (num >= max_pages) {
469                                 done = 1;
470                                 break;
471                         }
472                 }
473                 pagevec_release(&pvec);
474         }
475         return num;
476 }
477
478 /*
479  * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
480  */
481 static void set_buffers_da_mapped(struct inode *inode,
482                                    struct ext4_map_blocks *map)
483 {
484         struct address_space *mapping = inode->i_mapping;
485         struct pagevec pvec;
486         int i, nr_pages;
487         pgoff_t index, end;
488
489         index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
490         end = (map->m_lblk + map->m_len - 1) >>
491                 (PAGE_CACHE_SHIFT - inode->i_blkbits);
492
493         pagevec_init(&pvec, 0);
494         while (index <= end) {
495                 nr_pages = pagevec_lookup(&pvec, mapping, index,
496                                           min(end - index + 1,
497                                               (pgoff_t)PAGEVEC_SIZE));
498                 if (nr_pages == 0)
499                         break;
500                 for (i = 0; i < nr_pages; i++) {
501                         struct page *page = pvec.pages[i];
502                         struct buffer_head *bh, *head;
503
504                         if (unlikely(page->mapping != mapping) ||
505                             !PageDirty(page))
506                                 break;
507
508                         if (page_has_buffers(page)) {
509                                 bh = head = page_buffers(page);
510                                 do {
511                                         set_buffer_da_mapped(bh);
512                                         bh = bh->b_this_page;
513                                 } while (bh != head);
514                         }
515                         index++;
516                 }
517                 pagevec_release(&pvec);
518         }
519 }
520
521 /*
522  * The ext4_map_blocks() function tries to look up the requested blocks,
523  * and returns if the blocks are already mapped.
524  *
525  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
526  * and store the allocated blocks in the result buffer head and mark it
527  * mapped.
528  *
529  * If file type is extents based, it will call ext4_ext_map_blocks(),
530  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
531  * based files
532  *
533  * On success, it returns the number of blocks being mapped or allocate.
534  * if create==0 and the blocks are pre-allocated and uninitialized block,
535  * the result buffer head is unmapped. If the create ==1, it will make sure
536  * the buffer head is mapped.
537  *
538  * It returns 0 if plain look up failed (blocks have not been allocated), in
539  * that case, buffer head is unmapped
540  *
541  * It returns the error in case of allocation failure.
542  */
543 int ext4_map_blocks(handle_t *handle, struct inode *inode,
544                     struct ext4_map_blocks *map, int flags)
545 {
546         int retval;
547
548         map->m_flags = 0;
549         ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
550                   "logical block %lu\n", inode->i_ino, flags, map->m_len,
551                   (unsigned long) map->m_lblk);
552         /*
553          * Try to see if we can get the block without requesting a new
554          * file system block.
555          */
556         if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
557                 down_read((&EXT4_I(inode)->i_data_sem));
558         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
559                 retval = ext4_ext_map_blocks(handle, inode, map, flags &
560                                              EXT4_GET_BLOCKS_KEEP_SIZE);
561         } else {
562                 retval = ext4_ind_map_blocks(handle, inode, map, flags &
563                                              EXT4_GET_BLOCKS_KEEP_SIZE);
564         }
565         if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
566                 up_read((&EXT4_I(inode)->i_data_sem));
567
568         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
569                 int ret = check_block_validity(inode, map);
570                 if (ret != 0)
571                         return ret;
572         }
573
574         /* If it is only a block(s) look up */
575         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
576                 return retval;
577
578         /*
579          * Returns if the blocks have already allocated
580          *
581          * Note that if blocks have been preallocated
582          * ext4_ext_get_block() returns the create = 0
583          * with buffer head unmapped.
584          */
585         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
586                 return retval;
587
588         /*
589          * When we call get_blocks without the create flag, the
590          * BH_Unwritten flag could have gotten set if the blocks
591          * requested were part of a uninitialized extent.  We need to
592          * clear this flag now that we are committed to convert all or
593          * part of the uninitialized extent to be an initialized
594          * extent.  This is because we need to avoid the combination
595          * of BH_Unwritten and BH_Mapped flags being simultaneously
596          * set on the buffer_head.
597          */
598         map->m_flags &= ~EXT4_MAP_UNWRITTEN;
599
600         /*
601          * New blocks allocate and/or writing to uninitialized extent
602          * will possibly result in updating i_data, so we take
603          * the write lock of i_data_sem, and call get_blocks()
604          * with create == 1 flag.
605          */
606         down_write((&EXT4_I(inode)->i_data_sem));
607
608         /*
609          * if the caller is from delayed allocation writeout path
610          * we have already reserved fs blocks for allocation
611          * let the underlying get_block() function know to
612          * avoid double accounting
613          */
614         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
615                 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
616         /*
617          * We need to check for EXT4 here because migrate
618          * could have changed the inode type in between
619          */
620         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
621                 retval = ext4_ext_map_blocks(handle, inode, map, flags);
622         } else {
623                 retval = ext4_ind_map_blocks(handle, inode, map, flags);
624
625                 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
626                         /*
627                          * We allocated new blocks which will result in
628                          * i_data's format changing.  Force the migrate
629                          * to fail by clearing migrate flags
630                          */
631                         ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
632                 }
633
634                 /*
635                  * Update reserved blocks/metadata blocks after successful
636                  * block allocation which had been deferred till now. We don't
637                  * support fallocate for non extent files. So we can update
638                  * reserve space here.
639                  */
640                 if ((retval > 0) &&
641                         (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
642                         ext4_da_update_reserve_space(inode, retval, 1);
643         }
644         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
645                 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
646
647                 /* If we have successfully mapped the delayed allocated blocks,
648                  * set the BH_Da_Mapped bit on them. Its important to do this
649                  * under the protection of i_data_sem.
650                  */
651                 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
652                         set_buffers_da_mapped(inode, map);
653         }
654
655         up_write((&EXT4_I(inode)->i_data_sem));
656         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
657                 int ret = check_block_validity(inode, map);
658                 if (ret != 0)
659                         return ret;
660         }
661         return retval;
662 }
663
664 /* Maximum number of blocks we map for direct IO at once. */
665 #define DIO_MAX_BLOCKS 4096
666
667 static int _ext4_get_block(struct inode *inode, sector_t iblock,
668                            struct buffer_head *bh, int flags)
669 {
670         handle_t *handle = ext4_journal_current_handle();
671         struct ext4_map_blocks map;
672         int ret = 0, started = 0;
673         int dio_credits;
674
675         map.m_lblk = iblock;
676         map.m_len = bh->b_size >> inode->i_blkbits;
677
678         if (flags && !handle) {
679                 /* Direct IO write... */
680                 if (map.m_len > DIO_MAX_BLOCKS)
681                         map.m_len = DIO_MAX_BLOCKS;
682                 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
683                 handle = ext4_journal_start(inode, dio_credits);
684                 if (IS_ERR(handle)) {
685                         ret = PTR_ERR(handle);
686                         return ret;
687                 }
688                 started = 1;
689         }
690
691         ret = ext4_map_blocks(handle, inode, &map, flags);
692         if (ret > 0) {
693                 map_bh(bh, inode->i_sb, map.m_pblk);
694                 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
695                 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
696                 ret = 0;
697         }
698         if (started)
699                 ext4_journal_stop(handle);
700         return ret;
701 }
702
703 int ext4_get_block(struct inode *inode, sector_t iblock,
704                    struct buffer_head *bh, int create)
705 {
706         return _ext4_get_block(inode, iblock, bh,
707                                create ? EXT4_GET_BLOCKS_CREATE : 0);
708 }
709
710 /*
711  * `handle' can be NULL if create is zero
712  */
713 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
714                                 ext4_lblk_t block, int create, int *errp)
715 {
716         struct ext4_map_blocks map;
717         struct buffer_head *bh;
718         int fatal = 0, err;
719
720         J_ASSERT(handle != NULL || create == 0);
721
722         map.m_lblk = block;
723         map.m_len = 1;
724         err = ext4_map_blocks(handle, inode, &map,
725                               create ? EXT4_GET_BLOCKS_CREATE : 0);
726
727         if (err < 0)
728                 *errp = err;
729         if (err <= 0)
730                 return NULL;
731         *errp = 0;
732
733         bh = sb_getblk(inode->i_sb, map.m_pblk);
734         if (!bh) {
735                 *errp = -EIO;
736                 return NULL;
737         }
738         if (map.m_flags & EXT4_MAP_NEW) {
739                 J_ASSERT(create != 0);
740                 J_ASSERT(handle != NULL);
741
742                 /*
743                  * Now that we do not always journal data, we should
744                  * keep in mind whether this should always journal the
745                  * new buffer as metadata.  For now, regular file
746                  * writes use ext4_get_block instead, so it's not a
747                  * problem.
748                  */
749                 lock_buffer(bh);
750                 BUFFER_TRACE(bh, "call get_create_access");
751                 fatal = ext4_journal_get_create_access(handle, bh);
752                 if (!fatal && !buffer_uptodate(bh)) {
753                         memset(bh->b_data, 0, inode->i_sb->s_blocksize);
754                         set_buffer_uptodate(bh);
755                 }
756                 unlock_buffer(bh);
757                 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
758                 err = ext4_handle_dirty_metadata(handle, inode, bh);
759                 if (!fatal)
760                         fatal = err;
761         } else {
762                 BUFFER_TRACE(bh, "not a new buffer");
763         }
764         if (fatal) {
765                 *errp = fatal;
766                 brelse(bh);
767                 bh = NULL;
768         }
769         return bh;
770 }
771
772 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
773                                ext4_lblk_t block, int create, int *err)
774 {
775         struct buffer_head *bh;
776
777         bh = ext4_getblk(handle, inode, block, create, err);
778         if (!bh)
779                 return bh;
780         if (buffer_uptodate(bh))
781                 return bh;
782         ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
783         wait_on_buffer(bh);
784         if (buffer_uptodate(bh))
785                 return bh;
786         put_bh(bh);
787         *err = -EIO;
788         return NULL;
789 }
790
791 static int walk_page_buffers(handle_t *handle,
792                              struct buffer_head *head,
793                              unsigned from,
794                              unsigned to,
795                              int *partial,
796                              int (*fn)(handle_t *handle,
797                                        struct buffer_head *bh))
798 {
799         struct buffer_head *bh;
800         unsigned block_start, block_end;
801         unsigned blocksize = head->b_size;
802         int err, ret = 0;
803         struct buffer_head *next;
804
805         for (bh = head, block_start = 0;
806              ret == 0 && (bh != head || !block_start);
807              block_start = block_end, bh = next) {
808                 next = bh->b_this_page;
809                 block_end = block_start + blocksize;
810                 if (block_end <= from || block_start >= to) {
811                         if (partial && !buffer_uptodate(bh))
812                                 *partial = 1;
813                         continue;
814                 }
815                 err = (*fn)(handle, bh);
816                 if (!ret)
817                         ret = err;
818         }
819         return ret;
820 }
821
822 /*
823  * To preserve ordering, it is essential that the hole instantiation and
824  * the data write be encapsulated in a single transaction.  We cannot
825  * close off a transaction and start a new one between the ext4_get_block()
826  * and the commit_write().  So doing the jbd2_journal_start at the start of
827  * prepare_write() is the right place.
828  *
829  * Also, this function can nest inside ext4_writepage() ->
830  * block_write_full_page(). In that case, we *know* that ext4_writepage()
831  * has generated enough buffer credits to do the whole page.  So we won't
832  * block on the journal in that case, which is good, because the caller may
833  * be PF_MEMALLOC.
834  *
835  * By accident, ext4 can be reentered when a transaction is open via
836  * quota file writes.  If we were to commit the transaction while thus
837  * reentered, there can be a deadlock - we would be holding a quota
838  * lock, and the commit would never complete if another thread had a
839  * transaction open and was blocking on the quota lock - a ranking
840  * violation.
841  *
842  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
843  * will _not_ run commit under these circumstances because handle->h_ref
844  * is elevated.  We'll still have enough credits for the tiny quotafile
845  * write.
846  */
847 static int do_journal_get_write_access(handle_t *handle,
848                                        struct buffer_head *bh)
849 {
850         int dirty = buffer_dirty(bh);
851         int ret;
852
853         if (!buffer_mapped(bh) || buffer_freed(bh))
854                 return 0;
855         /*
856          * __block_write_begin() could have dirtied some buffers. Clean
857          * the dirty bit as jbd2_journal_get_write_access() could complain
858          * otherwise about fs integrity issues. Setting of the dirty bit
859          * by __block_write_begin() isn't a real problem here as we clear
860          * the bit before releasing a page lock and thus writeback cannot
861          * ever write the buffer.
862          */
863         if (dirty)
864                 clear_buffer_dirty(bh);
865         ret = ext4_journal_get_write_access(handle, bh);
866         if (!ret && dirty)
867                 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
868         return ret;
869 }
870
871 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
872                    struct buffer_head *bh_result, int create);
873 static int ext4_write_begin(struct file *file, struct address_space *mapping,
874                             loff_t pos, unsigned len, unsigned flags,
875                             struct page **pagep, void **fsdata)
876 {
877         struct inode *inode = mapping->host;
878         int ret, needed_blocks;
879         handle_t *handle;
880         int retries = 0;
881         struct page *page;
882         pgoff_t index;
883         unsigned from, to;
884
885         trace_ext4_write_begin(inode, pos, len, flags);
886         /*
887          * Reserve one block more for addition to orphan list in case
888          * we allocate blocks but write fails for some reason
889          */
890         needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
891         index = pos >> PAGE_CACHE_SHIFT;
892         from = pos & (PAGE_CACHE_SIZE - 1);
893         to = from + len;
894
895 retry:
896         handle = ext4_journal_start(inode, needed_blocks);
897         if (IS_ERR(handle)) {
898                 ret = PTR_ERR(handle);
899                 goto out;
900         }
901
902         /* We cannot recurse into the filesystem as the transaction is already
903          * started */
904         flags |= AOP_FLAG_NOFS;
905
906         page = grab_cache_page_write_begin(mapping, index, flags);
907         if (!page) {
908                 ext4_journal_stop(handle);
909                 ret = -ENOMEM;
910                 goto out;
911         }
912         *pagep = page;
913
914         if (ext4_should_dioread_nolock(inode))
915                 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
916         else
917                 ret = __block_write_begin(page, pos, len, ext4_get_block);
918
919         if (!ret && ext4_should_journal_data(inode)) {
920                 ret = walk_page_buffers(handle, page_buffers(page),
921                                 from, to, NULL, do_journal_get_write_access);
922         }
923
924         if (ret) {
925                 unlock_page(page);
926                 page_cache_release(page);
927                 /*
928                  * __block_write_begin may have instantiated a few blocks
929                  * outside i_size.  Trim these off again. Don't need
930                  * i_size_read because we hold i_mutex.
931                  *
932                  * Add inode to orphan list in case we crash before
933                  * truncate finishes
934                  */
935                 if (pos + len > inode->i_size && ext4_can_truncate(inode))
936                         ext4_orphan_add(handle, inode);
937
938                 ext4_journal_stop(handle);
939                 if (pos + len > inode->i_size) {
940                         ext4_truncate_failed_write(inode);
941                         /*
942                          * If truncate failed early the inode might
943                          * still be on the orphan list; we need to
944                          * make sure the inode is removed from the
945                          * orphan list in that case.
946                          */
947                         if (inode->i_nlink)
948                                 ext4_orphan_del(NULL, inode);
949                 }
950         }
951
952         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
953                 goto retry;
954 out:
955         return ret;
956 }
957
958 /* For write_end() in data=journal mode */
959 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
960 {
961         if (!buffer_mapped(bh) || buffer_freed(bh))
962                 return 0;
963         set_buffer_uptodate(bh);
964         return ext4_handle_dirty_metadata(handle, NULL, bh);
965 }
966
967 static int ext4_generic_write_end(struct file *file,
968                                   struct address_space *mapping,
969                                   loff_t pos, unsigned len, unsigned copied,
970                                   struct page *page, void *fsdata)
971 {
972         int i_size_changed = 0;
973         struct inode *inode = mapping->host;
974         handle_t *handle = ext4_journal_current_handle();
975
976         copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
977
978         /*
979          * No need to use i_size_read() here, the i_size
980          * cannot change under us because we hold i_mutex.
981          *
982          * But it's important to update i_size while still holding page lock:
983          * page writeout could otherwise come in and zero beyond i_size.
984          */
985         if (pos + copied > inode->i_size) {
986                 i_size_write(inode, pos + copied);
987                 i_size_changed = 1;
988         }
989
990         if (pos + copied >  EXT4_I(inode)->i_disksize) {
991                 /* We need to mark inode dirty even if
992                  * new_i_size is less that inode->i_size
993                  * bu greater than i_disksize.(hint delalloc)
994                  */
995                 ext4_update_i_disksize(inode, (pos + copied));
996                 i_size_changed = 1;
997         }
998         unlock_page(page);
999         page_cache_release(page);
1000
1001         /*
1002          * Don't mark the inode dirty under page lock. First, it unnecessarily
1003          * makes the holding time of page lock longer. Second, it forces lock
1004          * ordering of page lock and transaction start for journaling
1005          * filesystems.
1006          */
1007         if (i_size_changed)
1008                 ext4_mark_inode_dirty(handle, inode);
1009
1010         return copied;
1011 }
1012
1013 /*
1014  * We need to pick up the new inode size which generic_commit_write gave us
1015  * `file' can be NULL - eg, when called from page_symlink().
1016  *
1017  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
1018  * buffers are managed internally.
1019  */
1020 static int ext4_ordered_write_end(struct file *file,
1021                                   struct address_space *mapping,
1022                                   loff_t pos, unsigned len, unsigned copied,
1023                                   struct page *page, void *fsdata)
1024 {
1025         handle_t *handle = ext4_journal_current_handle();
1026         struct inode *inode = mapping->host;
1027         int ret = 0, ret2;
1028
1029         trace_ext4_ordered_write_end(inode, pos, len, copied);
1030         ret = ext4_jbd2_file_inode(handle, inode);
1031
1032         if (ret == 0) {
1033                 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1034                                                         page, fsdata);
1035                 copied = ret2;
1036                 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1037                         /* if we have allocated more blocks and copied
1038                          * less. We will have blocks allocated outside
1039                          * inode->i_size. So truncate them
1040                          */
1041                         ext4_orphan_add(handle, inode);
1042                 if (ret2 < 0)
1043                         ret = ret2;
1044         } else {
1045                 unlock_page(page);
1046                 page_cache_release(page);
1047         }
1048
1049         ret2 = ext4_journal_stop(handle);
1050         if (!ret)
1051                 ret = ret2;
1052
1053         if (pos + len > inode->i_size) {
1054                 ext4_truncate_failed_write(inode);
1055                 /*
1056                  * If truncate failed early the inode might still be
1057                  * on the orphan list; we need to make sure the inode
1058                  * is removed from the orphan list in that case.
1059                  */
1060                 if (inode->i_nlink)
1061                         ext4_orphan_del(NULL, inode);
1062         }
1063
1064
1065         return ret ? ret : copied;
1066 }
1067
1068 static int ext4_writeback_write_end(struct file *file,
1069                                     struct address_space *mapping,
1070                                     loff_t pos, unsigned len, unsigned copied,
1071                                     struct page *page, void *fsdata)
1072 {
1073         handle_t *handle = ext4_journal_current_handle();
1074         struct inode *inode = mapping->host;
1075         int ret = 0, ret2;
1076
1077         trace_ext4_writeback_write_end(inode, pos, len, copied);
1078         ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1079                                                         page, fsdata);
1080         copied = ret2;
1081         if (pos + len > inode->i_size && ext4_can_truncate(inode))
1082                 /* if we have allocated more blocks and copied
1083                  * less. We will have blocks allocated outside
1084                  * inode->i_size. So truncate them
1085                  */
1086                 ext4_orphan_add(handle, inode);
1087
1088         if (ret2 < 0)
1089                 ret = ret2;
1090
1091         ret2 = ext4_journal_stop(handle);
1092         if (!ret)
1093                 ret = ret2;
1094
1095         if (pos + len > inode->i_size) {
1096                 ext4_truncate_failed_write(inode);
1097                 /*
1098                  * If truncate failed early the inode might still be
1099                  * on the orphan list; we need to make sure the inode
1100                  * is removed from the orphan list in that case.
1101                  */
1102                 if (inode->i_nlink)
1103                         ext4_orphan_del(NULL, inode);
1104         }
1105
1106         return ret ? ret : copied;
1107 }
1108
1109 static int ext4_journalled_write_end(struct file *file,
1110                                      struct address_space *mapping,
1111                                      loff_t pos, unsigned len, unsigned copied,
1112                                      struct page *page, void *fsdata)
1113 {
1114         handle_t *handle = ext4_journal_current_handle();
1115         struct inode *inode = mapping->host;
1116         int ret = 0, ret2;
1117         int partial = 0;
1118         unsigned from, to;
1119         loff_t new_i_size;
1120
1121         trace_ext4_journalled_write_end(inode, pos, len, copied);
1122         from = pos & (PAGE_CACHE_SIZE - 1);
1123         to = from + len;
1124
1125         BUG_ON(!ext4_handle_valid(handle));
1126
1127         if (copied < len) {
1128                 if (!PageUptodate(page))
1129                         copied = 0;
1130                 page_zero_new_buffers(page, from+copied, to);
1131         }
1132
1133         ret = walk_page_buffers(handle, page_buffers(page), from,
1134                                 to, &partial, write_end_fn);
1135         if (!partial)
1136                 SetPageUptodate(page);
1137         new_i_size = pos + copied;
1138         if (new_i_size > inode->i_size)
1139                 i_size_write(inode, pos+copied);
1140         ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1141         EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1142         if (new_i_size > EXT4_I(inode)->i_disksize) {
1143                 ext4_update_i_disksize(inode, new_i_size);
1144                 ret2 = ext4_mark_inode_dirty(handle, inode);
1145                 if (!ret)
1146                         ret = ret2;
1147         }
1148
1149         unlock_page(page);
1150         page_cache_release(page);
1151         if (pos + len > inode->i_size && ext4_can_truncate(inode))
1152                 /* if we have allocated more blocks and copied
1153                  * less. We will have blocks allocated outside
1154                  * inode->i_size. So truncate them
1155                  */
1156                 ext4_orphan_add(handle, inode);
1157
1158         ret2 = ext4_journal_stop(handle);
1159         if (!ret)
1160                 ret = ret2;
1161         if (pos + len > inode->i_size) {
1162                 ext4_truncate_failed_write(inode);
1163                 /*
1164                  * If truncate failed early the inode might still be
1165                  * on the orphan list; we need to make sure the inode
1166                  * is removed from the orphan list in that case.
1167                  */
1168                 if (inode->i_nlink)
1169                         ext4_orphan_del(NULL, inode);
1170         }
1171
1172         return ret ? ret : copied;
1173 }
1174
1175 /*
1176  * Reserve a single cluster located at lblock
1177  */
1178 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1179 {
1180         int retries = 0;
1181         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1182         struct ext4_inode_info *ei = EXT4_I(inode);
1183         unsigned int md_needed;
1184         int ret;
1185         ext4_lblk_t save_last_lblock;
1186         int save_len;
1187
1188         /*
1189          * We will charge metadata quota at writeout time; this saves
1190          * us from metadata over-estimation, though we may go over by
1191          * a small amount in the end.  Here we just reserve for data.
1192          */
1193         ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
1194         if (ret)
1195                 return ret;
1196
1197         /*
1198          * recalculate the amount of metadata blocks to reserve
1199          * in order to allocate nrblocks
1200          * worse case is one extent per block
1201          */
1202 repeat:
1203         spin_lock(&ei->i_block_reservation_lock);
1204         /*
1205          * ext4_calc_metadata_amount() has side effects, which we have
1206          * to be prepared undo if we fail to claim space.
1207          */
1208         save_len = ei->i_da_metadata_calc_len;
1209         save_last_lblock = ei->i_da_metadata_calc_last_lblock;
1210         md_needed = EXT4_NUM_B2C(sbi,
1211                                  ext4_calc_metadata_amount(inode, lblock));
1212         trace_ext4_da_reserve_space(inode, md_needed);
1213
1214         /*
1215          * We do still charge estimated metadata to the sb though;
1216          * we cannot afford to run out of free blocks.
1217          */
1218         if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
1219                 ei->i_da_metadata_calc_len = save_len;
1220                 ei->i_da_metadata_calc_last_lblock = save_last_lblock;
1221                 spin_unlock(&ei->i_block_reservation_lock);
1222                 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1223                         yield();
1224                         goto repeat;
1225                 }
1226                 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1227                 return -ENOSPC;
1228         }
1229         ei->i_reserved_data_blocks++;
1230         ei->i_reserved_meta_blocks += md_needed;
1231         spin_unlock(&ei->i_block_reservation_lock);
1232
1233         return 0;       /* success */
1234 }
1235
1236 static void ext4_da_release_space(struct inode *inode, int to_free)
1237 {
1238         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1239         struct ext4_inode_info *ei = EXT4_I(inode);
1240
1241         if (!to_free)
1242                 return;         /* Nothing to release, exit */
1243
1244         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1245
1246         trace_ext4_da_release_space(inode, to_free);
1247         if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1248                 /*
1249                  * if there aren't enough reserved blocks, then the
1250                  * counter is messed up somewhere.  Since this
1251                  * function is called from invalidate page, it's
1252                  * harmless to return without any action.
1253                  */
1254                 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1255                          "ino %lu, to_free %d with only %d reserved "
1256                          "data blocks", inode->i_ino, to_free,
1257                          ei->i_reserved_data_blocks);
1258                 WARN_ON(1);
1259                 to_free = ei->i_reserved_data_blocks;
1260         }
1261         ei->i_reserved_data_blocks -= to_free;
1262
1263         if (ei->i_reserved_data_blocks == 0) {
1264                 /*
1265                  * We can release all of the reserved metadata blocks
1266                  * only when we have written all of the delayed
1267                  * allocation blocks.
1268                  * Note that in case of bigalloc, i_reserved_meta_blocks,
1269                  * i_reserved_data_blocks, etc. refer to number of clusters.
1270                  */
1271                 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
1272                                    ei->i_reserved_meta_blocks);
1273                 ei->i_reserved_meta_blocks = 0;
1274                 ei->i_da_metadata_calc_len = 0;
1275         }
1276
1277         /* update fs dirty data blocks counter */
1278         percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
1279
1280         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1281
1282         dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
1283 }
1284
1285 static void ext4_da_page_release_reservation(struct page *page,
1286                                              unsigned long offset)
1287 {
1288         int to_release = 0;
1289         struct buffer_head *head, *bh;
1290         unsigned int curr_off = 0;
1291         struct inode *inode = page->mapping->host;
1292         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1293         int num_clusters;
1294
1295         head = page_buffers(page);
1296         bh = head;
1297         do {
1298                 unsigned int next_off = curr_off + bh->b_size;
1299
1300                 if ((offset <= curr_off) && (buffer_delay(bh))) {
1301                         to_release++;
1302                         clear_buffer_delay(bh);
1303                         clear_buffer_da_mapped(bh);
1304                 }
1305                 curr_off = next_off;
1306         } while ((bh = bh->b_this_page) != head);
1307
1308         /* If we have released all the blocks belonging to a cluster, then we
1309          * need to release the reserved space for that cluster. */
1310         num_clusters = EXT4_NUM_B2C(sbi, to_release);
1311         while (num_clusters > 0) {
1312                 ext4_fsblk_t lblk;
1313                 lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
1314                         ((num_clusters - 1) << sbi->s_cluster_bits);
1315                 if (sbi->s_cluster_ratio == 1 ||
1316                     !ext4_find_delalloc_cluster(inode, lblk, 1))
1317                         ext4_da_release_space(inode, 1);
1318
1319                 num_clusters--;
1320         }
1321 }
1322
1323 /*
1324  * Delayed allocation stuff
1325  */
1326
1327 /*
1328  * mpage_da_submit_io - walks through extent of pages and try to write
1329  * them with writepage() call back
1330  *
1331  * @mpd->inode: inode
1332  * @mpd->first_page: first page of the extent
1333  * @mpd->next_page: page after the last page of the extent
1334  *
1335  * By the time mpage_da_submit_io() is called we expect all blocks
1336  * to be allocated. this may be wrong if allocation failed.
1337  *
1338  * As pages are already locked by write_cache_pages(), we can't use it
1339  */
1340 static int mpage_da_submit_io(struct mpage_da_data *mpd,
1341                               struct ext4_map_blocks *map)
1342 {
1343         struct pagevec pvec;
1344         unsigned long index, end;
1345         int ret = 0, err, nr_pages, i;
1346         struct inode *inode = mpd->inode;
1347         struct address_space *mapping = inode->i_mapping;
1348         loff_t size = i_size_read(inode);
1349         unsigned int len, block_start;
1350         struct buffer_head *bh, *page_bufs = NULL;
1351         int journal_data = ext4_should_journal_data(inode);
1352         sector_t pblock = 0, cur_logical = 0;
1353         struct ext4_io_submit io_submit;
1354
1355         BUG_ON(mpd->next_page <= mpd->first_page);
1356         memset(&io_submit, 0, sizeof(io_submit));
1357         /*
1358          * We need to start from the first_page to the next_page - 1
1359          * to make sure we also write the mapped dirty buffer_heads.
1360          * If we look at mpd->b_blocknr we would only be looking
1361          * at the currently mapped buffer_heads.
1362          */
1363         index = mpd->first_page;
1364         end = mpd->next_page - 1;
1365
1366         pagevec_init(&pvec, 0);
1367         while (index <= end) {
1368                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1369                 if (nr_pages == 0)
1370                         break;
1371                 for (i = 0; i < nr_pages; i++) {
1372                         int commit_write = 0, skip_page = 0;
1373                         struct page *page = pvec.pages[i];
1374
1375                         index = page->index;
1376                         if (index > end)
1377                                 break;
1378
1379                         if (index == size >> PAGE_CACHE_SHIFT)
1380                                 len = size & ~PAGE_CACHE_MASK;
1381                         else
1382                                 len = PAGE_CACHE_SIZE;
1383                         if (map) {
1384                                 cur_logical = index << (PAGE_CACHE_SHIFT -
1385                                                         inode->i_blkbits);
1386                                 pblock = map->m_pblk + (cur_logical -
1387                                                         map->m_lblk);
1388                         }
1389                         index++;
1390
1391                         BUG_ON(!PageLocked(page));
1392                         BUG_ON(PageWriteback(page));
1393
1394                         /*
1395                          * If the page does not have buffers (for
1396                          * whatever reason), try to create them using
1397                          * __block_write_begin.  If this fails,
1398                          * skip the page and move on.
1399                          */
1400                         if (!page_has_buffers(page)) {
1401                                 if (__block_write_begin(page, 0, len,
1402                                                 noalloc_get_block_write)) {
1403                                 skip_page:
1404                                         unlock_page(page);
1405                                         continue;
1406                                 }
1407                                 commit_write = 1;
1408                         }
1409
1410                         bh = page_bufs = page_buffers(page);
1411                         block_start = 0;
1412                         do {
1413                                 if (!bh)
1414                                         goto skip_page;
1415                                 if (map && (cur_logical >= map->m_lblk) &&
1416                                     (cur_logical <= (map->m_lblk +
1417                                                      (map->m_len - 1)))) {
1418                                         if (buffer_delay(bh)) {
1419                                                 clear_buffer_delay(bh);
1420                                                 bh->b_blocknr = pblock;
1421                                         }
1422                                         if (buffer_da_mapped(bh))
1423                                                 clear_buffer_da_mapped(bh);
1424                                         if (buffer_unwritten(bh) ||
1425                                             buffer_mapped(bh))
1426                                                 BUG_ON(bh->b_blocknr != pblock);
1427                                         if (map->m_flags & EXT4_MAP_UNINIT)
1428                                                 set_buffer_uninit(bh);
1429                                         clear_buffer_unwritten(bh);
1430                                 }
1431
1432                                 /*
1433                                  * skip page if block allocation undone and
1434                                  * block is dirty
1435                                  */
1436                                 if (ext4_bh_delay_or_unwritten(NULL, bh))
1437                                         skip_page = 1;
1438                                 bh = bh->b_this_page;
1439                                 block_start += bh->b_size;
1440                                 cur_logical++;
1441                                 pblock++;
1442                         } while (bh != page_bufs);
1443
1444                         if (skip_page)
1445                                 goto skip_page;
1446
1447                         if (commit_write)
1448                                 /* mark the buffer_heads as dirty & uptodate */
1449                                 block_commit_write(page, 0, len);
1450
1451                         clear_page_dirty_for_io(page);
1452                         /*
1453                          * Delalloc doesn't support data journalling,
1454                          * but eventually maybe we'll lift this
1455                          * restriction.
1456                          */
1457                         if (unlikely(journal_data && PageChecked(page)))
1458                                 err = __ext4_journalled_writepage(page, len);
1459                         else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
1460                                 err = ext4_bio_write_page(&io_submit, page,
1461                                                           len, mpd->wbc);
1462                         else if (buffer_uninit(page_bufs)) {
1463                                 ext4_set_bh_endio(page_bufs, inode);
1464                                 err = block_write_full_page_endio(page,
1465                                         noalloc_get_block_write,
1466                                         mpd->wbc, ext4_end_io_buffer_write);
1467                         } else
1468                                 err = block_write_full_page(page,
1469                                         noalloc_get_block_write, mpd->wbc);
1470
1471                         if (!err)
1472                                 mpd->pages_written++;
1473                         /*
1474                          * In error case, we have to continue because
1475                          * remaining pages are still locked
1476                          */
1477                         if (ret == 0)
1478                                 ret = err;
1479                 }
1480                 pagevec_release(&pvec);
1481         }
1482         ext4_io_submit(&io_submit);
1483         return ret;
1484 }
1485
1486 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1487 {
1488         int nr_pages, i;
1489         pgoff_t index, end;
1490         struct pagevec pvec;
1491         struct inode *inode = mpd->inode;
1492         struct address_space *mapping = inode->i_mapping;
1493
1494         index = mpd->first_page;
1495         end   = mpd->next_page - 1;
1496         while (index <= end) {
1497                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1498                 if (nr_pages == 0)
1499                         break;
1500                 for (i = 0; i < nr_pages; i++) {
1501                         struct page *page = pvec.pages[i];
1502                         if (page->index > end)
1503                                 break;
1504                         BUG_ON(!PageLocked(page));
1505                         BUG_ON(PageWriteback(page));
1506                         block_invalidatepage(page, 0);
1507                         ClearPageUptodate(page);
1508                         unlock_page(page);
1509                 }
1510                 index = pvec.pages[nr_pages - 1]->index + 1;
1511                 pagevec_release(&pvec);
1512         }
1513         return;
1514 }
1515
1516 static void ext4_print_free_blocks(struct inode *inode)
1517 {
1518         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1519         struct super_block *sb = inode->i_sb;
1520
1521         ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
1522                EXT4_C2B(EXT4_SB(inode->i_sb),
1523                         ext4_count_free_clusters(inode->i_sb)));
1524         ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
1525         ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
1526                (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1527                 percpu_counter_sum(&sbi->s_freeclusters_counter)));
1528         ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
1529                (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1530                 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1531         ext4_msg(sb, KERN_CRIT, "Block reservation details");
1532         ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
1533                  EXT4_I(inode)->i_reserved_data_blocks);
1534         ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
1535                EXT4_I(inode)->i_reserved_meta_blocks);
1536         return;
1537 }
1538
1539 /*
1540  * mpage_da_map_and_submit - go through given space, map them
1541  *       if necessary, and then submit them for I/O
1542  *
1543  * @mpd - bh describing space
1544  *
1545  * The function skips space we know is already mapped to disk blocks.
1546  *
1547  */
1548 static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1549 {
1550         int err, blks, get_blocks_flags;
1551         struct ext4_map_blocks map, *mapp = NULL;
1552         sector_t next = mpd->b_blocknr;
1553         unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1554         loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1555         handle_t *handle = NULL;
1556
1557         /*
1558          * If the blocks are mapped already, or we couldn't accumulate
1559          * any blocks, then proceed immediately to the submission stage.
1560          */
1561         if ((mpd->b_size == 0) ||
1562             ((mpd->b_state  & (1 << BH_Mapped)) &&
1563              !(mpd->b_state & (1 << BH_Delay)) &&
1564              !(mpd->b_state & (1 << BH_Unwritten))))
1565                 goto submit_io;
1566
1567         handle = ext4_journal_current_handle();
1568         BUG_ON(!handle);
1569
1570         /*
1571          * Call ext4_map_blocks() to allocate any delayed allocation
1572          * blocks, or to convert an uninitialized extent to be
1573          * initialized (in the case where we have written into
1574          * one or more preallocated blocks).
1575          *
1576          * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1577          * indicate that we are on the delayed allocation path.  This
1578          * affects functions in many different parts of the allocation
1579          * call path.  This flag exists primarily because we don't
1580          * want to change *many* call functions, so ext4_map_blocks()
1581          * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1582          * inode's allocation semaphore is taken.
1583          *
1584          * If the blocks in questions were delalloc blocks, set
1585          * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1586          * variables are updated after the blocks have been allocated.
1587          */
1588         map.m_lblk = next;
1589         map.m_len = max_blocks;
1590         get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
1591         if (ext4_should_dioread_nolock(mpd->inode))
1592                 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1593         if (mpd->b_state & (1 << BH_Delay))
1594                 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1595
1596         blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1597         if (blks < 0) {
1598                 struct super_block *sb = mpd->inode->i_sb;
1599
1600                 err = blks;
1601                 /*
1602                  * If get block returns EAGAIN or ENOSPC and there
1603                  * appears to be free blocks we will just let
1604                  * mpage_da_submit_io() unlock all of the pages.
1605                  */
1606                 if (err == -EAGAIN)
1607                         goto submit_io;
1608
1609                 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1610                         mpd->retval = err;
1611                         goto submit_io;
1612                 }
1613
1614                 /*
1615                  * get block failure will cause us to loop in
1616                  * writepages, because a_ops->writepage won't be able
1617                  * to make progress. The page will be redirtied by
1618                  * writepage and writepages will again try to write
1619                  * the same.
1620                  */
1621                 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1622                         ext4_msg(sb, KERN_CRIT,
1623                                  "delayed block allocation failed for inode %lu "
1624                                  "at logical offset %llu with max blocks %zd "
1625                                  "with error %d", mpd->inode->i_ino,
1626                                  (unsigned long long) next,
1627                                  mpd->b_size >> mpd->inode->i_blkbits, err);
1628                         ext4_msg(sb, KERN_CRIT,
1629                                 "This should not happen!! Data will be lost\n");
1630                         if (err == -ENOSPC)
1631                                 ext4_print_free_blocks(mpd->inode);
1632                 }
1633                 /* invalidate all the pages */
1634                 ext4_da_block_invalidatepages(mpd);
1635
1636                 /* Mark this page range as having been completed */
1637                 mpd->io_done = 1;
1638                 return;
1639         }
1640         BUG_ON(blks == 0);
1641
1642         mapp = &map;
1643         if (map.m_flags & EXT4_MAP_NEW) {
1644                 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1645                 int i;
1646
1647                 for (i = 0; i < map.m_len; i++)
1648                         unmap_underlying_metadata(bdev, map.m_pblk + i);
1649
1650                 if (ext4_should_order_data(mpd->inode)) {
1651                         err = ext4_jbd2_file_inode(handle, mpd->inode);
1652                         if (err) {
1653                                 /* Only if the journal is aborted */
1654                                 mpd->retval = err;
1655                                 goto submit_io;
1656                         }
1657                 }
1658         }
1659
1660         /*
1661          * Update on-disk size along with block allocation.
1662          */
1663         disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1664         if (disksize > i_size_read(mpd->inode))
1665                 disksize = i_size_read(mpd->inode);
1666         if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1667                 ext4_update_i_disksize(mpd->inode, disksize);
1668                 err = ext4_mark_inode_dirty(handle, mpd->inode);
1669                 if (err)
1670                         ext4_error(mpd->inode->i_sb,
1671                                    "Failed to mark inode %lu dirty",
1672                                    mpd->inode->i_ino);
1673         }
1674
1675 submit_io:
1676         mpage_da_submit_io(mpd, mapp);
1677         mpd->io_done = 1;
1678 }
1679
1680 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1681                 (1 << BH_Delay) | (1 << BH_Unwritten))
1682
1683 /*
1684  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1685  *
1686  * @mpd->lbh - extent of blocks
1687  * @logical - logical number of the block in the file
1688  * @bh - bh of the block (used to access block's state)
1689  *
1690  * the function is used to collect contig. blocks in same state
1691  */
1692 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1693                                    sector_t logical, size_t b_size,
1694                                    unsigned long b_state)
1695 {
1696         sector_t next;
1697         int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
1698
1699         /*
1700          * XXX Don't go larger than mballoc is willing to allocate
1701          * This is a stopgap solution.  We eventually need to fold
1702          * mpage_da_submit_io() into this function and then call
1703          * ext4_map_blocks() multiple times in a loop
1704          */
1705         if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
1706                 goto flush_it;
1707
1708         /* check if thereserved journal credits might overflow */
1709         if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
1710                 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1711                         /*
1712                          * With non-extent format we are limited by the journal
1713                          * credit available.  Total credit needed to insert
1714                          * nrblocks contiguous blocks is dependent on the
1715                          * nrblocks.  So limit nrblocks.
1716                          */
1717                         goto flush_it;
1718                 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
1719                                 EXT4_MAX_TRANS_DATA) {
1720                         /*
1721                          * Adding the new buffer_head would make it cross the
1722                          * allowed limit for which we have journal credit
1723                          * reserved. So limit the new bh->b_size
1724                          */
1725                         b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
1726                                                 mpd->inode->i_blkbits;
1727                         /* we will do mpage_da_submit_io in the next loop */
1728                 }
1729         }
1730         /*
1731          * First block in the extent
1732          */
1733         if (mpd->b_size == 0) {
1734                 mpd->b_blocknr = logical;
1735                 mpd->b_size = b_size;
1736                 mpd->b_state = b_state & BH_FLAGS;
1737                 return;
1738         }
1739
1740         next = mpd->b_blocknr + nrblocks;
1741         /*
1742          * Can we merge the block to our big extent?
1743          */
1744         if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1745                 mpd->b_size += b_size;
1746                 return;
1747         }
1748
1749 flush_it:
1750         /*
1751          * We couldn't merge the block to our extent, so we
1752          * need to flush current  extent and start new one
1753          */
1754         mpage_da_map_and_submit(mpd);
1755         return;
1756 }
1757
1758 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1759 {
1760         return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
1761 }
1762
1763 /*
1764  * This function is grabs code from the very beginning of
1765  * ext4_map_blocks, but assumes that the caller is from delayed write
1766  * time. This function looks up the requested blocks and sets the
1767  * buffer delay bit under the protection of i_data_sem.
1768  */
1769 static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1770                               struct ext4_map_blocks *map,
1771                               struct buffer_head *bh)
1772 {
1773         int retval;
1774         sector_t invalid_block = ~((sector_t) 0xffff);
1775
1776         if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1777                 invalid_block = ~0;
1778
1779         map->m_flags = 0;
1780         ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1781                   "logical block %lu\n", inode->i_ino, map->m_len,
1782                   (unsigned long) map->m_lblk);
1783         /*
1784          * Try to see if we can get the block without requesting a new
1785          * file system block.
1786          */
1787         down_read((&EXT4_I(inode)->i_data_sem));
1788         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1789                 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1790         else
1791                 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
1792
1793         if (retval == 0) {
1794                 /*
1795                  * XXX: __block_prepare_write() unmaps passed block,
1796                  * is it OK?
1797                  */
1798                 /* If the block was allocated from previously allocated cluster,
1799                  * then we dont need to reserve it again. */
1800                 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
1801                         retval = ext4_da_reserve_space(inode, iblock);
1802                         if (retval)
1803                                 /* not enough space to reserve */
1804                                 goto out_unlock;
1805                 }
1806
1807                 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1808                  * and it should not appear on the bh->b_state.
1809                  */
1810                 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
1811
1812                 map_bh(bh, inode->i_sb, invalid_block);
1813                 set_buffer_new(bh);
1814                 set_buffer_delay(bh);
1815         }
1816
1817 out_unlock:
1818         up_read((&EXT4_I(inode)->i_data_sem));
1819
1820         return retval;
1821 }
1822
1823 /*
1824  * This is a special get_blocks_t callback which is used by
1825  * ext4_da_write_begin().  It will either return mapped block or
1826  * reserve space for a single block.
1827  *
1828  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
1829  * We also have b_blocknr = -1 and b_bdev initialized properly
1830  *
1831  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
1832  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1833  * initialized properly.
1834  */
1835 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1836                                   struct buffer_head *bh, int create)
1837 {
1838         struct ext4_map_blocks map;
1839         int ret = 0;
1840
1841         BUG_ON(create == 0);
1842         BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
1843
1844         map.m_lblk = iblock;
1845         map.m_len = 1;
1846
1847         /*
1848          * first, we need to know whether the block is allocated already
1849          * preallocated blocks are unmapped but should treated
1850          * the same as allocated blocks.
1851          */
1852         ret = ext4_da_map_blocks(inode, iblock, &map, bh);
1853         if (ret <= 0)
1854                 return ret;
1855
1856         map_bh(bh, inode->i_sb, map.m_pblk);
1857         bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1858
1859         if (buffer_unwritten(bh)) {
1860                 /* A delayed write to unwritten bh should be marked
1861                  * new and mapped.  Mapped ensures that we don't do
1862                  * get_block multiple times when we write to the same
1863                  * offset and new ensures that we do proper zero out
1864                  * for partial write.
1865                  */
1866                 set_buffer_new(bh);
1867                 set_buffer_mapped(bh);
1868         }
1869         return 0;
1870 }
1871
1872 /*
1873  * This function is used as a standard get_block_t calback function
1874  * when there is no desire to allocate any blocks.  It is used as a
1875  * callback function for block_write_begin() and block_write_full_page().
1876  * These functions should only try to map a single block at a time.
1877  *
1878  * Since this function doesn't do block allocations even if the caller
1879  * requests it by passing in create=1, it is critically important that
1880  * any caller checks to make sure that any buffer heads are returned
1881  * by this function are either all already mapped or marked for
1882  * delayed allocation before calling  block_write_full_page().  Otherwise,
1883  * b_blocknr could be left unitialized, and the page write functions will
1884  * be taken by surprise.
1885  */
1886 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
1887                                    struct buffer_head *bh_result, int create)
1888 {
1889         BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1890         return _ext4_get_block(inode, iblock, bh_result, 0);
1891 }
1892
1893 static int bget_one(handle_t *handle, struct buffer_head *bh)
1894 {
1895         get_bh(bh);
1896         return 0;
1897 }
1898
1899 static int bput_one(handle_t *handle, struct buffer_head *bh)
1900 {
1901         put_bh(bh);
1902         return 0;
1903 }
1904
1905 static int __ext4_journalled_writepage(struct page *page,
1906                                        unsigned int len)
1907 {
1908         struct address_space *mapping = page->mapping;
1909         struct inode *inode = mapping->host;
1910         struct buffer_head *page_bufs;
1911         handle_t *handle = NULL;
1912         int ret = 0;
1913         int err;
1914
1915         ClearPageChecked(page);
1916         page_bufs = page_buffers(page);
1917         BUG_ON(!page_bufs);
1918         walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
1919         /* As soon as we unlock the page, it can go away, but we have
1920          * references to buffers so we are safe */
1921         unlock_page(page);
1922
1923         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1924         if (IS_ERR(handle)) {
1925                 ret = PTR_ERR(handle);
1926                 goto out;
1927         }
1928
1929         BUG_ON(!ext4_handle_valid(handle));
1930
1931         ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
1932                                 do_journal_get_write_access);
1933
1934         err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
1935                                 write_end_fn);
1936         if (ret == 0)
1937                 ret = err;
1938         EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1939         err = ext4_journal_stop(handle);
1940         if (!ret)
1941                 ret = err;
1942
1943         walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
1944         ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1945 out:
1946         return ret;
1947 }
1948
1949 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
1950 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
1951
1952 /*
1953  * Note that we don't need to start a transaction unless we're journaling data
1954  * because we should have holes filled from ext4_page_mkwrite(). We even don't
1955  * need to file the inode to the transaction's list in ordered mode because if
1956  * we are writing back data added by write(), the inode is already there and if
1957  * we are writing back data modified via mmap(), no one guarantees in which
1958  * transaction the data will hit the disk. In case we are journaling data, we
1959  * cannot start transaction directly because transaction start ranks above page
1960  * lock so we have to do some magic.
1961  *
1962  * This function can get called via...
1963  *   - ext4_da_writepages after taking page lock (have journal handle)
1964  *   - journal_submit_inode_data_buffers (no journal handle)
1965  *   - shrink_page_list via pdflush (no journal handle)
1966  *   - grab_page_cache when doing write_begin (have journal handle)
1967  *
1968  * We don't do any block allocation in this function. If we have page with
1969  * multiple blocks we need to write those buffer_heads that are mapped. This
1970  * is important for mmaped based write. So if we do with blocksize 1K
1971  * truncate(f, 1024);
1972  * a = mmap(f, 0, 4096);
1973  * a[0] = 'a';
1974  * truncate(f, 4096);
1975  * we have in the page first buffer_head mapped via page_mkwrite call back
1976  * but other buffer_heads would be unmapped but dirty (dirty done via the
1977  * do_wp_page). So writepage should write the first block. If we modify
1978  * the mmap area beyond 1024 we will again get a page_fault and the
1979  * page_mkwrite callback will do the block allocation and mark the
1980  * buffer_heads mapped.
1981  *
1982  * We redirty the page if we have any buffer_heads that is either delay or
1983  * unwritten in the page.
1984  *
1985  * We can get recursively called as show below.
1986  *
1987  *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1988  *              ext4_writepage()
1989  *
1990  * But since we don't do any block allocation we should not deadlock.
1991  * Page also have the dirty flag cleared so we don't get recurive page_lock.
1992  */
1993 static int ext4_writepage(struct page *page,
1994                           struct writeback_control *wbc)
1995 {
1996         int ret = 0, commit_write = 0;
1997         loff_t size;
1998         unsigned int len;
1999         struct buffer_head *page_bufs = NULL;
2000         struct inode *inode = page->mapping->host;
2001
2002         trace_ext4_writepage(page);
2003         size = i_size_read(inode);
2004         if (page->index == size >> PAGE_CACHE_SHIFT)
2005                 len = size & ~PAGE_CACHE_MASK;
2006         else
2007                 len = PAGE_CACHE_SIZE;
2008
2009         /*
2010          * If the page does not have buffers (for whatever reason),
2011          * try to create them using __block_write_begin.  If this
2012          * fails, redirty the page and move on.
2013          */
2014         if (!page_has_buffers(page)) {
2015                 if (__block_write_begin(page, 0, len,
2016                                         noalloc_get_block_write)) {
2017                 redirty_page:
2018                         redirty_page_for_writepage(wbc, page);
2019                         unlock_page(page);
2020                         return 0;
2021                 }
2022                 commit_write = 1;
2023         }
2024         page_bufs = page_buffers(page);
2025         if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2026                               ext4_bh_delay_or_unwritten)) {
2027                 /*
2028                  * We don't want to do block allocation, so redirty
2029                  * the page and return.  We may reach here when we do
2030                  * a journal commit via journal_submit_inode_data_buffers.
2031                  * We can also reach here via shrink_page_list but it
2032                  * should never be for direct reclaim so warn if that
2033                  * happens
2034                  */
2035                 WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
2036                                                                 PF_MEMALLOC);
2037                 goto redirty_page;
2038         }
2039         if (commit_write)
2040                 /* now mark the buffer_heads as dirty and uptodate */
2041                 block_commit_write(page, 0, len);
2042
2043         if (PageChecked(page) && ext4_should_journal_data(inode))
2044                 /*
2045                  * It's mmapped pagecache.  Add buffers and journal it.  There
2046                  * doesn't seem much point in redirtying the page here.
2047                  */
2048                 return __ext4_journalled_writepage(page, len);
2049
2050         if (buffer_uninit(page_bufs)) {
2051                 ext4_set_bh_endio(page_bufs, inode);
2052                 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2053                                             wbc, ext4_end_io_buffer_write);
2054         } else
2055                 ret = block_write_full_page(page, noalloc_get_block_write,
2056                                             wbc);
2057
2058         return ret;
2059 }
2060
2061 /*
2062  * This is called via ext4_da_writepages() to
2063  * calculate the total number of credits to reserve to fit
2064  * a single extent allocation into a single transaction,
2065  * ext4_da_writpeages() will loop calling this before
2066  * the block allocation.
2067  */
2068
2069 static int ext4_da_writepages_trans_blocks(struct inode *inode)
2070 {
2071         int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2072
2073         /*
2074          * With non-extent format the journal credit needed to
2075          * insert nrblocks contiguous block is dependent on
2076          * number of contiguous block. So we will limit
2077          * number of contiguous block to a sane value
2078          */
2079         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2080             (max_blocks > EXT4_MAX_TRANS_DATA))
2081                 max_blocks = EXT4_MAX_TRANS_DATA;
2082
2083         return ext4_chunk_trans_blocks(inode, max_blocks);
2084 }
2085
2086 /*
2087  * write_cache_pages_da - walk the list of dirty pages of the given
2088  * address space and accumulate pages that need writing, and call
2089  * mpage_da_map_and_submit to map a single contiguous memory region
2090  * and then write them.
2091  */
2092 static int write_cache_pages_da(struct address_space *mapping,
2093                                 struct writeback_control *wbc,
2094                                 struct mpage_da_data *mpd,
2095                                 pgoff_t *done_index)
2096 {
2097         struct buffer_head      *bh, *head;
2098         struct inode            *inode = mapping->host;
2099         struct pagevec          pvec;
2100         unsigned int            nr_pages;
2101         sector_t                logical;
2102         pgoff_t                 index, end;
2103         long                    nr_to_write = wbc->nr_to_write;
2104         int                     i, tag, ret = 0;
2105
2106         memset(mpd, 0, sizeof(struct mpage_da_data));
2107         mpd->wbc = wbc;
2108         mpd->inode = inode;
2109         pagevec_init(&pvec, 0);
2110         index = wbc->range_start >> PAGE_CACHE_SHIFT;
2111         end = wbc->range_end >> PAGE_CACHE_SHIFT;
2112
2113         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2114                 tag = PAGECACHE_TAG_TOWRITE;
2115         else
2116                 tag = PAGECACHE_TAG_DIRTY;
2117
2118         *done_index = index;
2119         while (index <= end) {
2120                 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2121                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2122                 if (nr_pages == 0)
2123                         return 0;
2124
2125                 for (i = 0; i < nr_pages; i++) {
2126                         struct page *page = pvec.pages[i];
2127
2128                         /*
2129                          * At this point, the page may be truncated or
2130                          * invalidated (changing page->mapping to NULL), or
2131                          * even swizzled back from swapper_space to tmpfs file
2132                          * mapping. However, page->index will not change
2133                          * because we have a reference on the page.
2134                          */
2135                         if (page->index > end)
2136                                 goto out;
2137
2138                         *done_index = page->index + 1;
2139
2140                         /*
2141                          * If we can't merge this page, and we have
2142                          * accumulated an contiguous region, write it
2143                          */
2144                         if ((mpd->next_page != page->index) &&
2145                             (mpd->next_page != mpd->first_page)) {
2146                                 mpage_da_map_and_submit(mpd);
2147                                 goto ret_extent_tail;
2148                         }
2149
2150                         lock_page(page);
2151
2152                         /*
2153                          * If the page is no longer dirty, or its
2154                          * mapping no longer corresponds to inode we
2155                          * are writing (which means it has been
2156                          * truncated or invalidated), or the page is
2157                          * already under writeback and we are not
2158                          * doing a data integrity writeback, skip the page
2159                          */
2160                         if (!PageDirty(page) ||
2161                             (PageWriteback(page) &&
2162                              (wbc->sync_mode == WB_SYNC_NONE)) ||
2163                             unlikely(page->mapping != mapping)) {
2164                                 unlock_page(page);
2165                                 continue;
2166                         }
2167
2168                         wait_on_page_writeback(page);
2169                         BUG_ON(PageWriteback(page));
2170
2171                         if (mpd->next_page != page->index)
2172                                 mpd->first_page = page->index;
2173                         mpd->next_page = page->index + 1;
2174                         logical = (sector_t) page->index <<
2175                                 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2176
2177                         if (!page_has_buffers(page)) {
2178                                 mpage_add_bh_to_extent(mpd, logical,
2179                                                        PAGE_CACHE_SIZE,
2180                                                        (1 << BH_Dirty) | (1 << BH_Uptodate));
2181                                 if (mpd->io_done)
2182                                         goto ret_extent_tail;
2183                         } else {
2184                                 /*
2185                                  * Page with regular buffer heads,
2186                                  * just add all dirty ones
2187                                  */
2188                                 head = page_buffers(page);
2189                                 bh = head;
2190                                 do {
2191                                         BUG_ON(buffer_locked(bh));
2192                                         /*
2193                                          * We need to try to allocate
2194                                          * unmapped blocks in the same page.
2195                                          * Otherwise we won't make progress
2196                                          * with the page in ext4_writepage
2197                                          */
2198                                         if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2199                                                 mpage_add_bh_to_extent(mpd, logical,
2200                                                                        bh->b_size,
2201                                                                        bh->b_state);
2202                                                 if (mpd->io_done)
2203                                                         goto ret_extent_tail;
2204                                         } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2205                                                 /*
2206                                                  * mapped dirty buffer. We need
2207                                                  * to update the b_state
2208                                                  * because we look at b_state
2209                                                  * in mpage_da_map_blocks.  We
2210                                                  * don't update b_size because
2211                                                  * if we find an unmapped
2212                                                  * buffer_head later we need to
2213                                                  * use the b_state flag of that
2214                                                  * buffer_head.
2215                                                  */
2216                                                 if (mpd->b_size == 0)
2217                                                         mpd->b_state = bh->b_state & BH_FLAGS;
2218                                         }
2219                                         logical++;
2220                                 } while ((bh = bh->b_this_page) != head);
2221                         }
2222
2223                         if (nr_to_write > 0) {
2224                                 nr_to_write--;
2225                                 if (nr_to_write == 0 &&
2226                                     wbc->sync_mode == WB_SYNC_NONE)
2227                                         /*
2228                                          * We stop writing back only if we are
2229                                          * not doing integrity sync. In case of
2230                                          * integrity sync we have to keep going
2231                                          * because someone may be concurrently
2232                                          * dirtying pages, and we might have
2233                                          * synced a lot of newly appeared dirty
2234                                          * pages, but have not synced all of the
2235                                          * old dirty pages.
2236                                          */
2237                                         goto out;
2238                         }
2239                 }
2240                 pagevec_release(&pvec);
2241                 cond_resched();
2242         }
2243         return 0;
2244 ret_extent_tail:
2245         ret = MPAGE_DA_EXTENT_TAIL;
2246 out:
2247         pagevec_release(&pvec);
2248         cond_resched();
2249         return ret;
2250 }
2251
2252
2253 static int ext4_da_writepages(struct address_space *mapping,
2254                               struct writeback_control *wbc)
2255 {
2256         pgoff_t index;
2257         int range_whole = 0;
2258         handle_t *handle = NULL;
2259         struct mpage_da_data mpd;
2260         struct inode *inode = mapping->host;
2261         int pages_written = 0;
2262         unsigned int max_pages;
2263         int range_cyclic, cycled = 1, io_done = 0;
2264         int needed_blocks, ret = 0;
2265         long desired_nr_to_write, nr_to_writebump = 0;
2266         loff_t range_start = wbc->range_start;
2267         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2268         pgoff_t done_index = 0;
2269         pgoff_t end;
2270         struct blk_plug plug;
2271
2272         trace_ext4_da_writepages(inode, wbc);
2273
2274         /*
2275          * No pages to write? This is mainly a kludge to avoid starting
2276          * a transaction for special inodes like journal inode on last iput()
2277          * because that could violate lock ordering on umount
2278          */
2279         if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2280                 return 0;
2281
2282         /*
2283          * If the filesystem has aborted, it is read-only, so return
2284          * right away instead of dumping stack traces later on that
2285          * will obscure the real source of the problem.  We test
2286          * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2287          * the latter could be true if the filesystem is mounted
2288          * read-only, and in that case, ext4_da_writepages should
2289          * *never* be called, so if that ever happens, we would want
2290          * the stack trace.
2291          */
2292         if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2293                 return -EROFS;
2294
2295         if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2296                 range_whole = 1;
2297
2298         range_cyclic = wbc->range_cyclic;
2299         if (wbc->range_cyclic) {
2300                 index = mapping->writeback_index;
2301                 if (index)
2302                         cycled = 0;
2303                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2304                 wbc->range_end  = LLONG_MAX;
2305                 wbc->range_cyclic = 0;
2306                 end = -1;
2307         } else {
2308                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2309                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2310         }
2311
2312         /*
2313          * This works around two forms of stupidity.  The first is in
2314          * the writeback code, which caps the maximum number of pages
2315          * written to be 1024 pages.  This is wrong on multiple
2316          * levels; different architectues have a different page size,
2317          * which changes the maximum amount of data which gets
2318          * written.  Secondly, 4 megabytes is way too small.  XFS
2319          * forces this value to be 16 megabytes by multiplying
2320          * nr_to_write parameter by four, and then relies on its
2321          * allocator to allocate larger extents to make them
2322          * contiguous.  Unfortunately this brings us to the second
2323          * stupidity, which is that ext4's mballoc code only allocates
2324          * at most 2048 blocks.  So we force contiguous writes up to
2325          * the number of dirty blocks in the inode, or
2326          * sbi->max_writeback_mb_bump whichever is smaller.
2327          */
2328         max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2329         if (!range_cyclic && range_whole) {
2330                 if (wbc->nr_to_write == LONG_MAX)
2331                         desired_nr_to_write = wbc->nr_to_write;
2332                 else
2333                         desired_nr_to_write = wbc->nr_to_write * 8;
2334         } else
2335                 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2336                                                            max_pages);
2337         if (desired_nr_to_write > max_pages)
2338                 desired_nr_to_write = max_pages;
2339
2340         if (wbc->nr_to_write < desired_nr_to_write) {
2341                 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2342                 wbc->nr_to_write = desired_nr_to_write;
2343         }
2344
2345 retry:
2346         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2347                 tag_pages_for_writeback(mapping, index, end);
2348
2349         blk_start_plug(&plug);
2350         while (!ret && wbc->nr_to_write > 0) {
2351
2352                 /*
2353                  * we  insert one extent at a time. So we need
2354                  * credit needed for single extent allocation.
2355                  * journalled mode is currently not supported
2356                  * by delalloc
2357                  */
2358                 BUG_ON(ext4_should_journal_data(inode));
2359                 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2360
2361                 /* start a new transaction*/
2362                 handle = ext4_journal_start(inode, needed_blocks);
2363                 if (IS_ERR(handle)) {
2364                         ret = PTR_ERR(handle);
2365                         ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2366                                "%ld pages, ino %lu; err %d", __func__,
2367                                 wbc->nr_to_write, inode->i_ino, ret);
2368                         blk_finish_plug(&plug);
2369                         goto out_writepages;
2370                 }
2371
2372                 /*
2373                  * Now call write_cache_pages_da() to find the next
2374                  * contiguous region of logical blocks that need
2375                  * blocks to be allocated by ext4 and submit them.
2376                  */
2377                 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
2378                 /*
2379                  * If we have a contiguous extent of pages and we
2380                  * haven't done the I/O yet, map the blocks and submit
2381                  * them for I/O.
2382                  */
2383                 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2384                         mpage_da_map_and_submit(&mpd);
2385                         ret = MPAGE_DA_EXTENT_TAIL;
2386                 }
2387                 trace_ext4_da_write_pages(inode, &mpd);
2388                 wbc->nr_to_write -= mpd.pages_written;
2389
2390                 ext4_journal_stop(handle);
2391
2392                 if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
2393                         /* commit the transaction which would
2394                          * free blocks released in the transaction
2395                          * and try again
2396                          */
2397                         jbd2_journal_force_commit_nested(sbi->s_journal);
2398                         ret = 0;
2399                 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
2400                         /*
2401                          * Got one extent now try with rest of the pages.
2402                          * If mpd.retval is set -EIO, journal is aborted.
2403                          * So we don't need to write any more.
2404                          */
2405                         pages_written += mpd.pages_written;
2406                         ret = mpd.retval;
2407                         io_done = 1;
2408                 } else if (wbc->nr_to_write)
2409                         /*
2410                          * There is no more writeout needed
2411                          * or we requested for a noblocking writeout
2412                          * and we found the device congested
2413                          */
2414                         break;
2415         }
2416         blk_finish_plug(&plug);
2417         if (!io_done && !cycled) {
2418                 cycled = 1;
2419                 index = 0;
2420                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2421                 wbc->range_end  = mapping->writeback_index - 1;
2422                 goto retry;
2423         }
2424
2425         /* Update index */
2426         wbc->range_cyclic = range_cyclic;
2427         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2428                 /*
2429                  * set the writeback_index so that range_cyclic
2430                  * mode will write it back later
2431                  */
2432                 mapping->writeback_index = done_index;
2433
2434 out_writepages:
2435         wbc->nr_to_write -= nr_to_writebump;
2436         wbc->range_start = range_start;
2437         trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2438         return ret;
2439 }
2440
2441 #define FALL_BACK_TO_NONDELALLOC 1
2442 static int ext4_nonda_switch(struct super_block *sb)
2443 {
2444         s64 free_blocks, dirty_blocks;
2445         struct ext4_sb_info *sbi = EXT4_SB(sb);
2446
2447         /*
2448          * switch to non delalloc mode if we are running low
2449          * on free block. The free block accounting via percpu
2450          * counters can get slightly wrong with percpu_counter_batch getting
2451          * accumulated on each CPU without updating global counters
2452          * Delalloc need an accurate free block accounting. So switch
2453          * to non delalloc when we are near to error range.
2454          */
2455         free_blocks  = EXT4_C2B(sbi,
2456                 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
2457         dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2458         if (2 * free_blocks < 3 * dirty_blocks ||
2459                 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
2460                 /*
2461                  * free block count is less than 150% of dirty blocks
2462                  * or free blocks is less than watermark
2463                  */
2464                 return 1;
2465         }
2466         /*
2467          * Even if we don't switch but are nearing capacity,
2468          * start pushing delalloc when 1/2 of free blocks are dirty.
2469          */
2470         if (free_blocks < 2 * dirty_blocks)
2471                 writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
2472
2473         return 0;
2474 }
2475
2476 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2477                                loff_t pos, unsigned len, unsigned flags,
2478                                struct page **pagep, void **fsdata)
2479 {
2480         int ret, retries = 0;
2481         struct page *page;
2482         pgoff_t index;
2483         struct inode *inode = mapping->host;
2484         handle_t *handle;
2485
2486         index = pos >> PAGE_CACHE_SHIFT;
2487
2488         if (ext4_nonda_switch(inode->i_sb)) {
2489                 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2490                 return ext4_write_begin(file, mapping, pos,
2491                                         len, flags, pagep, fsdata);
2492         }
2493         *fsdata = (void *)0;
2494         trace_ext4_da_write_begin(inode, pos, len, flags);
2495 retry:
2496         /*
2497          * With delayed allocation, we don't log the i_disksize update
2498          * if there is delayed block allocation. But we still need
2499          * to journalling the i_disksize update if writes to the end
2500          * of file which has an already mapped buffer.
2501          */
2502         handle = ext4_journal_start(inode, 1);
2503         if (IS_ERR(handle)) {
2504                 ret = PTR_ERR(handle);
2505                 goto out;
2506         }
2507         /* We cannot recurse into the filesystem as the transaction is already
2508          * started */
2509         flags |= AOP_FLAG_NOFS;
2510
2511         page = grab_cache_page_write_begin(mapping, index, flags);
2512         if (!page) {
2513                 ext4_journal_stop(handle);
2514                 ret = -ENOMEM;
2515                 goto out;
2516         }
2517         *pagep = page;
2518
2519         ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
2520         if (ret < 0) {
2521                 unlock_page(page);
2522                 ext4_journal_stop(handle);
2523                 page_cache_release(page);
2524                 /*
2525                  * block_write_begin may have instantiated a few blocks
2526                  * outside i_size.  Trim these off again. Don't need
2527                  * i_size_read because we hold i_mutex.
2528                  */
2529                 if (pos + len > inode->i_size)
2530                         ext4_truncate_failed_write(inode);
2531         }
2532
2533         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2534                 goto retry;
2535 out:
2536         return ret;
2537 }
2538
2539 /*
2540  * Check if we should update i_disksize
2541  * when write to the end of file but not require block allocation
2542  */
2543 static int ext4_da_should_update_i_disksize(struct page *page,
2544                                             unsigned long offset)
2545 {
2546         struct buffer_head *bh;
2547         struct inode *inode = page->mapping->host;
2548         unsigned int idx;
2549         int i;
2550
2551         bh = page_buffers(page);
2552         idx = offset >> inode->i_blkbits;
2553
2554         for (i = 0; i < idx; i++)
2555                 bh = bh->b_this_page;
2556
2557         if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
2558                 return 0;
2559         return 1;
2560 }
2561
2562 static int ext4_da_write_end(struct file *file,
2563                              struct address_space *mapping,
2564                              loff_t pos, unsigned len, unsigned copied,
2565                              struct page *page, void *fsdata)
2566 {
2567         struct inode *inode = mapping->host;
2568         int ret = 0, ret2;
2569         handle_t *handle = ext4_journal_current_handle();
2570         loff_t new_i_size;
2571         unsigned long start, end;
2572         int write_mode = (int)(unsigned long)fsdata;
2573
2574         if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2575                 switch (ext4_inode_journal_mode(inode)) {
2576                 case EXT4_INODE_ORDERED_DATA_MODE:
2577                         return ext4_ordered_write_end(file, mapping, pos,
2578                                         len, copied, page, fsdata);
2579                 case EXT4_INODE_WRITEBACK_DATA_MODE:
2580                         return ext4_writeback_write_end(file, mapping, pos,
2581                                         len, copied, page, fsdata);
2582                 default:
2583                         BUG();
2584                 }
2585         }
2586
2587         trace_ext4_da_write_end(inode, pos, len, copied);
2588         start = pos & (PAGE_CACHE_SIZE - 1);
2589         end = start + copied - 1;
2590
2591         /*
2592          * generic_write_end() will run mark_inode_dirty() if i_size
2593          * changes.  So let's piggyback the i_disksize mark_inode_dirty
2594          * into that.
2595          */
2596
2597         new_i_size = pos + copied;
2598         if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
2599                 if (ext4_da_should_update_i_disksize(page, end)) {
2600                         down_write(&EXT4_I(inode)->i_data_sem);
2601                         if (new_i_size > EXT4_I(inode)->i_disksize) {
2602                                 /*
2603                                  * Updating i_disksize when extending file
2604                                  * without needing block allocation
2605                                  */
2606                                 if (ext4_should_order_data(inode))
2607                                         ret = ext4_jbd2_file_inode(handle,
2608                                                                    inode);
2609
2610                                 EXT4_I(inode)->i_disksize = new_i_size;
2611                         }
2612                         up_write(&EXT4_I(inode)->i_data_sem);
2613                         /* We need to mark inode dirty even if
2614                          * new_i_size is less that inode->i_size
2615                          * bu greater than i_disksize.(hint delalloc)
2616                          */
2617                         ext4_mark_inode_dirty(handle, inode);
2618                 }
2619         }
2620         ret2 = generic_write_end(file, mapping, pos, len, copied,
2621                                                         page, fsdata);
2622         copied = ret2;
2623         if (ret2 < 0)
2624                 ret = ret2;
2625         ret2 = ext4_journal_stop(handle);
2626         if (!ret)
2627                 ret = ret2;
2628
2629         return ret ? ret : copied;
2630 }
2631
2632 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2633 {
2634         /*
2635          * Drop reserved blocks
2636          */
2637         BUG_ON(!PageLocked(page));
2638         if (!page_has_buffers(page))
2639                 goto out;
2640
2641         ext4_da_page_release_reservation(page, offset);
2642
2643 out:
2644         ext4_invalidatepage(page, offset);
2645
2646         return;
2647 }
2648
2649 /*
2650  * Force all delayed allocation blocks to be allocated for a given inode.
2651  */
2652 int ext4_alloc_da_blocks(struct inode *inode)
2653 {
2654         trace_ext4_alloc_da_blocks(inode);
2655
2656         if (!EXT4_I(inode)->i_reserved_data_blocks &&
2657             !EXT4_I(inode)->i_reserved_meta_blocks)
2658                 return 0;
2659
2660         /*
2661          * We do something simple for now.  The filemap_flush() will
2662          * also start triggering a write of the data blocks, which is
2663          * not strictly speaking necessary (and for users of
2664          * laptop_mode, not even desirable).  However, to do otherwise
2665          * would require replicating code paths in:
2666          *
2667          * ext4_da_writepages() ->
2668          *    write_cache_pages() ---> (via passed in callback function)
2669          *        __mpage_da_writepage() -->
2670          *           mpage_add_bh_to_extent()
2671          *           mpage_da_map_blocks()
2672          *
2673          * The problem is that write_cache_pages(), located in
2674          * mm/page-writeback.c, marks pages clean in preparation for
2675          * doing I/O, which is not desirable if we're not planning on
2676          * doing I/O at all.
2677          *
2678          * We could call write_cache_pages(), and then redirty all of
2679          * the pages by calling redirty_page_for_writepage() but that
2680          * would be ugly in the extreme.  So instead we would need to
2681          * replicate parts of the code in the above functions,
2682          * simplifying them because we wouldn't actually intend to
2683          * write out the pages, but rather only collect contiguous
2684          * logical block extents, call the multi-block allocator, and
2685          * then update the buffer heads with the block allocations.
2686          *
2687          * For now, though, we'll cheat by calling filemap_flush(),
2688          * which will map the blocks, and start the I/O, but not
2689          * actually wait for the I/O to complete.
2690          */
2691         return filemap_flush(inode->i_mapping);
2692 }
2693
2694 /*
2695  * bmap() is special.  It gets used by applications such as lilo and by
2696  * the swapper to find the on-disk block of a specific piece of data.
2697  *
2698  * Naturally, this is dangerous if the block concerned is still in the
2699  * journal.  If somebody makes a swapfile on an ext4 data-journaling
2700  * filesystem and enables swap, then they may get a nasty shock when the
2701  * data getting swapped to that swapfile suddenly gets overwritten by
2702  * the original zero's written out previously to the journal and
2703  * awaiting writeback in the kernel's buffer cache.
2704  *
2705  * So, if we see any bmap calls here on a modified, data-journaled file,
2706  * take extra steps to flush any blocks which might be in the cache.
2707  */
2708 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2709 {
2710         struct inode *inode = mapping->host;
2711         journal_t *journal;
2712         int err;
2713
2714         if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2715                         test_opt(inode->i_sb, DELALLOC)) {
2716                 /*
2717                  * With delalloc we want to sync the file
2718                  * so that we can make sure we allocate
2719                  * blocks for file
2720                  */
2721                 filemap_write_and_wait(mapping);
2722         }
2723
2724         if (EXT4_JOURNAL(inode) &&
2725             ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
2726                 /*
2727                  * This is a REALLY heavyweight approach, but the use of
2728                  * bmap on dirty files is expected to be extremely rare:
2729                  * only if we run lilo or swapon on a freshly made file
2730                  * do we expect this to happen.
2731                  *
2732                  * (bmap requires CAP_SYS_RAWIO so this does not
2733                  * represent an unprivileged user DOS attack --- we'd be
2734                  * in trouble if mortal users could trigger this path at
2735                  * will.)
2736                  *
2737                  * NB. EXT4_STATE_JDATA is not set on files other than
2738                  * regular files.  If somebody wants to bmap a directory
2739                  * or symlink and gets confused because the buffer
2740                  * hasn't yet been flushed to disk, they deserve
2741                  * everything they get.
2742                  */
2743
2744                 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
2745                 journal = EXT4_JOURNAL(inode);
2746                 jbd2_journal_lock_updates(journal);
2747                 err = jbd2_journal_flush(journal);
2748                 jbd2_journal_unlock_updates(journal);
2749
2750                 if (err)
2751                         return 0;
2752         }
2753
2754         return generic_block_bmap(mapping, block, ext4_get_block);
2755 }
2756
2757 static int ext4_readpage(struct file *file, struct page *page)
2758 {
2759         trace_ext4_readpage(page);
2760         return mpage_readpage(page, ext4_get_block);
2761 }
2762
2763 static int
2764 ext4_readpages(struct file *file, struct address_space *mapping,
2765                 struct list_head *pages, unsigned nr_pages)
2766 {
2767         return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2768 }
2769
2770 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
2771 {
2772         struct buffer_head *head, *bh;
2773         unsigned int curr_off = 0;
2774
2775         if (!page_has_buffers(page))
2776                 return;
2777         head = bh = page_buffers(page);
2778         do {
2779                 if (offset <= curr_off && test_clear_buffer_uninit(bh)
2780                                         && bh->b_private) {
2781                         ext4_free_io_end(bh->b_private);
2782                         bh->b_private = NULL;
2783                         bh->b_end_io = NULL;
2784                 }
2785                 curr_off = curr_off + bh->b_size;
2786                 bh = bh->b_this_page;
2787         } while (bh != head);
2788 }
2789
2790 static void ext4_invalidatepage(struct page *page, unsigned long offset)
2791 {
2792         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2793
2794         trace_ext4_invalidatepage(page, offset);
2795
2796         /*
2797          * free any io_end structure allocated for buffers to be discarded
2798          */
2799         if (ext4_should_dioread_nolock(page->mapping->host))
2800                 ext4_invalidatepage_free_endio(page, offset);
2801         /*
2802          * If it's a full truncate we just forget about the pending dirtying
2803          */
2804         if (offset == 0)
2805                 ClearPageChecked(page);
2806
2807         if (journal)
2808                 jbd2_journal_invalidatepage(journal, page, offset);
2809         else
2810                 block_invalidatepage(page, offset);
2811 }
2812
2813 static int ext4_releasepage(struct page *page, gfp_t wait)
2814 {
2815         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2816
2817         trace_ext4_releasepage(page);
2818
2819         WARN_ON(PageChecked(page));
2820         if (!page_has_buffers(page))
2821                 return 0;
2822         if (journal)
2823                 return jbd2_journal_try_to_free_buffers(journal, page, wait);
2824         else
2825                 return try_to_free_buffers(page);
2826 }
2827
2828 /*
2829  * ext4_get_block used when preparing for a DIO write or buffer write.
2830  * We allocate an uinitialized extent if blocks haven't been allocated.
2831  * The extent will be converted to initialized after the IO is complete.
2832  */
2833 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
2834                    struct buffer_head *bh_result, int create)
2835 {
2836         ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
2837                    inode->i_ino, create);
2838         return _ext4_get_block(inode, iblock, bh_result,
2839                                EXT4_GET_BLOCKS_IO_CREATE_EXT);
2840 }
2841
2842 static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
2843                    struct buffer_head *bh_result, int flags)
2844 {
2845         handle_t *handle = ext4_journal_current_handle();
2846         struct ext4_map_blocks map;
2847         int ret = 0;
2848
2849         ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n",
2850                    inode->i_ino, flags);
2851
2852         flags = EXT4_GET_BLOCKS_NO_LOCK;
2853
2854         map.m_lblk = iblock;
2855         map.m_len = bh_result->b_size >> inode->i_blkbits;
2856
2857         ret = ext4_map_blocks(handle, inode, &map, flags);
2858         if (ret > 0) {
2859                 map_bh(bh_result, inode->i_sb, map.m_pblk);
2860                 bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
2861                                         map.m_flags;
2862                 bh_result->b_size = inode->i_sb->s_blocksize * map.m_len;
2863                 ret = 0;
2864         }
2865         return ret;
2866 }
2867
2868 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2869                             ssize_t size, void *private, int ret,
2870                             bool is_async)
2871 {
2872         struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2873         ext4_io_end_t *io_end = iocb->private;
2874         struct workqueue_struct *wq;
2875         unsigned long flags;
2876         struct ext4_inode_info *ei;
2877
2878         /* if not async direct IO or dio with 0 bytes write, just return */
2879         if (!io_end || !size)
2880                 goto out;
2881
2882         ext_debug("ext4_end_io_dio(): io_end 0x%p "
2883                   "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
2884                   iocb->private, io_end->inode->i_ino, iocb, offset,
2885                   size);
2886
2887         iocb->private = NULL;
2888
2889         /* if not aio dio with unwritten extents, just free io and return */
2890         if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
2891                 ext4_free_io_end(io_end);
2892 out:
2893                 if (is_async)
2894                         aio_complete(iocb, ret, 0);
2895                 inode_dio_done(inode);
2896                 return;
2897         }
2898
2899         io_end->offset = offset;
2900         io_end->size = size;
2901         if (is_async) {
2902                 io_end->iocb = iocb;
2903                 io_end->result = ret;
2904         }
2905         wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
2906
2907         /* Add the io_end to per-inode completed aio dio list*/
2908         ei = EXT4_I(io_end->inode);
2909         spin_lock_irqsave(&ei->i_completed_io_lock, flags);
2910         list_add_tail(&io_end->list, &ei->i_completed_io_list);
2911         spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
2912
2913         /* queue the work to convert unwritten extents to written */
2914         queue_work(wq, &io_end->work);
2915 }
2916
2917 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2918 {
2919         ext4_io_end_t *io_end = bh->b_private;
2920         struct workqueue_struct *wq;
2921         struct inode *inode;
2922         unsigned long flags;
2923
2924         if (!test_clear_buffer_uninit(bh) || !io_end)
2925                 goto out;
2926
2927         if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
2928                 ext4_msg(io_end->inode->i_sb, KERN_INFO,
2929                          "sb umounted, discard end_io request for inode %lu",
2930                          io_end->inode->i_ino);
2931                 ext4_free_io_end(io_end);
2932                 goto out;
2933         }
2934
2935         /*
2936          * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
2937          * but being more careful is always safe for the future change.
2938          */
2939         inode = io_end->inode;
2940         ext4_set_io_unwritten_flag(inode, io_end);
2941
2942         /* Add the io_end to per-inode completed io list*/
2943         spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
2944         list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
2945         spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
2946
2947         wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
2948         /* queue the work to convert unwritten extents to written */
2949         queue_work(wq, &io_end->work);
2950 out:
2951         bh->b_private = NULL;
2952         bh->b_end_io = NULL;
2953         clear_buffer_uninit(bh);
2954         end_buffer_async_write(bh, uptodate);
2955 }
2956
2957 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
2958 {
2959         ext4_io_end_t *io_end;
2960         struct page *page = bh->b_page;
2961         loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
2962         size_t size = bh->b_size;
2963
2964 retry:
2965         io_end = ext4_init_io_end(inode, GFP_ATOMIC);
2966         if (!io_end) {
2967                 pr_warn_ratelimited("%s: allocation fail\n", __func__);
2968                 schedule();
2969                 goto retry;
2970         }
2971         io_end->offset = offset;
2972         io_end->size = size;
2973         /*
2974          * We need to hold a reference to the page to make sure it
2975          * doesn't get evicted before ext4_end_io_work() has a chance
2976          * to convert the extent from written to unwritten.
2977          */
2978         io_end->page = page;
2979         get_page(io_end->page);
2980
2981         bh->b_private = io_end;
2982         bh->b_end_io = ext4_end_io_buffer_write;
2983         return 0;
2984 }
2985
2986 /*
2987  * For ext4 extent files, ext4 will do direct-io write to holes,
2988  * preallocated extents, and those write extend the file, no need to
2989  * fall back to buffered IO.
2990  *
2991  * For holes, we fallocate those blocks, mark them as uninitialized
2992  * If those blocks were preallocated, we mark sure they are splited, but
2993  * still keep the range to write as uninitialized.
2994  *
2995  * The unwrritten extents will be converted to written when DIO is completed.
2996  * For async direct IO, since the IO may still pending when return, we
2997  * set up an end_io call back function, which will do the conversion
2998  * when async direct IO completed.
2999  *
3000  * If the O_DIRECT write will extend the file then add this inode to the
3001  * orphan list.  So recovery will truncate it back to the original size
3002  * if the machine crashes during the write.
3003  *
3004  */
3005 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3006                               const struct iovec *iov, loff_t offset,
3007                               unsigned long nr_segs)
3008 {
3009         struct file *file = iocb->ki_filp;
3010         struct inode *inode = file->f_mapping->host;
3011         ssize_t ret;
3012         size_t count = iov_length(iov, nr_segs);
3013
3014         loff_t final_size = offset + count;
3015         if (rw == WRITE && final_size <= inode->i_size) {
3016                 int overwrite = 0;
3017
3018                 BUG_ON(iocb->private == NULL);
3019
3020                 /* If we do a overwrite dio, i_mutex locking can be released */
3021                 overwrite = *((int *)iocb->private);
3022
3023                 if (overwrite) {
3024                         down_read(&EXT4_I(inode)->i_data_sem);
3025                         mutex_unlock(&inode->i_mutex);
3026                 }
3027
3028                 /*
3029                  * We could direct write to holes and fallocate.
3030                  *
3031                  * Allocated blocks to fill the hole are marked as uninitialized
3032                  * to prevent parallel buffered read to expose the stale data
3033                  * before DIO complete the data IO.
3034                  *
3035                  * As to previously fallocated extents, ext4 get_block
3036                  * will just simply mark the buffer mapped but still
3037                  * keep the extents uninitialized.
3038                  *
3039                  * for non AIO case, we will convert those unwritten extents
3040                  * to written after return back from blockdev_direct_IO.
3041                  *
3042                  * for async DIO, the conversion needs to be defered when
3043                  * the IO is completed. The ext4 end_io callback function
3044                  * will be called to take care of the conversion work.
3045                  * Here for async case, we allocate an io_end structure to
3046                  * hook to the iocb.
3047                  */
3048                 iocb->private = NULL;
3049                 EXT4_I(inode)->cur_aio_dio = NULL;
3050                 if (!is_sync_kiocb(iocb)) {
3051                         ext4_io_end_t *io_end =
3052                                 ext4_init_io_end(inode, GFP_NOFS);
3053                         if (!io_end) {
3054                                 ret = -ENOMEM;
3055                                 goto retake_lock;
3056                         }
3057                         io_end->flag |= EXT4_IO_END_DIRECT;
3058                         iocb->private = io_end;
3059                         /*
3060                          * we save the io structure for current async
3061                          * direct IO, so that later ext4_map_blocks()
3062                          * could flag the io structure whether there
3063                          * is a unwritten extents needs to be converted
3064                          * when IO is completed.
3065                          */
3066                         EXT4_I(inode)->cur_aio_dio = iocb->private;
3067                 }
3068
3069                 if (overwrite)
3070                         ret = __blockdev_direct_IO(rw, iocb, inode,
3071                                                  inode->i_sb->s_bdev, iov,
3072                                                  offset, nr_segs,
3073                                                  ext4_get_block_write_nolock,
3074                                                  ext4_end_io_dio,
3075                                                  NULL,
3076                                                  0);
3077                 else
3078                         ret = __blockdev_direct_IO(rw, iocb, inode,
3079                                                  inode->i_sb->s_bdev, iov,
3080                                                  offset, nr_segs,
3081                                                  ext4_get_block_write,
3082                                                  ext4_end_io_dio,
3083                                                  NULL,
3084                                                  DIO_LOCKING);
3085                 if (iocb->private)
3086                         EXT4_I(inode)->cur_aio_dio = NULL;
3087                 /*
3088                  * The io_end structure takes a reference to the inode,
3089                  * that structure needs to be destroyed and the
3090                  * reference to the inode need to be dropped, when IO is
3091                  * complete, even with 0 byte write, or failed.
3092                  *
3093                  * In the successful AIO DIO case, the io_end structure will be
3094                  * desctroyed and the reference to the inode will be dropped
3095                  * after the end_io call back function is called.
3096                  *
3097                  * In the case there is 0 byte write, or error case, since
3098                  * VFS direct IO won't invoke the end_io call back function,
3099                  * we need to free the end_io structure here.
3100                  */
3101                 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3102                         ext4_free_io_end(iocb->private);
3103                         iocb->private = NULL;
3104                 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3105                                                 EXT4_STATE_DIO_UNWRITTEN)) {
3106                         int err;
3107                         /*
3108                          * for non AIO case, since the IO is already
3109                          * completed, we could do the conversion right here
3110                          */
3111                         err = ext4_convert_unwritten_extents(inode,
3112                                                              offset, ret);
3113                         if (err < 0)
3114                                 ret = err;
3115                         ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3116                 }
3117
3118         retake_lock:
3119                 /* take i_mutex locking again if we do a ovewrite dio */
3120                 if (overwrite) {
3121                         up_read(&EXT4_I(inode)->i_data_sem);
3122                         mutex_lock(&inode->i_mutex);
3123                 }
3124
3125                 return ret;
3126         }
3127
3128         /* for write the the end of file case, we fall back to old way */
3129         return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3130 }
3131
3132 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3133                               const struct iovec *iov, loff_t offset,
3134                               unsigned long nr_segs)
3135 {
3136         struct file *file = iocb->ki_filp;
3137         struct inode *inode = file->f_mapping->host;
3138         ssize_t ret;
3139
3140         /*
3141          * If we are doing data journalling we don't support O_DIRECT
3142          */
3143         if (ext4_should_journal_data(inode))
3144                 return 0;
3145
3146         trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3147         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3148                 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3149         else
3150                 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3151         trace_ext4_direct_IO_exit(inode, offset,
3152                                 iov_length(iov, nr_segs), rw, ret);
3153         return ret;
3154 }
3155
3156 /*
3157  * Pages can be marked dirty completely asynchronously from ext4's journalling
3158  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
3159  * much here because ->set_page_dirty is called under VFS locks.  The page is
3160  * not necessarily locked.
3161  *
3162  * We cannot just dirty the page and leave attached buffers clean, because the
3163  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
3164  * or jbddirty because all the journalling code will explode.
3165  *
3166  * So what we do is to mark the page "pending dirty" and next time writepage
3167  * is called, propagate that into the buffers appropriately.
3168  */
3169 static int ext4_journalled_set_page_dirty(struct page *page)
3170 {
3171         SetPageChecked(page);
3172         return __set_page_dirty_nobuffers(page);
3173 }
3174
3175 static const struct address_space_operations ext4_ordered_aops = {
3176         .readpage               = ext4_readpage,
3177         .readpages              = ext4_readpages,
3178         .writepage              = ext4_writepage,
3179         .write_begin            = ext4_write_begin,
3180         .write_end              = ext4_ordered_write_end,
3181         .bmap                   = ext4_bmap,
3182         .invalidatepage         = ext4_invalidatepage,
3183         .releasepage            = ext4_releasepage,
3184         .direct_IO              = ext4_direct_IO,
3185         .migratepage            = buffer_migrate_page,
3186         .is_partially_uptodate  = block_is_partially_uptodate,
3187         .error_remove_page      = generic_error_remove_page,
3188 };
3189
3190 static const struct address_space_operations ext4_writeback_aops = {
3191         .readpage               = ext4_readpage,
3192         .readpages              = ext4_readpages,
3193         .writepage              = ext4_writepage,
3194         .write_begin            = ext4_write_begin,
3195         .write_end              = ext4_writeback_write_end,
3196         .bmap                   = ext4_bmap,
3197         .invalidatepage         = ext4_invalidatepage,
3198         .releasepage            = ext4_releasepage,
3199         .direct_IO              = ext4_direct_IO,
3200         .migratepage            = buffer_migrate_page,
3201         .is_partially_uptodate  = block_is_partially_uptodate,
3202         .error_remove_page      = generic_error_remove_page,
3203 };
3204
3205 static const struct address_space_operations ext4_journalled_aops = {
3206         .readpage               = ext4_readpage,
3207         .readpages              = ext4_readpages,
3208         .writepage              = ext4_writepage,
3209         .write_begin            = ext4_write_begin,
3210         .write_end              = ext4_journalled_write_end,
3211         .set_page_dirty         = ext4_journalled_set_page_dirty,
3212         .bmap                   = ext4_bmap,
3213         .invalidatepage         = ext4_invalidatepage,
3214         .releasepage            = ext4_releasepage,
3215         .direct_IO              = ext4_direct_IO,
3216         .is_partially_uptodate  = block_is_partially_uptodate,
3217         .error_remove_page      = generic_error_remove_page,
3218 };
3219
3220 static const struct address_space_operations ext4_da_aops = {
3221         .readpage               = ext4_readpage,
3222         .readpages              = ext4_readpages,
3223         .writepage              = ext4_writepage,
3224         .writepages             = ext4_da_writepages,
3225         .write_begin            = ext4_da_write_begin,
3226         .write_end              = ext4_da_write_end,
3227         .bmap                   = ext4_bmap,
3228         .invalidatepage         = ext4_da_invalidatepage,
3229         .releasepage            = ext4_releasepage,
3230         .direct_IO              = ext4_direct_IO,
3231         .migratepage            = buffer_migrate_page,
3232         .is_partially_uptodate  = block_is_partially_uptodate,
3233         .error_remove_page      = generic_error_remove_page,
3234 };
3235
3236 void ext4_set_aops(struct inode *inode)
3237 {
3238         switch (ext4_inode_journal_mode(inode)) {
3239         case EXT4_INODE_ORDERED_DATA_MODE:
3240                 if (test_opt(inode->i_sb, DELALLOC))
3241                         inode->i_mapping->a_ops = &ext4_da_aops;
3242                 else
3243                         inode->i_mapping->a_ops = &ext4_ordered_aops;
3244                 break;
3245         case EXT4_INODE_WRITEBACK_DATA_MODE:
3246                 if (test_opt(inode->i_sb, DELALLOC))
3247                         inode->i_mapping->a_ops = &ext4_da_aops;
3248                 else
3249                         inode->i_mapping->a_ops = &ext4_writeback_aops;
3250                 break;
3251         case EXT4_INODE_JOURNAL_DATA_MODE:
3252                 inode->i_mapping->a_ops = &ext4_journalled_aops;
3253                 break;
3254         default:
3255                 BUG();
3256         }
3257 }
3258
3259
3260 /*
3261  * ext4_discard_partial_page_buffers()
3262  * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
3263  * This function finds and locks the page containing the offset
3264  * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
3265  * Calling functions that already have the page locked should call
3266  * ext4_discard_partial_page_buffers_no_lock directly.
3267  */
3268 int ext4_discard_partial_page_buffers(handle_t *handle,
3269                 struct address_space *mapping, loff_t from,
3270                 loff_t length, int flags)
3271 {
3272         struct inode *inode = mapping->host;
3273         struct page *page;
3274         int err = 0;
3275
3276         page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3277                                    mapping_gfp_mask(mapping) & ~__GFP_FS);
3278         if (!page)
3279                 return -ENOMEM;
3280
3281         err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
3282                 from, length, flags);
3283
3284         unlock_page(page);
3285         page_cache_release(page);
3286         return err;
3287 }
3288
3289 /*
3290  * ext4_discard_partial_page_buffers_no_lock()
3291  * Zeros a page range of length 'length' starting from offset 'from'.
3292  * Buffer heads that correspond to the block aligned regions of the
3293  * zeroed range will be unmapped.  Unblock aligned regions
3294  * will have the corresponding buffer head mapped if needed so that
3295  * that region of the page can be updated with the partial zero out.
3296  *
3297  * This function assumes that the page has already been  locked.  The
3298  * The range to be discarded must be contained with in the given page.
3299  * If the specified range exceeds the end of the page it will be shortened
3300  * to the end of the page that corresponds to 'from'.  This function is
3301  * appropriate for updating a page and it buffer heads to be unmapped and
3302  * zeroed for blocks that have been either released, or are going to be
3303  * released.
3304  *
3305  * handle: The journal handle
3306  * inode:  The files inode
3307  * page:   A locked page that contains the offset "from"
3308  * from:   The starting byte offset (from the begining of the file)
3309  *         to begin discarding
3310  * len:    The length of bytes to discard
3311  * flags:  Optional flags that may be used:
3312  *
3313  *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3314  *         Only zero the regions of the page whose buffer heads
3315  *         have already been unmapped.  This flag is appropriate
3316  *         for updateing the contents of a page whose blocks may
3317  *         have already been released, and we only want to zero
3318  *         out the regions that correspond to those released blocks.
3319  *
3320  * Returns zero on sucess or negative on failure.
3321  */
3322 static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3323                 struct inode *inode, struct page *page, loff_t from,
3324                 loff_t length, int flags)
3325 {
3326         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3327         unsigned int offset = from & (PAGE_CACHE_SIZE-1);
3328         unsigned int blocksize, max, pos;
3329         ext4_lblk_t iblock;
3330         struct buffer_head *bh;
3331         int err = 0;
3332
3333         blocksize = inode->i_sb->s_blocksize;
3334         max = PAGE_CACHE_SIZE - offset;
3335
3336         if (index != page->index)
3337                 return -EINVAL;
3338
3339         /*
3340          * correct length if it does not fall between
3341          * 'from' and the end of the page
3342          */
3343         if (length > max || length < 0)
3344                 length = max;
3345
3346         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3347
3348         if (!page_has_buffers(page))
3349                 create_empty_buffers(page, blocksize, 0);
3350
3351         /* Find the buffer that contains "offset" */
3352         bh = page_buffers(page);
3353         pos = blocksize;
3354         while (offset >= pos) {
3355                 bh = bh->b_this_page;
3356                 iblock++;
3357                 pos += blocksize;
3358         }
3359
3360         pos = offset;
3361         while (pos < offset + length) {
3362                 unsigned int end_of_block, range_to_discard;
3363
3364                 err = 0;
3365
3366                 /* The length of space left to zero and unmap */
3367                 range_to_discard = offset + length - pos;
3368
3369                 /* The length of space until the end of the block */
3370                 end_of_block = blocksize - (pos & (blocksize-1));
3371
3372                 /*
3373                  * Do not unmap or zero past end of block
3374                  * for this buffer head
3375                  */
3376                 if (range_to_discard > end_of_block)
3377                         range_to_discard = end_of_block;
3378
3379
3380                 /*
3381                  * Skip this buffer head if we are only zeroing unampped
3382                  * regions of the page
3383                  */
3384                 if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
3385                         buffer_mapped(bh))
3386                                 goto next;
3387
3388                 /* If the range is block aligned, unmap */
3389                 if (range_to_discard == blocksize) {
3390                         clear_buffer_dirty(bh);
3391                         bh->b_bdev = NULL;
3392                         clear_buffer_mapped(bh);
3393                         clear_buffer_req(bh);
3394                         clear_buffer_new(bh);
3395                         clear_buffer_delay(bh);
3396                         clear_buffer_unwritten(bh);
3397                         clear_buffer_uptodate(bh);
3398                         zero_user(page, pos, range_to_discard);
3399                         BUFFER_TRACE(bh, "Buffer discarded");
3400                         goto next;
3401                 }
3402
3403                 /*
3404                  * If this block is not completely contained in the range
3405                  * to be discarded, then it is not going to be released. Because
3406                  * we need to keep this block, we need to make sure this part
3407                  * of the page is uptodate before we modify it by writeing
3408                  * partial zeros on it.
3409                  */
3410                 if (!buffer_mapped(bh)) {
3411                         /*
3412                          * Buffer head must be mapped before we can read
3413                          * from the block
3414                          */
3415                         BUFFER_TRACE(bh, "unmapped");
3416                         ext4_get_block(inode, iblock, bh, 0);
3417                         /* unmapped? It's a hole - nothing to do */
3418                         if (!buffer_mapped(bh)) {
3419                                 BUFFER_TRACE(bh, "still unmapped");
3420                                 goto next;
3421                         }
3422                 }
3423
3424                 /* Ok, it's mapped. Make sure it's up-to-date */
3425                 if (PageUptodate(page))
3426                         set_buffer_uptodate(bh);
3427
3428                 if (!buffer_uptodate(bh)) {
3429                         err = -EIO;
3430                         ll_rw_block(READ, 1, &bh);
3431                         wait_on_buffer(bh);
3432                         /* Uhhuh. Read error. Complain and punt.*/
3433                         if (!buffer_uptodate(bh))
3434                                 goto next;
3435                 }
3436
3437                 if (ext4_should_journal_data(inode)) {
3438                         BUFFER_TRACE(bh, "get write access");
3439                         err = ext4_journal_get_write_access(handle, bh);
3440                         if (err)
3441                                 goto next;
3442                 }
3443
3444                 zero_user(page, pos, range_to_discard);
3445
3446                 err = 0;
3447                 if (ext4_should_journal_data(inode)) {
3448                         err = ext4_handle_dirty_metadata(handle, inode, bh);
3449                 } else
3450                         mark_buffer_dirty(bh);
3451
3452                 BUFFER_TRACE(bh, "Partial buffer zeroed");
3453 next:
3454                 bh = bh->b_this_page;
3455                 iblock++;
3456                 pos += range_to_discard;
3457         }
3458
3459         return err;
3460 }
3461
3462 int ext4_can_truncate(struct inode *inode)
3463 {
3464         if (S_ISREG(inode->i_mode))
3465                 return 1;
3466         if (S_ISDIR(inode->i_mode))
3467                 return 1;
3468         if (S_ISLNK(inode->i_mode))
3469                 return !ext4_inode_is_fast_symlink(inode);
3470         return 0;
3471 }
3472
3473 /*
3474  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
3475  * associated with the given offset and length
3476  *
3477  * @inode:  File inode
3478  * @offset: The offset where the hole will begin
3479  * @len:    The length of the hole
3480  *
3481  * Returns: 0 on sucess or negative on failure
3482  */
3483
3484 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3485 {
3486         struct inode *inode = file->f_path.dentry->d_inode;
3487         if (!S_ISREG(inode->i_mode))
3488                 return -EOPNOTSUPP;
3489
3490         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3491                 /* TODO: Add support for non extent hole punching */
3492                 return -EOPNOTSUPP;
3493         }
3494
3495         if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3496                 /* TODO: Add support for bigalloc file systems */
3497                 return -EOPNOTSUPP;
3498         }
3499
3500         return ext4_ext_punch_hole(file, offset, length);
3501 }
3502
3503 /*
3504  * ext4_truncate()
3505  *
3506  * We block out ext4_get_block() block instantiations across the entire
3507  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
3508  * simultaneously on behalf of the same inode.
3509  *
3510  * As we work through the truncate and commit bits of it to the journal there
3511  * is one core, guiding principle: the file's tree must always be consistent on
3512  * disk.  We must be able to restart the truncate after a crash.
3513  *
3514  * The file's tree may be transiently inconsistent in memory (although it
3515  * probably isn't), but whenever we close off and commit a journal transaction,
3516  * the contents of (the filesystem + the journal) must be consistent and
3517  * restartable.  It's pretty simple, really: bottom up, right to left (although
3518  * left-to-right works OK too).
3519  *
3520  * Note that at recovery time, journal replay occurs *before* the restart of
3521  * truncate against the orphan inode list.
3522  *
3523  * The committed inode has the new, desired i_size (which is the same as
3524  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
3525  * that this inode's truncate did not complete and it will again call
3526  * ext4_truncate() to have another go.  So there will be instantiated blocks
3527  * to the right of the truncation point in a crashed ext4 filesystem.  But
3528  * that's fine - as long as they are linked from the inode, the post-crash
3529  * ext4_truncate() run will find them and release them.
3530  */
3531 void ext4_truncate(struct inode *inode)
3532 {
3533         trace_ext4_truncate_enter(inode);
3534
3535         if (!ext4_can_truncate(inode))
3536                 return;
3537
3538         ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3539
3540         if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3541                 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
3542
3543         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3544                 ext4_ext_truncate(inode);
3545         else
3546                 ext4_ind_truncate(inode);
3547
3548         trace_ext4_truncate_exit(inode);
3549 }
3550
3551 /*
3552  * ext4_get_inode_loc returns with an extra refcount against the inode's
3553  * underlying buffer_head on success. If 'in_mem' is true, we have all
3554  * data in memory that is needed to recreate the on-disk version of this
3555  * inode.
3556  */
3557 static int __ext4_get_inode_loc(struct inode *inode,
3558                                 struct ext4_iloc *iloc, int in_mem)
3559 {
3560         struct ext4_group_desc  *gdp;
3561         struct buffer_head      *bh;
3562         struct super_block      *sb = inode->i_sb;
3563         ext4_fsblk_t            block;
3564         int                     inodes_per_block, inode_offset;
3565
3566         iloc->bh = NULL;
3567         if (!ext4_valid_inum(sb, inode->i_ino))
3568                 return -EIO;
3569
3570         iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
3571         gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
3572         if (!gdp)
3573                 return -EIO;
3574
3575         /*
3576          * Figure out the offset within the block group inode table
3577          */
3578         inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
3579         inode_offset = ((inode->i_ino - 1) %
3580                         EXT4_INODES_PER_GROUP(sb));
3581         block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
3582         iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
3583
3584         bh = sb_getblk(sb, block);
3585         if (!bh) {
3586                 EXT4_ERROR_INODE_BLOCK(inode, block,
3587                                        "unable to read itable block");
3588                 return -EIO;
3589         }
3590         if (!buffer_uptodate(bh)) {
3591                 lock_buffer(bh);
3592
3593                 /*
3594                  * If the buffer has the write error flag, we have failed
3595                  * to write out another inode in the same block.  In this
3596                  * case, we don't have to read the block because we may
3597                  * read the old inode data successfully.
3598                  */
3599                 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
3600                         set_buffer_uptodate(bh);
3601
3602                 if (buffer_uptodate(bh)) {
3603                         /* someone brought it uptodate while we waited */
3604                         unlock_buffer(bh);
3605                         goto has_buffer;
3606                 }
3607
3608                 /*
3609                  * If we have all information of the inode in memory and this
3610                  * is the only valid inode in the block, we need not read the
3611                  * block.
3612                  */
3613                 if (in_mem) {
3614                         struct buffer_head *bitmap_bh;
3615                         int i, start;
3616
3617                         start = inode_offset & ~(inodes_per_block - 1);
3618
3619                         /* Is the inode bitmap in cache? */
3620                         bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3621                         if (!bitmap_bh)
3622                                 goto make_io;
3623
3624                         /*
3625                          * If the inode bitmap isn't in cache then the
3626                          * optimisation may end up performing two reads instead
3627                          * of one, so skip it.
3628                          */
3629                         if (!buffer_uptodate(bitmap_bh)) {
3630                                 brelse(bitmap_bh);
3631                                 goto make_io;
3632                         }
3633                         for (i = start; i < start + inodes_per_block; i++) {
3634                                 if (i == inode_offset)
3635                                         continue;
3636                                 if (ext4_test_bit(i, bitmap_bh->b_data))
3637                                         break;
3638                         }
3639                         brelse(bitmap_bh);
3640                         if (i == start + inodes_per_block) {
3641                                 /* all other inodes are free, so skip I/O */
3642                                 memset(bh->b_data, 0, bh->b_size);
3643                                 set_buffer_uptodate(bh);
3644                                 unlock_buffer(bh);
3645                                 goto has_buffer;
3646                         }
3647                 }
3648
3649 make_io:
3650                 /*
3651                  * If we need to do any I/O, try to pre-readahead extra
3652                  * blocks from the inode table.
3653                  */
3654                 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3655                         ext4_fsblk_t b, end, table;
3656                         unsigned num;
3657
3658                         table = ext4_inode_table(sb, gdp);
3659                         /* s_inode_readahead_blks is always a power of 2 */
3660                         b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
3661                         if (table > b)
3662                                 b = table;
3663                         end = b + EXT4_SB(sb)->s_inode_readahead_blks;
3664                         num = EXT4_INODES_PER_GROUP(sb);
3665                         if (ext4_has_group_desc_csum(sb))
3666                                 num -= ext4_itable_unused_count(sb, gdp);
3667                         table += num / inodes_per_block;
3668                         if (end > table)
3669                                 end = table;
3670                         while (b <= end)
3671                                 sb_breadahead(sb, b++);
3672                 }
3673
3674                 /*
3675                  * There are other valid inodes in the buffer, this inode
3676                  * has in-inode xattrs, or we don't have this inode in memory.
3677                  * Read the block from disk.
3678                  */
3679                 trace_ext4_load_inode(inode);
3680                 get_bh(bh);
3681                 bh->b_end_io = end_buffer_read_sync;
3682                 submit_bh(READ | REQ_META | REQ_PRIO, bh);
3683                 wait_on_buffer(bh);
3684                 if (!buffer_uptodate(bh)) {
3685                         EXT4_ERROR_INODE_BLOCK(inode, block,
3686                                                "unable to read itable block");
3687                         brelse(bh);
3688                         return -EIO;
3689                 }
3690         }
3691 has_buffer:
3692         iloc->bh = bh;
3693         return 0;
3694 }
3695
3696 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
3697 {
3698         /* We have all inode data except xattrs in memory here. */
3699         return __ext4_get_inode_loc(inode, iloc,
3700                 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
3701 }
3702
3703 void ext4_set_inode_flags(struct inode *inode)
3704 {
3705         unsigned int flags = EXT4_I(inode)->i_flags;
3706
3707         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
3708         if (flags & EXT4_SYNC_FL)
3709                 inode->i_flags |= S_SYNC;
3710         if (flags & EXT4_APPEND_FL)
3711                 inode->i_flags |= S_APPEND;
3712         if (flags & EXT4_IMMUTABLE_FL)
3713                 inode->i_flags |= S_IMMUTABLE;
3714         if (flags & EXT4_NOATIME_FL)
3715                 inode->i_flags |= S_NOATIME;
3716         if (flags & EXT4_DIRSYNC_FL)
3717                 inode->i_flags |= S_DIRSYNC;
3718 }
3719
3720 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
3721 void ext4_get_inode_flags(struct ext4_inode_info *ei)
3722 {
3723         unsigned int vfs_fl;
3724         unsigned long old_fl, new_fl;
3725
3726         do {
3727                 vfs_fl = ei->vfs_inode.i_flags;
3728                 old_fl = ei->i_flags;
3729                 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
3730                                 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
3731                                 EXT4_DIRSYNC_FL);
3732                 if (vfs_fl & S_SYNC)
3733                         new_fl |= EXT4_SYNC_FL;
3734                 if (vfs_fl & S_APPEND)
3735                         new_fl |= EXT4_APPEND_FL;
3736                 if (vfs_fl & S_IMMUTABLE)
3737                         new_fl |= EXT4_IMMUTABLE_FL;
3738                 if (vfs_fl & S_NOATIME)
3739                         new_fl |= EXT4_NOATIME_FL;
3740                 if (vfs_fl & S_DIRSYNC)
3741                         new_fl |= EXT4_DIRSYNC_FL;
3742         } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
3743 }
3744
3745 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
3746                                   struct ext4_inode_info *ei)
3747 {
3748         blkcnt_t i_blocks ;
3749         struct inode *inode = &(ei->vfs_inode);
3750         struct super_block *sb = inode->i_sb;
3751
3752         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3753                                 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
3754                 /* we are using combined 48 bit field */
3755                 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
3756                                         le32_to_cpu(raw_inode->i_blocks_lo);
3757                 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
3758                         /* i_blocks represent file system block size */
3759                         return i_blocks  << (inode->i_blkbits - 9);
3760                 } else {
3761                         return i_blocks;
3762                 }
3763         } else {
3764                 return le32_to_cpu(raw_inode->i_blocks_lo);
3765         }
3766 }
3767
3768 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3769 {
3770         struct ext4_iloc iloc;
3771         struct ext4_inode *raw_inode;
3772         struct ext4_inode_info *ei;
3773         struct inode *inode;
3774         journal_t *journal = EXT4_SB(sb)->s_journal;
3775         long ret;
3776         int block;
3777         uid_t i_uid;
3778         gid_t i_gid;
3779
3780         inode = iget_locked(sb, ino);
3781         if (!inode)
3782                 return ERR_PTR(-ENOMEM);
3783         if (!(inode->i_state & I_NEW))
3784                 return inode;
3785
3786         ei = EXT4_I(inode);
3787         iloc.bh = NULL;
3788
3789         ret = __ext4_get_inode_loc(inode, &iloc, 0);
3790         if (ret < 0)
3791                 goto bad_inode;
3792         raw_inode = ext4_raw_inode(&iloc);
3793
3794         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3795                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3796                 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3797                     EXT4_INODE_SIZE(inode->i_sb)) {
3798                         EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
3799                                 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
3800                                 EXT4_INODE_SIZE(inode->i_sb));
3801                         ret = -EIO;
3802                         goto bad_inode;
3803                 }
3804         } else
3805                 ei->i_extra_isize = 0;
3806
3807         /* Precompute checksum seed for inode metadata */
3808         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3809                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3810                 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3811                 __u32 csum;
3812                 __le32 inum = cpu_to_le32(inode->i_ino);
3813                 __le32 gen = raw_inode->i_generation;
3814                 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
3815                                    sizeof(inum));
3816                 ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
3817                                               sizeof(gen));
3818         }
3819
3820         if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
3821                 EXT4_ERROR_INODE(inode, "checksum invalid");
3822                 ret = -EIO;
3823                 goto bad_inode;
3824         }
3825
3826         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3827         i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
3828         i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
3829         if (!(test_opt(inode->i_sb, NO_UID32))) {
3830                 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
3831                 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3832         }
3833         i_uid_write(inode, i_uid);
3834         i_gid_write(inode, i_gid);
3835         set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
3836
3837         ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
3838         ei->i_dir_start_lookup = 0;
3839         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
3840         /* We now have enough fields to check if the inode was active or not.
3841          * This is needed because nfsd might try to access dead inodes
3842          * the test is that same one that e2fsck uses
3843          * NeilBrown 1999oct15
3844          */
3845         if (inode->i_nlink == 0) {
3846                 if (inode->i_mode == 0 ||
3847                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
3848                         /* this inode is deleted */
3849                         ret = -ESTALE;
3850                         goto bad_inode;
3851                 }
3852                 /* The only unlinked inodes we let through here have
3853                  * valid i_mode and are being read by the orphan
3854                  * recovery code: that's fine, we're about to complete
3855                  * the process of deleting those. */
3856         }
3857         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
3858         inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
3859         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
3860         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
3861                 ei->i_file_acl |=
3862                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
3863         inode->i_size = ext4_isize(raw_inode);
3864         ei->i_disksize = inode->i_size;
3865 #ifdef CONFIG_QUOTA
3866         ei->i_reserved_quota = 0;
3867 #endif
3868         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
3869         ei->i_block_group = iloc.block_group;
3870         ei->i_last_alloc_group = ~0;
3871         /*
3872          * NOTE! The in-memory inode i_data array is in little-endian order
3873          * even on big-endian machines: we do NOT byteswap the block numbers!
3874          */
3875         for (block = 0; block < EXT4_N_BLOCKS; block++)
3876                 ei->i_data[block] = raw_inode->i_block[block];
3877         INIT_LIST_HEAD(&ei->i_orphan);
3878
3879         /*
3880          * Set transaction id's of transactions that have to be committed
3881          * to finish f[data]sync. We set them to currently running transaction
3882          * as we cannot be sure that the inode or some of its metadata isn't
3883          * part of the transaction - the inode could have been reclaimed and
3884          * now it is reread from disk.
3885          */
3886         if (journal) {
3887                 transaction_t *transaction;
3888                 tid_t tid;
3889
3890                 read_lock(&journal->j_state_lock);
3891                 if (journal->j_running_transaction)
3892                         transaction = journal->j_running_transaction;
3893                 else
3894                         transaction = journal->j_committing_transaction;
3895                 if (transaction)
3896                         tid = transaction->t_tid;
3897                 else
3898                         tid = journal->j_commit_sequence;
3899                 read_unlock(&journal->j_state_lock);
3900                 ei->i_sync_tid = tid;
3901                 ei->i_datasync_tid = tid;
3902         }
3903
3904         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3905                 if (ei->i_extra_isize == 0) {
3906                         /* The extra space is currently unused. Use it. */
3907                         ei->i_extra_isize = sizeof(struct ext4_inode) -
3908                                             EXT4_GOOD_OLD_INODE_SIZE;
3909                 } else {
3910                         __le32 *magic = (void *)raw_inode +
3911                                         EXT4_GOOD_OLD_INODE_SIZE +
3912                                         ei->i_extra_isize;
3913                         if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
3914                                 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
3915                 }
3916         }
3917
3918         EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
3919         EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
3920         EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
3921         EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
3922
3923         inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
3924         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3925                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
3926                         inode->i_version |=
3927                         (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
3928         }
3929
3930         ret = 0;
3931         if (ei->i_file_acl &&
3932             !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
3933                 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
3934                                  ei->i_file_acl);
3935                 ret = -EIO;
3936                 goto bad_inode;
3937         } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3938                 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3939                     (S_ISLNK(inode->i_mode) &&
3940                      !ext4_inode_is_fast_symlink(inode)))
3941                         /* Validate extent which is part of inode */
3942                         ret = ext4_ext_check_inode(inode);
3943         } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3944                    (S_ISLNK(inode->i_mode) &&
3945                     !ext4_inode_is_fast_symlink(inode))) {
3946                 /* Validate block references which are part of inode */
3947                 ret = ext4_ind_check_inode(inode);
3948         }
3949         if (ret)
3950                 goto bad_inode;
3951
3952         if (S_ISREG(inode->i_mode)) {
3953                 inode->i_op = &ext4_file_inode_operations;
3954                 inode->i_fop = &ext4_file_operations;
3955                 ext4_set_aops(inode);
3956         } else if (S_ISDIR(inode->i_mode)) {
3957                 inode->i_op = &ext4_dir_inode_operations;
3958                 inode->i_fop = &ext4_dir_operations;
3959         } else if (S_ISLNK(inode->i_mode)) {
3960                 if (ext4_inode_is_fast_symlink(inode)) {
3961                         inode->i_op = &ext4_fast_symlink_inode_operations;
3962                         nd_terminate_link(ei->i_data, inode->i_size,
3963                                 sizeof(ei->i_data) - 1);
3964                 } else {
3965                         inode->i_op = &ext4_symlink_inode_operations;
3966                         ext4_set_aops(inode);
3967                 }
3968         } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
3969               S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
3970                 inode->i_op = &ext4_special_inode_operations;
3971                 if (raw_inode->i_block[0])
3972                         init_special_inode(inode, inode->i_mode,
3973                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
3974                 else
3975                         init_special_inode(inode, inode->i_mode,
3976                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
3977         } else {
3978                 ret = -EIO;
3979                 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
3980                 goto bad_inode;
3981         }
3982         brelse(iloc.bh);
3983         ext4_set_inode_flags(inode);
3984         unlock_new_inode(inode);
3985         return inode;
3986
3987 bad_inode:
3988         brelse(iloc.bh);
3989         iget_failed(inode);
3990         return ERR_PTR(ret);
3991 }
3992
3993 static int ext4_inode_blocks_set(handle_t *handle,
3994                                 struct ext4_inode *raw_inode,
3995                                 struct ext4_inode_info *ei)
3996 {
3997         struct inode *inode = &(ei->vfs_inode);
3998         u64 i_blocks = inode->i_blocks;
3999         struct super_block *sb = inode->i_sb;
4000
4001         if (i_blocks <= ~0U) {
4002                 /*
4003                  * i_blocks can be represnted in a 32 bit variable
4004                  * as multiple of 512 bytes
4005                  */
4006                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
4007                 raw_inode->i_blocks_high = 0;
4008                 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
4009                 return 0;
4010         }
4011         if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
4012                 return -EFBIG;
4013
4014         if (i_blocks <= 0xffffffffffffULL) {
4015                 /*
4016                  * i_blocks can be represented in a 48 bit variable
4017                  * as multiple of 512 bytes
4018                  */
4019                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
4020                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4021                 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
4022         } else {
4023                 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
4024                 /* i_block is stored in file system block size */
4025                 i_blocks = i_blocks >> (inode->i_blkbits - 9);
4026                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
4027                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4028         }
4029         return 0;
4030 }
4031
4032 /*
4033  * Post the struct inode info into an on-disk inode location in the
4034  * buffer-cache.  This gobbles the caller's reference to the
4035  * buffer_head in the inode location struct.
4036  *
4037  * The caller must have write access to iloc->bh.
4038  */
4039 static int ext4_do_update_inode(handle_t *handle,
4040                                 struct inode *inode,
4041                                 struct ext4_iloc *iloc)
4042 {
4043         struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4044         struct ext4_inode_info *ei = EXT4_I(inode);
4045         struct buffer_head *bh = iloc->bh;
4046         int err = 0, rc, block;
4047         uid_t i_uid;
4048         gid_t i_gid;
4049
4050         /* For fields not not tracking in the in-memory inode,
4051          * initialise them to zero for new inodes. */
4052         if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
4053                 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
4054
4055         ext4_get_inode_flags(ei);
4056         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
4057         i_uid = i_uid_read(inode);
4058         i_gid = i_gid_read(inode);
4059         if (!(test_opt(inode->i_sb, NO_UID32))) {
4060                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
4061                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
4062 /*
4063  * Fix up interoperability with old kernels. Otherwise, old inodes get
4064  * re-used with the upper 16 bits of the uid/gid intact
4065  */
4066                 if (!ei->i_dtime) {
4067                         raw_inode->i_uid_high =
4068                                 cpu_to_le16(high_16_bits(i_uid));
4069                         raw_inode->i_gid_high =
4070                                 cpu_to_le16(high_16_bits(i_gid));
4071                 } else {
4072                         raw_inode->i_uid_high = 0;
4073                         raw_inode->i_gid_high = 0;
4074                 }
4075         } else {
4076                 raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
4077                 raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
4078                 raw_inode->i_uid_high = 0;
4079                 raw_inode->i_gid_high = 0;
4080         }
4081         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
4082
4083         EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
4084         EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
4085         EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
4086         EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
4087
4088         if (ext4_inode_blocks_set(handle, raw_inode, ei))
4089                 goto out_brelse;
4090         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4091         raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
4092         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
4093             cpu_to_le32(EXT4_OS_HURD))
4094                 raw_inode->i_file_acl_high =
4095                         cpu_to_le16(ei->i_file_acl >> 32);
4096         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
4097         ext4_isize_set(raw_inode, ei->i_disksize);
4098         if (ei->i_disksize > 0x7fffffffULL) {
4099                 struct super_block *sb = inode->i_sb;
4100                 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
4101                                 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
4102                                 EXT4_SB(sb)->s_es->s_rev_level ==
4103                                 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
4104                         /* If this is the first large file
4105                          * created, add a flag to the superblock.
4106                          */
4107                         err = ext4_journal_get_write_access(handle,
4108                                         EXT4_SB(sb)->s_sbh);
4109                         if (err)
4110                                 goto out_brelse;
4111                         ext4_update_dynamic_rev(sb);
4112                         EXT4_SET_RO_COMPAT_FEATURE(sb,
4113                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
4114                         ext4_handle_sync(handle);
4115                         err = ext4_handle_dirty_super(handle, sb);
4116                 }
4117         }
4118         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
4119         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
4120                 if (old_valid_dev(inode->i_rdev)) {
4121                         raw_inode->i_block[0] =
4122                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
4123                         raw_inode->i_block[1] = 0;
4124                 } else {
4125                         raw_inode->i_block[0] = 0;
4126                         raw_inode->i_block[1] =
4127                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
4128                         raw_inode->i_block[2] = 0;
4129                 }
4130         } else
4131                 for (block = 0; block < EXT4_N_BLOCKS; block++)
4132                         raw_inode->i_block[block] = ei->i_data[block];
4133
4134         raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4135         if (ei->i_extra_isize) {
4136                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4137                         raw_inode->i_version_hi =
4138                         cpu_to_le32(inode->i_version >> 32);
4139                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4140         }
4141
4142         ext4_inode_csum_set(inode, raw_inode, ei);
4143
4144         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4145         rc = ext4_handle_dirty_metadata(handle, NULL, bh);
4146         if (!err)
4147                 err = rc;
4148         ext4_clear_inode_state(inode, EXT4_STATE_NEW);
4149
4150         ext4_update_inode_fsync_trans(handle, inode, 0);
4151 out_brelse:
4152         brelse(bh);
4153         ext4_std_error(inode->i_sb, err);
4154         return err;
4155 }
4156
4157 /*
4158  * ext4_write_inode()
4159  *
4160  * We are called from a few places:
4161  *
4162  * - Within generic_file_write() for O_SYNC files.
4163  *   Here, there will be no transaction running. We wait for any running
4164  *   trasnaction to commit.
4165  *
4166  * - Within sys_sync(), kupdate and such.
4167  *   We wait on commit, if tol to.
4168  *
4169  * - Within prune_icache() (PF_MEMALLOC == true)
4170  *   Here we simply return.  We can't afford to block kswapd on the
4171  *   journal commit.
4172  *
4173  * In all cases it is actually safe for us to return without doing anything,
4174  * because the inode has been copied into a raw inode buffer in
4175  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
4176  * knfsd.
4177  *
4178  * Note that we are absolutely dependent upon all inode dirtiers doing the
4179  * right thing: they *must* call mark_inode_dirty() after dirtying info in
4180  * which we are interested.
4181  *
4182  * It would be a bug for them to not do this.  The code:
4183  *
4184  *      mark_inode_dirty(inode)
4185  *      stuff();
4186  *      inode->i_size = expr;
4187  *
4188  * is in error because a kswapd-driven write_inode() could occur while
4189  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
4190  * will no longer be on the superblock's dirty inode list.
4191  */
4192 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4193 {
4194         int err;
4195
4196         if (current->flags & PF_MEMALLOC)
4197                 return 0;
4198
4199         if (EXT4_SB(inode->i_sb)->s_journal) {
4200                 if (ext4_journal_current_handle()) {
4201                         jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4202                         dump_stack();
4203                         return -EIO;
4204                 }
4205
4206                 if (wbc->sync_mode != WB_SYNC_ALL)
4207                         return 0;
4208
4209                 err = ext4_force_commit(inode->i_sb);
4210         } else {
4211                 struct ext4_iloc iloc;
4212
4213                 err = __ext4_get_inode_loc(inode, &iloc, 0);
4214                 if (err)
4215                         return err;
4216                 if (wbc->sync_mode == WB_SYNC_ALL)
4217                         sync_dirty_buffer(iloc.bh);
4218                 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
4219                         EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
4220                                          "IO error syncing inode");
4221                         err = -EIO;
4222                 }
4223                 brelse(iloc.bh);
4224         }
4225         return err;
4226 }
4227
4228 /*
4229  * ext4_setattr()
4230  *
4231  * Called from notify_change.
4232  *
4233  * We want to trap VFS attempts to truncate the file as soon as
4234  * possible.  In particular, we want to make sure that when the VFS
4235  * shrinks i_size, we put the inode on the orphan list and modify
4236  * i_disksize immediately, so that during the subsequent flushing of
4237  * dirty pages and freeing of disk blocks, we can guarantee that any
4238  * commit will leave the blocks being flushed in an unused state on
4239  * disk.  (On recovery, the inode will get truncated and the blocks will
4240  * be freed, so we have a strong guarantee that no future commit will
4241  * leave these blocks visible to the user.)
4242  *
4243  * Another thing we have to assure is that if we are in ordered mode
4244  * and inode is still attached to the committing transaction, we must
4245  * we start writeout of all the dirty pages which are being truncated.
4246  * This way we are sure that all the data written in the previous
4247  * transaction are already on disk (truncate waits for pages under
4248  * writeback).
4249  *
4250  * Called with inode->i_mutex down.
4251  */
4252 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4253 {
4254         struct inode *inode = dentry->d_inode;
4255         int error, rc = 0;
4256         int orphan = 0;
4257         const unsigned int ia_valid = attr->ia_valid;
4258
4259         error = inode_change_ok(inode, attr);
4260         if (error)
4261                 return error;
4262
4263         if (is_quota_modification(inode, attr))
4264                 dquot_initialize(inode);
4265         if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
4266             (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
4267                 handle_t *handle;
4268
4269                 /* (user+group)*(old+new) structure, inode write (sb,
4270                  * inode block, ? - but truncate inode update has it) */
4271                 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
4272                                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
4273                 if (IS_ERR(handle)) {
4274                         error = PTR_ERR(handle);
4275                         goto err_out;
4276                 }
4277                 error = dquot_transfer(inode, attr);
4278                 if (error) {
4279                         ext4_journal_stop(handle);
4280                         return error;
4281                 }
4282                 /* Update corresponding info in inode so that everything is in
4283                  * one transaction */
4284                 if (attr->ia_valid & ATTR_UID)
4285                         inode->i_uid = attr->ia_uid;
4286                 if (attr->ia_valid & ATTR_GID)
4287                         inode->i_gid = attr->ia_gid;
4288                 error = ext4_mark_inode_dirty(handle, inode);
4289                 ext4_journal_stop(handle);
4290         }
4291
4292         if (attr->ia_valid & ATTR_SIZE) {
4293                 inode_dio_wait(inode);
4294
4295                 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4296                         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4297
4298                         if (attr->ia_size > sbi->s_bitmap_maxbytes)
4299                                 return -EFBIG;
4300                 }
4301         }
4302
4303         if (S_ISREG(inode->i_mode) &&
4304             attr->ia_valid & ATTR_SIZE &&
4305             (attr->ia_size < inode->i_size)) {
4306                 handle_t *handle;
4307
4308                 handle = ext4_journal_start(inode, 3);
4309                 if (IS_ERR(handle)) {
4310                         error = PTR_ERR(handle);
4311                         goto err_out;
4312                 }
4313                 if (ext4_handle_valid(handle)) {
4314                         error = ext4_orphan_add(handle, inode);
4315                         orphan = 1;
4316                 }
4317                 EXT4_I(inode)->i_disksize = attr->ia_size;
4318                 rc = ext4_mark_inode_dirty(handle, inode);
4319                 if (!error)
4320                         error = rc;
4321                 ext4_journal_stop(handle);
4322
4323                 if (ext4_should_order_data(inode)) {
4324                         error = ext4_begin_ordered_truncate(inode,
4325                                                             attr->ia_size);
4326                         if (error) {
4327                                 /* Do as much error cleanup as possible */
4328                                 handle = ext4_journal_start(inode, 3);
4329                                 if (IS_ERR(handle)) {
4330                                         ext4_orphan_del(NULL, inode);
4331                                         goto err_out;
4332                                 }
4333                                 ext4_orphan_del(handle, inode);
4334                                 orphan = 0;
4335                                 ext4_journal_stop(handle);
4336                                 goto err_out;
4337                         }
4338                 }
4339         }
4340
4341         if (attr->ia_valid & ATTR_SIZE) {
4342                 if (attr->ia_size != i_size_read(inode))
4343                         truncate_setsize(inode, attr->ia_size);
4344                 ext4_truncate(inode);
4345         }
4346
4347         if (!rc) {
4348                 setattr_copy(inode, attr);
4349                 mark_inode_dirty(inode);
4350         }
4351
4352         /*
4353          * If the call to ext4_truncate failed to get a transaction handle at
4354          * all, we need to clean up the in-core orphan list manually.
4355          */
4356         if (orphan && inode->i_nlink)
4357                 ext4_orphan_del(NULL, inode);
4358
4359         if (!rc && (ia_valid & ATTR_MODE))
4360                 rc = ext4_acl_chmod(inode);
4361
4362 err_out:
4363         ext4_std_error(inode->i_sb, error);
4364         if (!error)
4365                 error = rc;
4366         return error;
4367 }
4368
4369 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4370                  struct kstat *stat)
4371 {
4372         struct inode *inode;
4373         unsigned long delalloc_blocks;
4374
4375         inode = dentry->d_inode;
4376         generic_fillattr(inode, stat);
4377
4378         /*
4379          * We can't update i_blocks if the block allocation is delayed
4380          * otherwise in the case of system crash before the real block
4381          * allocation is done, we will have i_blocks inconsistent with
4382          * on-disk file blocks.
4383          * We always keep i_blocks updated together with real
4384          * allocation. But to not confuse with user, stat
4385          * will return the blocks that include the delayed allocation
4386          * blocks for this file.
4387          */
4388         delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
4389                                 EXT4_I(inode)->i_reserved_data_blocks);
4390
4391         stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4392         return 0;
4393 }
4394
4395 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4396 {
4397         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4398                 return ext4_ind_trans_blocks(inode, nrblocks, chunk);
4399         return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
4400 }
4401
4402 /*
4403  * Account for index blocks, block groups bitmaps and block group
4404  * descriptor blocks if modify datablocks and index blocks
4405  * worse case, the indexs blocks spread over different block groups
4406  *
4407  * If datablocks are discontiguous, they are possible to spread over
4408  * different block groups too. If they are contiuguous, with flexbg,
4409  * they could still across block group boundary.
4410  *
4411  * Also account for superblock, inode, quota and xattr blocks
4412  */
4413 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4414 {
4415         ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
4416         int gdpblocks;
4417         int idxblocks;
4418         int ret = 0;
4419
4420         /*
4421          * How many index blocks need to touch to modify nrblocks?
4422          * The "Chunk" flag indicating whether the nrblocks is
4423          * physically contiguous on disk
4424          *
4425          * For Direct IO and fallocate, they calls get_block to allocate
4426          * one single extent at a time, so they could set the "Chunk" flag
4427          */
4428         idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
4429
4430         ret = idxblocks;
4431
4432         /*
4433          * Now let's see how many group bitmaps and group descriptors need
4434          * to account
4435          */
4436         groups = idxblocks;
4437         if (chunk)
4438                 groups += 1;
4439         else
4440                 groups += nrblocks;
4441
4442         gdpblocks = groups;
4443         if (groups > ngroups)
4444                 groups = ngroups;
4445         if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4446                 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4447
4448         /* bitmaps and block group descriptor blocks */
4449         ret += groups + gdpblocks;
4450
4451         /* Blocks for super block, inode, quota and xattr blocks */
4452         ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
4453
4454         return ret;
4455 }
4456
4457 /*
4458  * Calculate the total number of credits to reserve to fit
4459  * the modification of a single pages into a single transaction,
4460  * which may include multiple chunks of block allocations.
4461  *
4462  * This could be called via ext4_write_begin()
4463  *
4464  * We need to consider the worse case, when
4465  * one new block per extent.
4466  */
4467 int ext4_writepage_trans_blocks(struct inode *inode)
4468 {
4469         int bpp = ext4_journal_blocks_per_page(inode);
4470         int ret;
4471
4472         ret = ext4_meta_trans_blocks(inode, bpp, 0);
4473
4474         /* Account for data blocks for journalled mode */
4475         if (ext4_should_journal_data(inode))
4476                 ret += bpp;
4477         return ret;
4478 }
4479
4480 /*
4481  * Calculate the journal credits for a chunk of data modification.
4482  *
4483  * This is called from DIO, fallocate or whoever calling
4484  * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
4485  *
4486  * journal buffers for data blocks are not included here, as DIO
4487  * and fallocate do no need to journal data buffers.
4488  */
4489 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
4490 {
4491         return ext4_meta_trans_blocks(inode, nrblocks, 1);
4492 }
4493
4494 /*
4495  * The caller must have previously called ext4_reserve_inode_write().
4496  * Give this, we know that the caller already has write access to iloc->bh.
4497  */
4498 int ext4_mark_iloc_dirty(handle_t *handle,
4499                          struct inode *inode, struct ext4_iloc *iloc)
4500 {
4501         int err = 0;
4502
4503         if (IS_I_VERSION(inode))
4504                 inode_inc_iversion(inode);
4505
4506         /* the do_update_inode consumes one bh->b_count */
4507         get_bh(iloc->bh);
4508
4509         /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
4510         err = ext4_do_update_inode(handle, inode, iloc);
4511         put_bh(iloc->bh);
4512         return err;
4513 }
4514
4515 /*
4516  * On success, We end up with an outstanding reference count against
4517  * iloc->bh.  This _must_ be cleaned up later.
4518  */
4519
4520 int
4521 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
4522                          struct ext4_iloc *iloc)
4523 {
4524         int err;
4525
4526         err = ext4_get_inode_loc(inode, iloc);
4527         if (!err) {
4528                 BUFFER_TRACE(iloc->bh, "get_write_access");
4529                 err = ext4_journal_get_write_access(handle, iloc->bh);
4530                 if (err) {
4531                         brelse(iloc->bh);
4532                         iloc->bh = NULL;
4533                 }
4534         }
4535         ext4_std_error(inode->i_sb, err);
4536         return err;
4537 }
4538
4539 /*
4540  * Expand an inode by new_extra_isize bytes.
4541  * Returns 0 on success or negative error number on failure.
4542  */
4543 static int ext4_expand_extra_isize(struct inode *inode,
4544                                    unsigned int new_extra_isize,
4545                                    struct ext4_iloc iloc,
4546                                    handle_t *handle)
4547 {
4548         struct ext4_inode *raw_inode;
4549         struct ext4_xattr_ibody_header *header;
4550
4551         if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
4552                 return 0;
4553
4554         raw_inode = ext4_raw_inode(&iloc);
4555
4556         header = IHDR(inode, raw_inode);
4557
4558         /* No extended attributes present */
4559         if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
4560             header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
4561                 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
4562                         new_extra_isize);
4563                 EXT4_I(inode)->i_extra_isize = new_extra_isize;
4564                 return 0;
4565         }
4566
4567         /* try to expand with EAs present */
4568         return ext4_expand_extra_isize_ea(inode, new_extra_isize,
4569                                           raw_inode, handle);
4570 }
4571
4572 /*
4573  * What we do here is to mark the in-core inode as clean with respect to inode
4574  * dirtiness (it may still be data-dirty).
4575  * This means that the in-core inode may be reaped by prune_icache
4576  * without having to perform any I/O.  This is a very good thing,
4577  * because *any* task may call prune_icache - even ones which
4578  * have a transaction open against a different journal.
4579  *
4580  * Is this cheating?  Not really.  Sure, we haven't written the
4581  * inode out, but prune_icache isn't a user-visible syncing function.
4582  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
4583  * we start and wait on commits.
4584  *
4585  * Is this efficient/effective?  Well, we're being nice to the system
4586  * by cleaning up our inodes proactively so they can be reaped
4587  * without I/O.  But we are potentially leaving up to five seconds'
4588  * worth of inodes floating about which prune_icache wants us to
4589  * write out.  One way to fix that would be to get prune_icache()
4590  * to do a write_super() to free up some memory.  It has the desired
4591  * effect.
4592  */
4593 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4594 {
4595         struct ext4_iloc iloc;
4596         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4597         static unsigned int mnt_count;
4598         int err, ret;
4599
4600         might_sleep();
4601         trace_ext4_mark_inode_dirty(inode, _RET_IP_);
4602         err = ext4_reserve_inode_write(handle, inode, &iloc);
4603         if (ext4_handle_valid(handle) &&
4604             EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
4605             !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
4606                 /*
4607                  * We need extra buffer credits since we may write into EA block
4608                  * with this same handle. If journal_extend fails, then it will
4609                  * only result in a minor loss of functionality for that inode.
4610                  * If this is felt to be critical, then e2fsck should be run to
4611                  * force a large enough s_min_extra_isize.
4612                  */
4613                 if ((jbd2_journal_extend(handle,
4614                              EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
4615                         ret = ext4_expand_extra_isize(inode,
4616                                                       sbi->s_want_extra_isize,
4617                                                       iloc, handle);
4618                         if (ret) {
4619                                 ext4_set_inode_state(inode,
4620                                                      EXT4_STATE_NO_EXPAND);
4621                                 if (mnt_count !=
4622                                         le16_to_cpu(sbi->s_es->s_mnt_count)) {
4623                                         ext4_warning(inode->i_sb,
4624                                         "Unable to expand inode %lu. Delete"
4625                                         " some EAs or run e2fsck.",
4626                                         inode->i_ino);
4627                                         mnt_count =
4628                                           le16_to_cpu(sbi->s_es->s_mnt_count);
4629                                 }
4630                         }
4631                 }
4632         }
4633         if (!err)
4634                 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
4635         return err;
4636 }
4637
4638 /*
4639  * ext4_dirty_inode() is called from __mark_inode_dirty()
4640  *
4641  * We're really interested in the case where a file is being extended.
4642  * i_size has been changed by generic_commit_write() and we thus need
4643  * to include the updated inode in the current transaction.
4644  *
4645  * Also, dquot_alloc_block() will always dirty the inode when blocks
4646  * are allocated to the file.
4647  *
4648  * If the inode is marked synchronous, we don't honour that here - doing
4649  * so would cause a commit on atime updates, which we don't bother doing.
4650  * We handle synchronous inodes at the highest possible level.
4651  */
4652 void ext4_dirty_inode(struct inode *inode, int flags)
4653 {
4654         handle_t *handle;
4655
4656         handle = ext4_journal_start(inode, 2);
4657         if (IS_ERR(handle))
4658                 goto out;
4659
4660         ext4_mark_inode_dirty(handle, inode);
4661
4662         ext4_journal_stop(handle);
4663 out:
4664         return;
4665 }
4666
4667 #if 0
4668 /*
4669  * Bind an inode's backing buffer_head into this transaction, to prevent
4670  * it from being flushed to disk early.  Unlike
4671  * ext4_reserve_inode_write, this leaves behind no bh reference and
4672  * returns no iloc structure, so the caller needs to repeat the iloc
4673  * lookup to mark the inode dirty later.
4674  */
4675 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
4676 {
4677         struct ext4_iloc iloc;
4678
4679         int err = 0;
4680         if (handle) {
4681                 err = ext4_get_inode_loc(inode, &iloc);
4682                 if (!err) {
4683                         BUFFER_TRACE(iloc.bh, "get_write_access");
4684                         err = jbd2_journal_get_write_access(handle, iloc.bh);
4685                         if (!err)
4686                                 err = ext4_handle_dirty_metadata(handle,
4687                                                                  NULL,
4688                                                                  iloc.bh);
4689                         brelse(iloc.bh);
4690                 }
4691         }
4692         ext4_std_error(inode->i_sb, err);
4693         return err;
4694 }
4695 #endif
4696
4697 int ext4_change_inode_journal_flag(struct inode *inode, int val)
4698 {
4699         journal_t *journal;
4700         handle_t *handle;
4701         int err;
4702
4703         /*
4704          * We have to be very careful here: changing a data block's
4705          * journaling status dynamically is dangerous.  If we write a
4706          * data block to the journal, change the status and then delete
4707          * that block, we risk forgetting to revoke the old log record
4708          * from the journal and so a subsequent replay can corrupt data.
4709          * So, first we make sure that the journal is empty and that
4710          * nobody is changing anything.
4711          */
4712
4713         journal = EXT4_JOURNAL(inode);
4714         if (!journal)
4715                 return 0;
4716         if (is_journal_aborted(journal))
4717                 return -EROFS;
4718         /* We have to allocate physical blocks for delalloc blocks
4719          * before flushing journal. otherwise delalloc blocks can not
4720          * be allocated any more. even more truncate on delalloc blocks
4721          * could trigger BUG by flushing delalloc blocks in journal.
4722          * There is no delalloc block in non-journal data mode.
4723          */
4724         if (val && test_opt(inode->i_sb, DELALLOC)) {
4725                 err = ext4_alloc_da_blocks(inode);
4726                 if (err < 0)
4727                         return err;
4728         }
4729
4730         jbd2_journal_lock_updates(journal);
4731
4732         /*
4733          * OK, there are no updates running now, and all cached data is
4734          * synced to disk.  We are now in a completely consistent state
4735          * which doesn't have anything in the journal, and we know that
4736          * no filesystem updates are running, so it is safe to modify
4737          * the inode's in-core data-journaling state flag now.
4738          */
4739
4740         if (val)
4741                 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4742         else {
4743                 jbd2_journal_flush(journal);
4744                 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4745         }
4746         ext4_set_aops(inode);
4747
4748         jbd2_journal_unlock_updates(journal);
4749
4750         /* Finally we can mark the inode as dirty. */
4751
4752         handle = ext4_journal_start(inode, 1);
4753         if (IS_ERR(handle))
4754                 return PTR_ERR(handle);
4755
4756         err = ext4_mark_inode_dirty(handle, inode);
4757         ext4_handle_sync(handle);
4758         ext4_journal_stop(handle);
4759         ext4_std_error(inode->i_sb, err);
4760
4761         return err;
4762 }
4763
4764 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4765 {
4766         return !buffer_mapped(bh);
4767 }
4768
4769 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4770 {
4771         struct page *page = vmf->page;
4772         loff_t size;
4773         unsigned long len;
4774         int ret;
4775         struct file *file = vma->vm_file;
4776         struct inode *inode = file->f_path.dentry->d_inode;
4777         struct address_space *mapping = inode->i_mapping;
4778         handle_t *handle;
4779         get_block_t *get_block;
4780         int retries = 0;
4781
4782         /*
4783          * This check is racy but catches the common case. We rely on
4784          * __block_page_mkwrite() to do a reliable check.
4785          */
4786         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
4787         /* Delalloc case is easy... */
4788         if (test_opt(inode->i_sb, DELALLOC) &&
4789             !ext4_should_journal_data(inode) &&
4790             !ext4_nonda_switch(inode->i_sb)) {
4791                 do {
4792                         ret = __block_page_mkwrite(vma, vmf,
4793                                                    ext4_da_get_block_prep);
4794                 } while (ret == -ENOSPC &&
4795                        ext4_should_retry_alloc(inode->i_sb, &retries));
4796                 goto out_ret;
4797         }
4798
4799         lock_page(page);
4800         size = i_size_read(inode);
4801         /* Page got truncated from under us? */
4802         if (page->mapping != mapping || page_offset(page) > size) {
4803                 unlock_page(page);
4804                 ret = VM_FAULT_NOPAGE;
4805                 goto out;
4806         }
4807
4808         if (page->index == size >> PAGE_CACHE_SHIFT)
4809                 len = size & ~PAGE_CACHE_MASK;
4810         else
4811                 len = PAGE_CACHE_SIZE;
4812         /*
4813          * Return if we have all the buffers mapped. This avoids the need to do
4814          * journal_start/journal_stop which can block and take a long time
4815          */
4816         if (page_has_buffers(page)) {
4817                 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4818                                         ext4_bh_unmapped)) {
4819                         /* Wait so that we don't change page under IO */
4820                         wait_on_page_writeback(page);
4821                         ret = VM_FAULT_LOCKED;
4822                         goto out;
4823                 }
4824         }
4825         unlock_page(page);
4826         /* OK, we need to fill the hole... */
4827         if (ext4_should_dioread_nolock(inode))
4828                 get_block = ext4_get_block_write;
4829         else
4830                 get_block = ext4_get_block;
4831 retry_alloc:
4832         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
4833         if (IS_ERR(handle)) {
4834                 ret = VM_FAULT_SIGBUS;
4835                 goto out;
4836         }
4837         ret = __block_page_mkwrite(vma, vmf, get_block);
4838         if (!ret && ext4_should_journal_data(inode)) {
4839                 if (walk_page_buffers(handle, page_buffers(page), 0,
4840                           PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
4841                         unlock_page(page);
4842                         ret = VM_FAULT_SIGBUS;
4843                         ext4_journal_stop(handle);
4844                         goto out;
4845                 }
4846                 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
4847         }
4848         ext4_journal_stop(handle);
4849         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
4850                 goto retry_alloc;
4851 out_ret:
4852         ret = block_page_mkwrite_return(ret);
4853 out:
4854         return ret;
4855 }