X-Git-Url: https://git.karo-electronics.de/?a=blobdiff_plain;ds=sidebyside;f=mm%2Fpage-writeback.c;h=029dfad5a235753fab2b7f81dda1950e770e9f2d;hb=c40769fee13c1ab43e1fb10320d6fbc29f582d8e;hp=488b7088557c549840b7cf6cf9daba1efc38bd44;hpb=db1a19b38f3a85f475b4ad716c71be133d8ca48e;p=mv-sheeva.git diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 488b7088557..029dfad5a23 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1,5 +1,5 @@ /* - * mm/page-writeback.c. + * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. * @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,8 @@ #include #include #include +#include +#include /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -64,12 +67,12 @@ static inline long sync_writeback_pages(void) /* * Start background writeback (via pdflush) at this percentage */ -int dirty_background_ratio = 10; +int dirty_background_ratio = 5; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 40; +int vm_dirty_ratio = 10; /* * The interval between `kupdate'-style writebacks, in jiffies @@ -116,6 +119,44 @@ static void background_writeout(unsigned long _min_pages); * We make sure that the background writeout level is below the adjusted * clamping level. */ + +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_online_node(node) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + + zone_page_state(z, NR_INACTIVE) + + zone_page_state(z, NR_ACTIVE); + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +static unsigned long determine_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + + global_page_state(NR_INACTIVE) + + global_page_state(NR_ACTIVE); + x -= highmem_dirtyable_memory(x); + return x + 1; /* Ensure that we never return 0 */ +} + static void get_dirty_limits(long *pbackground, long *pdirty, struct address_space *mapping) @@ -125,22 +166,12 @@ get_dirty_limits(long *pbackground, long *pdirty, int unmapped_ratio; long background; long dirty; - unsigned long available_memory = vm_total_pages; + unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; -#ifdef CONFIG_HIGHMEM - /* - * If this mapping can only allocate from low memory, - * we exclude high memory from our count. - */ - if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) - available_memory -= totalhigh_pages; -#endif - - unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; + available_memory; dirty_ratio = vm_dirty_ratio; if (dirty_ratio > unmapped_ratio / 2) @@ -220,7 +251,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ } - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); } if (nr_reclaimable + global_page_state(NR_WRITEBACK) @@ -295,11 +326,21 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, } EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); -void throttle_vm_writeout(void) +void throttle_vm_writeout(gfp_t gfp_mask) { long background_thresh; long dirty_thresh; + if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) { + /* + * The caller might hold locks which can prevent IO completion + * or progress in the filesystem. So we cannot just sit here + * waiting for IO to complete. + */ + congestion_wait(WRITE, HZ/10); + return; + } + for ( ; ; ) { get_dirty_limits(&background_thresh, &dirty_thresh, NULL); @@ -312,11 +353,10 @@ void throttle_vm_writeout(void) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); } } - /* * writeback at least _min_pages, and keep writing until the amount of dirty * memory is less than the background threshold, or until we're all clean. @@ -349,7 +389,7 @@ static void background_writeout(unsigned long _min_pages) min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); if (!wbc.encountered_congestion) break; } @@ -420,7 +460,7 @@ static void wb_kupdate(unsigned long arg) writeback_inodes(&wbc); if (wbc.nr_to_write > 0) { if (wbc.encountered_congestion) - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); else break; /* All the old data is written */ } @@ -514,7 +554,7 @@ static int __cpuinit ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) { writeback_set_ratelimit(); - return 0; + return NOTIFY_DONE; } static struct notifier_block __cpuinitdata ratelimit_nb = { @@ -523,33 +563,161 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { }; /* - * If the machine has a large highmem:lowmem ratio then scale back the default - * dirty memory thresholds: allowing too much dirty highmem pins an excessive - * number of buffer_heads. + * Called early on to tune the page writeback dirty limits. + * + * We used to scale dirty pages according to how total memory + * related to pages that could be allocated for buffers (by + * comparing nr_free_buffer_pages() to vm_total_pages. + * + * However, that was when we used "dirty_ratio" to scale with + * all memory, and we don't do that any more. "dirty_ratio" + * is now applied to total non-HIGHPAGE memory (by subtracting + * totalhigh_pages from vm_total_pages), and as such we can't + * get into the old insane situation any more where we had + * large amounts of dirty pages compared to a small amount of + * non-HIGHMEM memory. + * + * But we might still want to scale the dirty_ratio by how + * much memory the box has.. */ void __init page_writeback_init(void) { - long buffer_pages = nr_free_buffer_pages(); - long correction; + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + writeback_set_ratelimit(); + register_cpu_notifier(&ratelimit_nb); +} + +/** + * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * + * This is a library function, which implements the writepages() + * address_space_operation. + * + * If a page is already under I/O, generic_writepages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + * + * Derived from mpage_writepages() - if you fix this you should check that + * also! + */ +int generic_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + int (*writepage)(struct page *page, struct writeback_control *wbc); + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } - correction = (100 * 4 * buffer_pages) / vm_total_pages; + writepage = mapping->a_ops->writepage; - if (correction < 100) { - dirty_background_ratio *= correction; - dirty_background_ratio /= 100; - vm_dirty_ratio *= correction; - vm_dirty_ratio /= 100; + /* deal with chardevs and other special file */ + if (!writepage) + return 0; - if (dirty_background_ratio <= 0) - dirty_background_ratio = 1; - if (vm_dirty_ratio <= 0) - vm_dirty_ratio = 1; + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; } - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); - writeback_set_ratelimit(); - register_cpu_notifier(&ratelimit_nb); +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc); + if (ret) { + if (ret == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else + set_bit(AS_EIO, &mapping->flags); + } + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) + unlock_page(page); + if (ret || (--(wbc->nr_to_write) <= 0)) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + return ret; } +EXPORT_SYMBOL(generic_writepages); + int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -567,7 +735,6 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) /** * write_one_page - write out a single page and optionally wait on I/O - * * @page: the page to write * @wait: if true, wait on writeout * @@ -605,6 +772,16 @@ int write_one_page(struct page *page, int wait) } EXPORT_SYMBOL(write_one_page); +/* + * For address_spaces which do not use buffers nor write back. + */ +int __set_page_dirty_no_writeback(struct page *page) +{ + if (!PageDirty(page)) + SetPageDirty(page); + return 0; +} + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -626,23 +803,24 @@ int __set_page_dirty_nobuffers(struct page *page) struct address_space *mapping = page_mapping(page); struct address_space *mapping2; - if (mapping) { - write_lock_irq(&mapping->tree_lock); - mapping2 = page_mapping(page); - if (mapping2) { /* Race with truncate? */ - BUG_ON(mapping2 != mapping); - if (mapping_cap_account_dirty(mapping)) - __inc_zone_page_state(page, - NR_FILE_DIRTY); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - write_unlock_irq(&mapping->tree_lock); - if (mapping->host) { - /* !PageAnon && !swapper_space */ - __mark_inode_dirty(mapping->host, - I_DIRTY_PAGES); + if (!mapping) + return 1; + + write_lock_irq(&mapping->tree_lock); + mapping2 = page_mapping(page); + if (mapping2) { /* Race with truncate? */ + BUG_ON(mapping2 != mapping); + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + task_io_account_write(PAGE_CACHE_SIZE); } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + write_unlock_irq(&mapping->tree_lock); + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } @@ -672,9 +850,11 @@ int fastcall set_page_dirty(struct page *page) if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - return __set_page_dirty_buffers(page); +#ifdef CONFIG_BLOCK + if (!spd) + spd = __set_page_dirty_buffers; +#endif + return (*spd)(page); } if (!PageDirty(page)) { if (!TestSetPageDirty(page)) @@ -705,39 +885,6 @@ int set_page_dirty_lock(struct page *page) } EXPORT_SYMBOL(set_page_dirty_lock); -/* - * Clear a page's dirty flag, while caring for dirty memory accounting. - * Returns true if the page was previously dirty. - */ -int test_clear_page_dirty(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - unsigned long flags; - - if (mapping) { - write_lock_irqsave(&mapping->tree_lock, flags); - if (TestClearPageDirty(page)) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); - /* - * We can continue to use `mapping' here because the - * page is locked, which pins the address_space - */ - if (mapping_cap_account_dirty(mapping)) { - page_mkclean(page); - dec_zone_page_state(page, NR_FILE_DIRTY); - } - return 1; - } - write_unlock_irqrestore(&mapping->tree_lock, flags); - return 0; - } - return TestClearPageDirty(page); -} -EXPORT_SYMBOL(test_clear_page_dirty); - /* * Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. @@ -756,12 +903,41 @@ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); - if (mapping) { + if (mapping && mapping_cap_account_dirty(mapping)) { + /* + * Yes, Virginia, this is indeed insane. + * + * We use this sequence to make sure that + * (a) we account for dirty stats properly + * (b) we tell the low-level filesystem to + * mark the whole page dirty if it was + * dirty in a pagetable. Only to then + * (c) clean the page again and return 1 to + * cause the writeback. + * + * This way we avoid all nasty races with the + * dirty bit in multiple places and clearing + * them concurrently from different threads. + * + * Note! Normally the "set_page_dirty(page)" + * has no effect on the actual dirty bit - since + * that will already usually be set. But we + * need the side effects, and it can help us + * avoid races. + * + * We basically use the page "master dirty bit" + * as a serialization point for all the different + * threads doing their things. + * + * FIXME! We still have a race here: if somebody + * adds the page back to the page tables in + * between the "page_mkclean()" and the "TestClearPageDirty()", + * we might have it mapped without the dirty bit set. + */ + if (page_mkclean(page)) + set_page_dirty(page); if (TestClearPageDirty(page)) { - if (mapping_cap_account_dirty(mapping)) { - page_mkclean(page); - dec_zone_page_state(page, NR_FILE_DIRTY); - } + dec_zone_page_state(page, NR_FILE_DIRTY); return 1; } return 0; @@ -818,15 +994,6 @@ int test_set_page_writeback(struct page *page) } EXPORT_SYMBOL(test_set_page_writeback); -/* - * Wakes up tasks that are being throttled due to writeback congestion - */ -void writeback_congestion_end(void) -{ - blk_congestion_end(WRITE); -} -EXPORT_SYMBOL(writeback_congestion_end); - /* * Return true if any of the pages in the mapping are marged with the * passed tag.