Don't call ->writepage from VM scanner when page is met for the first time during scan. New page flag PG_skipped is used for this. This flag is TestSet-ed just before calling ->writepage and is cleaned when page enters inactive list. One can see this as "second chance" algorithm for the dirty pages on the inactive list. BSD does the same: src/sys/vm/vm_pageout.c:vm_pageout_scan(), PG_WINATCFLS flag. Reason behind this is that ->writepages() will perform more efficient writeout than ->writepage(). Skipping of page can be conditioned on zone->pressure. On the other hand, avoiding ->writepage() increases amount of scanning performed by kswapd. (Possible drawback: executable text pages are evicted earlier.) Signed-off-by: Nikita Danilov include/linux/page-flags.h | 8 ++++ mm/swap.c | 1 mm/truncate.c | 2 + mm/vmscan.c | 80 +++++++++++++++++++++++++++++++-------------- 4 files changed, 67 insertions(+), 24 deletions(-) diff -puN mm/vmscan.c~skip-writepage mm/vmscan.c --- git-linux/mm/vmscan.c~skip-writepage 2005-10-24 13:46:18.000000000 +0400 +++ git-linux-nikita/mm/vmscan.c 2005-10-24 13:46:18.000000000 +0400 @@ -300,26 +300,24 @@ static void handle_write_error(struct ad } /* - * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). + * Called by shrink_list() for each dirty page. Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping) { /* - * If the page is dirty, only perform writeback if that write - * will be non-blocking. To prevent this allocation from being - * stalled by pagecache activity. But note that there may be - * stalls if we need to run get_block(). We could test - * PagePrivate for that. + * If the page is dirty, only perform writeback if that write will be + * non-blocking. To prevent this allocation from being stalled by + * pagecache activity. But note that there may be stalls if we need + * to run get_block(). We could test PagePrivate for that. * - * If this process is currently in generic_file_write() against - * this page's queue, we can perform writeback even if that - * will block. + * If this process is currently in generic_file_write() against this + * page's queue, we can perform writeback even if that will block. * - * If the page is swapcache, write it back even if that would - * block, for some throttling. This happens by accident, because - * swap_backing_dev_info is bust: it doesn't reflect the - * congestion state of the swapdevs. Easy to fix, if needed. - * See swapfile.c:page_queue_congested(). + * If the page is swapcache, write it back even if that would block, + * for some throttling. This happens by accident, because + * swap_backing_dev_info is bust: it doesn't reflect the congestion + * state of the swapdevs. Easy to fix, if needed. See + * swapfile.c:page_queue_congested(). */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; @@ -341,18 +339,50 @@ static pageout_t pageout(struct page *pa return PAGE_ACTIVATE; if (!may_write_to_queue(mapping->backing_dev_info)) return PAGE_KEEP; - + /* + * Don't call ->writepage when page is met for the first time during + * scanning. Reasons: + * + * 1. if memory pressure is not too high, skipping ->writepage() + * may avoid writing out page that will be re-dirtied (should not + * be too important, because scanning starts from the tail of + * inactive list, where pages are _supposed_ to be rarely used, + * but when under constant memory pressure, inactive list is + * rotated and so is more FIFO than LRU). + * + * 2. ->writepages() writes data more efficiently than + * ->writepage(). + */ + if (!TestSetPageSkipped(page)) + return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { int res; + struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = SWAP_CLUSTER_MAX, - .nonblocking = 1, - .for_reclaim = 1, + /* + * synchronous page reclamation should be non blocking + * for the reasons outlined in the comment above. But + * in the kswapd blocking is ok. + * + * NOTE: + * + * 1. .nonblocking is not analyzed by existing + * in-tree implementations of ->writepage(). + * + * 2. may be if page zone is under considerable + * memory pressure (zone->prev_priority is low), + * .nonblocking should be set anyway. + */ + .nonblocking = !current_is_kswapd(), + .for_reclaim = 1 /* XXX not used */ }; + ClearPageSkipped(page); SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); + if (res < 0) handle_write_error(mapping, page, res); if (res == WRITEPAGE_ACTIVATE) { @@ -363,10 +393,8 @@ static pageout_t pageout(struct page *pa /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - return PAGE_SUCCESS; } - return PAGE_CLEAN; } @@ -483,7 +511,7 @@ static int shrink_list(struct list_head * possible for a page to have PageDirty set, but it is actually * clean (all its buffers are clean). This happens if the * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. + * will do this, as well as the blockdev mapping. * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * @@ -658,10 +686,13 @@ static void shrink_cache(struct zone *zo if (TestSetPageLRU(page)) BUG(); list_del(&page->lru); - if (PageActive(page)) + if (PageActive(page)) { + if (PageSkipped(page)) + ClearPageSkipped(page); add_page_to_active_list(zone, page); - else + } else { add_page_to_inactive_list(zone, page); + } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); @@ -772,6 +803,7 @@ refill_inactive_zone(struct zone *zone, BUG(); if (!TestClearPageActive(page)) BUG(); + ClearPageSkipped(page); list_move(&page->lru, &zone->inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { @@ -912,7 +944,7 @@ shrink_caches(struct zone **zones, struc shrink_zone(zone, sc); } } - + /* * This is the main entry point to direct page reclaim. * @@ -1178,7 +1210,7 @@ out: /* * The background pageout daemon, started as a kernel thread - * from the init process. + * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity diff -puN include/linux/mm_inline.h~skip-writepage include/linux/mm_inline.h diff -puN include/linux/page-flags.h~skip-writepage include/linux/page-flags.h --- git-linux/include/linux/page-flags.h~skip-writepage 2005-10-24 13:46:18.000000000 +0400 +++ git-linux-nikita/include/linux/page-flags.h 2005-10-24 13:46:18.000000000 +0400 @@ -76,6 +76,8 @@ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_uncached 19 /* Page has been mapped as uncached */ +#define PG_skipped 20 /* ->writepage() was skipped */ + /* * Global page accounting. One instance per CPU. Only unsigned longs are * allowed. @@ -162,6 +164,12 @@ extern void __mod_page_state(unsigned lo __mod_page_state(offset, (delta)); \ } while (0) +#define PageSkipped(page) test_bit(PG_skipped, &(page)->flags) +#define SetPageSkipped(page) set_bit(PG_skipped, &(page)->flags) +#define TestSetPageSkipped(page) test_and_set_bit(PG_skipped, &(page)->flags) +#define ClearPageSkipped(page) clear_bit(PG_skipped, &(page)->flags) +#define TestClearPageSkipped(page) test_and_clear_bit(PG_skipped, &(page)->flags) + /* * Manipulation of page state flags */ diff -puN mm/truncate.c~skip-writepage mm/truncate.c --- git-linux/mm/truncate.c~skip-writepage 2005-10-24 13:46:18.000000000 +0400 +++ git-linux-nikita/mm/truncate.c 2005-10-24 13:46:18.000000000 +0400 @@ -54,6 +54,7 @@ truncate_complete_page(struct address_sp clear_page_dirty(page); ClearPageUptodate(page); ClearPageMappedToDisk(page); + ClearPageSkipped(page); remove_from_page_cache(page); page_cache_release(page); /* pagecache ref */ } @@ -86,6 +87,7 @@ invalidate_complete_page(struct address_ __remove_from_page_cache(page); write_unlock_irq(&mapping->tree_lock); ClearPageUptodate(page); + ClearPageSkipped(page); page_cache_release(page); /* pagecache ref */ return 1; } diff -puN mm/swap.c~skip-writepage mm/swap.c --- git-linux/mm/swap.c~skip-writepage 2005-10-24 13:46:18.000000000 +0400 +++ git-linux-nikita/mm/swap.c 2005-10-24 13:46:18.000000000 +0400 @@ -303,6 +303,7 @@ void __pagevec_lru_add(struct pagevec *p } if (TestSetPageLRU(page)) BUG(); + ClearPageSkipped(page); add_page_to_inactive_list(zone, page); } if (zone) _