diff -up linux-2.6.25.noarch/include/linux/swap.h.splitlru linux-2.6.25.noarch/include/linux/swap.h --- linux-2.6.25.noarch/include/linux/swap.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/include/linux/swap.h 2008-04-30 19:14:26.000000000 -0400 @@ -171,8 +171,8 @@ extern unsigned int nr_free_pagecache_pa /* linux/mm/swap.c */ -extern void lru_cache_add(struct page *); -extern void lru_cache_add_active(struct page *); +extern void __lru_cache_add(struct page *, enum lru_list lru); +extern void lru_cache_add_lru(struct page *, enum lru_list lru); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); @@ -180,12 +180,48 @@ extern int lru_add_drain_all(void); extern int rotate_reclaimable_page(struct page *page); extern void swap_setup(void); +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +static inline void lru_cache_add_anon(struct page *page) +{ + __lru_cache_add(page, LRU_INACTIVE_ANON); +} + +static inline void lru_cache_add_active_anon(struct page *page) +{ + __lru_cache_add(page, LRU_ACTIVE_ANON); +} + +static inline void lru_cache_add_file(struct page *page) +{ + __lru_cache_add(page, LRU_INACTIVE_FILE); +} + +static inline void lru_cache_add_active_file(struct page *page) +{ + __lru_cache_add(page, LRU_ACTIVE_FILE); +} + +#ifdef CONFIG_NORECLAIM_LRU +static inline void lru_cache_add_noreclaim(struct page *page) +{ + __lru_cache_add(page, LRU_NORECLAIM); +} +#else +static inline void lru_cache_add_noreclaim(struct page *page) +{ + BUG(); +} +#endif + /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, gfp_t gfp_mask); -extern int __isolate_lru_page(struct page *page, int mode); +extern int __isolate_lru_page(struct page *page, int mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); @@ -204,6 +240,20 @@ static inline int zone_reclaim(struct zo } #endif +#ifdef CONFIG_NORECLAIM_LRU +extern int page_reclaimable(struct page *page, struct vm_area_struct *vma); +extern void scan_mapping_noreclaim_pages(struct address_space *); +#else +static inline int page_reclaimable(struct page *page, + struct vm_area_struct *vma) +{ + return 1; +} +static inline void scan_mapping_noreclaim_pages(struct address_space *mapping) +{ +} +#endif + extern int kswapd_run(int nid); #ifdef CONFIG_MMU diff -up linux-2.6.25.noarch/include/linux/page-flags.h.splitlru linux-2.6.25.noarch/include/linux/page-flags.h --- linux-2.6.25.noarch/include/linux/page-flags.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/include/linux/page-flags.h 2008-05-07 20:45:30.000000000 -0400 @@ -89,6 +89,7 @@ #define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ #define PG_reclaim 17 /* To be reclaimed asap */ #define PG_buddy 19 /* Page is free, on buddy lists */ +#define PG_swapbacked 20 /* Page is backed by RAM/swap */ /* PG_readahead is only used for file reads; PG_reclaim is only for writes */ #define PG_readahead PG_reclaim /* Reminder to do async read-ahead */ @@ -105,6 +106,9 @@ * 64 bit | FIELDS | ?????? FLAGS | * 63 32 0 */ +#ifdef CONFIG_NORECLAIM_LRU +#define PG_noreclaim 30 /* Page is "non-reclaimable" */ +#endif #define PG_uncached 31 /* Page has been mapped as uncached */ #endif @@ -195,6 +199,7 @@ static inline void SetPageUptodate(struc #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) #define __ClearPageActive(page) __clear_bit(PG_active, &(page)->flags) +#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) #define __SetPageSlab(page) __set_bit(PG_slab, &(page)->flags) @@ -252,6 +257,10 @@ static inline void SetPageUptodate(struc #define ClearPageReclaim(page) clear_bit(PG_reclaim, &(page)->flags) #define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags) +#define PageSwapBacked(page) test_bit(PG_swapbacked, &(page)->flags) +#define SetPageSwapBacked(page) set_bit(PG_swapbacked, &(page)->flags) +#define __ClearPageSwapBacked(page) __clear_bit(PG_swapbacked, &(page)->flags) + #define PageCompound(page) test_bit(PG_compound, &(page)->flags) #define __SetPageCompound(page) __set_bit(PG_compound, &(page)->flags) #define __ClearPageCompound(page) __clear_bit(PG_compound, &(page)->flags) @@ -292,6 +301,21 @@ static inline void __ClearPageTail(struc #define PageSwapCache(page) 0 #endif +#ifdef CONFIG_NORECLAIM_LRU +#define PageNoreclaim(page) test_bit(PG_noreclaim, &(page)->flags) +#define SetPageNoreclaim(page) set_bit(PG_noreclaim, &(page)->flags) +#define ClearPageNoreclaim(page) clear_bit(PG_noreclaim, &(page)->flags) +#define __ClearPageNoreclaim(page) __clear_bit(PG_noreclaim, &(page)->flags) +#define TestClearPageNoreclaim(page) \ + test_and_clear_bit(PG_noreclaim, &(page)->flags) +#else +#define PageNoreclaim(page) 0 +#define SetPageNoreclaim(page) +#define ClearPageNoreclaim(page) +#define __ClearPageNoreclaim(page) +#define TestClearPageNoreclaim(page) 0 +#endif + #define PageUncached(page) test_bit(PG_uncached, &(page)->flags) #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) diff -up linux-2.6.25.noarch/include/linux/pagemap.h.splitlru linux-2.6.25.noarch/include/linux/pagemap.h --- linux-2.6.25.noarch/include/linux/pagemap.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/include/linux/pagemap.h 2008-04-30 19:10:58.000000000 -0400 @@ -30,6 +30,34 @@ static inline void mapping_set_error(str } } +#ifdef CONFIG_NORECLAIM_LRU +#define AS_NORECLAIM (__GFP_BITS_SHIFT + 2) /* e.g., ramdisk, SHM_LOCK */ + +static inline void mapping_set_noreclaim(struct address_space *mapping) +{ + set_bit(AS_NORECLAIM, &mapping->flags); +} + +static inline void mapping_clear_noreclaim(struct address_space *mapping) +{ + clear_bit(AS_NORECLAIM, &mapping->flags); +} + +static inline int mapping_non_reclaimable(struct address_space *mapping) +{ + if (mapping) + return test_bit(AS_NORECLAIM, &mapping->flags); + return 0; +} +#else +static inline void mapping_set_noreclaim(struct address_space *mapping) { } +static inline void mapping_clear_noreclaim(struct address_space *mapping) { } +static inline int mapping_non_reclaimable(struct address_space *mapping) +{ + return 0; +} +#endif + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; diff -up linux-2.6.25.noarch/include/linux/pagevec.h.splitlru linux-2.6.25.noarch/include/linux/pagevec.h --- linux-2.6.25.noarch/include/linux/pagevec.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/include/linux/pagevec.h 2008-04-30 19:10:58.000000000 -0400 @@ -23,9 +23,9 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void __pagevec_release_nonlru(struct pagevec *pvec); void __pagevec_free(struct pagevec *pvec); -void __pagevec_lru_add(struct pagevec *pvec); -void __pagevec_lru_add_active(struct pagevec *pvec); +void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); void pagevec_strip(struct pagevec *pvec); +void pagevec_swap_free(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, @@ -81,10 +81,43 @@ static inline void pagevec_free(struct p __pagevec_free(pvec); } -static inline void pagevec_lru_add(struct pagevec *pvec) +static inline void __pagevec_lru_add_anon(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); +} + +static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_ACTIVE_ANON); +} + +static inline void __pagevec_lru_add_file(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_INACTIVE_FILE); +} + +static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_ACTIVE_FILE); +} + +#ifdef CONFIG_NORECLAIM_LRU +static inline void __pagevec_lru_add_noreclaim(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_NORECLAIM); +} +#endif + +static inline void pagevec_lru_add_file(struct pagevec *pvec) +{ + if (pagevec_count(pvec)) + __pagevec_lru_add_file(pvec); +} + +static inline void pagevec_lru_add_anon(struct pagevec *pvec) { if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); + __pagevec_lru_add_anon(pvec); } #endif /* _LINUX_PAGEVEC_H */ diff -up linux-2.6.25.noarch/include/linux/memcontrol.h.splitlru linux-2.6.25.noarch/include/linux/memcontrol.h --- linux-2.6.25.noarch/include/linux/memcontrol.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/include/linux/memcontrol.h 2008-04-30 19:10:58.000000000 -0400 @@ -44,7 +44,7 @@ extern unsigned long mem_cgroup_isolate_ unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active); + int active, int file); extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); @@ -67,10 +67,8 @@ extern void mem_cgroup_note_reclaim_prio extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority); -extern long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, - struct zone *zone, int priority); -extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, - struct zone *zone, int priority); +extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, + int priority, enum lru_list lru); #else /* CONFIG_CGROUP_MEM_RES_CTLR */ static inline void mm_init_cgroup(struct mm_struct *mm, @@ -161,14 +159,9 @@ static inline void mem_cgroup_record_rec { } -static inline long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, - struct zone *zone, int priority) -{ - return 0; -} - -static inline long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, - struct zone *zone, int priority) +static inline long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, + struct zone *zone, int priority, + enum lru_list lru) { return 0; } diff -up linux-2.6.25.noarch/include/linux/vmstat.h.splitlru linux-2.6.25.noarch/include/linux/vmstat.h --- linux-2.6.25.noarch/include/linux/vmstat.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/include/linux/vmstat.h 2008-04-30 19:10:58.000000000 -0400 @@ -149,6 +149,16 @@ static inline unsigned long zone_page_st return x; } +extern unsigned long global_lru_pages(void); + +static inline unsigned long zone_lru_pages(struct zone *zone) +{ + return (zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_FILE)); +} + #ifdef CONFIG_NUMA /* * Determine the per node value of a stat item. This function diff -up linux-2.6.25.noarch/include/linux/mmzone.h.splitlru linux-2.6.25.noarch/include/linux/mmzone.h --- linux-2.6.25.noarch/include/linux/mmzone.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/include/linux/mmzone.h 2008-04-30 21:03:35.000000000 -0400 @@ -80,20 +80,27 @@ struct zone_padding { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, - NR_INACTIVE, - NR_ACTIVE, + NR_INACTIVE_ANON, /* must match order of LRU_[IN]ACTIVE_* */ + NR_ACTIVE_ANON, /* " " " " " */ + NR_INACTIVE_FILE, /* " " " " " */ + NR_ACTIVE_FILE, /* " " " " " */ +#ifdef CONFIG_NORECLAIM_LRU + NR_NORECLAIM, /* " " " " " */ +#else + NR_NORECLAIM = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */ +#endif NR_ANON_PAGES, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, - /* Second 128 byte cacheline */ NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_BOUNCE, + /* Second 128 byte cacheline */ NR_VMSCAN_WRITE, #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ @@ -105,6 +112,54 @@ enum zone_stat_item { #endif NR_VM_ZONE_STAT_ITEMS }; +/* + * We do arithmetic on the LRU lists in various places in the code, + * so it is important to keep the active lists LRU_ACTIVE higher in + * the array than the corresponding inactive lists, and to keep + * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. + */ +#define LRU_BASE 0 +#define LRU_ACTIVE 1 +#define LRU_FILE 2 + +enum lru_list { + LRU_INACTIVE_ANON = LRU_BASE, + LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, + LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, + LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, +#ifdef CONFIG_NORECLAIM_LRU + LRU_NORECLAIM, +#else + LRU_NORECLAIM = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */ +#endif + NR_LRU_LISTS +}; + +#define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++) + +#define for_each_reclaimable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++) + +static inline int is_file_lru(enum lru_list l) +{ + return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE); +} + +static inline int is_active_lru(enum lru_list l) +{ + return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE); +} + +static inline int is_noreclaim_lru(enum lru_list l) +{ +#ifdef CONFIG_NORECLAIM_LRU + return (l == LRU_NORECLAIM); +#else + return 0; +#endif +} + +enum lru_list page_lru(struct page *page); + struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ @@ -258,10 +313,14 @@ struct zone { /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; - struct list_head active_list; - struct list_head inactive_list; - unsigned long nr_scan_active; - unsigned long nr_scan_inactive; + struct list_head list[NR_LRU_LISTS]; + unsigned long nr_scan[NR_LRU_LISTS]; + + unsigned long recent_rotated_anon; + unsigned long recent_rotated_file; + unsigned long recent_scanned_anon; + unsigned long recent_scanned_file; + unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ @@ -283,6 +342,11 @@ struct zone { */ int prev_priority; + /* + * The ratio of active to inactive pages. + */ + unsigned int inactive_ratio; + ZONE_PADDING(_pad2_) /* Rarely used or read-mostly fields */ diff -up linux-2.6.25.noarch/include/linux/mm_inline.h.splitlru linux-2.6.25.noarch/include/linux/mm_inline.h --- linux-2.6.25.noarch/include/linux/mm_inline.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/include/linux/mm_inline.h 2008-04-30 19:10:58.000000000 -0400 @@ -1,40 +1,137 @@ -static inline void -add_page_to_active_list(struct zone *zone, struct page *page) +#ifndef LINUX_MM_INLINE_H +#define LINUX_MM_INLINE_H + +/** + * page_file_cache - should the page be on a file LRU or anon LRU? + * @page: the page to test + * + * Returns LRU_FILE if @page is page cache page backed by a regular filesystem, + * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed. + * + * We would like to get this info without a page flag, but the state + * needs to survive until the page is last deleted from the LRU, which + * could be as far down as __page_cache_release. + */ +static inline int page_file_cache(struct page *page) { - list_add(&page->lru, &zone->active_list); - __inc_zone_state(zone, NR_ACTIVE); + if (PageSwapBacked(page)) + return 0; + + /* The page is page cache backed by a normal filesystem. */ + return LRU_FILE; } static inline void -add_page_to_inactive_list(struct zone *zone, struct page *page) +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) { - list_add(&page->lru, &zone->inactive_list); - __inc_zone_state(zone, NR_INACTIVE); + list_add(&page->lru, &zone->list[l]); + __inc_zone_state(zone, NR_INACTIVE_ANON + l); } static inline void -del_page_from_active_list(struct zone *zone, struct page *page) +del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); - __dec_zone_state(zone, NR_ACTIVE); + __dec_zone_state(zone, NR_INACTIVE_ANON + l); } static inline void -del_page_from_inactive_list(struct zone *zone, struct page *page) +add_page_to_inactive_anon_list(struct zone *zone, struct page *page) { - list_del(&page->lru); - __dec_zone_state(zone, NR_INACTIVE); + add_page_to_lru_list(zone, page, LRU_INACTIVE_ANON); } static inline void +add_page_to_active_anon_list(struct zone *zone, struct page *page) +{ + add_page_to_lru_list(zone, page, LRU_ACTIVE_ANON); +} + +static inline void +add_page_to_inactive_file_list(struct zone *zone, struct page *page) +{ + add_page_to_lru_list(zone, page, LRU_INACTIVE_FILE); +} + +static inline void +add_page_to_active_file_list(struct zone *zone, struct page *page) +{ + add_page_to_lru_list(zone, page, LRU_ACTIVE_FILE); +} + +static inline void +del_page_from_inactive_anon_list(struct zone *zone, struct page *page) +{ + del_page_from_lru_list(zone, page, LRU_INACTIVE_ANON); +} + +static inline void +del_page_from_active_anon_list(struct zone *zone, struct page *page) +{ + del_page_from_lru_list(zone, page, LRU_ACTIVE_ANON); +} + +static inline void +del_page_from_inactive_file_list(struct zone *zone, struct page *page) +{ + del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE); +} + +static inline void +del_page_from_active_file_list(struct zone *zone, struct page *page) +{ + del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE); +} + +#ifdef CONFIG_NORECLAIM_LRU +static inline void +add_page_to_noreclaim_list(struct zone *zone, struct page *page) +{ + add_page_to_lru_list(zone, page, LRU_NORECLAIM); +} + +static inline void +del_page_from_noreclaim_list(struct zone *zone, struct page *page) +{ + del_page_from_lru_list(zone, page, LRU_NORECLAIM); +} +#else +static inline void +add_page_to_noreclaim_list(struct zone *zone, struct page *page) { } + +static inline void +del_page_from_noreclaim_list(struct zone *zone, struct page *page) { } +#endif + +static inline void del_page_from_lru(struct zone *zone, struct page *page) { + enum lru_list l = LRU_INACTIVE_ANON; + list_del(&page->lru); - if (PageActive(page)) { - __ClearPageActive(page); - __dec_zone_state(zone, NR_ACTIVE); + if (PageNoreclaim(page)) { + __ClearPageNoreclaim(page); + l = LRU_NORECLAIM; } else { - __dec_zone_state(zone, NR_INACTIVE); + if (PageActive(page)) { + __ClearPageActive(page); + l += LRU_ACTIVE; + } + l += page_file_cache(page); } + __dec_zone_state(zone, NR_INACTIVE_ANON + l); } +static inline int inactive_anon_low(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_ANON); + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + + if (inactive * zone->inactive_ratio < active) + return 1; + + return 0; +} +#endif diff -up linux-2.6.25.noarch/include/linux/migrate.h.splitlru linux-2.6.25.noarch/include/linux/migrate.h --- linux-2.6.25.noarch/include/linux/migrate.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/include/linux/migrate.h 2008-04-30 19:10:58.000000000 -0400 @@ -25,7 +25,6 @@ static inline int vma_migratable(struct return 1; } -extern int isolate_lru_page(struct page *p, struct list_head *pagelist); extern int putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); @@ -42,8 +41,6 @@ extern int migrate_vmas(struct mm_struct static inline int vma_migratable(struct vm_area_struct *vma) { return 0; } -static inline int isolate_lru_page(struct page *p, struct list_head *list) - { return -ENOSYS; } static inline int putback_lru_pages(struct list_head *l) { return 0; } static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private) { return -ENOSYS; } diff -up linux-2.6.25.noarch/drivers/block/brd.c.splitlru linux-2.6.25.noarch/drivers/block/brd.c --- linux-2.6.25.noarch/drivers/block/brd.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/drivers/block/brd.c 2008-04-30 19:10:58.000000000 -0400 @@ -373,8 +373,21 @@ static int brd_ioctl(struct inode *inode return error; } +/* + * brd_open(): + * Just mark the mapping as containing non-reclaimable pages + */ +static int brd_open(struct inode *inode, struct file *filp) +{ + struct address_space *mapping = inode->i_mapping; + + mapping_set_noreclaim(mapping); + return 0; +} + static struct block_device_operations brd_fops = { .owner = THIS_MODULE, + .open = brd_open, .ioctl = brd_ioctl, #ifdef CONFIG_BLK_DEV_XIP .direct_access = brd_direct_access, diff -up linux-2.6.25.noarch/drivers/base/node.c.splitlru linux-2.6.25.noarch/drivers/base/node.c --- linux-2.6.25.noarch/drivers/base/node.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/drivers/base/node.c 2008-04-30 21:25:57.000000000 -0400 @@ -45,33 +45,49 @@ static ssize_t node_read_meminfo(struct si_meminfo_node(&i, nid); n = sprintf(buf, "\n" - "Node %d MemTotal: %8lu kB\n" - "Node %d MemFree: %8lu kB\n" - "Node %d MemUsed: %8lu kB\n" - "Node %d Active: %8lu kB\n" - "Node %d Inactive: %8lu kB\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d Active: %8lu kB\n" + "Node %d Inactive: %8lu kB\n" + "Node %d Active(anon): %8lu kB\n" + "Node %d Inactive(anon): %8lu kB\n" + "Node %d Active(file): %8lu kB\n" + "Node %d Inactive(file): %8lu kB\n" +#ifdef CONFIG_NORECLAIM_LRU + "Node %d Noreclaim: %8lu kB\n" +#endif #ifdef CONFIG_HIGHMEM - "Node %d HighTotal: %8lu kB\n" - "Node %d HighFree: %8lu kB\n" - "Node %d LowTotal: %8lu kB\n" - "Node %d LowFree: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n" #endif - "Node %d Dirty: %8lu kB\n" - "Node %d Writeback: %8lu kB\n" - "Node %d FilePages: %8lu kB\n" - "Node %d Mapped: %8lu kB\n" - "Node %d AnonPages: %8lu kB\n" - "Node %d PageTables: %8lu kB\n" - "Node %d NFS_Unstable: %8lu kB\n" - "Node %d Bounce: %8lu kB\n" - "Node %d Slab: %8lu kB\n" - "Node %d SReclaimable: %8lu kB\n" - "Node %d SUnreclaim: %8lu kB\n", + "Node %d Dirty: %8lu kB\n" + "Node %d Writeback: %8lu kB\n" + "Node %d FilePages: %8lu kB\n" + "Node %d Mapped: %8lu kB\n" + "Node %d AnonPages: %8lu kB\n" + "Node %d PageTables: %8lu kB\n" + "Node %d NFS_Unstable: %8lu kB\n" + "Node %d Bounce: %8lu kB\n" + "Node %d Slab: %8lu kB\n" + "Node %d SReclaimable: %8lu kB\n" + "Node %d SUnreclaim: %8lu kB\n", nid, K(i.totalram), nid, K(i.freeram), nid, K(i.totalram - i.freeram), - nid, node_page_state(nid, NR_ACTIVE), - nid, node_page_state(nid, NR_INACTIVE), + nid, K(node_page_state(nid, NR_ACTIVE_ANON) + + node_page_state(nid, NR_ACTIVE_FILE)), + nid, K(node_page_state(nid, NR_INACTIVE_ANON) + + node_page_state(nid, NR_INACTIVE_FILE)), + nid, node_page_state(nid, NR_ACTIVE_ANON), + nid, node_page_state(nid, NR_INACTIVE_ANON), + nid, node_page_state(nid, NR_ACTIVE_FILE), + nid, node_page_state(nid, NR_INACTIVE_FILE), +#ifdef CONFIG_NORECLAIM_LRU + nid, node_page_state(nid, NR_NORECLAIM), +#endif #ifdef CONFIG_HIGHMEM nid, K(i.totalhigh), nid, K(i.freehigh), diff -up linux-2.6.25.noarch/mm/memory.c.splitlru linux-2.6.25.noarch/mm/memory.c --- linux-2.6.25.noarch/mm/memory.c.splitlru 2008-04-28 17:17:11.000000000 -0400 +++ linux-2.6.25.noarch/mm/memory.c 2008-04-30 19:10:58.000000000 -0400 @@ -1728,7 +1728,8 @@ gotten: ptep_clear_flush(vma, address, page_table); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); - lru_cache_add_active(new_page); + SetPageSwapBacked(new_page); + lru_cache_add_active_anon(new_page); page_add_new_anon_rmap(new_page, vma, address); /* Free the old page.. */ @@ -2196,7 +2197,8 @@ static int do_anonymous_page(struct mm_s if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); + SetPageSwapBacked(page); + lru_cache_add_active_anon(page); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2350,7 +2352,8 @@ static int __do_fault(struct mm_struct * set_pte_at(mm, address, page_table, entry); if (anon) { inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); + SetPageSwapBacked(page); + lru_cache_add_active_anon(page); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); diff -up linux-2.6.25.noarch/mm/shmem.c.splitlru linux-2.6.25.noarch/mm/shmem.c --- linux-2.6.25.noarch/mm/shmem.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/shmem.c 2008-04-30 19:10:58.000000000 -0400 @@ -1434,6 +1434,7 @@ repeat: goto failed; } + SetPageSwapBacked(filepage); spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) @@ -1524,10 +1525,13 @@ int shmem_lock(struct file *file, int lo if (!user_shm_lock(inode->i_size, user)) goto out_nomem; info->flags |= VM_LOCKED; + mapping_set_noreclaim(file->f_mapping); } if (!lock && (info->flags & VM_LOCKED) && user) { user_shm_unlock(inode->i_size, user); info->flags &= ~VM_LOCKED; + mapping_clear_noreclaim(file->f_mapping); + scan_mapping_noreclaim_pages(file->f_mapping); } retval = 0; out_nomem: diff -up linux-2.6.25.noarch/mm/memcontrol.c.splitlru linux-2.6.25.noarch/mm/memcontrol.c --- linux-2.6.25.noarch/mm/memcontrol.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/memcontrol.c 2008-04-30 22:04:08.000000000 -0400 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -80,22 +81,13 @@ static s64 mem_cgroup_read_stat(struct m /* * per-zone information in memory controller. */ - -enum mem_cgroup_zstat_index { - MEM_CGROUP_ZSTAT_ACTIVE, - MEM_CGROUP_ZSTAT_INACTIVE, - - NR_MEM_CGROUP_ZSTAT, -}; - struct mem_cgroup_per_zone { /* * spin_lock to protect the per cgroup LRU */ spinlock_t lru_lock; - struct list_head active_list; - struct list_head inactive_list; - unsigned long count[NR_MEM_CGROUP_ZSTAT]; + struct list_head lists[NR_LRU_LISTS]; + unsigned long count[NR_LRU_LISTS]; }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -166,6 +158,7 @@ struct page_cgroup { }; #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ +#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */ static int page_cgroup_nid(struct page_cgroup *pc) { @@ -215,7 +208,7 @@ page_cgroup_zoneinfo(struct page_cgroup } static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, - enum mem_cgroup_zstat_index idx) + enum lru_list idx) { int nid, zid; struct mem_cgroup_per_zone *mz; @@ -289,13 +282,15 @@ static void unlock_page_cgroup(struct pa static void __mem_cgroup_remove_list(struct page_cgroup *pc) { - int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); + int lru = LRU_BASE; - if (from) - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; - else - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; + if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE) + lru += LRU_ACTIVE; + if (pc->flags & PAGE_CGROUP_FLAG_FILE) + lru += LRU_FILE; + + MEM_CGROUP_ZSTAT(mz, lru) -= 1; mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); list_del_init(&pc->lru); @@ -303,38 +298,38 @@ static void __mem_cgroup_remove_list(str static void __mem_cgroup_add_list(struct page_cgroup *pc) { - int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); + int lru = LRU_BASE; + + if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE) + lru += LRU_ACTIVE; + if (pc->flags & PAGE_CGROUP_FLAG_FILE) + lru += LRU_FILE; + + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_add(&pc->lru, &mz->lists[lru]); - if (!to) { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; - list_add(&pc->lru, &mz->inactive_list); - } else { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; - list_add(&pc->lru, &mz->active_list); - } mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); } static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) { - int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); + int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; + int file = pc->flags & PAGE_CGROUP_FLAG_FILE; + int lru = LRU_FILE * !!file + !!from; - if (from) - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; - else - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; + MEM_CGROUP_ZSTAT(mz, lru) -= 1; - if (active) { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; + if (active) pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; - list_move(&pc->lru, &mz->active_list); - } else { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; + else pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; - list_move(&pc->lru, &mz->inactive_list); - } + + lru = LRU_FILE * !!file + !!active; + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_move(&pc->lru, &mz->lists[lru]); + } int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) @@ -394,21 +389,6 @@ int mem_cgroup_calc_mapped_ratio(struct } /* - * This function is called from vmscan.c. In page reclaiming loop. balance - * between active and inactive list is calculated. For memory controller - * page reclaiming, we should use using mem_cgroup's imbalance rather than - * zone's global lru imbalance. - */ -long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) -{ - unsigned long active, inactive; - /* active and inactive are the number of pages. 'long' is ok.*/ - active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); - inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); - return (long) (active / (inactive + 1)); -} - -/* * prev_priority control...this will be used in memory reclaim path. */ int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) @@ -435,28 +415,17 @@ void mem_cgroup_record_reclaim_priority( * (see include/linux/mmzone.h) */ -long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, - struct zone *zone, int priority) +long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, + int priority, enum lru_list lru) { - long nr_active; + long nr_pages; int nid = zone->zone_pgdat->node_id; int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); - nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); - return (nr_active >> priority); -} - -long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, - struct zone *zone, int priority) -{ - long nr_inactive; - int nid = zone->zone_pgdat->node_id; - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); + nr_pages = MEM_CGROUP_ZSTAT(mz, lru); - nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); - return (nr_inactive >> priority); + return (nr_pages >> priority); } unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, @@ -464,7 +433,7 @@ unsigned long mem_cgroup_isolate_pages(u unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { unsigned long nr_taken = 0; struct page *page; @@ -475,13 +444,10 @@ unsigned long mem_cgroup_isolate_pages(u int nid = z->zone_pgdat->node_id; int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; + int lru = LRU_FILE * !!file + !!active; mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - if (active) - src = &mz->active_list; - else - src = &mz->inactive_list; - + src = &mz->lists[lru]; spin_lock(&mz->lru_lock); scan = 0; @@ -493,6 +459,9 @@ unsigned long mem_cgroup_isolate_pages(u if (unlikely(!PageLRU(page))) continue; + /* + * TODO: play better with lumpy reclaim, grabbing anything. + */ if (PageActive(page) && !active) { __mem_cgroup_move_lists(pc, true); continue; @@ -505,7 +474,7 @@ unsigned long mem_cgroup_isolate_pages(u scan++; list_move(&pc->lru, &pc_list); - if (__isolate_lru_page(page, mode) == 0) { + if (__isolate_lru_page(page, mode, file) == 0) { list_move(&page->lru, dst); nr_taken++; } @@ -611,6 +580,8 @@ retry: pc->flags = PAGE_CGROUP_FLAG_ACTIVE; if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) pc->flags |= PAGE_CGROUP_FLAG_CACHE; + if (page_file_cache(page)) + pc->flags |= PAGE_CGROUP_FLAG_FILE; lock_page_cgroup(page); if (page_get_page_cgroup(page)) { @@ -773,7 +744,7 @@ void mem_cgroup_page_migration(struct pa #define FORCE_UNCHARGE_BATCH (128) static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct mem_cgroup_per_zone *mz, - int active) + enum lru_list lru) { struct page_cgroup *pc; struct page *page; @@ -781,10 +752,7 @@ static void mem_cgroup_force_empty_list( unsigned long flags; struct list_head *list; - if (active) - list = &mz->active_list; - else - list = &mz->inactive_list; + list = &mz->lists[lru]; spin_lock_irqsave(&mz->lru_lock, flags); while (!list_empty(list)) { @@ -827,11 +795,10 @@ static int mem_cgroup_force_empty(struct for_each_node_state(node, N_POSSIBLE) for (zid = 0; zid < MAX_NR_ZONES; zid++) { struct mem_cgroup_per_zone *mz; + enum lru_list l; mz = mem_cgroup_zoneinfo(mem, node, zid); - /* drop all page_cgroup in active_list */ - mem_cgroup_force_empty_list(mem, mz, 1); - /* drop all page_cgroup in inactive_list */ - mem_cgroup_force_empty_list(mem, mz, 0); + for_each_lru(l) + mem_cgroup_force_empty_list(mem, mz, l); } } ret = 0; @@ -919,14 +886,21 @@ static int mem_control_stat_show(struct } /* showing # of active pages */ { - unsigned long active, inactive; + unsigned long active_anon, inactive_anon; + unsigned long active_file, inactive_file; - inactive = mem_cgroup_get_all_zonestat(mem_cont, - MEM_CGROUP_ZSTAT_INACTIVE); - active = mem_cgroup_get_all_zonestat(mem_cont, - MEM_CGROUP_ZSTAT_ACTIVE); - seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); - seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); + inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_ANON); + active_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_ANON); + inactive_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_FILE); + active_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_FILE); + seq_printf(m, "active_anon %ld\n", (active_anon) * PAGE_SIZE); + seq_printf(m, "inactive_anon %ld\n", (inactive_anon) * PAGE_SIZE); + seq_printf(m, "active_file %ld\n", (active_file) * PAGE_SIZE); + seq_printf(m, "inactive_file %ld\n", (inactive_file) * PAGE_SIZE); } return 0; } @@ -978,6 +952,7 @@ static int alloc_mem_cgroup_per_zone_inf { struct mem_cgroup_per_node *pn; struct mem_cgroup_per_zone *mz; + enum lru_list l; int zone, tmp = node; /* * This routine is called against possible nodes. @@ -998,9 +973,9 @@ static int alloc_mem_cgroup_per_zone_inf for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - INIT_LIST_HEAD(&mz->active_list); - INIT_LIST_HEAD(&mz->inactive_list); spin_lock_init(&mz->lru_lock); + for_each_lru(l) + INIT_LIST_HEAD(&mz->lists[l]); } return 0; } diff -up linux-2.6.25.noarch/mm/internal.h.splitlru linux-2.6.25.noarch/mm/internal.h --- linux-2.6.25.noarch/mm/internal.h.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/internal.h 2008-04-30 22:04:41.000000000 -0400 @@ -34,6 +34,8 @@ static inline void __put_page(struct pag atomic_dec(&page->_count); } +extern int isolate_lru_page(struct page *page); + extern void __init __free_pages_bootmem(struct page *page, unsigned int order); diff -up linux-2.6.25.noarch/mm/migrate.c.splitlru linux-2.6.25.noarch/mm/migrate.c --- linux-2.6.25.noarch/mm/migrate.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/migrate.c 2008-04-30 19:10:58.000000000 -0400 @@ -36,36 +36,6 @@ #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) /* - * Isolate one page from the LRU lists. If successful put it onto - * the indicated list with elevated page count. - * - * Result: - * -EBUSY: page not on LRU list - * 0: page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page, struct list_head *pagelist) -{ - int ret = -EBUSY; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && get_page_unless_zero(page)) { - ret = 0; - ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - list_add_tail(&page->lru, pagelist); - } - spin_unlock_irq(&zone->lru_lock); - } - return ret; -} - -/* * migrate_prep() needs to be called before we start compiling a list of pages * to be migrated using isolate_lru_page(). */ @@ -84,16 +54,7 @@ int migrate_prep(void) static inline void move_to_lru(struct page *page) { - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } + lru_cache_add_lru(page, page_lru(page)); put_page(page); } @@ -374,8 +335,11 @@ static void migrate_page_copy(struct pag SetPageReferenced(newpage); if (PageUptodate(page)) SetPageUptodate(newpage); - if (PageActive(page)) + if (TestClearPageActive(page)) { + VM_BUG_ON(PageNoreclaim(page)); SetPageActive(newpage); + } else if (TestClearPageNoreclaim(page)) + SetPageNoreclaim(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -389,7 +353,6 @@ static void migrate_page_copy(struct pag #ifdef CONFIG_SWAP ClearPageSwapCache(page); #endif - ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); page->mapping = NULL; @@ -585,6 +548,8 @@ static int move_to_new_page(struct page /* Prepare mapping for the new page.*/ newpage->index = page->index; newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); mapping = page_mapping(page); if (!mapping) @@ -879,7 +844,9 @@ static int do_move_pages(struct mm_struc !migrate_all) goto put_and_set; - err = isolate_lru_page(page, &pagelist); + err = isolate_lru_page(page); + if (!err) + list_add_tail(&page->lru, &pagelist); put_and_set: /* * Either remove the duplicate refcount from diff -up linux-2.6.25.noarch/mm/Kconfig.splitlru linux-2.6.25.noarch/mm/Kconfig --- linux-2.6.25.noarch/mm/Kconfig.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/Kconfig 2008-04-30 19:10:58.000000000 -0400 @@ -193,3 +193,13 @@ config NR_QUICK config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config NORECLAIM_LRU + bool "Add LRU list to track non-reclaimable pages (EXPERIMENTAL, 64BIT only)" + depends on EXPERIMENTAL && 64BIT + help + Supports tracking of non-reclaimable pages off the [in]active lists + to avoid excessive reclaim overhead on large memory systems. Pages + may be non-reclaimable because: they are locked into memory, they + are anonymous pages for which no swap space exists, or they are anon + pages that are expensive to unmap [long anon_vma "related vma" list.] diff -up linux-2.6.25.noarch/mm/page_alloc.c.splitlru linux-2.6.25.noarch/mm/page_alloc.c --- linux-2.6.25.noarch/mm/page_alloc.c.splitlru 2008-04-28 17:17:10.000000000 -0400 +++ linux-2.6.25.noarch/mm/page_alloc.c 2008-04-30 19:10:58.000000000 -0400 @@ -240,11 +240,15 @@ static void bad_page(struct page *page) 1 << PG_private | 1 << PG_locked | 1 << PG_active | +#ifdef CONFIG_NORECLAIM_LRU + 1 << PG_noreclaim | +#endif 1 << PG_dirty | 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | + 1 << PG_swapbacked | 1 << PG_buddy ); set_page_count(page, 0); reset_page_mapcount(page); @@ -471,10 +475,15 @@ static inline int free_pages_check(struc 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | +#ifdef CONFIG_NORECLAIM_LRU + 1 << PG_noreclaim | +#endif 1 << PG_buddy )))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); + if (PageSwapBacked(page)) + __ClearPageSwapBacked(page); /* * For now, we report if PG_reserved was found set, but do not * clear it, and do not free the page. But we shall soon need @@ -617,11 +626,15 @@ static int prep_new_page(struct page *pa 1 << PG_private | 1 << PG_locked | 1 << PG_active | +#ifdef CONFIG_NORECLAIM_LRU + 1 << PG_noreclaim | +#endif 1 << PG_dirty | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | + 1 << PG_swapbacked | 1 << PG_buddy )))) bad_page(page); @@ -1815,10 +1828,21 @@ void show_free_areas(void) } } - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" + printk("Active_anon:%lu active_file:%lu inactive_anon%lu\n" + " inactive_file:%lu" +//TODO: check/adjust line lengths +#ifdef CONFIG_NORECLAIM_LRU + " noreclaim:%lu" +#endif + " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", - global_page_state(NR_ACTIVE), - global_page_state(NR_INACTIVE), + global_page_state(NR_ACTIVE_ANON), + global_page_state(NR_ACTIVE_FILE), + global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_INACTIVE_FILE), +#ifdef CONFIG_NORECLAIM_LRU + global_page_state(NR_NORECLAIM), +#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -1841,8 +1865,13 @@ void show_free_areas(void) " min:%lukB" " low:%lukB" " high:%lukB" - " active:%lukB" - " inactive:%lukB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" +#ifdef CONFIG_NORECLAIM_LRU + " noreclaim:%lukB" +#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -1852,8 +1881,13 @@ void show_free_areas(void) K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), - K(zone_page_state(zone, NR_ACTIVE)), - K(zone_page_state(zone, NR_INACTIVE)), + K(zone_page_state(zone, NR_ACTIVE_ANON)), + K(zone_page_state(zone, NR_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ACTIVE_FILE)), + K(zone_page_state(zone, NR_INACTIVE_FILE)), +#ifdef CONFIG_NORECLAIM_LRU + K(zone_page_state(zone, NR_NORECLAIM)), +#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") @@ -3337,6 +3371,7 @@ static void __paginginit free_area_init_ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; + enum lru_list l; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -3386,10 +3421,14 @@ static void __paginginit free_area_init_ zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); - INIT_LIST_HEAD(&zone->active_list); - INIT_LIST_HEAD(&zone->inactive_list); - zone->nr_scan_active = 0; - zone->nr_scan_inactive = 0; + for_each_lru(l) { + INIT_LIST_HEAD(&zone->list[l]); + zone->nr_scan[l] = 0; + } + zone->recent_rotated_anon = 0; + zone->recent_rotated_file = 0; + zone->recent_scanned_anon = 0; + zone->recent_scanned_file = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) @@ -4137,6 +4176,45 @@ void setup_per_zone_pages_min(void) calculate_totalreserve_pages(); } +/** + * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. + * + * The inactive anon list should be small enough that the VM never has to + * do too much work, but large enough that each inactive page has a chance + * to be referenced again before it is swapped out. + * + * The inactive_anon ratio is the ratio of active to inactive anonymous + * pages. Ie. a ratio of 3 means 3:1 or 25% of the anonymous pages are + * on the inactive list. + * + * total return max + * memory value inactive anon + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB + */ +void setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) { + unsigned int gb, ratio; + + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + ratio = int_sqrt(10 * gb); + if (!ratio) + ratio = 1; + + zone->inactive_ratio = ratio; + } +} + /* * Initialise min_free_kbytes. * @@ -4174,6 +4252,7 @@ static int __init init_per_zone_pages_mi min_free_kbytes = 65536; setup_per_zone_pages_min(); setup_per_zone_lowmem_reserve(); + setup_per_zone_inactive_ratio(); return 0; } module_init(init_per_zone_pages_min) diff -up linux-2.6.25.noarch/mm/vmstat.c.splitlru linux-2.6.25.noarch/mm/vmstat.c --- linux-2.6.25.noarch/mm/vmstat.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/vmstat.c 2008-04-30 19:10:58.000000000 -0400 @@ -594,8 +594,13 @@ const struct seq_operations pagetypeinfo static const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_free_pages", - "nr_inactive", - "nr_active", + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", +#ifdef CONFIG_NORECLAIM_LRU + "nr_noreclaim", +#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -607,6 +612,7 @@ static const char * const vmstat_text[] "nr_unstable", "nr_bounce", "nr_vmscan_write", + "nr_writeback_temp", #ifdef CONFIG_NUMA "numa_hit", @@ -658,7 +664,7 @@ static void zoneinfo_show_print(struct s "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu (a: %lu i: %lu)" + "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), @@ -666,7 +672,10 @@ static void zoneinfo_show_print(struct s zone->pages_low, zone->pages_high, zone->pages_scanned, - zone->nr_scan_active, zone->nr_scan_inactive, + zone->nr_scan[LRU_ACTIVE_ANON], + zone->nr_scan[LRU_INACTIVE_ANON], + zone->nr_scan[LRU_ACTIVE_FILE], + zone->nr_scan[LRU_INACTIVE_FILE], zone->spanned_pages, zone->present_pages); @@ -703,10 +712,12 @@ static void zoneinfo_show_print(struct s seq_printf(m, "\n all_unreclaimable: %u" "\n prev_priority: %i" - "\n start_pfn: %lu", + "\n start_pfn: %lu" + "\n inactive_ratio: %u", zone_is_all_unreclaimable(zone), zone->prev_priority, - zone->zone_start_pfn); + zone->zone_start_pfn, + zone->inactive_ratio); seq_putc(m, '\n'); } diff -up linux-2.6.25.noarch/mm/filemap.c.splitlru linux-2.6.25.noarch/mm/filemap.c --- linux-2.6.25.noarch/mm/filemap.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/filemap.c 2008-04-30 19:10:58.000000000 -0400 @@ -33,6 +33,7 @@ #include #include /* for BUG_ON(!in_atomic()) only */ #include +#include /* for page_file_cache() */ #include "internal.h" /* @@ -491,8 +492,12 @@ int add_to_page_cache_lru(struct page *p pgoff_t offset, gfp_t gfp_mask) { int ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add(page); + if (ret == 0) { + if (page_file_cache(page)) + lru_cache_add_file(page); + else + lru_cache_add_active_anon(page); + } return ret; } diff -up linux-2.6.25.noarch/mm/swap_state.c.splitlru linux-2.6.25.noarch/mm/swap_state.c --- linux-2.6.25.noarch/mm/swap_state.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/swap_state.c 2008-04-30 19:10:58.000000000 -0400 @@ -82,6 +82,7 @@ int add_to_swap_cache(struct page *page, if (!error) { page_cache_get(page); SetPageSwapCache(page); + SetPageSwapBacked(page); set_page_private(page, entry.val); total_swapcache_pages++; __inc_zone_page_state(page, NR_FILE_PAGES); @@ -300,7 +301,7 @@ struct page *read_swap_cache_async(swp_e /* * Initiate read into locked page and return. */ - lru_cache_add_active(new_page); + lru_cache_add_anon(new_page); swap_readpage(NULL, new_page); return new_page; } diff -up linux-2.6.25.noarch/mm/vmscan.c.splitlru linux-2.6.25.noarch/mm/vmscan.c --- linux-2.6.25.noarch/mm/vmscan.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/vmscan.c 2008-05-07 20:32:31.000000000 -0400 @@ -77,7 +77,7 @@ struct scan_control { unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active); + int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -229,27 +229,6 @@ unsigned long shrink_slab(unsigned long return ret; } -/* Called without lock on whether page is mapped, so answer is unstable */ -static inline int page_mapping_inuse(struct page *page) -{ - struct address_space *mapping; - - /* Page is in somebody's page tables. */ - if (page_mapped(page)) - return 1; - - /* Be more reluctant to reclaim swapcache than pagecache */ - if (PageSwapCache(page)) - return 1; - - mapping = page_mapping(page); - if (!mapping) - return 0; - - /* File is mmap'd by somebody? */ - return mapping_mapped(mapping); -} - static inline int is_page_cache_freeable(struct page *page) { return page_count(page) - !!PagePrivate(page) == 2; @@ -484,6 +463,11 @@ static unsigned long shrink_page_list(st sc->nr_scanned++; + if (!page_reclaimable(page, NULL)) { + SetPageNoreclaim(page); + goto keep_locked; + } + if (!sc->may_swap && page_mapped(page)) goto keep_locked; @@ -511,8 +495,7 @@ static unsigned long shrink_page_list(st referenced = page_referenced(page, 1, sc->mem_cgroup); /* In active use or really unfreeable? Activate it. */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) goto activate_locked; #ifdef CONFIG_SWAP @@ -543,8 +526,6 @@ static unsigned long shrink_page_list(st } if (PageDirty(page)) { - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) - goto keep_locked; if (!may_enter_fs) goto keep_locked; if (!sc->may_writepage) @@ -583,7 +564,7 @@ static unsigned long shrink_page_list(st * possible for a page to have PageDirty set, but it is actually * clean (all its buffers are clean). This happens if the * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. + * will do this, as well as the blockdev mapping. * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * @@ -612,6 +593,10 @@ free_it: continue; activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ + if (PageSwapCache(page) && vm_swap_full()) + remove_exclusive_swap_page(page); + VM_BUG_ON(PageActive(page)); SetPageActive(page); pgactivate++; keep_locked: @@ -642,7 +627,7 @@ keep: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode) +int __isolate_lru_page(struct page *page, int mode, int file) { int ret = -EINVAL; @@ -658,6 +643,17 @@ int __isolate_lru_page(struct page *page if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; + if (mode != ISOLATE_BOTH && (!page_file_cache(page) != !file)) + return ret; + + /* + * Non-reclaimable pages shouldn't make it onto either the active + * nor the inactive list. However, when doing lumpy reclaim of + * higher order pages we can still run into them. + */ + if (PageNoreclaim(page)) + return ret; + ret = -EBUSY; if (likely(get_page_unless_zero(page))) { /* @@ -688,12 +684,13 @@ int __isolate_lru_page(struct page *page * @scanned: The number of pages that were scanned. * @order: The caller's attempted allocation order * @mode: One of the LRU isolation modes + * @file: True [1] if isolating file [!anon] pages * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode) + unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; unsigned long scan; @@ -710,7 +707,7 @@ static unsigned long isolate_lru_pages(u VM_BUG_ON(!PageLRU(page)); - switch (__isolate_lru_page(page, mode)) { + switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); nr_taken++; @@ -753,10 +750,11 @@ static unsigned long isolate_lru_pages(u break; cursor_page = pfn_to_page(pfn); + /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode)) { + switch (__isolate_lru_page(cursor_page, mode, file)) { case 0: list_move(&cursor_page->lru, dst); nr_taken++; @@ -767,7 +765,7 @@ static unsigned long isolate_lru_pages(u /* else it is being freed elsewhere */ list_move(&cursor_page->lru, src); default: - break; + break; /* ! on LRU or wrong list */ } } } @@ -781,40 +779,100 @@ static unsigned long isolate_pages_globa unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { + int lru = LRU_BASE; if (active) - return isolate_lru_pages(nr, &z->active_list, dst, - scanned, order, mode); - else - return isolate_lru_pages(nr, &z->inactive_list, dst, - scanned, order, mode); + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + return isolate_lru_pages(nr, &z->list[lru], dst, scanned, order, + mode, !!file); } /* * clear_active_flags() is a helper for shrink_active_list(), clearing * any active bits from the pages in the list. */ -static unsigned long clear_active_flags(struct list_head *page_list) +static unsigned long clear_active_flags(struct list_head *page_list, + unsigned int *count) { int nr_active = 0; + int lru; struct page *page; - list_for_each_entry(page, page_list, lru) + list_for_each_entry(page, page_list, lru) { + lru = page_file_cache(page); if (PageActive(page)) { + lru += LRU_ACTIVE; ClearPageActive(page); nr_active++; } + count[lru]++; + } return nr_active; } +/** + * isolate_lru_page - tries to isolate a page from its LRU list + * @page: page to isolate from its LRU list + * + * Isolates a @page from an LRU list, clears PageLRU and adjusts the + * vmstat statistic corresponding to whatever LRU list the page was on. + * + * Returns 0 if the page was removed from an LRU list. + * Returns -EBUSY if the page was not on an LRU list. + * + * The returned page will have PageLRU() cleared. If it was found on + * the active list, it will have PageActive set. If it was found on + * the noreclaim list, it will have the PageNoreclaim bit set. That flag + * may need to be cleared by the caller before letting the page go. + * + * The vmstat statistic corresponding to the list on which the page was + * found will be decremented. + * + * Restrictions: + * (1) Must be called with an elevated refcount on the page. This is a + * fundamentnal difference from isolate_lru_pages (which is called + * without a stable reference). + * (2) the lru_lock must not be held. + * (3) interrupts must be enabled. + */ +int isolate_lru_page(struct page *page) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && get_page_unless_zero(page)) { + int lru = LRU_BASE; + ret = 0; + ClearPageLRU(page); + + /* Calculate the LRU list for normal pages ... */ + lru += page_file_cache(page) + !!PageActive(page); + + /* ... except NoReclaim, which has its own list. */ + if (PageNoreclaim(page)) + lru = LRU_NORECLAIM; + + del_page_from_lru_list(zone, page, lru); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + /* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc) + struct zone *zone, struct scan_control *sc, + int priority, int file) { LIST_HEAD(page_list); struct pagevec pvec; @@ -831,20 +889,43 @@ static unsigned long shrink_inactive_lis unsigned long nr_scan; unsigned long nr_freed; unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + int mode = ISOLATE_INACTIVE; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + mode = ISOLATE_BOTH; + else if (sc->order && priority < DEF_PRIORITY - 2) + mode = ISOLATE_BOTH; nr_taken = sc->isolate_pages(sc->swap_cluster_max, - &page_list, &nr_scan, sc->order, - (sc->order > PAGE_ALLOC_COSTLY_ORDER)? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, 0); - nr_active = clear_active_flags(&page_list); + &page_list, &nr_scan, sc->order, mode, + zone, sc->mem_cgroup, 0, file); + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); - __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); - __mod_zone_page_state(zone, NR_INACTIVE, - -(nr_taken - nr_active)); - if (scan_global_lru(sc)) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); + + if (scan_global_lru(sc)) { zone->pages_scanned += nr_scan; + zone->recent_scanned_anon += count[LRU_ACTIVE_ANON] + + count[LRU_INACTIVE_ANON]; + zone->recent_scanned_file += count[LRU_ACTIVE_FILE] + + count[LRU_INACTIVE_FILE]; + } spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; @@ -864,7 +945,7 @@ static unsigned long shrink_inactive_lis * The attempt at page out may have made some * of the pages active, mark them inactive again. */ - nr_active = clear_active_flags(&page_list); + nr_active = clear_active_flags(&page_list, count); count_vm_events(PGDEACTIVATE, nr_active); nr_freed += shrink_page_list(&page_list, sc, @@ -889,14 +970,27 @@ static unsigned long shrink_inactive_lis * Put back any unfreeable pages. */ while (!list_empty(&page_list)) { + int lru = LRU_BASE; page = lru_to_page(&page_list); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); + if (PageNoreclaim(page)) { + VM_BUG_ON(PageActive(page)); + lru = LRU_NORECLAIM; + } else { + if (page_file_cache(page)) + lru += LRU_FILE; + if (scan_global_lru(sc)) { + if (page_file_cache(page)) + zone->recent_rotated_file++; + else + zone->recent_rotated_anon++; + } + if (PageActive(page)) + lru += LRU_ACTIVE; + } + add_page_to_lru_list(zone, page, lru); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); @@ -927,115 +1021,7 @@ static inline void note_zone_scanning_pr static inline int zone_is_near_oom(struct zone *zone) { - return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE))*3; -} - -/* - * Determine we should try to reclaim mapped pages. - * This is called only when sc->mem_cgroup is NULL. - */ -static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, - int priority) -{ - long mapped_ratio; - long distress; - long swap_tendency; - long imbalance; - int reclaim_mapped = 0; - int prev_priority; - - if (scan_global_lru(sc) && zone_is_near_oom(zone)) - return 1; - /* - * `distress' is a measure of how much trouble we're having - * reclaiming pages. 0 -> no problems. 100 -> great trouble. - */ - if (scan_global_lru(sc)) - prev_priority = zone->prev_priority; - else - prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); - - distress = 100 >> min(prev_priority, priority); - - /* - * The point of this algorithm is to decide when to start - * reclaiming mapped memory instead of just pagecache. Work out - * how much memory - * is mapped. - */ - if (scan_global_lru(sc)) - mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; - else - mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); - - /* - * Now decide how much we really want to unmap some pages. The - * mapped ratio is downgraded - just because there's a lot of - * mapped memory doesn't necessarily mean that page reclaim - * isn't succeeding. - * - * The distress ratio is important - we don't want to start - * going oom. - * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; - - /* - * If there's huge imbalance between active and inactive - * (think active 100 times larger than inactive) we should - * become more permissive, or the system will take too much - * cpu before it start swapping during memory pressure. - * Distress is about avoiding early-oom, this is about - * making swappiness graceful despite setting it to low - * values. - * - * Avoid div by zero with nr_inactive+1, and max resulting - * value is vm_total_pages. - */ - if (scan_global_lru(sc)) { - imbalance = zone_page_state(zone, NR_ACTIVE); - imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; - } else - imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); - - /* - * Reduce the effect of imbalance if swappiness is low, - * this means for a swappiness very low, the imbalance - * must be much higher than 100 for this logic to make - * the difference. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= (vm_swappiness + 1); - imbalance /= 100; - - /* - * If not much of the ram is mapped, makes the imbalance - * less relevant, it's high priority we refill the inactive - * list with mapped pages only in presence of high ratio of - * mapped pages. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= mapped_ratio; - imbalance /= 100; - - /* apply imbalance feedback to swap_tendency */ - swap_tendency += imbalance; - - /* - * Now use this metric to decide whether to start moving mapped - * memory onto the inactive list. - */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - - return reclaim_mapped; + return zone->pages_scanned >= (zone_lru_pages(zone) * 3); } /* @@ -1058,53 +1044,78 @@ static int calc_reclaim_mapped(struct sc static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority) + struct scan_control *sc, int priority, int file) { - unsigned long pgmoved; + unsigned long pgmoved = 0; int pgdeactivate = 0; unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ - LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ - LIST_HEAD(l_active); /* Pages to go onto the active_list */ + LIST_HEAD(l_inactive); + LIST_HEAD(l_active); + LIST_HEAD(l_noreclaim); struct page *page; struct pagevec pvec; - int reclaim_mapped = 0; - - if (sc->may_swap) - reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); + enum lru_list lru; lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1); + sc->mem_cgroup, 1, file); /* * zone->pages_scanned is used for detect zone's oom * mem_cgroup remembers nr_scan by itself. */ - if (scan_global_lru(sc)) + if (scan_global_lru(sc)) { zone->pages_scanned += pgscanned; + if (file) + zone->recent_scanned_file += pgscanned; + else + zone->recent_scanned_anon += pgscanned; + } - __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); + if (file) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + else + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); spin_unlock_irq(&zone->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); list_del(&page->lru); - if (page_mapped(page)) { - if (!reclaim_mapped || - (total_swap_pages == 0 && PageAnon(page)) || - page_referenced(page, 0, sc->mem_cgroup)) { - list_add(&page->lru, &l_active); - continue; - } + + if (!page_reclaimable(page, NULL)) { + /* Non-reclaimable pages go onto their own list. */ + list_add(&page->lru, &l_noreclaim); + continue; + } + + if (page_referenced(page, 0, sc->mem_cgroup) && file) { + /* Referenced file pages stay active. */ + list_add(&page->lru, &l_active); + } else { + list_add(&page->lru, &l_inactive); + if (!file) + /* Anonymous pages always get deactivated. */ + pgmoved++; } - list_add(&page->lru, &l_inactive); } + /* + * Count the referenced anon pages as rotated, to balance pageout + * scan pressure between file and anonymous pages in get_sacn_ratio. + */ + if (!file) + zone->recent_rotated_anon += pgmoved; + + /* + * Now put the pages back on the appropriate [file or anon] inactive + * and active lists. + */ pagevec_init(&pvec, 1); pgmoved = 0; + lru = LRU_BASE + file * LRU_FILE; spin_lock_irq(&zone->lru_lock); while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); @@ -1114,11 +1125,12 @@ static void shrink_active_list(unsigned VM_BUG_ON(!PageActive(page)); ClearPageActive(page); - list_move(&page->lru, &zone->inactive_list); + list_move(&page->lru, &zone->list[lru]); mem_cgroup_move_lists(page, false); pgmoved++; if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_INACTIVE_ANON + lru, + pgmoved); spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; pgmoved = 0; @@ -1128,7 +1140,7 @@ static void shrink_active_list(unsigned spin_lock_irq(&zone->lru_lock); } } - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_INACTIVE_ANON + lru, pgmoved); pgdeactivate += pgmoved; if (buffer_heads_over_limit) { spin_unlock_irq(&zone->lru_lock); @@ -1137,6 +1149,7 @@ static void shrink_active_list(unsigned } pgmoved = 0; + lru = LRU_ACTIVE + file * LRU_FILE; while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); @@ -1144,88 +1157,220 @@ static void shrink_active_list(unsigned SetPageLRU(page); VM_BUG_ON(!PageActive(page)); - list_move(&page->lru, &zone->active_list); + list_move(&page->lru, &zone->list[lru]); mem_cgroup_move_lists(page, true); pgmoved++; if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_INACTIVE_ANON + lru, + pgmoved); + pgmoved = 0; + spin_unlock_irq(&zone->lru_lock); + if (vm_swap_full()) + pagevec_swap_free(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_INACTIVE_ANON + lru, pgmoved); + if (file) { + zone->recent_rotated_file += pgmoved; + } else { + zone->recent_rotated_anon += pgmoved; + } + +#ifdef CONFIG_NORECLAIM_LRU + pgmoved = 0; + while (!list_empty(&l_noreclaim)) { + page = lru_to_page(&l_noreclaim); + prefetchw_prev_lru_page(page, &l_noreclaim, flags); + + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + VM_BUG_ON(!PageActive(page)); + ClearPageActive(page); + VM_BUG_ON(PageNoreclaim(page)); + SetPageNoreclaim(page); + + list_move(&page->lru, &zone->list[LRU_NORECLAIM]); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + __mod_zone_page_state(zone, NR_NORECLAIM, pgmoved); pgmoved = 0; spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); spin_lock_irq(&zone->lru_lock); } } - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_NORECLAIM, pgmoved); +#endif __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgdeactivate); spin_unlock_irq(&zone->lru_lock); + if (vm_swap_full()) + pagevec_swap_free(&pvec); pagevec_release(&pvec); } +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct zone *zone, struct scan_control *sc, int priority) +{ + int file = is_file_lru(lru); + + if (lru == LRU_ACTIVE_FILE) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + if (lru == LRU_ACTIVE_ANON && inactive_anon_low(zone)) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); +} + +/* + * The utility of the anon and file memory corresponds to the fraction + * of pages that were recently referenced in each category. Pageout + * pressure is distributed according to the size of each set, the fraction + * of recently referenced pages (except used-once file pages) and the + * swappiness parameter. + * + * We return the relative pressures as percentages so shrink_zone can + * easily use them. + */ +static void get_scan_ratio(struct zone *zone, struct scan_control * sc, + unsigned long *percent) +{ + unsigned long anon, file, free; + unsigned long anon_prio, file_prio; + unsigned long ap, fp; + + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + /* Keep a floating average of RECENT references. */ + if (unlikely(zone->recent_scanned_anon > anon / zone->inactive_ratio)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned_anon /= 2; + zone->recent_rotated_anon /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + if (unlikely(zone->recent_scanned_file > file / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned_file /= 2; + zone->recent_rotated_file /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* + * anon recent_rotated_anon + * %anon = 100 * ----------- / ------------------- * IO cost + * anon + file recent_scanned_anon + */ + ap = (anon_prio + 1) * (zone->recent_scanned_anon + 1); + ap /= zone->recent_rotated_anon + 1; + + fp = (file_prio + 1) * (zone->recent_scanned_file + 1); + fp /= zone->recent_rotated_file + 1; + + /* Normalize to percentages */ + percent[0] = 100 * ap / (ap + fp + 1); + percent[1] = 100 - percent[0]; + + free = zone_page_state(zone, NR_FREE_PAGES); + + /* + * If we have no swap space, do not bother scanning anon pages. + */ + if (nr_swap_pages <= 0) { + percent[0] = 0; + percent[1] = 100; + } + /* + * If we already freed most file pages, scan the anon pages + * regardless of the page access ratios or swappiness setting. + */ + else if (file + free <= zone->pages_high) + percent[0] = 100; +} + + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static unsigned long shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { - unsigned long nr_active; - unsigned long nr_inactive; + unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; unsigned long nr_reclaimed = 0; + unsigned long percent[2]; /* anon @ 0; file @ 1 */ + enum lru_list l; - if (scan_global_lru(sc)) { - /* - * Add one to nr_to_scan just to make sure that the kernel - * will slowly sift through the active list. - */ - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; - nr_active = zone->nr_scan_active; - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; - nr_inactive = zone->nr_scan_inactive; - if (nr_inactive >= sc->swap_cluster_max) - zone->nr_scan_inactive = 0; - else - nr_inactive = 0; - - if (nr_active >= sc->swap_cluster_max) - zone->nr_scan_active = 0; - else - nr_active = 0; - } else { - /* - * This reclaim occurs not because zone memory shortage but - * because memory controller hits its limit. - * Then, don't modify zone reclaim related data. - */ - nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, - zone, priority); + get_scan_ratio(zone, sc, percent); - nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, - zone, priority); + for_each_reclaimable_lru(l) { + if (scan_global_lru(sc)) { + int file = is_file_lru(l); + int scan; + /* + * Add one to nr_to_scan just to make sure that the + * kernel will slowly sift through each list. + */ + scan = zone_page_state(zone, NR_INACTIVE_ANON + l); + scan >>= priority; + scan = (scan * percent[file]) / 100; + + zone->nr_scan[l] += scan + 1; + nr[l] = zone->nr_scan[l]; + if (nr[l] >= sc->swap_cluster_max) + zone->nr_scan[l] = 0; + else + nr[l] = 0; + } else { + /* + * This reclaim occurs not because zone memory shortage + * but because memory controller hits its limit. + * Then, don't modify zone reclaim related data. + */ + nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, + priority, l); + } } - - while (nr_active || nr_inactive) { - if (nr_active) { - nr_to_scan = min(nr_active, + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + for_each_reclaimable_lru(l) { + if (nr[l]) { + nr_to_scan = min(nr[l], (unsigned long)sc->swap_cluster_max); - nr_active -= nr_to_scan; - shrink_active_list(nr_to_scan, zone, sc, priority); - } + nr[l] -= nr_to_scan; - if (nr_inactive) { - nr_to_scan = min(nr_inactive, - (unsigned long)sc->swap_cluster_max); - nr_inactive -= nr_to_scan; - nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, - sc); + nr_reclaimed += shrink_list(l, nr_to_scan, + zone, sc, priority); + } } } + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (scan_global_lru(sc) && inactive_anon_low(zone)) + shrink_list(NR_ACTIVE_ANON, SWAP_CLUSTER_MAX, zone, sc, + priority); + throttle_vm_writeout(sc->gfp_mask); return nr_reclaimed; } @@ -1287,7 +1432,7 @@ static unsigned long shrink_zones(int pr return nr_reclaimed; } - + /* * This is the main entry point to direct page reclaim. * @@ -1324,8 +1469,7 @@ static unsigned long do_try_to_free_page if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } } @@ -1515,6 +1659,14 @@ loop_again: priority != DEF_PRIORITY) continue; + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. + */ + if (inactive_anon_low(zone)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, + &sc, priority, 0); + if (!zone_watermark_ok(zone, order, zone->pages_high, 0, 0)) { end_zone = i; @@ -1527,8 +1679,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } /* @@ -1572,8 +1723,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE)) * 6) + (zone_lru_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -1627,7 +1777,7 @@ out: /* * The background pageout daemon, started as a kernel thread - * from the init process. + * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity @@ -1722,6 +1872,14 @@ void wakeup_kswapd(struct zone *zone, in wake_up_interruptible(&pgdat->kswapd_wait); } +unsigned long global_lru_pages(void) +{ + return global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_INACTIVE_FILE); +} + #ifdef CONFIG_PM /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages @@ -1735,6 +1893,7 @@ static unsigned long shrink_all_zones(un { struct zone *zone; unsigned long nr_to_scan, ret = 0; + enum lru_list l; for_each_zone(zone) { @@ -1744,38 +1903,31 @@ static unsigned long shrink_all_zones(un if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; - /* For pass = 0 we don't shrink the active list */ - if (pass > 0) { - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; - if (zone->nr_scan_active >= nr_pages || pass > 3) { - zone->nr_scan_active = 0; + for_each_reclaimable_lru(l) { + /* For pass = 0, we don't shrink the active list */ + if (pass == 0 && + (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE)) + continue; + + zone->nr_scan[l] += + (zone_page_state(zone, NR_INACTIVE_ANON + l) + >> prio) + 1; + if (zone->nr_scan[l] >= nr_pages || pass > 3) { + zone->nr_scan[l] = 0; nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_ACTIVE)); - shrink_active_list(nr_to_scan, zone, sc, prio); + zone_page_state(zone, + NR_INACTIVE_ANON + l)); + ret += shrink_list(l, nr_to_scan, zone, + sc, prio); + if (ret >= nr_pages) + return ret; } } - - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; - if (zone->nr_scan_inactive >= nr_pages || pass > 3) { - zone->nr_scan_inactive = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_INACTIVE)); - ret += shrink_inactive_list(nr_to_scan, zone, sc); - if (ret >= nr_pages) - return ret; - } } return ret; } -static unsigned long count_lru_pages(void) -{ - return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); -} - /* * Try to free `nr_pages' of memory, system-wide, and return the number of * freed pages. @@ -1801,7 +1953,7 @@ unsigned long shrink_all_memory(unsigned current->reclaim_state = &reclaim_state; - lru_pages = count_lru_pages(); + lru_pages = global_lru_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -1844,7 +1996,7 @@ unsigned long shrink_all_memory(unsigned reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - count_lru_pages()); + global_lru_pages()); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -1861,7 +2013,7 @@ unsigned long shrink_all_memory(unsigned if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); } @@ -2090,3 +2242,116 @@ int zone_reclaim(struct zone *zone, gfp_ return ret; } #endif + +#ifdef CONFIG_NORECLAIM_LRU +/* + * page_reclaimable - test whether a page is reclaimable + * @page: the page to test + * @vma: the VMA in which the page is or will be mapped, may be NULL + * + * Test whether page is reclaimable--i.e., should be placed on active/inactive + * lists vs noreclaim list. + * + * Reasons page might not be reclaimable: + * (1) page's mapping marked non-reclaimable + * + * TODO - later patches + */ +int page_reclaimable(struct page *page, struct vm_area_struct *vma) +{ + + VM_BUG_ON(PageNoreclaim(page)); + + if (mapping_non_reclaimable(page_mapping(page))) + return 0; + + /* TODO: test page [!]reclaimable conditions */ + + return 1; +} + +/** + * check_move_noreclaim_page - check page for reclaimability and move to appropriate zone lru list + * @page: page to check reclaimability and move to appropriate lru list + * @zone: zone page is in + * + * Checks a page for reclaimability and moves the page to the appropriate + * zone lru list. + * + * Restrictions: zone->lru_lock must be held, page must be on LRU and must + * have PageNoreclaim set. + */ +static void check_move_noreclaim_page(struct page *page, struct zone *zone) +{ + + ClearPageNoreclaim(page); /* for page_reclaimable() */ + if (page_reclaimable(page, NULL)) { + enum lru_list l = LRU_INACTIVE_ANON + page_file_cache(page); + __dec_zone_state(zone, NR_NORECLAIM); + list_move(&page->lru, &zone->list[l]); + __inc_zone_state(zone, NR_INACTIVE_ANON + l); + } else { + /* + * rotate noreclaim list + */ + SetPageNoreclaim(page); + list_move(&page->lru, &zone->list[LRU_NORECLAIM]); + } +} + +/** + * scan_mapping_noreclaim_pages - scan an address space for reclaimable pages + * @mapping: struct address_space to scan for reclaimable pages + * + * Scan all pages in mapping. Check non-reclaimable pages for + * reclaimability and move them to the appropriate zone lru list. + */ +void scan_mapping_noreclaim_pages(struct address_space *mapping) +{ + pgoff_t next = 0; + pgoff_t end = i_size_read(mapping->host); + struct zone *zone; + struct pagevec pvec; + + if (mapping->nrpages == 0) + return; + + pagevec_init(&pvec, 0); + while (next < end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + int i; + + zone = NULL; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + struct zone *pagezone = page_zone(page); + + if (page_index > next) + next = page_index; + next++; + + if (TestSetPageLocked(page)) + continue; + + if (pagezone != zone) { + if (zone) + spin_unlock(&zone->lru_lock); + zone = pagezone; + spin_lock(&zone->lru_lock); + } + + if (PageLRU(page) && PageNoreclaim(page)) + check_move_noreclaim_page(page, zone); + + unlock_page(page); + + } + if (zone) + spin_unlock(&zone->lru_lock); + pagevec_release(&pvec); + } + +} +#endif diff -up linux-2.6.25.noarch/mm/swap.c.splitlru linux-2.6.25.noarch/mm/swap.c --- linux-2.6.25.noarch/mm/swap.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/swap.c 2008-04-30 21:26:32.000000000 -0400 @@ -34,8 +34,7 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; +static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs) = { {0,}, }; static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; /* @@ -96,6 +95,28 @@ void put_pages_list(struct list_head *pa } EXPORT_SYMBOL(put_pages_list); +/** + * page_lru - which LRU list should a page be on? + * @page: the page to test + * + * Returns the LRU list a page should be on, as an index + * into the array of LRU lists. + */ +enum lru_list page_lru(struct page *page) +{ + enum lru_list lru = LRU_BASE; + + if (PageNoreclaim(page)) + lru = LRU_NORECLAIM; + else { + if (PageActive(page)) + lru += LRU_ACTIVE; + lru += page_file_cache(page); + } + + return lru; +} + /* * pagevec_move_tail() must be called with IRQ disabled. * Otherwise this may cause nasty races. @@ -116,8 +137,10 @@ static void pagevec_move_tail(struct pag zone = pagezone; spin_lock(&zone->lru_lock); } - if (PageLRU(page) && !PageActive(page)) { - list_move_tail(&page->lru, &zone->inactive_list); + if (PageLRU(page) && !PageActive(page) && + !PageNoreclaim(page)) { + int lru = page_file_cache(page); + list_move_tail(&page->lru, &zone->list[lru]); pgmoved++; } } @@ -146,6 +169,8 @@ int rotate_reclaimable_page(struct page return 1; if (PageActive(page)) return 1; + if (PageNoreclaim(page)) + return 1; if (!PageLRU(page)) return 1; @@ -170,12 +195,25 @@ void activate_page(struct page *page) struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(zone, page); + if (PageLRU(page) && !PageActive(page) && !PageNoreclaim(page)) { + int file = page_file_cache(page); + int lru = LRU_BASE + file; + del_page_from_lru_list(zone, page, lru); + SetPageActive(page); - add_page_to_active_list(zone, page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); __count_vm_event(PGACTIVATE); mem_cgroup_move_lists(page, true); + + if (file) { + zone->recent_scanned_file++; + zone->recent_rotated_file++; + } else { + /* Can this happen? Maybe through tmpfs... */ + zone->recent_scanned_anon++; + zone->recent_rotated_anon++; + } } spin_unlock_irq(&zone->lru_lock); } @@ -189,7 +227,8 @@ void activate_page(struct page *page) */ void mark_page_accessed(struct page *page) { - if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { + if (!PageActive(page) && !PageNoreclaim(page) && + PageReferenced(page) && PageLRU(page)) { activate_page(page); ClearPageReferenced(page); } else if (!PageReferenced(page)) { @@ -199,28 +238,33 @@ void mark_page_accessed(struct page *pag EXPORT_SYMBOL(mark_page_accessed); -/** - * lru_cache_add: add a page to the page lists - * @page: the page to add - */ -void lru_cache_add(struct page *page) +void __lru_cache_add(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; page_cache_get(page); if (!pagevec_add(pvec, page)) - __pagevec_lru_add(pvec); + ____pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } -void lru_cache_add_active(struct page *page) -{ - struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); +/** + * lru_cache_add_lru - add a page to a page list + * @page: the page to be added to the LRU. + * @lru: the LRU list to which the page is added. + */ +void lru_cache_add_lru(struct page *page, enum lru_list lru) +{ + if (PageActive(page)) { + VM_BUG_ON(PageNoreclaim(page)); + ClearPageActive(page); + } else if (PageNoreclaim(page)) { + VM_BUG_ON(PageActive(page)); + ClearPageNoreclaim(page); + } - page_cache_get(page); - if (!pagevec_add(pvec, page)) - __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_active_pvecs); + VM_BUG_ON(PageLRU(page) || PageActive(page) || PageNoreclaim(page)); + __lru_cache_add(page, lru); } /* @@ -230,15 +274,15 @@ void lru_cache_add_active(struct page *p */ static void drain_cpu_pagevecs(int cpu) { + struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); struct pagevec *pvec; + int lru; - pvec = &per_cpu(lru_add_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); - - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); + for_each_lru(lru) { + pvec = &pvecs[lru - LRU_BASE]; + if (pagevec_count(pvec)) + ____pagevec_lru_add(pvec, lru); + } pvec = &per_cpu(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { @@ -320,6 +364,7 @@ void release_pages(struct page **pages, if (PageLRU(page)) { struct zone *pagezone = page_zone(page); + if (pagezone != zone) { if (zone) spin_unlock_irqrestore(&zone->lru_lock, @@ -392,7 +437,7 @@ void __pagevec_release_nonlru(struct pag * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void __pagevec_lru_add(struct pagevec *pvec) +void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { int i; struct zone *zone = NULL; @@ -407,9 +452,14 @@ void __pagevec_lru_add(struct pagevec *p zone = pagezone; spin_lock_irq(&zone->lru_lock); } + VM_BUG_ON(PageActive(page) || PageNoreclaim(page)); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - add_page_to_inactive_list(zone, page); + if (is_active_lru(lru)) + SetPageActive(page); + else if (is_noreclaim_lru(lru)) + SetPageNoreclaim(page); + add_page_to_lru_list(zone, page, lru); } if (zone) spin_unlock_irq(&zone->lru_lock); @@ -417,48 +467,39 @@ void __pagevec_lru_add(struct pagevec *p pagevec_reinit(pvec); } -EXPORT_SYMBOL(__pagevec_lru_add); +EXPORT_SYMBOL(____pagevec_lru_add); -void __pagevec_lru_add_active(struct pagevec *pvec) +/* + * Try to drop buffers from the pages in a pagevec + */ +void pagevec_strip(struct pagevec *pvec) { int i; - struct zone *zone = NULL; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); + if (PagePrivate(page) && !TestSetPageLocked(page)) { + if (PagePrivate(page)) + try_to_release_page(page, 0); + unlock_page(page); } - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(PageActive(page)); - SetPageActive(page); - add_page_to_active_list(zone, page); } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); } /* - * Try to drop buffers from the pages in a pagevec + * Try to free swap space from the pages in a pagevec */ -void pagevec_strip(struct pagevec *pvec) +void pagevec_swap_free(struct pagevec *pvec) { int i; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (PagePrivate(page) && !TestSetPageLocked(page)) { - if (PagePrivate(page)) - try_to_release_page(page, 0); + if (PageSwapCache(page) && !TestSetPageLocked(page)) { + if (PageSwapCache(page)) + remove_exclusive_swap_page(page); unlock_page(page); } } diff -up linux-2.6.25.noarch/mm/page-writeback.c.splitlru linux-2.6.25.noarch/mm/page-writeback.c --- linux-2.6.25.noarch/mm/page-writeback.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/page-writeback.c 2008-04-30 19:10:58.000000000 -0400 @@ -270,9 +270,7 @@ static unsigned long highmem_dirtyable_m struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) - + zone_page_state(z, NR_INACTIVE) - + zone_page_state(z, NR_ACTIVE); + x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -290,9 +288,7 @@ static unsigned long determine_dirtyable { unsigned long x; - x = global_page_state(NR_FREE_PAGES) - + global_page_state(NR_INACTIVE) - + global_page_state(NR_ACTIVE); + x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); diff -up linux-2.6.25.noarch/mm/mempolicy.c.splitlru linux-2.6.25.noarch/mm/mempolicy.c --- linux-2.6.25.noarch/mm/mempolicy.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/mempolicy.c 2008-04-30 19:10:58.000000000 -0400 @@ -93,6 +93,8 @@ #include #include +#include "internal.h" + /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ @@ -618,8 +620,11 @@ static void migrate_page_add(struct page /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) - isolate_lru_page(page, pagelist); + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { + if (!isolate_lru_page(page)) { + list_add_tail(&page->lru, pagelist); + } + } } static struct page *new_node_page(struct page *page, unsigned long node, int **x) @@ -1922,7 +1927,7 @@ static void gather_stats(struct page *pa if (PageSwapCache(page)) md->swapcache++; - if (PageActive(page)) + if (PageActive(page) || PageNoreclaim(page)) md->active++; if (PageWriteback(page)) diff -up linux-2.6.25.noarch/mm/readahead.c.splitlru linux-2.6.25.noarch/mm/readahead.c --- linux-2.6.25.noarch/mm/readahead.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/mm/readahead.c 2008-04-30 19:10:58.000000000 -0400 @@ -229,7 +229,7 @@ int do_page_cache_readahead(struct addre */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) + return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); } diff -up linux-2.6.25.noarch/fs/ramfs/inode.c.splitlru linux-2.6.25.noarch/fs/ramfs/inode.c --- linux-2.6.25.noarch/fs/ramfs/inode.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/fs/ramfs/inode.c 2008-04-30 19:10:58.000000000 -0400 @@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct sup inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_noreclaim(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { default: diff -up linux-2.6.25.noarch/fs/ramfs/file-nommu.c.splitlru linux-2.6.25.noarch/fs/ramfs/file-nommu.c --- linux-2.6.25.noarch/fs/ramfs/file-nommu.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/fs/ramfs/file-nommu.c 2008-04-30 19:10:58.000000000 -0400 @@ -111,12 +111,12 @@ static int ramfs_nommu_expand_for_mappin goto add_error; if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); + __pagevec_lru_add_file(&lru_pvec); unlock_page(page); } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); return 0; fsize_exceeded: diff -up linux-2.6.25.noarch/fs/nfs/dir.c.splitlru linux-2.6.25.noarch/fs/nfs/dir.c --- linux-2.6.25.noarch/fs/nfs/dir.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/fs/nfs/dir.c 2008-04-30 19:10:58.000000000 -0400 @@ -1523,7 +1523,7 @@ static int nfs_symlink(struct inode *dir if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { pagevec_add(&lru_pvec, page); - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else diff -up linux-2.6.25.noarch/fs/cifs/file.c.splitlru linux-2.6.25.noarch/fs/cifs/file.c --- linux-2.6.25.noarch/fs/cifs/file.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/fs/cifs/file.c 2008-04-30 19:10:58.000000000 -0400 @@ -1778,7 +1778,7 @@ static void cifs_copy_cache_pages(struct SetPageUptodate(page); unlock_page(page); if (!pagevec_add(plru_pvec, page)) - __pagevec_lru_add(plru_pvec); + __pagevec_lru_add_file(plru_pvec); data += PAGE_CACHE_SIZE; } return; @@ -1912,7 +1912,7 @@ static int cifs_readpages(struct file *f bytes_read = 0; } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); /* need to free smb_read_data buf before exit */ if (smb_read_data) { diff -up linux-2.6.25.noarch/fs/ntfs/file.c.splitlru linux-2.6.25.noarch/fs/ntfs/file.c --- linux-2.6.25.noarch/fs/ntfs/file.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/fs/ntfs/file.c 2008-04-30 19:10:58.000000000 -0400 @@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_page pages[nr] = *cached_page; page_cache_get(*cached_page); if (unlikely(!pagevec_add(lru_pvec, *cached_page))) - __pagevec_lru_add(lru_pvec); + __pagevec_lru_add_file(lru_pvec); *cached_page = NULL; } index++; @@ -2084,7 +2084,7 @@ err_out: OSYNC_METADATA|OSYNC_DATA); } } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); diff -up linux-2.6.25.noarch/fs/proc/proc_misc.c.splitlru linux-2.6.25.noarch/fs/proc/proc_misc.c --- linux-2.6.25.noarch/fs/proc/proc_misc.c.splitlru 2008-04-16 22:49:44.000000000 -0400 +++ linux-2.6.25.noarch/fs/proc/proc_misc.c 2008-05-07 20:38:08.000000000 -0400 @@ -132,6 +132,10 @@ static int meminfo_read_proc(char *page, unsigned long allowed; struct vmalloc_info vmi; long cached; + unsigned long inactive_anon; + unsigned long active_anon; + unsigned long inactive_file; + unsigned long active_file; /* * display in kilobytes. @@ -150,47 +154,66 @@ static int meminfo_read_proc(char *page, get_vmalloc_info(&vmi); + inactive_anon = global_page_state(NR_INACTIVE_ANON); + active_anon = global_page_state(NR_ACTIVE_ANON); + inactive_file = global_page_state(NR_INACTIVE_FILE); + active_file = global_page_state(NR_ACTIVE_FILE); + /* * Tagged format, for easy grepping and expansion. */ len = sprintf(page, - "MemTotal: %8lu kB\n" - "MemFree: %8lu kB\n" - "Buffers: %8lu kB\n" - "Cached: %8lu kB\n" - "SwapCached: %8lu kB\n" - "Active: %8lu kB\n" - "Inactive: %8lu kB\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "Buffers: %8lu kB\n" + "Cached: %8lu kB\n" + "SwapCached: %8lu kB\n" + "Active: %8lu kB\n" + "Inactive: %8lu kB\n" + "Active(anon): %8lu kB\n" + "Inactive(anon): %8lu kB\n" + "Active(file): %8lu kB\n" + "Inactive(file): %8lu kB\n" +#ifdef CONFIG_NORECLAIM_LRU + "Noreclaim: %8lu kB\n" +#endif #ifdef CONFIG_HIGHMEM - "HighTotal: %8lu kB\n" - "HighFree: %8lu kB\n" - "LowTotal: %8lu kB\n" - "LowFree: %8lu kB\n" -#endif - "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n" - "Dirty: %8lu kB\n" - "Writeback: %8lu kB\n" - "AnonPages: %8lu kB\n" - "Mapped: %8lu kB\n" - "Slab: %8lu kB\n" - "SReclaimable: %8lu kB\n" - "SUnreclaim: %8lu kB\n" - "PageTables: %8lu kB\n" - "NFS_Unstable: %8lu kB\n" - "Bounce: %8lu kB\n" - "CommitLimit: %8lu kB\n" - "Committed_AS: %8lu kB\n" - "VmallocTotal: %8lu kB\n" - "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", + "HighTotal: %8lu kB\n" + "HighFree: %8lu kB\n" + "LowTotal: %8lu kB\n" + "LowFree: %8lu kB\n" +#endif + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n" + "Dirty: %8lu kB\n" + "Writeback: %8lu kB\n" + "AnonPages: %8lu kB\n" + "Mapped: %8lu kB\n" + "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" + "PageTables: %8lu kB\n" + "NFS_Unstable: %8lu kB\n" + "Bounce: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), K(cached), K(total_swapcache_pages), - K(global_page_state(NR_ACTIVE)), - K(global_page_state(NR_INACTIVE)), + K(active_anon + active_file), + K(inactive_anon + inactive_file), + K(active_anon), + K(inactive_anon), + K(active_file), + K(inactive_file), +#ifdef CONFIG_NORECLAIM_LRU + K(global_page_state(NR_NORECLAIM)), +#endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh),