Archive for November, 2015

kernel: mm: about migration in v3.15

November 30, 2015

The post is to discuss migration in v3.15.

what is migration
Migration could migrate pages to other free pages. For example, an anonymous page could be migrated by copying page structure contents, copying physical page contents, and updating the corresponding vma and page table.

who are the users of migrate_pages()
CMA allocator and compaction mechanism.

how move_to_new_page() move pages
Generally, move_to_new_page() could help migrate a page by calling page->mapping->a_ops->migratepage(). If a filesystem supports migratepage, then its page could be migrated by move_to_new_page().

736 /*
737  * Move a page to a newly allocated page
738  * The page is locked and all ptes have been successfully removed.
739  *
740  * The new page will have replaced the old page if this function
741  * is successful.
742  *
743  * Return value:
744  *   < 0 - error code
745  *  MIGRATEPAGE_SUCCESS - success
746  */
747 static int move_to_new_page(struct page *newpage, struct page *page,
748                                 int remap_swapcache, enum migrate_mode mode)
749 {
750         struct address_space *mapping;
751         int rc;
752 
753         /*
754          * Block others from accessing the page when we get around to
755          * establishing additional references. We are the only one
756          * holding a reference to the new page at this point.
757          */
758         if (!trylock_page(newpage))
759                 BUG();
760 
761         /* Prepare mapping for the new page.*/
762         newpage->index = page->index;
763         newpage->mapping = page->mapping;
764         if (PageSwapBacked(page))
765                 SetPageSwapBacked(newpage);
766 
767         mapping = page_mapping(page);
768         if (!mapping)
769                 rc = migrate_page(mapping, newpage, page, mode);
770         else if (mapping->a_ops->migratepage)
771                 /*
772                  * Most pages have a mapping and most filesystems provide a
773                  * migratepage callback. Anonymous pages are part of swap
774                  * space which also has its own migratepage callback. This
775                  * is the most common path for page migration.
776                  */
777                 rc = mapping->a_ops->migratepage(mapping,
778                                                 newpage, page, mode);
779         else
780                 rc = fallback_migrate_page(mapping, newpage, page, mode);
781 
782         if (rc != MIGRATEPAGE_SUCCESS) {
783                 newpage->mapping = NULL;
784         } else {
785                 if (remap_swapcache)
786                         remove_migration_ptes(page, newpage);
787                 page->mapping = NULL;
788         }
789 
790         unlock_page(newpage);
791 
792         return rc;
793 }

migratepage callback of different filesystems

  • The nfs filesystem uses nfs_migrate_page() as migratepage() callback.
  • 553 const struct address_space_operations nfs_file_aops = {
    554         .readpage = nfs_readpage,
    555         .readpages = nfs_readpages,
    556         .set_page_dirty = __set_page_dirty_nobuffers,
    557         .writepage = nfs_writepage,
    558         .writepages = nfs_writepages,
    559         .write_begin = nfs_write_begin,
    560         .write_end = nfs_write_end,
    561         .invalidatepage = nfs_invalidate_page,
    562         .releasepage = nfs_release_page,
    563         .direct_IO = nfs_direct_IO,
    564         .migratepage = nfs_migrate_page,
    565         .launder_page = nfs_launder_page,
    566         .is_dirty_writeback = nfs_check_dirty_writeback,
    567         .error_remove_page = generic_error_remove_page,
    568 #ifdef CONFIG_NFS_SWAP
    569         .swap_activate = nfs_swap_activate,
    570         .swap_deactivate = nfs_swap_deactivate,
    571 #endif
    572 };
    
  • The ext4 filesystem uses buffer_migrate_page() as migratepage() callback.
  • 3271 static const struct address_space_operations ext4_aops = {
    3272         .readpage               = ext4_readpage,
    3273         .readpages              = ext4_readpages,
    3274         .writepage              = ext4_writepage,
    3275         .writepages             = ext4_writepages,
    3276         .write_begin            = ext4_write_begin,
    3277         .write_end              = ext4_write_end,
    3278         .bmap                   = ext4_bmap,
    3279         .invalidatepage         = ext4_invalidatepage,
    3280         .releasepage            = ext4_releasepage,
    3281         .direct_IO              = ext4_direct_IO,
    3282         .migratepage            = buffer_migrate_page,
    3283         .is_partially_uptodate  = block_is_partially_uptodate,
    3284         .error_remove_page      = generic_error_remove_page,
    3285 };
    

    migration API
    This most convenient migration API is migrate_pages(). The from argument is an isolated list of pages. The get_new_page is used to allocate new pages as migrate target.

    The return value of migrate_pages()

  • Return negative number as error code if failure
  • Otherwise, it returns the number of remaining pages in the from list. Return zero means all pages are migrated, and the from list become empty.
  • 1082 /*
    1083  * migrate_pages - migrate the pages specified in a list, to the free pages
    1084  *                 supplied as the target for the page migration
    1085  *
    1086  * @from:               The list of pages to be migrated.
    1087  * @get_new_page:       The function used to allocate free pages to be used
    1088  *                      as the target of the page migration.
    1089  * @private:            Private data to be passed on to get_new_page()
    1090  * @mode:               The migration mode that specifies the constraints for
    1091  *                      page migration, if any.
    1092  * @reason:             The reason for page migration.
    1093  *
    1094  * The function returns after 10 attempts or if no pages are movable any more
    1095  * because the list has become empty or no retryable pages exist any more.
    1096  * The caller should call putback_lru_pages() to return pages to the LRU
    1097  * or free list only if ret != 0.
    1098  *
    1099  * Returns the number of pages that were not migrated, or an error code.
    1100  */
    1101 int migrate_pages(struct list_head *from, new_page_t get_new_page,
    1102                 unsigned long private, enum migrate_mode mode, int reason)
    1103 {
    1104         int retry = 1;
    1105         int nr_failed = 0;
    1106         int nr_succeeded = 0;
    1107         int pass = 0;
    1108         struct page *page;
    1109         struct page *page2;
    1110         int swapwrite = current->flags & PF_SWAPWRITE;
    1111         int rc;
    1112 
    1113         if (!swapwrite)
    1114                 current->flags |= PF_SWAPWRITE;
    1115 
    1116         for(pass = 0; pass < 10 && retry; pass++) {
    1117                 retry = 0;
    1118 
    1119                 list_for_each_entry_safe(page, page2, from, lru) {
    1120                         cond_resched();
    1121 
    1122                         if (PageHuge(page))
    1123                                 rc = unmap_and_move_huge_page(get_new_page,
    1124                                                 private, page, pass > 2, mode);
    1125                         else
    1126                                 rc = unmap_and_move(get_new_page, private,
    1127                                                 page, pass > 2, mode);
    1128 
    1129                         switch(rc) {
    1130                         case -ENOMEM:
    1131                                 goto out;
    1132                         case -EAGAIN:
    1133                                 retry++;
    1134                                 break;
    1135                         case MIGRATEPAGE_SUCCESS:
    1136                                 nr_succeeded++;
    1137                                 break;
    1138                         default:
    1139                                 /*
    1140                                  * Permanent failure (-EBUSY, -ENOSYS, etc.):
    1141                                  * unlike -EAGAIN case, the failed page is
    1142                                  * removed from migration page list and not
    1143                                  * retried in the next outer loop.
    1144                                  */
    1145                                 nr_failed++;
    1146                                 break;
    1147                         }
    1148                 }
    1149         }
    1150         rc = nr_failed + retry;
    1151 out:
    1152         if (nr_succeeded)
    1153                 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
    1154         if (nr_failed)
    1155                 count_vm_events(PGMIGRATE_FAIL, nr_failed);
    1156         trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
    1157 
    1158         if (!swapwrite)
    1159                 current->flags &= ~PF_SWAPWRITE;
    1160 
    1161         return rc;
    1162 }
    

    conclusion
    This post discusses migration in v3.15. It also describes move_to_new_page() and migrate_pages().

    patch discussion: mm, compaction: terminate async compaction when rescheduling

    November 30, 2015

    This patch discusses mm, compaction: terminate async compaction when rescheduling.

    merge time
    v3.16

    symptom
    In isolate_migratepages_range(), cond_resched() might reschedule and it make need_schedule() in should_release_lock() always return false. This will make some async compactions don’t abort as expected in mm: compaction: minimise the time IRQs are disabled while isolating pages for migration.

    456 unsigned long
    457 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
    458                 unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
    459 {
    460         unsigned long last_pageblock_nr = 0, pageblock_nr;
    461         unsigned long nr_scanned = 0, nr_isolated = 0;
    462         struct list_head *migratelist = &cc->migratepages;
    463         struct lruvec *lruvec;
    464         unsigned long flags;
    465         bool locked = false;
    466         struct page *page = NULL, *valid_page = NULL;
    467         bool skipped_async_unsuitable = false;
    468         const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
    469                                     (unevictable ? ISOLATE_UNEVICTABLE : 0);
    470 
    471         /*
    472          * Ensure that there are not too many pages isolated from the LRU
    473          * list by either parallel reclaimers or compaction. If there are,
    474          * delay for some time until fewer pages are isolated
    475          */
    476         while (unlikely(too_many_isolated(zone))) {
    477                 /* async migration should just abort */
    478                 if (!cc->sync)
    479                         return 0;
    480 
    481                 congestion_wait(BLK_RW_ASYNC, HZ/10);
    482 
    483                 if (fatal_signal_pending(current))
    484                         return 0;
    485         }
    486 
    487         /* Time to isolate some pages for migration */
    488         cond_resched();
    489         for (; low_pfn < end_pfn; low_pfn++) {
    490                 /* give a chance to irqs before checking need_resched() */
    491                 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
    492                         if (should_release_lock(&zone->lru_lock)) {
    493                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
    494                                 locked = false;
    495                         }
    496                 }
    
    174 static inline bool should_release_lock(spinlock_t *lock)
    175 {
    176         return need_resched() || spin_is_contended(lock);
    177 }
    

    effects of this patch

  • If cond_resched() returns true and this compaction is asynchronous, then isolate_migratepages_range() will return ISOLATE_ABORT = 0.
  • compact_zone() will return COMPACT_PARTIAL = 2.
  • compact_zone_order() will return COMPACT_PARTIAL = 2.
  • try_to_compact_pages() will return COMPACT_PARTIAL = 2. It will return 2 if no other zones in the zonelist return higher value, such as COMPACT_COMPLETE = 3.
  • __alloc_pages_direct_compact() set *did_some_progress as 2. If there still exist no pages in freelist, then it will not call defer_compaction() since this compaction is asynchronous.
  • Since *did_some_progress > 0, oom-killer will not be triggered in this round of rebalance
  • 
    diff --git a/mm/compaction.c b/mm/compaction.c
    index 217a6ad..56331f5 100644
    --- a/mm/compaction.c
    +++ b/mm/compaction.c
    @@ -494,8 +494,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
     			return 0;
     	}
     
    +	if (cond_resched()) {
    +		/* Async terminates prematurely on need_resched() */
    +		if (cc->mode == MIGRATE_ASYNC)
    +			return 0;
    +	}
    +
     	/* Time to isolate some pages for migration */
    -	cond_resched();
     	for (; low_pfn < end_pfn; low_pfn++) {
     		/* give a chance to irqs before checking need_resched() */
     		if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
    

    conclusion
    This patch discusses how mm, compaction: terminate asynchronous compaction when rescheduling fixes the condition in which asynchronous compaction doesn’t abort as expected. It also shows compaction behaviours while this asynchronous compaction happens.

    patch discussion: mm, thp: avoid excessive compaction latency during fault

    November 30, 2015

    This post is to discuss patch mm, thp: avoid excessive compaction latency during fault.

    merge time
    v3.16

    effects of this patch

    page allocations from a kernel thread

  • the first time of compaction in allocation slowpth is MIGRATE_ASYNC
  • the other times of compaction in allocation slowpth is MIGRATE_SYNC_LIGHT
  • normal page allocations from a user space thread

  • the first time of compaction in allocation slowpth is MIGRATE_ASYNC
  • the other times of compaction in allocation slowpth is MIGRATE_SYNC_LIGHT
  • transparent huge page allocations from a user space thread

  • each time of compaction in allocation slowpth is MIGRATE_ASYNC
  • diff --git a/mm/page_alloc.c b/mm/page_alloc.c
    index afb29da..d88d675 100644
    --- a/mm/page_alloc.c
    +++ b/mm/page_alloc.c
    @@ -2575,7 +2575,14 @@ rebalance:
     					&did_some_progress);
     	if (page)
     		goto got_pg;
    -	migration_mode = MIGRATE_SYNC_LIGHT;
    +
    +	/*
    +	 * It can become very expensive to allocate transparent hugepages at
    +	 * fault, so use asynchronous memory compaction for THP unless it is
    +	 * khugepaged trying to collapse.
    +	 */
    +	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
    +		migration_mode = MIGRATE_SYNC_LIGHT;
     
     	/*
     	 * If compaction is deferred for high-order allocations, it is because
    

    __GFP_NO_KSWAPD and huge page allocation
    get_huge_zero_page() which returns a huge page calls alloc_pages() with GFP_TRANSHUGE in gfp_mask set. GFP_TRANSHUGE includes __GFP_NO_KSWAPD. This patch tests __GFP_NO_KSWAPD to know this allocation might be for a huge page allocation.

    kernel: mm: gfp_mask and ion system heap allocation and kernel: mm: gfp_mask and kgsl allocator show ion system heap and kgsl allocators allocate high order pages with __GFP_NO_KSWAPD in gfp_mask set. But since __GFP_NORETRY in gfp_mask is also set, these allocations will not try the second time of compaction in slowpath. So the patch has no effects to these allocations.

    This patch might affect some allocations which also set __GFP_NO_KSWAPD in gfp_mask.
    Since __GFP_NO_KSWAPD implies that the callers don’t want to disturb the system by waking up kswapd, it’s reasonable not to synchronous compact the system, also.

    123 #define GFP_TRANSHUGE   (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
    124                          __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
    125                          __GFP_NO_KSWAPD)
    
    176 struct page *get_huge_zero_page(void)
    177 {
    178         struct page *zero_page;
    179 retry:
    180         if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
    181                 return READ_ONCE(huge_zero_page);
    182 
    183         zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
    184                         HPAGE_PMD_ORDER);
    185         if (!zero_page) {
    186                 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
    187                 return NULL;
    188         }
    189         count_vm_event(THP_ZERO_PAGE_ALLOC);
    190         preempt_disable();
    191         if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
    192                 preempt_enable();
    193                 __free_pages(zero_page, compound_order(zero_page));
    194                 goto retry;
    195         }
    196 
    197         /* We take additional reference here. It will be put back by shrinker */
    198         atomic_set(&huge_zero_refcount, 2);
    199         preempt_enable();
    200         return READ_ONCE(huge_zero_page);
    201 }
    

    conclusion
    This post is to discuss how mm, thp: avoid excessive compaction latency during fault decrease compaction efforts while __GFP_NO_KSWAPD is set.

    patch discussion: mm, compaction: embed migration mode in compact_control

    November 30, 2015

    This post is to discuss mm, compaction: embed migration mode in compact_control.

    merge time
    v3.16

    effects of this patch
    compact control’s field sync is replaced by enum migrate_mode mode.

    how callers set up compaction control’s migrate_mode

  • compaction though /proc/sys/vm/compact_memory set mode as MIGRATE_SYNC
  • compaction from kswapd set mode as MIGRATE_ASYNC
  • the first compaction from allocation slowpatch set mode as MIGRATE_ASYNC
  • the other compaction from allocation slowpatch set mode as MIGRATE_SYNC_LIGHT
  • zone’s cached migrate pfn and compaction control’s migrate_mode

  • zone->compact_cached_migrate_pfn[0] is for MIGRATE_ASYNC.
  • zone->compact_cached_migrate_pfn[0] is for MIGRATE_SYNC_LIGHT and MIGRATE_SYNC.
  • diff --git a/mm/internal.h b/mm/internal.h
    index 6ee580d..a25424a 100644
    --- a/mm/internal.h
    +++ b/mm/internal.h
    @@ -134,7 +134,7 @@ struct compact_control {
     	unsigned long nr_migratepages;	/* Number of pages to migrate */
     	unsigned long free_pfn;		/* isolate_freepages search base */
     	unsigned long migrate_pfn;	/* isolate_migratepages search base */
    -	bool sync;			/* Synchronous migration */
    +	enum migrate_mode mode;		/* Async or sync migration mode */
     	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
     	bool finished_update_free;	/* True when the zone cached pfns are
     					 * no longer being updated
    

    conclusion
    This post is to discuss the effects of mm, compaction: embed migration mode in compact_control.

    patch discussion: mm, compaction: add per-zone migration pfn cache for async compaction

    November 30, 2015

    This post is to discuss patch mm, compaction: add per-zone migration pfn cache for async compaction.

    merge time
    v3.16

    symptom in v3.15
    If migrate scanner of an async compaction finds a pageblock whose type is not CMA or Movable, then it skips the pageblock. But it doesn’t update pageblock skip flag and zone’s compact_cached_migrate_pfn. The next async compaction will also skip these pageblocks again. It’s a waste of cpu resources, especially in a system with large memory.

    528                 /* If isolation recently failed, do not retry */
    529                 pageblock_nr = low_pfn >> pageblock_order;
    530                 if (last_pageblock_nr != pageblock_nr) {
    531                         int mt;
    532 
    533                         last_pageblock_nr = pageblock_nr;
    534                         if (!isolation_suitable(cc, page))
    535                                 goto next_pageblock;
    536 
    537                         /*
    538                          * For async migration, also only scan in MOVABLE
    539                          * blocks. Async migration is optimistic to see if
    540                          * the minimum amount of work satisfies the allocation
    541                          */
    542                         mt = get_pageblock_migratetype(page);
    543                         if (!cc->sync && !migrate_async_suitable(mt)) {
    544                                 cc->finished_update_migrate = true;
    545                                 skipped_async_unsuitable = true;
    546                                 goto next_pageblock;
    547                         }
    548                 }
    

    how does the patch improve this
    Each zone has two cached migrate pfn. One is for async compaction, and the other is for sync compaction.

  • If async compaction’s migrate scanner skips a pageblock whose pageblock is not CMA or Movable, then it will update zone->compact_cached_migrate_pfn[0], but it still will not update pageblock’s skip flag.
  • The following async could take advantage of this updated zone->compact_cached_migrate_pfn[0] to prevent migrate scanner from scanning these pages again.
  • I am not sure why this patch doesn’t set pageblock’s skip flag for async copmaction. In v4.3, both zone’s cached async migrate pfn and pageblock’s skip are updated.
  • diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
    index ae693e1..10a96ee 100644
    --- a/include/linux/mmzone.h
    +++ b/include/linux/mmzone.h
    @@ -360,9 +360,10 @@ struct zone {
     	/* Set to true when the PG_migrate_skip bits should be cleared */
     	bool			compact_blockskip_flush;
     
    -	/* pfns where compaction scanners should start */
    +	/* pfn where compaction free scanner should start */
     	unsigned long		compact_cached_free_pfn;
    -	unsigned long		compact_cached_migrate_pfn;
    +	/* pfn where async and sync compaction migration scanner should start */
    +	unsigned long		compact_cached_migrate_pfn[2];
     #endif
     #ifdef CONFIG_MEMORY_HOTPLUG
     	/* see spanned/present_pages for more description */
    

    conclusion
    This post is to discuss how mm, compaction: add per-zone migration pfn cache for async compaction improve async copmaction’s efficiency by async compact_cached_migrate_pfn for each zone.

    patch discussion: net: use __GFP_NORETRY for high order allocations

    November 30, 2015

    This post is to discuss patch net: use __GFP_NORETRY for high order allocations.

    sk_page_frag_refill() in v3.13
    In v3.13, skb_page_frag_refill() allocates pages with following fallback order.

  • alloc order-3 pages with gfp_mask = 0x0042d0 = (__GFP_COMP | __GFP_NOWARN | GFP_KERNEL)
  • alloc order-2 pages with gfp_mask = 0x0042d0 = (__GFP_COMP | __GFP_NOWARN | GFP_KERNEL)
  • alloc order-1 pages with gfp_mask = 0x0042d0 = (__GFP_COMP | __GFP_NOWARN | GFP_KERNEL)
  • alloc order-0 pages with gfp_mask = 0x0000d0 = (GFP_KERNEL)
  • 1842 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
    1843 
    1844 /**
    1845  * skb_page_frag_refill - check that a page_frag contains enough room
    1846  * @sz: minimum size of the fragment we want to get
    1847  * @pfrag: pointer to page_frag
    1848  * @prio: priority for memory allocation
    1849  *
    1850  * Note: While this allocator tries to use high order pages, there is
    1851  * no guarantee that allocations succeed. Therefore, @sz MUST be
    1852  * less or equal than PAGE_SIZE.
    1853  */
    1854 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
    1855 {
    1856         int order;
    1857 
    1858         if (pfrag->page) {
    1859                 if (atomic_read(&pfrag->page->_count) == 1) {
    1860                         pfrag->offset = 0;
    1861                         return true;
    1862                 }
    1863                 if (pfrag->offset + sz <= pfrag->size)
    1864                         return true;
    1865                 put_page(pfrag->page);
    1866         }
    1867 
    1868         /* We restrict high order allocations to users that can afford to wait */
    1869         order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
    1870 
    1871         do {
    1872                 gfp_t gfp = prio;
    1873 
    1874                 if (order)
    1875                         gfp |= __GFP_COMP | __GFP_NOWARN;
    1876                 pfrag->page = alloc_pages(gfp, order);
    1877                 if (likely(pfrag->page)) {
    1878                         pfrag->offset = 0;
    1879                         pfrag->size = PAGE_SIZE << order;
    1880                         return true;
    1881                 }
    1882         } while (--order >= 0);
    1883 
    1884         return false;
    1885 }
    1886 EXPORT_SYMBOL(skb_page_frag_refill);
    

    sk_page_frag_refill() in v3.14 with this patch
    In v3.14, skb_page_frag_refill() allocates pages with following fallback order.

  • alloc order-3 pages with gfp_mask = 0x0052d0 = (__GFP_COMP | __GFP_NORETRY | __GFP_NOWARN | GFP_KERNEL)
  • alloc order-2 pages with gfp_mask = 0x0052d0 = (__GFP_COMP | __GFP_NORETRY | __GFP_NOWARN | GFP_KERNEL)
  • alloc order-1 pages with gfp_mask = 0x0052d0 = (__GFP_COMP | __GFP_NORETRY | __GFP_NOWARN | GFP_KERNEL)
  • alloc order-0 pages with gfp_mask = 0x0000d0 = (GFP_KERNEL)
  • @@ -1845,7 +1847,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
     		gfp_t gfp = prio;
     
     		if (order)
    -			gfp |= __GFP_COMP | __GFP_NOWARN;
    +			gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
     		pfrag->page = alloc_pages(gfp, order);
     		if (likely(pfrag->page)) {
    

    behaviors of order-3 page allocation slowpath from in allocation from sk_page_frag_refill() in v3.14 with this patch

  • __GFP_NO_KSWAPD is not set: wakeup kswapd
  • try get_page_from_freelist before entering rebalance
  • ALLOC_NO_WATERMARKS is not set: skip trying __alloc_pages_high_priority which returns page if success
  • wait is true: enter rebalance which includes compaction and direct reclaim
  • Try async compaction which returns page if success.
  • Try direct reclaim which returns page if success.
  • If both compaction and direct reclaim have no progresses, then avoid triggering OOM-killer because __GFP_NORETRY is set.
  • should_alloc_retry() always returns true and it goes back to rebalance again.
  • wait = gfp_mask & __GFP_WAIT = __GFP_WAIT
    alloc_flags = gfp_to_alloc_flags(gfp_mask) = 0x00000040 = (ALLOC_WMARK_MIN | ALLOC_CPUSET)
    
     85 static inline bool oom_gfp_allowed(gfp_t gfp_mask)
     86 {
     87         return (gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY);
     88 }
    

    behaviors of should_alloc_retry() for order-3 page allocation from skb_page_frag_refill() since v3.14 with this patch

  • In v3.13, this order-3 allocation always returns true to continue retrying this allocation in slowpath.
  • In v3.14, this order-3 allocation always returns false to avoid retrying this allocation in slowpath.
    2154 static inline int
    2155 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
    2156                                 unsigned long did_some_progress,
    2157                                 unsigned long pages_reclaimed)
    2158 {
    2159         /* Do not loop if specifically requested */
    2160         if (gfp_mask & __GFP_NORETRY)
    2161                 return 0;
    2162 
    2163         /* Always retry if specifically requested */
    2164         if (gfp_mask & __GFP_NOFAIL)
    2165                 return 1;
    2166 
    2167         /*
    2168          * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
    2169          * making forward progress without invoking OOM. Suspend also disables
    2170          * storage devices so kswapd will not help. Bail if we are suspending.
    2171          */
    2172         if (!did_some_progress && pm_suspended_storage())
    2173                 return 0;
    2174 
    2175         /*
    2176          * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
    2177          * means __GFP_NOFAIL, but that may not be true in other
    2178          * implementations.
    2179          */
    2180         if (order <= PAGE_ALLOC_COSTLY_ORDER)
    2181                 return 1;
    2182 
    2183         /*
    2184          * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
    2185          * specified, then we retry until we no longer reclaim any pages
    2186          * (above), or we've reclaimed an order of pages at least as
    2187          * large as the allocation's order. In both cases, if the
    2188          * allocation still fails, we stop retrying.
    2189          */
    2190         if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
    2191                 return 1;
    2192 
    2193         return 0;
    2194 }
    

    effect of this patch

  • without this patch in v3.13: order-3 page allocations from sk_page_frag_refill() might be stuck in allocation slowpath and even trigger oom-killer while memory is under high pressure and fragmented.
  • with this patch in 3.14: order-3 page allocations from sk_page_frag_refill() will try async compaction, direct reclaim, avoid trigger oom-killer and returns directly.
  • For sk_page_frag_refill(), order-3 page allocation is for increasing efficiency. It it fails, it could continue fallback to allocate order-2, order-1, and order-0 pages. However, if memory is under high pressure, order-3 page allocation without __GFP_NORETRY set will decrease system performance.
  • conclusion
    This post is to discuss how patch net: use __GFP_NORETRY for high order allocations avoids oom-killer triggered by set __GFP_NORETYR in high order allocation from sk_page_frag_refill().

    patch discussion: mm/compaction: disallow high-order page for migration target

    November 29, 2015

    The post is to discuss mm/compaction: disallow high-order page for migration target

    merge time
    v3.15

    effect of this patch
    After this patch, the pages which could be isolated by free scanner must be buddy order-0 ones of CMA or Movable pageblock.

    diff --git a/mm/compaction.c b/mm/compaction.c
    index b6ab771..9a03fdb 100644
    --- a/mm/compaction.c
    +++ b/mm/compaction.c
    @@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
     /* Returns true if the page is within a block suitable for migration to */
     static bool suitable_migration_target(struct page *page)
     {
    -	int migratetype = get_pageblock_migratetype(page);
    -
    -	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
    -	if (migratetype == MIGRATE_RESERVE)
    -		return false;
    -
    -	if (is_migrate_isolate(migratetype))
    -		return false;
    -
    -	/* If the page is a large free page, then allow migration */
    +	/* If the page is a large free page, then disallow migration */
     	if (PageBuddy(page) && page_order(page) >= pageblock_order)
    -		return true;
    +		return false;
     
     	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
    -	if (migrate_async_suitable(migratetype))
    +	if (migrate_async_suitable(get_pageblock_migratetype(page)))
     		return true;
     
     	/* Otherwise skip the block */
    

    compaction and migration type
    From v3.15 to v4.3, compaction has no effect under condition in which all free pages are Unmovable and Reclaimabl migrate type.

    ------ PAGETYPEINFO (/proc/pagetypeinfo) ------
    Page block order: 10
    Pages per block:  1024  
    
    Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10    
    Node    0, zone      DMA, type    Unmovable   1533   3972    896      0      0      0      0      0      0      0      0     
    Node    0, zone      DMA, type  Reclaimable    618   3006      6      0      0      0      0      0      0      0      0     
    Node    0, zone      DMA, type      Movable      0      0      0      0      0      0      0      0      0      0      0     
    Node    0, zone      DMA, type      Reserve      0      0      3      5      0      0      0      0      0      0      0     
    Node    0, zone      DMA, type          CMA      0      0      0      0      0      0      0      0      0      0      0     
    Node    0, zone      DMA, type      Isolate      0      0      0      0      0      0      0      0      0      0      0     
    
    Number of blocks type     Unmovable  Reclaimable      Movable      Reserve          CMA      Isolate 
    Node 0, zone      DMA          215            8          396            2          106            0
    

    conclusion
    This post discusses effect and this patch and compaction limitation related to migration type from v3.15 to v4.3.

    patch discussion: mm, compaction: return failed migration target pages back to freelist

    November 29, 2015

    This post is to discuss patch mm, compaction: return failed migration target pages back to freelist

    merge time
    v3.16

    migrate_pages() and unmap_and_move() in v3.15
    In v3.15, migrate_pages() and unmap_and_move() have an get_new_page callback to let free scanner isolate free pages. If page migration fails, these isolated free pages are returned to buddy system by putback_lru_page(newpage). It’s a waste of resources, since these pages could be used in the subsequent migrations.

    941 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
    942                         struct page *page, int force, enum migrate_mode mode)
    943 {
    944         int rc = 0;
    945         int *result = NULL;
    946         struct page *newpage = get_new_page(page, private, &result);
    947 
    948         if (!newpage)
    949                 return -ENOMEM;
    950 
    951         if (page_count(page) == 1) {
    952                 /* page was freed from under us. So we are done. */
    953                 goto out;
    954         }
    955 
    956         if (unlikely(PageTransHuge(page)))
    957                 if (unlikely(split_huge_page(page)))
    958                         goto out;
    959 
    960         rc = __unmap_and_move(page, newpage, force, mode);
    961 
    962         if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
    963                 /*
    964                  * A ballooned page has been migrated already.
    965                  * Now, it's the time to wrap-up counters,
    966                  * handle the page back to Buddy and return.
    967                  */
    968                 dec_zone_page_state(page, NR_ISOLATED_ANON +
    969                                     page_is_file_cache(page));
    970                 balloon_page_free(page);
    971                 return MIGRATEPAGE_SUCCESS;
    972         }
    973 out:
    974         if (rc != -EAGAIN) {
    975                 /*
    976                  * A page that has been migrated has all references
    977                  * removed and will be freed. A page that has not been
    978                  * migrated will have kepts its references and be
    979                  * restored.
    980                  */
    981                 list_del(&page->lru);
    982                 dec_zone_page_state(page, NR_ISOLATED_ANON +
    983                                 page_is_file_cache(page));
    984                 putback_lru_page(page);
    985         }
    986         /*
    987          * Move the new page to the LRU. If migration was not successful
    988          * then this will free the page.
    989          */
    990         putback_lru_page(newpage);
    991         if (result) {
    992                 if (rc)
    993                         *result = rc;
    994                 else
    995                         *result = page_to_nid(newpage);
    996         }
    997         return rc;
    998 }
    

    migrate_pages() and unmap_and_move() in v3.16
    In v3.16, migrate_pages() and unmap_and_move() support put_new_page callback argument as well as get_new_page. If migration fails, unmap_and_move() could call put_new_page callback to release these isolated free pages.

    This extension is done by mm, migration: add destination page freeing callback

    939 static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
    940                         unsigned long private, struct page *page, int force,
    941                         enum migrate_mode mode)
    942 {
    943         int rc = 0;
    944         int *result = NULL;
    945         struct page *newpage = get_new_page(page, private, &result);
    946 
    947         if (!newpage)
    948                 return -ENOMEM;
    949 
    950         if (page_count(page) == 1) {
    951                 /* page was freed from under us. So we are done. */
    952                 goto out;
    953         }
    954 
    955         if (unlikely(PageTransHuge(page)))
    956                 if (unlikely(split_huge_page(page)))
    957                         goto out;
    958 
    959         rc = __unmap_and_move(page, newpage, force, mode);
    960 
    961         if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
    962                 /*
    963                  * A ballooned page has been migrated already.
    964                  * Now, it's the time to wrap-up counters,
    965                  * handle the page back to Buddy and return.
    966                  */
    967                 dec_zone_page_state(page, NR_ISOLATED_ANON +
    968                                     page_is_file_cache(page));
    969                 balloon_page_free(page);
    970                 return MIGRATEPAGE_SUCCESS;
    971         }
    972 out:
    973         if (rc != -EAGAIN) {
    974                 /*
    975                  * A page that has been migrated has all references
    976                  * removed and will be freed. A page that has not been
    977                  * migrated will have kepts its references and be
    978                  * restored.
    979                  */
    980                 list_del(&page->lru);
    981                 dec_zone_page_state(page, NR_ISOLATED_ANON +
    982                                 page_is_file_cache(page));
    983                 putback_lru_page(page);
    984         }
    985 
    986         /*
    987          * If migration was not successful and there's a freeing callback, use
    988          * it.  Otherwise, putback_lru_page() will drop the reference grabbed
    989          * during isolation.
    990          */
    991         if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
    992                 ClearPageSwapBacked(newpage);
    993                 put_new_page(newpage, private);
    994         } else
    995                 putback_lru_page(newpage);
    996 
    997         if (result) {
    998                 if (rc)
    999                         *result = rc;
    1000                 else
    1001                         *result = page_to_nid(newpage);
    1002         }
    1003         return rc;
    1004 }
    

    effect of this patch
    This patch adds compaction_free callback. compact_zone() calls migrate_pages() with compaction_free() as put_new_page argument. This could put unused isolated free pages into compaction control’s freelist and increase compaction efficiency.

    
    diff --git a/include/linux/migrate.h b/include/linux/migrate.h
    index 84a31ad..a2901c4 100644
    --- a/include/linux/migrate.h
    +++ b/include/linux/migrate.h
    +static void compaction_free(struct page *page, unsigned long data)
    +{
    +	struct compact_control *cc = (struct compact_control *)data;
    +
    +	list_add(&page->lru, &cc->freepages);
    +	cc->nr_freepages++;
    +}
    
     /* possible outcome of isolate_migratepages */
    @@ -1016,8 +1025,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
     		}
     
     		nr_migrate = cc->nr_migratepages;
    -		err = migrate_pages(&cc->migratepages, compaction_alloc, NULL,
    -				(unsigned long)cc,
    +		err = migrate_pages(&cc->migratepages, compaction_alloc,
    +				compaction_free, (unsigned long)cc,
     				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
     				MR_COMPACTION);
     		update_nr_listpages(cc);
    

    conclusion
    This post shows the how migrate_pages() and unmap_and_move() evolved from v3.15 to v3.16. It also discusses how this patch help increase compaction efficiency.

    patch discussion: mm, compaction: ignore pageblock skip when manually invoking compaction

    November 28, 2015

    This post is to discuss kernel patch mm, compaction: ignore pageblock skip when manually invoking compaction

    merge time
    v3.15

    what the patch does
    This patch sets ignore_skip_hint of compaction while compaction is triggered through /proc/sys/vm/compact_memory. This could enforce this compaction isolates all pageblocks.

    diff --git a/mm/compaction.c b/mm/compaction.c
    index 9185775..37b3799 100644
    --- a/mm/compaction.c
    +++ b/mm/compaction.c
    @@ -1186,6 +1186,7 @@ static void compact_node(int nid)
     	struct compact_control cc = {
     		.order = -1,
     		.sync = true,
    +		.ignore_skip_hint = true,
     	};
     
     	__compact_pgdat(NODE_DATA(nid), &cc);
    

    what is the effect of compaction.ignore_skip_hint

  • compact_zone() calls isolate_migratepages() to isolate used pages starting at cc->migrate_pfn
  • isolate_migratepages_range() scans pages one by one. If it scans a pages from a new pageblock, it calls isolation_suitable() to determine the whole pageblock should be skipped or not.
  • isolation_suitable() returns true if skip flag of this pageblock is set. cc->ignore_skip_hint could enforce it return false.
  • isolate_freepages() also use cc->ignore_skip_hint to skip a pageblock.
  • 456 unsigned long
    457 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
    458                 unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
    459 {
    460         unsigned long last_pageblock_nr = 0, pageblock_nr;
    461         unsigned long nr_scanned = 0, nr_isolated = 0;
    462         struct list_head *migratelist = &cc->migratepages;
    463         struct lruvec *lruvec;
    464         unsigned long flags;
    465         bool locked = false;
    466         struct page *page = NULL, *valid_page = NULL;
    467         bool skipped_async_unsuitable = false;
    468         const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
    469                                     (unevictable ? ISOLATE_UNEVICTABLE : 0);
    470 
    471         /*
    472          * Ensure that there are not too many pages isolated from the LRU
    473          * list by either parallel reclaimers or compaction. If there are,
    474          * delay for some time until fewer pages are isolated
    475          */
    476         while (unlikely(too_many_isolated(zone))) {
    477                 /* async migration should just abort */
    478                 if (!cc->sync)
    479                         return 0;
    480 
    481                 congestion_wait(BLK_RW_ASYNC, HZ/10);
    482 
    483                 if (fatal_signal_pending(current))
    484                         return 0;
    485         }
    486 
    487         /* Time to isolate some pages for migration */
    488         cond_resched();
    489         for (; low_pfn < end_pfn; low_pfn++) {
    490                 /* give a chance to irqs before checking need_resched() */
    491                 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
    492                         if (should_release_lock(&zone->lru_lock)) {
    493                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
    494                                 locked = false;
    495                         }
    496                 }
    497 
    498                 /*
    499                  * migrate_pfn does not necessarily start aligned to a
    500                  * pageblock. Ensure that pfn_valid is called when moving
    501                  * into a new MAX_ORDER_NR_PAGES range in case of large
    502                  * memory holes within the zone
    503                  */
    504                 if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
    505                         if (!pfn_valid(low_pfn)) {
    506                                 low_pfn += MAX_ORDER_NR_PAGES - 1;
    507                                 continue;
    508                         }
    509                 }
    510 
    511                 if (!pfn_valid_within(low_pfn))
    512                         continue;
    513                 nr_scanned++;
    514 
    515                 /*
    516                  * Get the page and ensure the page is within the same zone.
    517                  * See the comment in isolate_freepages about overlapping
    518                  * nodes. It is deliberate that the new zone lock is not taken
    519                  * as memory compaction should not move pages between nodes.
    520                  */
    521                 page = pfn_to_page(low_pfn);
    522                 if (page_zone(page) != zone)
    523                         continue;
    524 
    525                 if (!valid_page)
    526                         valid_page = page;
    527 
    528                 /* If isolation recently failed, do not retry */
    529                 pageblock_nr = low_pfn >> pageblock_order;
    530                 if (last_pageblock_nr != pageblock_nr) {
    531                         int mt;
    532 
    533                         last_pageblock_nr = pageblock_nr;
    534                         if (!isolation_suitable(cc, page))
    535                                 goto next_pageblock;
    536 
    537                         /*
    538                          * For async migration, also only scan in MOVABLE
    539                          * blocks. Async migration is optimistic to see if
    540                          * the minimum amount of work satisfies the allocation
    541                          */
    542                         mt = get_pageblock_migratetype(page);
    543                         if (!cc->sync && !migrate_async_suitable(mt)) {
    544                                 cc->finished_update_migrate = true;
    545                                 skipped_async_unsuitable = true;
    546                                 goto next_pageblock;
    547                         }
    548                 }
    549 
    550                 /*
    551                  * Skip if free. page_order cannot be used without zone->lock
    552                  * as nothing prevents parallel allocations or buddy merging.
    553                  */
    554                 if (PageBuddy(page))
    555                         continue;
    556 
    557                 /*
    558                  * Check may be lockless but that's ok as we recheck later.
    559                  * It's possible to migrate LRU pages and balloon pages
    560                  * Skip any other type of page
    561                  */
    562                 if (!PageLRU(page)) {
    563                         if (unlikely(balloon_page_movable(page))) {
    564                                 if (locked && balloon_page_isolate(page)) {
    565                                         /* Successfully isolated */
    566                                         goto isolate_success;
    567                                 }
    568                         }
    569                         continue;
    570                 }
    571 
    572                 /*
    573                  * PageLRU is set. lru_lock normally excludes isolation
    574                  * splitting and collapsing (collapsing has already happened
    575                  * if PageLRU is set) but the lock is not necessarily taken
    576                  * here and it is wasteful to take it just to check transhuge.
    577                  * Check TransHuge without lock and skip the whole pageblock if
    578                  * it's either a transhuge or hugetlbfs page, as calling
    579                  * compound_order() without preventing THP from splitting the
    580                  * page underneath us may return surprising results.
    581                  */
    582                 if (PageTransHuge(page)) {
    583                         if (!locked)
    584                                 goto next_pageblock;
    585                         low_pfn += (1 << compound_order(page)) - 1;
    586                         continue;
    587                 }
    588 
    589                 /*
    590                  * Migration will fail if an anonymous page is pinned in memory,
    591                  * so avoid taking lru_lock and isolating it unnecessarily in an
    592                  * admittedly racy check.
    593                  */
    594                 if (!page_mapping(page) &&
    595                     page_count(page) > page_mapcount(page))
    596                         continue;
    597 
    598                 /* Check if it is ok to still hold the lock */
    599                 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
    600                                                                 locked, cc);
    601                 if (!locked || fatal_signal_pending(current))
    602                         break;
    603 
    604                 /* Recheck PageLRU and PageTransHuge under lock */
    605                 if (!PageLRU(page))
    606                         continue;
    607                 if (PageTransHuge(page)) {
    608                         low_pfn += (1 << compound_order(page)) - 1;
    609                         continue;
    610                 }
    611 
    612                 lruvec = mem_cgroup_page_lruvec(page, zone);
    613 
    614                 /* Try isolate the page */
    615                 if (__isolate_lru_page(page, mode) != 0)
    616                         continue;
    617 
    618                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
    619 
    620                 /* Successfully isolated */
    621                 del_page_from_lru_list(page, lruvec, page_lru(page));
    622 
    623 isolate_success:
    624                 cc->finished_update_migrate = true;
    625                 list_add(&page->lru, migratelist);
    626                 cc->nr_migratepages++;
    627                 nr_isolated++;
    628 
    629                 /* Avoid isolating too much */
    630                 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
    631                         ++low_pfn;
    632                         break;
    633                 }
    634 
    635                 continue;
    636 
    637 next_pageblock:
    638                 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
    639         }
    640 
    641         acct_isolated(zone, locked, cc);
    642 
    643         if (locked)
    644                 spin_unlock_irqrestore(&zone->lru_lock, flags);
    645 
    646         /*
    647          * Update the pageblock-skip information and cached scanner pfn,
    648          * if the whole pageblock was scanned without isolating any page.
    649          * This is not done when pageblock was skipped due to being unsuitable
    650          * for async compaction, so that eventual sync compaction can try.
    651          */
    652         if (low_pfn == end_pfn && !skipped_async_unsuitable)
    653                 update_pageblock_skip(cc, valid_page, nr_isolated, true);
    654 
    655         trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
    656 
    657         count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
    658         if (nr_isolated)
    659                 count_compact_events(COMPACTISOLATED, nr_isolated);
    660 
    661         return low_pfn;
    662 }
    
     72 static inline bool isolation_suitable(struct compact_control *cc,
     73                                         struct page *page)
     74 {
     75         if (cc->ignore_skip_hint)
     76                 return true;
     77 
     78         return !get_pageblock_skip(page);
     79 }
    

    who also set ignore_skip_hint
    CMA allocator calls alloc_contig_range() to allocate contiguous pages. It also set ignore_skip_hint of compaction control as true. This is due to the fact that CMA shouldn’t skip scanning pages.

    6247 int alloc_contig_range(unsigned long start, unsigned long end,
    6248                        unsigned migratetype)
    6249 {
    6250         unsigned long outer_start, outer_end;
    6251         int ret = 0, order;
    6252 
    6253         struct compact_control cc = {
    6254                 .nr_migratepages = 0,
    6255                 .order = -1,
    6256                 .zone = page_zone(pfn_to_page(start)),
    6257                 .sync = true,
    6258                 .ignore_skip_hint = true,
    6259         };
    

    conclusion
    ignore_skip_hint of compaction control is set as true in two conditions.

  • compact through /proc/sys/vm/compact_memory
  • CMA allocator calls alloc_contig_range()
  • kernel: mm: compact_zone

    November 28, 2015

    This post is to discuss copmact_zone.

    reference code base
    LA.BF64.1.1-06510-8×94.0 with Android 5.0.0_r2(LRX21M) and Linux kernel 3.10.49.

    reference kernel config

    # CONFIG_NUMA is not set
    CONFIG_ZONE_DMA=y
    # CONFIG_MEMCG is not set
    # CONFIG_TRANSPARENT_HUGEPAGE is not set
    CONFIG_MEMORY_ISOLATION=y
    CONFIG_CMA=y
    # CONFIG_ALLOC_BUFFERS_IN_4K_CHUNKS is not set
    CONFIG_COMPACTION=y
    CONFIG_MIGRATION=y
    

    compact_zone_order and compact_zone

  • If a compaction is for order greater than zero and is not deferred, then it could enter try_to_compact_pages
  • If a compaction is for order greater than zero and could do fs/io operations, then it could iterate all zones in the zonelist and compact it with compact_zone_order()
  • compact_zone_order() instantiate a compaction_control and calls compact_zone() to compact this zone.
  • static unsigned long compact_zone_order(struct zone *zone,
    				 int order, gfp_t gfp_mask,
    				 bool sync, bool *contended)
    {
    	unsigned long ret;
    	struct compact_control cc = {
    		.nr_freepages = 0,
    		.nr_migratepages = 0,
    		.order = order,
    		.migratetype = allocflags_to_migratetype(gfp_mask),
    		.zone = zone,
    		.sync = sync,
    	};
    	INIT_LIST_HEAD(&cc.freepages);
    	INIT_LIST_HEAD(&cc.migratepages);
    
    	ret = compact_zone(zone, &cc);
    
    	VM_BUG_ON(!list_empty(&cc.freepages));
    	VM_BUG_ON(!list_empty(&cc.migratepages));
    
    	*contended = cc.contended;
    	return ret;
    }
    

    compaction_zone
    compact_zone() calls compaction_suitable() to check if compaction is needed. If order is -1, then this compaction is from /proc/sys/vm/compact_memory, then it returns COMPACT_CONTINUE to continue the compaction. If low watermark is not satisfied, then it returns COMPACT_SKIPPED to skip the compaction. Then it checks fragmentation index to determine if compaction is needed to make allocation success.

    compaction_restarting() returns true when compaction cache data should be reset.

  • If order is 4 and (compact_considered, 1 << compact_defer_shift, compact_order_failed) = (64, 64, 4), then it returns true
  • __reset_isolation_suitable() help reset compact cache data such as compact_cached_migrate_pfn, and compact_cached_free_pfn, and compact_blockskip_flush. It also clears all pageblocks’ skip flag.
  • Then, it initialize cc->migrate_pfn and cc->free_pfn. compaction() is done by migrating pages in the bottom of a zone to pages in the top of a zone. At first, cc-migrate_pfn points to bottom of the zone, and cc->free_pfn points to top of the zone.
    Below is simplified idea of compaction:

  • A while loop whose condition is that cc->migrate_pfn is less than cc->free_pfn
  • If cc->migrate_pfn page couldn’t be migrated, then increase cc->migrate_pfn by 1.
  • if cc->migrate_pfn page could be migrated but cc->free_pfn page is not free, then decrease cc->free_pfn by 1.
  • If cc->migrate_pfn page could be migrated and cc->free_pfn page is free, then migrate cc->migrate_pfn page to cc->free_pfn page. Increase cc->migrate_pfn by 1 and decrease cc->free_pfn by 1.
  • The actual compaction implementation is done by migrate a batch of pages each time. isolate_migratepages() helps isolate used pages starting at cc->migrate_pfn. It isolates used pages to cc->migratepages and increases cc->migrate_pfn. It then calls migrate_pages() which isolates free pages backward starting at cc->free_pfn to cc->freepages and decreases cc->free_pfn. If both isolation succeed, then it migrate pages in cc->migratepages to cc->freepages. Finally, migrate_pages() release free pages and push back lru pages to its lru list.

    The loop is controlled by compact_finished(). The loop continues if it returns COMPACT_CONTINUE.

  • if cc->migrate_pfn > cc->free_pan, then it returns COMPACT_COMPLETE.
  • if cc->order == -1, then it returns COMPACT_CONTINUE
  • if low watermark is not met, then it returns COMPACT_CONTINUE.
  • if page allocation could succeed, then it returns COMPACT_PARTIAL
  • otherwise, it returns COMPACT_CONTINUE.
  •  
    static int compact_zone(struct zone *zone, struct compact_control *cc)
    {
    	int ret;
    	unsigned long start_pfn = zone->zone_start_pfn;
    	unsigned long end_pfn = zone_end_pfn(zone);
    
    	ret = compaction_suitable(zone, cc->order);
    	switch (ret) {
    	case COMPACT_PARTIAL:
    	case COMPACT_SKIPPED:
    		/* Compaction is likely to fail */
    		return ret;
    	case COMPACT_CONTINUE:
    		/* Fall through to compaction */
    		;
    	}
    
    	/*
    	 * Clear pageblock skip if there were failures recently and compaction
    	 * is about to be retried after being deferred. kswapd does not do
    	 * this reset as it'll reset the cached information when going to sleep.
    	 */
    	if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
    		__reset_isolation_suitable(zone);
    
    	/*
    	 * Setup to move all movable pages to the end of the zone. Used cached
    	 * information on where the scanners should start but check that it
    	 * is initialised by ensuring the values are within zone boundaries.
    	 */
    	cc->migrate_pfn = zone->compact_cached_migrate_pfn;
    	cc->free_pfn = zone->compact_cached_free_pfn;
    	if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
    		cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
    		zone->compact_cached_free_pfn = cc->free_pfn;
    	}
    	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
    		cc->migrate_pfn = start_pfn;
    		zone->compact_cached_migrate_pfn = cc->migrate_pfn;
    	}
    
    	migrate_prep_local();
    
    	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
    		unsigned long nr_migrate, nr_remaining;
    		int err;
    
    		switch (isolate_migratepages(zone, cc)) {
    		case ISOLATE_ABORT:
    			ret = COMPACT_PARTIAL;
    			putback_movable_pages(&cc->migratepages);
    			cc->nr_migratepages = 0;
    			goto out;
    		case ISOLATE_NONE:
    			continue;
    		case ISOLATE_SUCCESS:
    			;
    		}
    
    		nr_migrate = cc->nr_migratepages;
    		err = migrate_pages(&cc->migratepages, compaction_alloc,
    				(unsigned long)cc,
    				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
    				MR_COMPACTION);
    		update_nr_listpages(cc);
    		nr_remaining = cc->nr_migratepages;
    
    		trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
    						nr_remaining);
    
    		/* Release isolated pages not migrated */
    		if (err) {
    			putback_movable_pages(&cc->migratepages);
    			cc->nr_migratepages = 0;
    			/*
    			 * migrate_pages() may return -ENOMEM when scanners meet
    			 * and we want compact_finished() to detect it
    			 */
    			if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
    				ret = COMPACT_PARTIAL;
    				goto out;
    			}
    		}
    	}
    
    out:
    	/* Release free pages and check accounting */
    	cc->nr_freepages -= release_freepages(&cc->freepages);
    	VM_BUG_ON(cc->nr_freepages != 0);
    
    	return ret;
    }
    

    conclusion
    This post is to discuss copmact_zone. It explains its mechanism and implementations.


    %d bloggers like this: