开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 系统运维 -> linux内核那些事之内存规整(memory compact) -> 正文阅读

[系统运维]linux内核那些事之内存规整(memory compact)

内存规整

内存规整是Mel Gormal开发防止内存碎片anti-fragmen pach补丁的第二个部分Avoiding fragmentation with page clustering v27 [LWN.net]，主要用于解决当系统长时间运行之后，造成比较碎化内存时，通过内存规整将处于可以移动MOVE的类型内存，重新进行页迁移整合出较大连续物理内存:

内存规整机制原因比较简单：

利用类似快慢指针技巧，当需要对一个zone进行内存规整时，使用migrate_pfn和free_pfn两个遍历；
migrate_pfn 为从zone 头部开始进行扫描，依次扫描出已经被分配出去但是可以进行页迁移的页面
free_pfn：为从zone 尾部开始扫描，依次扫描出空闲page。
当每次扫描结束后，将migrate_pfn扫描出的可迁移页面依次迁移到free_pfn 空闲page中。
当free_pfn和migrate_pfn 两个遇见相等时说明内存规整完毕。
这样规整之后，zone前半部分可以需要迁移的页面被迁移到zone后半部分空闲page中，这样前半部分会空闲出大块连续物理内存，供下次申请内存使用。

内存规整技术是页迁移技术的一个比较重要的使用场景，帮助系统整理出连续物理内存。

触发时机

内存规整触发时机主要有以下三种：

通过/proc/sys/vm/compact_memory 由用户根据手动触发,如果是NUMA系统则还可以通过/sys/devices/system/node/node<id>/compact 触发
kcompatd线程类似与kswapd线程，内存水位不够时，会触发kcompactd线程进行异步内存规整。
慢速申请内存通道，说明内存压力过大，__alloc_pages_slowpath会通过__alloc_pages_direct_compact 进行同步内存规整。

compact_zone()

compact_zone()函数是实施内存规整的核心函数，不管是哪种触发方式最终都会通过compact_zone 规整指定的zone进行内存规整，

调用关系

按照compact_zone触发关系调用关系图如下：

上述四种触发方式，最终都是靠compact_zone实现页迁移功能

compact_zone源码


static enum compact_result
compact_zone(struct compact_control *cc, struct capture_control *capc)
{
	enum compact_result ret;
	unsigned long start_pfn = cc->zone->zone_start_pfn;
	unsigned long end_pfn = zone_end_pfn(cc->zone);
	unsigned long last_migrated_pfn;
	const bool sync = cc->mode != MIGRATE_ASYNC;
	bool update_cached;

	/*
	 * These counters track activities during zone compaction.  Initialize
	 * them before compacting a new zone.
	 */
	cc->total_migrate_scanned = 0;
	cc->total_free_scanned = 0;
	cc->nr_migratepages = 0;
	cc->nr_freepages = 0;
	INIT_LIST_HEAD(&cc->freepages);
	INIT_LIST_HEAD(&cc->migratepages);

	cc->migratetype = gfp_migratetype(cc->gfp_mask);
	ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
							cc->highest_zoneidx);
	/* Compaction is likely to fail */
	if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
		return ret;

	/* huh, compaction_suitable is returning something unexpected */
	VM_BUG_ON(ret != COMPACT_CONTINUE);

	/*
	 * Clear pageblock skip if there were failures recently and compaction
	 * is about to be retried after being deferred.
	 */
	if (compaction_restarting(cc->zone, cc->order))
		__reset_isolation_suitable(cc->zone);

	/*
	 * Setup to move all movable pages to the end of the zone. Used cached
	 * information on where the scanners should start (unless we explicitly
	 * want to compact the whole zone), but check that it is initialised
	 * by ensuring the values are within zone boundaries.
	 */
	cc->fast_start_pfn = 0;
	if (cc->whole_zone) {
		cc->migrate_pfn = start_pfn;
		cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
	} else {
		cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
		cc->free_pfn = cc->zone->compact_cached_free_pfn;
		if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
			cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
			cc->zone->compact_cached_free_pfn = cc->free_pfn;
		}
		if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
			cc->migrate_pfn = start_pfn;
			cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
			cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
		}

		if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
			cc->whole_zone = true;
	}

	last_migrated_pfn = 0;

	/*
	 * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
	 * the basis that some migrations will fail in ASYNC mode. However,
	 * if the cached PFNs match and pageblocks are skipped due to having
	 * no isolation candidates, then the sync state does not matter.
	 * Until a pageblock with isolation candidates is found, keep the
	 * cached PFNs in sync to avoid revisiting the same blocks.
	 */
	update_cached = !sync &&
		cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];

	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
				cc->free_pfn, end_pfn, sync);

	migrate_prep_local();

	while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
		int err;
		unsigned long start_pfn = cc->migrate_pfn;

		/*
		 * Avoid multiple rescans which can happen if a page cannot be
		 * isolated (dirty/writeback in async mode) or if the migrated
		 * pages are being allocated before the pageblock is cleared.
		 * The first rescan will capture the entire pageblock for
		 * migration. If it fails, it'll be marked skip and scanning
		 * will proceed as normal.
		 */
		cc->rescan = false;
		if (pageblock_start_pfn(last_migrated_pfn) ==
		    pageblock_start_pfn(start_pfn)) {
			cc->rescan = true;
		}

		switch (isolate_migratepages(cc)) {
		case ISOLATE_ABORT:
			ret = COMPACT_CONTENDED;
			putback_movable_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
			goto out;
		case ISOLATE_NONE:
			if (update_cached) {
				cc->zone->compact_cached_migrate_pfn[1] =
					cc->zone->compact_cached_migrate_pfn[0];
			}

			/*
			 * We haven't isolated and migrated anything, but
			 * there might still be unflushed migrations from
			 * previous cc->order aligned block.
			 */
			goto check_drain;
		case ISOLATE_SUCCESS:
			update_cached = false;
			last_migrated_pfn = start_pfn;
			;
		}

		err = migrate_pages(&cc->migratepages, compaction_alloc,
				compaction_free, (unsigned long)cc, cc->mode,
				MR_COMPACTION);

		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
							&cc->migratepages);

		/* All pages were either migrated or will be released */
		cc->nr_migratepages = 0;
		if (err) {
			putback_movable_pages(&cc->migratepages);
			/*
			 * migrate_pages() may return -ENOMEM when scanners meet
			 * and we want compact_finished() to detect it
			 */
			if (err == -ENOMEM && !compact_scanners_met(cc)) {
				ret = COMPACT_CONTENDED;
				goto out;
			}
			/*
			 * We failed to migrate at least one page in the current
			 * order-aligned block, so skip the rest of it.
			 */
			if (cc->direct_compaction &&
						(cc->mode == MIGRATE_ASYNC)) {
				cc->migrate_pfn = block_end_pfn(
						cc->migrate_pfn - 1, cc->order);
				/* Draining pcplists is useless in this case */
				last_migrated_pfn = 0;
			}
		}

check_drain:
		/*
		 * Has the migration scanner moved away from the previous
		 * cc->order aligned block where we migrated from? If yes,
		 * flush the pages that were freed, so that they can merge and
		 * compact_finished() can detect immediately if allocation
		 * would succeed.
		 */
		if (cc->order > 0 && last_migrated_pfn) {
			unsigned long current_block_start =
				block_start_pfn(cc->migrate_pfn, cc->order);

			if (last_migrated_pfn < current_block_start) {
				lru_add_drain_cpu_zone(cc->zone);
				/* No more flushing until we migrate again */
				last_migrated_pfn = 0;
			}
		}

		/* Stop if a page has been captured */
		if (capc && capc->page) {
			ret = COMPACT_SUCCESS;
			break;
		}
	}

out:
	/*
	 * Release free pages and update where the free scanner should restart,
	 * so we don't leave any returned pages behind in the next attempt.
	 */
	if (cc->nr_freepages > 0) {
		unsigned long free_pfn = release_freepages(&cc->freepages);

		cc->nr_freepages = 0;
		VM_BUG_ON(free_pfn == 0);
		/* The cached pfn is always the first in a pageblock */
		free_pfn = pageblock_start_pfn(free_pfn);
		/*
		 * Only go back, not forward. The cached pfn might have been
		 * already reset to zone end in compact_finished()
		 */
		if (free_pfn > cc->zone->compact_cached_free_pfn)
			cc->zone->compact_cached_free_pfn = free_pfn;
	}

	count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
	count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);

	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
				cc->free_pfn, end_pfn, sync, ret);

	return ret;
}

compact_zone流程

该函数整理处理思路相对比较清晰：

?按照指定的zone进行内存规整
对compact_control结构中的一些值进行初始化
compaction_suitable：根据实际内存水位情况判断是否有必要做内存规整，因为内存规整操作比较耗时，如果空闲内存处于较高情况，则没有必要触发内存规整，最终是通过__compaction_suitable实现对内存水位判断
compact_finished：用于处理当前zone是否扫描完毕，zone扫描时使用了处理上的技巧。内存规整时分别从zone头部扫描migrate_pfn和从zone尾部扫描free_pfn，当migrate_pfn与free_pfn相遇时，则认为扫描完毕。最终是调用__compact_finished函数
isolate_migratepages：将扫描出的要迁移的页进行孤立isolate，防止在迁移过程中，该页面被释放或者swap out等，也是做页迁移之前必须准备工作，可以详见《linux那些事之页迁移(page migratiom)》
migrate_pages:将孤立出来的页面进行页迁移。
当zone扫描内存规整完毕之后，需要free_pfn 等信息保存到zone中用于记录本次扫描位置，空闲页等信息，以便用于下一次内存规整时使用。
compact_result 为内存规整结果

??????????????关键几个函数

__compaction_suitable

__compaction_suitable用于判断当前zone是否可以做内存规整：


/*
 * compaction_suitable: Is this suitable to run compaction on this zone now?
 * Returns
 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
 *   COMPACT_SUCCESS  - If the allocation would succeed without compaction
 *   COMPACT_CONTINUE - If compaction should run now
 */
static enum compact_result __compaction_suitable(struct zone *zone, int order,
					unsigned int alloc_flags,
					int highest_zoneidx,
					unsigned long wmark_target)
{
	unsigned long watermark;

	if (is_via_compact_memory(order))
		return COMPACT_CONTINUE;

	watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
	/*
	 * If watermarks for high-order allocation are already met, there
	 * should be no need for compaction at all.
	 */
	if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
								alloc_flags))
		return COMPACT_SUCCESS;

	/*
	 * Watermarks for order-0 must be met for compaction to be able to
	 * isolate free pages for migration targets. This means that the
	 * watermark and alloc_flags have to match, or be more pessimistic than
	 * the check in __isolate_free_page(). We don't use the direct
	 * compactor's alloc_flags, as they are not relevant for freepage
	 * isolation. We however do use the direct compactor's highest_zoneidx
	 * to skip over zones where lowmem reserves would prevent allocation
	 * even if compaction succeeds.
	 * For costly orders, we require low watermark instead of min for
	 * compaction to proceed to increase its chances.
	 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
	 * suitable migration targets
	 */
	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
				low_wmark_pages(zone) : min_wmark_pages(zone);
	watermark += compact_gap(order);
	if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
						ALLOC_CMA, wmark_target))
		return COMPACT_SKIPPED;

	return COMPACT_CONTINUE;
}

主要有以下几种情况：

is_via_compact_memory：是否通过/proc/sys/vm/compact_memory 手动强制进行内存规整，如果强制内存规整，则直接返回COMPACT_CONTINUE，继续后续步骤进行内存规整
zone_watermark_ok：当作内存water mark满足order，则没有必要做内存规整，能够申请oder 内存成功，返回COMPACT_SUCCESS
当内存不满足时，则查看是否可以通过内存压缩满足内存申请，如果能够通过内存规整满足，则返回COMPACT_SUCCESS进行内存规整，如果判断出当前内存不能够通过内存压缩满足分配要求，则说明内存碎片化不严重，没有必要进行内存压缩发跳过此次内存规整操作，返回?COMPACT_SKIPPED。

compact_result

compact_result为此次内存规整之后的结果：


/* Return values for compact_zone() and try_to_compact_pages() */
/* When adding new states, please adjust include/trace/events/compaction.h */
enum compact_result {
	/* For more detailed tracepoint output - internal to compaction */
	COMPACT_NOT_SUITABLE_ZONE,
	/*
	 * compaction didn't start as it was not possible or direct reclaim
	 * was more suitable
	 */
	COMPACT_SKIPPED,
	/* compaction didn't start as it was deferred due to past failures */
	COMPACT_DEFERRED,

	/* compaction not active last round */
	COMPACT_INACTIVE = COMPACT_DEFERRED,

	/* For more detailed tracepoint output - internal to compaction */
	COMPACT_NO_SUITABLE_PAGE,
	/* compaction should continue to another pageblock */
	COMPACT_CONTINUE,

	/*
	 * The full zone was compacted scanned but wasn't successfull to compact
	 * suitable pages.
	 */
	COMPACT_COMPLETE,
	/*
	 * direct compaction has scanned part of the zone but wasn't successfull
	 * to compact suitable pages.
	 */
	COMPACT_PARTIAL_SKIPPED,

	/* compaction terminated prematurely due to lock contentions */
	COMPACT_CONTENDED,

	/*
	 * direct compaction terminated after concluding that the allocation
	 * should now succeed
	 */
	COMPACT_SUCCESS,
};

?COMPACT_NOT_SUITABLE_ZONE：该zone不适合做内存规整
COMPACT_SKIPPED：内存规整要求不满足，跳过该zone
COMPACT_DEFERRED:由于之前的一些错误导致内存规整退出
COMPACT_INACTIVE：上次内存规整未激活
COMPACT_NO_SUITABLE_PAGE：没有合适的物理页做内存规整
COMPACT_CONTINUE：下一个页面块pageblock继续做内存规整
COMPACT_COMPLETE：zone都以及做内存规整扫描完毕，但是没有合适页面做内存规整
COMPACT_PARTIAL_SKIPPED：直接内存规整已经扫描了部分zone页面，但是仍然没有合适页面做内存规整
COMPACT_CONTENDED：由于某些锁竞争导致内存规整退出
COMPACT_SUCCESS：当前zone内存规整满足页面分配要求，可以退出

__compact_finished

__compact_finished用于判断当前内存压缩扫描是否完成：


static enum compact_result __compact_finished(struct compact_control *cc)
{
	unsigned int order;
	const int migratetype = cc->migratetype;
	int ret;

	/* Compaction run completes if the migrate and free scanner meet */
	if (compact_scanners_met(cc)) {
		/* Let the next compaction start anew. */
		reset_cached_positions(cc->zone);

		/*
		 * Mark that the PG_migrate_skip information should be cleared
		 * by kswapd when it goes to sleep. kcompactd does not set the
		 * flag itself as the decision to be clear should be directly
		 * based on an allocation request.
		 */
		if (cc->direct_compaction)
			cc->zone->compact_blockskip_flush = true;

		if (cc->whole_zone)
			return COMPACT_COMPLETE;
		else
			return COMPACT_PARTIAL_SKIPPED;
	}

	if (is_via_compact_memory(cc->order))
		return COMPACT_CONTINUE;

	/*
	 * Always finish scanning a pageblock to reduce the possibility of
	 * fallbacks in the future. This is particularly important when
	 * migration source is unmovable/reclaimable but it's not worth
	 * special casing.
	 */
	if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
		return COMPACT_CONTINUE;

	/* Direct compactor: Is a suitable page free? */
	ret = COMPACT_NO_SUITABLE_PAGE;
	for (order = cc->order; order < MAX_ORDER; order++) {
		struct free_area *area = &cc->zone->free_area[order];
		bool can_steal;

		/* Job done if page is free of the right migratetype */
		if (!free_area_empty(area, migratetype))
			return COMPACT_SUCCESS;

#ifdef CONFIG_CMA
		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
		if (migratetype == MIGRATE_MOVABLE &&
			!free_area_empty(area, MIGRATE_CMA))
			return COMPACT_SUCCESS;
#endif
		/*
		 * Job done if allocation would steal freepages from
		 * other migratetype buddy lists.
		 */
		if (find_suitable_fallback(area, order, migratetype,
						true, &can_steal) != -1) {

			/* movable pages are OK in any pageblock */
			if (migratetype == MIGRATE_MOVABLE)
				return COMPACT_SUCCESS;

			/*
			 * We are stealing for a non-movable allocation. Make
			 * sure we finish compacting the current pageblock
			 * first so it is as free as possible and we won't
			 * have to steal another one soon. This only applies
			 * to sync compaction, as async compaction operates
			 * on pageblocks of the same migratetype.
			 */
			if (cc->mode == MIGRATE_ASYNC ||
					IS_ALIGNED(cc->migrate_pfn,
							pageblock_nr_pages)) {
				return COMPACT_SUCCESS;
			}

			ret = COMPACT_CONTINUE;
			break;
		}
	}

	if (cc->contended || fatal_signal_pending(current))
		ret = COMPACT_CONTENDED;

	return ret;
}

compact_scanners_met： cc->free_pfn和cc->migrate_pfn是否相遇，如果说明已经扫描完毕，返回true
compact_scanners_met 返回true，如果cc->whole_zone为true说明需要扫描整个zone，并且已经扫描完毕了，返回COMPACT_COMPLETE。如果ccf->whole_zone为false，说明不需要扫描整个zone，但是此时已经扫描完整个zone，说明扫描完整个zone还没有满足后续内存分配要求，返回COMPACT_PARTIAL_SKIPPED。
compact_scanners_met 返回false, is_via_compact_memory()为ture说明是/proc/sys/vm/compact_memory手动触发，需要继续扫描zone下一个pageblock，返回COMPACT_CONTINUE。
如果cc->migrate_pfn 不是pageblock对齐，则需要继续扫描
接下来一个比较长的循环出来，主要是需要判断后续扫描是否还有空闲page 用来做迁移目的页，如果有则可以继续做迁移，如果没有则返回COMPACT_NO_SUITABLE_PAGE

isolate_migratepages

isolate_migratepages函数处理比较长，整体处理思路就是继续扫描，将扫描出合适的物理页做迁移，并将需要做迁移的页孤立出来，加入到cc->migratepages中：

static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
{
	unsigned long block_start_pfn;
	unsigned long block_end_pfn;
	unsigned long low_pfn;
	struct page *page;
	const isolate_mode_t isolate_mode =
		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
		(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
	bool fast_find_block;

    //通过快速通道找到所要迁移扫描的起始pfn
	low_pfn = fast_find_migrateblock(cc);
    //计算出扫描结束page pfn，要求按照pageblock对齐
	block_start_pfn = pageblock_start_pfn(low_pfn);
	if (block_start_pfn < cc->zone->zone_start_pfn)
		block_start_pfn = cc->zone->zone_start_pfn;

    //快速找到的pfn是否成功
	fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;

	/* Only scan within a pageblock boundary */
	block_end_pfn = pageblock_end_pfn(low_pfn);


    //对low_pfn起始的pageblock内物理页面做扫描，将合适的做迁移的页面孤立出来
	for (; block_end_pfn <= cc->free_pfn;
			fast_find_block = false,
			low_pfn = block_end_pfn,
			block_start_pfn = block_end_pfn,
			block_end_pfn += pageblock_nr_pages) {

        //如果长时间循环，需要进行放权，给其他进程得到调度机会
		if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
			cond_resched();

        //检查是否在同一个pageblock内
		page = pageblock_pfn_to_page(block_start_pfn,
						block_end_pfn, cc->zone);
		if (!page)
			continue;
	
        //如果最近孤立页面失败，则不进行再次尝试，仅仅做检查
		if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
		    !fast_find_block && !isolation_suitable(cc, page))
			continue;

        //如果是异步规整，则仅规整move页面且非huge page
		if (!suitable_migration_source(cc, page)) {
			update_cached_migrate(cc, block_end_pfn);
			continue;
		}

		/* Perform the isolation */
        //将[low_pfn,block_end_pfn)中的符合内存规整要求的页面孤立出来，将孤立出来的页面加入到cc->migratepages中
		low_pfn = isolate_migratepages_block(cc, low_pfn,
						block_end_pfn, isolate_mode);

		if (!low_pfn)
			return ISOLATE_ABORT;

        //孤立页面成功或者失败都不在继续
		break;
	}

    //记录下次重新扫描的做页迁移的起始pfn
	cc->migrate_pfn = low_pfn;

	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}

migrate_pages

将上述合适做页迁移的页面，且已经孤立到cc->migratepages中页面做页迁移：

err = migrate_pages(&cc->migratepages, compaction_alloc,
				compaction_free, (unsigned long)cc, cc->mode,MR_COMPACTION)

关于页迁移详细信息可以参考《linux那些事之页迁移(page migratiom)》

创作打卡挑战赛

赢取流量/现金/CSDN周边激励大奖

系统运维最新文章

配置小型公司网络WLAN基本业务（AC通过三层

如何用DWDM射频光纤技术实现200公里外的站点

国内顺畅下载k8s.gcr.io的镜像

自动化测试appium

ctfshow ssrf

Linux操作系统学习之实用指令（Centos7/8均

加:2022-05-10 12:16:32 更:2022-05-10 12:16:39

360图书馆购物三丰科技阅读网日历万年历 2025年7日历

-2025/7/2 19:41:00-

图片自动播放器
↓图片自动播放器↓

TxT小说阅读器
↓语音阅读,小说下载,古典文学↓

一键清除垃圾
↓轻轻一点,清除系统垃圾↓

图片批量下载器
↓批量下载图片,美女图库↓

网站联系: qq:121756557 email:121756557@qq.com IT数码