[系统运维] linux控制组: cpuset解析

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 系统运维 -> linux控制组: cpuset解析 -> 正文阅读

[系统运维]linux控制组: cpuset解析

? cpuset提供了一种机制，用于将一组CPU和内存节点分配给一组任务。在这里，“内存节点”是指包含内存的在线节点。

? CPU集将任务的CPU和内存放置限制为仅任务当前cpuset中的资源。它们形成虚拟文件系统中可见的嵌套层次结构。这些是管理大型系统上动态作业放置所需的基本钩子，超出了已经存在的钩子。

? cpuset使用控制组中描述的通用cgroup子系统。

? 任务的请求，使用sched_setaffinity系统调用将CPU包含在CPU关联掩码中，并使用mbind和set_mempolicy系统调用将内存节点包含在内存策略中，都通过该任务的CPU集进行过滤，过滤掉不在CPU集中的任何CPU或内存节点。调度程序不会在其cpus_allowed向量中不允许的CPU上调度任务，并且内核页面分配器不会在请求任务的mems_allowed向量中不允许的节点上分配页面。

? 用户级代码可以在cgroup虚拟文件系统中按名称创建和销毁cpuset，管理这些cpuset的属性和权限以及分配给每个cpuset的CPU和内存节点，指定和查询任务分配给哪个cpuset，并列出分配给cpuset的任务pid。

? cgroup在这里作为一组内存区域空间，通过obj_cgroup数组与它建立存储内存的访问关系。如通过alloc_percpu分配内存，得到是obj_cgroup对象所对应的块，它先从内核静态percpu区域的预留块中查找是否存在对应(大小，并且按指定大小对齐)的块区域，如果没有找到再从创建新的pcpu块区域，从顶部分配虚拟区间。

? cpuset支持CPU/内存的热插拔事件(注册通知链)，它通过cpuset_track_online_nodes_nb函数检测cpu集的跟踪节点在线状态，当mems_allowed跟踪node_states[N_MEMORY]发生变化时，调度工作队列cpuset_hotplug_work，处理cpuset的CPU/内存热插拔等相关的变化。

内容

1. 函数分析

1.1 cpuset_init

? 初始化percpu读写信号量、分配顶部cpu集的部分参数

BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); // 定义、初始化percpu读写信号量
        
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); // 分配顶部cpu集的cpus_allowed指针，(可以在哪些CPU上调度，用户配置的CPU和内存节点允许执行任务）
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); // 分配effective_cpus指针，(CPU 集中的CPU列表副本，有效的CPU和内存节点允许执行任务)
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); // 分配subparts_cpus指针，(分配给子分区的CPU）

cpumask_setall(top_cpuset.cpus_allowed); // 设置(填充)cpumask中的所有cpu（<nr_cpu_id）
nodes_setall(top_cpuset.mems_allowed); // 设置(填充)nodemask的所有内存节点
cpumask_setall(top_cpuset.effective_cpus);
nodes_setall(top_cpuset.effective_mems);

fmeter_init(&top_cpuset.fmeter); // 初始化内存压力过滤器，频率计
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); // 设置顶部cpu集的标志
top_cpuset.relax_domain_level = -1; // relax_domain_level 迁移任务时的搜索范围，-1 没有要求

BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); // 分配cpus_attach指针

return 0;
}

DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
percpu_init_rwsem
top_cpuset

1.2 cpuset_init_smp

void __init cpuset_init_smp(void)
{
        /*
         * cpus_allowd/mems_allowd在初始值中设置为v2值
         * cpuset_bind() 调用将在另一个调用中重置为v1值
         * 安装v1 cpuset时调用cpuset_bind(）
         */
        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; // 记录nodemask的所有内存节点

        cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); // 拷贝cpu可用于迁移时的掩码
        top_cpuset.effective_mems = node_states[N_MEMORY]; // 拷贝cpu的内存节点列表

		register_hotmemory_notifier(&cpuset_track_online_nodes_nb); // 设置cpu集跟踪节点在线状态
		// 根据通知块的优先级插入到通知块链表对应的位置，如果申请的是唯一优先级，不能存在相同优先级的节点

cpuset_track_online_nodes_nb
register_hotmemory_notifier

2. 源码结构

? cpuset_rwsem 一种新的读写信号量设计，针对读取锁定进行了优化

static struct percpu_rw_semaphore cpuset_rwsem = {                           
        .rss = __RCU_SYNC_INITIALIZER(cpuset_rwsem,.rss),                        
        .read_count = &__percpu_rwsem_rc_cpuset_rwsem,                        
        .writer = __RCUWAIT_INITIALIZER(cpuset_rwsem.writer),                   
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(cpuset_rwsem.waiters),         
        .block = ATOMIC_INIT(0),                                       
        __PERCPU_RWSEM_DEP_MAP_INIT(cpuset_rwsem)                             
};

? cpuset_track_online_nodes_nb cpu集跟踪节点在线状态

static struct notifier_block cpuset_track_online_nodes_nb = {
        .notifier_call = cpuset_track_online_nodes,
        .priority = 10,         /* ??! */
};

cpuset_track_online_nodes

? __RWSEM_INITIALIZER 块通知链表头，展开形式

#define __RWSEM_INITIALIZER(memory_chain)	
||
\/
struct blocking_notifier_head memory_chain = { 
		.count =  { (0) },
	  	.owner =  { (0) },		
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
	  	.osq = { (0) },		
#endif	
	  	.wait_lock = {	
					.raw_lock = 1,
#ifdef CONFIG_DEBUG_SPINLOCK
					.magic = 0xdead4ead,		
					.owner_cpu = -1,		
					.owner = ((void *)-1L),
#endif
					.dep_map = {					
							.name = "wait_lock",	
							.wait_type_inner = 2,	
					}
	  	},
	  	.wait_list = LIST_HEAD_INIT((memory_chain).wait_list),
#ifdef CONFIG_DEBUG_RWSEMS
		.magic = &memory_chain,
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
		.dep_map = {				
				.name = "memory_chain",		
				.wait_type_inner = LD_WAIT_SLEEP,
	},
#endif
}

3. 部分结构定义

? percpu_stats 块分配统计

struct percpu_stats {
	u64 nr_alloc;		/* lifetime # 分配数 */
	u64 nr_dealloc;		/* lifetime # 回收数 */
	u64 nr_cur_alloc;	/* current # 当前分配数 */
	u64 nr_max_alloc;	/* max # 最大分配数 */
	u32 nr_chunks;		/* current # 活动块 */
	u32 nr_max_chunks;	/* max # 最大活动块 */
	size_t min_alloc_size;	/* 最小分配 */
	size_t max_alloc_size;	/* 最大分配 */
};

? top_cpuset 顶部cpu集

static struct cpuset top_cpuset = {
        .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
                  (1 << CS_MEM_EXCLUSIVE)),
        .partition_root_state = PRS_ROOT,
};

4. 扩展函数/变量

? DEFINE_STATIC_PERCPU_RWSEM 定义新类型读写信号量结构对象cpuset_rwsem

DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); // 展开内容参考下发跳转
||
\/
__DEFINE_PERCPU_RWSEM(cpuset_rwsem, static)
||
\/
#define __DEFINE_PERCPU_RWSEM(name, is_static)                          \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);          \
is_static struct percpu_rw_semaphore name = {                           \
        .rss = __RCU_SYNC_INITIALIZER(name.rss),                        \
        .read_count = &__percpu_rwsem_rc_##name,                        \
        .writer = __RCUWAIT_INITIALIZER(name.writer),                   \
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),         \
        .block = ATOMIC_INIT(0),                                        \
        __PERCPU_RWSEM_DEP_MAP_INIT(name)                               \
}

cpuset_rwsem
DEFINE_PER_CPU

? DEFINE_PER_CPU

static unsigned int __percpu_rwsem_rc_cpuset_rwsem;  
// __percpu_rwsem_rc_cpuset_rwsem定义在.data..percpu 段中，属于cpu局部的内存空间
// 该变量在整个vmlinux的.data..percpu区里的位置，然后通过某个cpu的percpu内存块的起始地址，
// 就可以计算出该cpu对应的该变量的运行时内存地址

/*
 * linux内核在启动时，会先把vmlinux文件加载到内存中，然后根据cpu的个数，
 * 为每个cpu都分配一块用于存放percpu变量的内存区域，
 * 之后把vmlinux中的.data..percpu section里的内容，
 * 拷贝到各个cpu的percpu内存块的static区域里，
 * 最后将各percpu内存块的起始地址放到对应cpu的gs寄存器里
 *
 * 当我们在访问percpu变量时，只需要将gs寄存器里的地址，
 * 加上我们想要访问的percpu变量的地址，就能得到在该cpu上，该percpu变量真实的内存地址
 *  https://zhuanlan.zhihu.com/p/340985476
 */

static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);
||
\/
DEFINE_PER_CPU_SECTION(unsigned int, __percpu_rwsem_rc_cpuset_rwsem, "")
||
\/
__PCPU_ATTRS("") __typeof__(unsigned int) __percpu_rwsem_rc_cpuset_rwsem
||
\/
#define __PCPU_ATTRS("")                                               \
        __percpu __attribute__((section(PER_CPU_BASE_SECTION "")))     \
        PER_CPU_ATTRIBUTES   // 通用架构(包括x86系列、arm系列等)为空
||
\/
# define __percpu       __attribute__((noderef, address_space(__percpu))) // __percpu在sparse中定义为#define __percpu __attribute__((address_space(3))) ，表示指针不能被解引用(*ptr访问)，3表示cpu局部的内存空间

#define PER_CPU_BASE_SECTION ".data..percpu"

? percpu_init_rwsem 定义、初始化percpu读写信号量

define percpu_init_rwsem(sem)                                  \
({                                                              \
        static struct lock_class_key rwsem_key;                 \
        __percpu_init_rwsem(sem, #sem, &rwsem_key);             \
})
||
\/
 static struct lock_class_key rwsem_key; \
  __percpu_init_rwsem(cpuset_rwsem, "cpuset_rwsem", &rwsem_key);
  ||
  \/
  int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
                        const char *name, struct lock_class_key *key)
{
        sem->read_count = alloc_percpu(int); // 分配pcpu块区域，从顶部分配虚拟区间， 记录到cgroup数组中，更新cgroup内存统计信息
        if (unlikely(!sem->read_count))
                return -ENOMEM;

alloc_percpu

		rcu_sync_init(&sem->rss); // 初始化rcu_sync结构，及gp_wait等待队列
        rcuwait_init(&sem->writer); // w->task = NULL;
        init_waitqueue_head(&sem->waiters); // 初始化写者等待队列
        atomic_set(&sem->block, 0); 
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
        lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
        return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);

? alloc_percpu 分配pcpu块区域，从顶部分配虚拟区间，记录到cgroup数组中，更新cgroup内存统计信息

#define alloc_percpu(type)                                              \
        (typeof(type) __percpu *)__alloc_percpu(sizeof(type),           \
                                                __alignof__(type))
||
\/
(int __percpu *)__alloc_percpu(sizeof(int),  __alignof__(int))
||
\/
void __percpu *__alloc_percpu(size_t size, size_t align)  // 分配动态percpu区域
{
        return pcpu_alloc(size, align, false, GFP_KERNEL);
}       
EXPORT_SYMBOL_GPL(__alloc_percpu);
||
\/
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
{
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
        struct obj_cgroup *objcg = NULL;
        static int warn_limit = 10;
        struct pcpu_chunk *chunk, *next;
        const char *err;
        int slot, off, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
        size_t bits, bit_align;

        gfp = current_gfp_context(gfp);
        // 将每个任务的gfp上下文应用到给定的分配标志
        // 指定PF_MEMALLOC_NOIO，将移除__GFP_IO 和 __GFP_FS标志
        // 指定PF_MEMALLOC_NOFS，将移除__GFP_FS标志
		// 指定PF_MEMALLOC_PIN，将移除__GFP_MOVABLE标志

		/* 可传递给后台分配器的白名单标志 */
        pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
        do_warn = !(gfp & __GFP_NOWARN);

		/* 
		 * 现在有一个PCPU_MIN_ALLOC_SIZE的最小分配大小，因此对齐必须是这么多字节中的最小值
		 * 一个分配可能有内部碎片，从四舍五入到PCPU_MIN_ALLOC_SIZE - 1字节
		 * /
		 if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
                align = PCPU_MIN_ALLOC_SIZE;

        size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); // 按PCPU_MIN_ALLOC_SIZE对齐分配，如分配1字节 对齐为4字节，不足4字节将分配4字节，如果为5，将分配8字节
        bits = size >> PCPU_MIN_ALLOC_SHIFT; 
        bit_align = align >> PCPU_MIN_ALLOC_SHIFT; 

		if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
                     !is_power_of_2(align))) { // 最大分配不超过32K，对齐小于页大小，并且为2的幂次数(如二进制10, 100, 1000， n & (n -1) == 0)
                WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
                     size, align); 
                return NULL;
        }

		if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg))) // 从每cpu中的mem_cgroup中获取指定字节(内存)的对象
                return NULL;

pcpu_memcg_pre_alloc_hook

if (!is_atomic) {
                /*
                 * pcpu_balance_workfn()在这个互斥锁下分配内存，它可能会等待内存回收
                 * 允许当前任务成为OOM受害者，以防内存压力
                 */
                if (gfp & __GFP_NOFAIL) { // 内存分配不会失败的标志(堵塞等待，它一直到分配成功为止)
                        mutex_lock(&pcpu_alloc_mutex);
                } else if (mutex_lock_killable(&pcpu_alloc_mutex)) { // 获取互斥锁，可被致命信号中断
                        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
                        return NULL;
                }
        }

		...
		/* 如果可用，提供来自预留块的预留分配 */
        if (reserved && pcpu_reserved_chunk) { // 内核静态percpu区域的预留块
                chunk = pcpu_reserved_chunk;

                off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); // 查找要开始搜索的块索引
                if (off < 0) {
                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }

                off = pcpu_alloc_area(chunk, bits, bit_align, off); // pcpu_chunk分配区域，此函数从指定偏移量开始搜索，找到(具有)指定大小的对齐区域
                if (off >= 0)
                        goto area_found;

                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }

restart:
		pcpu块链表中搜索区域

		...
		/* 
		 * 没有剩余空间， 创建新区块
		 * 我们不希望多个任务同时创建块
		 * 如果在获取互斥体之后仍然没有空块，则序列化并创建
		 * /
		if (is_atomic) {
                err = "atomic alloc failed, no space left";
                goto fail;
        }

        if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
                chunk = pcpu_create_chunk(pcpu_gfp); // 分配pcpu块区域，从顶部分配虚拟区间
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
                }

                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_chunk_relocate(chunk, -1); // 从插槽里移除块链表
        } else {
                spin_lock_irqsave(&pcpu_lock, flags);
        }

        goto restart;

area_found:

pcpu_create_chunk

pcpu_stats_area_alloc(chunk, size); // 加入块统计

percpu_stats

/* 如果不是所有页面都已存在，则填充 */
 if (!is_atomic) {
                unsigned int page_end, rs, re;

                rs = PFN_DOWN(off); // 返回对应的物理页号 (off >> PAGE_SHIFT)
                page_end = PFN_UP(off + size); // 返回对应+1的物理页号((off + size) 不能整除PAGE_SHIFT的情况)，如(off + size)余数不足一页或超出一页，按一页补齐，可以整除则返回对应的物理页号

                for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
                        WARN_ON(chunk->immutable);

                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); // 填充和映射pcpu_chunk的一个区域

                        spin_lock_irqsave(&pcpu_lock, flags); // 获取自旋锁，禁止抢占
                        if (ret) {
                                pcpu_free_area(chunk, off);
                                err = "failed to populate";
                                goto fail_unlock;
                        }
                        pcpu_chunk_populated(chunk, rs, re); // 填充后记账
                        spin_unlock_irqrestore(&pcpu_lock, flags); // 归还自旋锁
                }
	mutex_unlock(&pcpu_alloc_mutex);
}

	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work(); // 调度工作队列 pcpu_balance_work， 管理空闲块和已填充页面的数量

	/* 清除区域并返回相对于基址的地址 */
	for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

	/* 默认addr<->pcpu_ptr映射 */
	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size, gfp); // 注册新分配的__percpu对象

        trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
                                  chunk->base_addr, off, ptr,
                                  pcpu_obj_full_size(size), gfp); // 增加trace调试输出

		pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); // 记录到cgroup数组中，更新cgroup内存统计信息

        return ptr;
        ...
}

? pcpu_memcg_pre_alloc_hook 从每cpu中的mem_cgroup中获取指定字节(内存)的对象

static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
                                      struct obj_cgroup **objcgp)
{
        struct obj_cgroup *objcg;

        if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) // 首先根据memcg_kmem_enabled预测值(默认值或修改回0值时)直接跳转，这些通过加载时已经确定执行路径(几乎相当于没有进行判断)，用于减少判断带来的开销及提高准确性
                return true;

		objcg = get_obj_cgroup_from_current(); // 储存任意字节大小的cgroup对象
		// 每个 cgroup 都有一个与之关联的内存控制器特定数据结构 (mem_cgroup）
		
		if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { // 向objcg收取一定数量的内核页面
                obj_cgroup_put(objcg); // 减少引用计数
                return false;
        }
        
		*objcgp = objcg;
        return true;
}

memcg_kmem_enabled

? memcg_kmem_enabled 根据静态分支预测对象memcg_kmem_enabled_key，生成跳转路径，如memcg_kmem_enabled_key为默认值的情况下，直接跳到下面执行代码(跳过true分支)，而这些通过加载时已经确定执行路径(几乎相当于没有进行判断)，用于减少判断带来的开销及提高准确性

static inline bool memcg_kmem_enabled(void)
{
        return static_branch_likely(&memcg_kmem_enabled_key); // 分支预测，加载时已经确定，调用static_key_enable或static_key_disable函数时修改预测值(开销较大)
}  
||
\/
_________________________________________________________________________________
/*
 * 许多对缓存分配函数的调用都期望由编译器内联
 * 由于对memcg_slab_pre_alloc_hook()的调用对于这个静态分支是有条件的，
 * 所以我们必须允许执行kmem_cache_alloc等操作的模块也可以看到这个符号
 * /
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); //  定义静态分支预测 memcg_kmem_enabled_key，主要可作为动态修改预测值，FALSE (enabled)定义默认为0，执行false分支(预测结果)，而TRUE定义默认为1
// static_key_enable函数可以用于修改enabled值为1，执行true分支(预测结果)
// static_key_disable函数可以用于修改enabled值为0，执行false分支(预测结果)

EXPORT_SYMBOL(memcg_kmem_enabled_key); // 导入(声明)全局符号
// 如果这个符号所在的文件被编译成了.o并链接入了vmlinux，则这个符号在内核范围内都可以使用
// 通过 extern ...，外部声明这个符号即可使用(在需使用的文件中extern定义，符号可以是函数或变量)

#define DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key)   \ 
        struct static_key_false memcg_kmem_enabled_key = 
        		(struct static_key_false) { .key =  { .enabled = { 0 },                                     \
          											{ .type = 0UL } } , }
_________________________________________________________________________________

? pcpu_create_chunk 分配pcpu块区域，从顶部分配虚拟区间

static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{               
        struct pcpu_chunk *chunk;
        struct vm_struct **vms;
                
        chunk = pcpu_alloc_chunk(gfp); // 分配pcpu块区域
        if (!chunk)
                return NULL;
        
        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
                                pcpu_nr_groups, pcpu_atom_size); // pcpu获取虚拟内存区域
        // percpu分配器想要使用一致的vm区域，这样它可以保持percpu区域之间的偏移          
        // 为了避免与常规vmalloc交互，这些区域是从顶部分配的              
       
        chunk->data = vms;
        chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
        
        pcpu_stats_chunk_alloc(); // 增加块统计
        trace_percpu_create_chunk(chunk->base_addr);  // 增加trace调试输出，base_addr
        
        return chunk;
}

? register_hotmemory_notifier 根据通知块的优先级插入到通知块链表对应的位置，如果申请的是唯一优先级，不能存在相同优先级的节点

#define register_hotmemory_notifier(nb)         register_memory_notifier(nb)
||
\/
int register_memory_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&memory_chain, nb);
}
EXPORT_SYMBOL(register_memory_notifier);
||
\/
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, false);
}       
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
||
\/
static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                                              struct notifier_block *n,
                                              bool unique_priority)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, unique_priority); // 根据通知块的优先级插入到通知块链表对应的位置，如果申请的是唯一优先级，不能存在相同优先级的节点

        down_write(&nh->rwsem);
        ret = notifier_chain_register(&nh->head, n, unique_priority);
        up_write(&nh->rwsem);
        return ret;
}

memory_chain
notifier_chain_register

? memory_chain 块通知链头定义

#define BLOCKING_NOTIFIER_HEAD(memory_chain)				\
	struct blocking_notifier_head memory_chain =			\
		BLOCKING_NOTIFIER_INIT(memory_chain)
||
\/
#define BLOCKING_NOTIFIER_INIT(memory_chain) {				\
		.rwsem = __RWSEM_INITIALIZER((memory_chain).rwsem),	\
		.head = NULL }
||
\/
#define __RWSEM_INITIALIZER(memory_chain)				\
	{ __RWSEM_COUNT_INIT(memory_chain),				\
	  .owner = ATOMIC_LONG_INIT(0),				\
	  __RWSEM_OPT_INIT(memory_chain)				\
	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(memory_chain.wait_lock),\
	  .wait_list = LIST_HEAD_INIT((memory_chain).wait_list),	\
	  __RWSEM_DEBUG_INIT(memory_chain)				\
	  __RWSEM_DEP_MAP_INIT(memory_chain) }

__RWSEM_INITIALIZER

? notifier_chain_register 根据通知块的优先级插入到通知块链表对应的位置，如果申请的是唯一优先级，不能存在相同优先级的节点

static int notifier_chain_register(struct notifier_block **nl,
                                   struct notifier_block *n,
                                   bool unique_priority)
{
        while ((*nl) != NULL) {
                if (unlikely((*nl) == n)) {
                        WARN(1, "notifier callback %ps already registered",
                             n->notifier_call);
                        return -EEXIST;
                }
                if (n->priority > (*nl)->priority) // 如果通知块的优先级高于通知块链表头中的节点
                        break;
                if (n->priority == (*nl)->priority && unique_priority) // 如果优先级相同，而且申请的是唯一优先级
                        return -EBUSY; // 直接返回错误
                nl = &((*nl)->next); // 进入链表头的下一个节点
        }
        n->next = *nl; // 插入通知块到链表
        rcu_assign_pointer(*nl, n); // 分配给rcu保护的指针
        return 0;
}

? cpuset_track_online_nodes 通知链执行函数

/*
 * 保持top_cpuset
 * mems_allowed跟踪node_states[N_MEMORY]
 * 在node_states[N_MEMORY]更改后随时调用此例程
 * 有关CPU热插拔处理，请参见cpuset_update_active_cpus()
 * /
static int cpuset_track_online_nodes(struct notifier_block *self,
				unsigned long action, void *arg)
{
	schedule_work(&cpuset_hotplug_work); // 调度工作队列cpuset_hotplug_work
	return NOTIFY_OK;
}
||
\/
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);

cpuset_hotplug_workfn

? cpuset_track_online_nodes 处理cpuset的CPU/内存热插拔

/*
 * 此函数在CPU或内存配置更改后调用，并相应更新cpuset
 * top_cpuset始终与cpu_active_mask和N_MEMORY同步，
 * 这对于在主动使用cpu热插拔但未主动使用cpuse的系统上使cpuse透明（无影响）是必要的
 *
 * 非根cpuse仅受脱机影响
 * 如果任何CPU或内存节点已关闭，则会在所有子体上调cpuset_hotplug_update_tasks()
 *
 * 请注意，暂停期间的CPU脱机被忽略
 * 我们根本不会在挂起/恢复周期中修改cpusets
 */
static void cpuset_hotplug_workfn(struct work_struct *work)
{
	static cpumask_t new_cpus;
	static nodemask_t new_mems;
	bool cpus_updated, mems_updated;
	bool on_dfl = is_in_v2_mode();
	struct tmpmasks tmp, *ptmp = NULL;

	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
		ptmp = &tmp;

	percpu_down_write(&cpuset_rwsem);

	/* 获取可用的cpus/mems，并找出其中的更改方式 */
	cpumask_copy(&new_cpus, cpu_active_mask);
	new_mems = node_states[N_MEMORY];

	/* 
	 * 如果subparts_cpus被填充，那么当cpu列表没有更改时，
	 * 下面的检查会对cpus_updated进行确认是否符合条件
	 * 这是额外的工作，但最好是安全
	 * /
	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);

	/* 在罕见的情况下，热插拔会删除subsets_cpus中的所有cpu，我们假设cpu已更新 */
	if (!cpus_updated && top_cpuset.nr_subparts_cpus)
		cpus_updated = true;

	/* 将允许的cpu同步到cpu_active_mask */
	if (cpus_updated) {
		spin_lock_irq(&callback_lock);
		if (!on_dfl)
			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
		/*
		 * 确保分配给子分区的CPU不会显示在effective_CPUs中
		 * 如果没有剩余的CPU，我们将清除subscripts_cpus，并让子分区再次争夺CPU
		 */
		if (top_cpuset.nr_subparts_cpus) {
			if (cpumask_subset(&new_cpus,
					   top_cpuset.subparts_cpus)) {
				top_cpuset.nr_subparts_cpus = 0;
				cpumask_clear(top_cpuset.subparts_cpus);
			} else {
				cpumask_andnot(&new_cpus, &new_cpus,
					       top_cpuset.subparts_cpus);
			}
		}
		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
		spin_unlock_irq(&callback_lock);
		/* 我们不会在topcpuset中处理任务的cpumask */
	}

	/* 将mems_allowed同步到N_MEMORY */
	if (mems_updated) {
		spin_lock_irq(&callback_lock);
		if (!on_dfl)
			top_cpuset.mems_allowed = new_mems;
		top_cpuset.effective_mems = new_mems;
		spin_unlock_irq(&callback_lock);
		update_tasks_nodemask(&top_cpuset);
	}

	percpu_up_write(&cpuset_rwsem);

	/* 如果cpu或mems发生变化，我们需要传播给子级 */
	if (cpus_updated || mems_updated) {
		struct cpuset *cs;
		struct cgroup_subsys_state *pos_css;

		rcu_read_lock();
		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
				continue;
			rcu_read_unlock();

			cpuset_hotplug_update_tasks(cs, ptmp);

			rcu_read_lock();
			css_put(&cs->css);
		}
		rcu_read_unlock();
	}

	/* 如果cpusallowed已更改，则重建计划域 */
	if (cpus_updated || force_rebuild) {
		force_rebuild = false;
		rebuild_sched_domains();
	}

	free_cpumasks(NULL, ptmp);
}