MMAP
mmap 声明
mmap/munmap函数声明如下:
#include <sys/mman.h>
void *mmap(void *addr, size_t length, int prot, int flags,
int fd, off_t offset);
int munmap(void *addr, size_t length);
- addr:如果不为NULL,内核会在此地址创建映射;否则,内核会选择一个合适的虚拟地址。
- length:表示映射到进程地址空间的大小。
- prot:内存区域的读/写/执行属性。
- flags:内存映射的属性,共享、私有、匿名、文件等。
- fd:表示这是一个文件映射,fd是打开文件的句柄。
- offset:在文件映射时,表示相对文件头的偏移量;返回的地址是偏移量对应的虚拟地址。
/*prot对应的参数组合*/
#define PROT_READ 0x1 /* page can be read */
#define PROT_WRITE 0x2 /* page can be written */
#define PROT_EXEC 0x4 /* page can be executed */
#define PROT_SEM 0x8 /* page may be used for atomic ops */
#define PROT_NONE 0x0 /* page can not be accessed */
#define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */
#define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */
/*flags参数组合*/
#define MAP_SHARED 0x01 /* Share changes 创建一个共享映射的区域,多个进程可以映射到一个文件,其他进程可以看到映射内容的改变,修改后内容会同步到磁盘中。*/
#define MAP_PRIVATE 0x02 /* Changes are private --创建一个私有的写时复制的映射,其他进程看不到映射内容的改变,也不会同步到磁盘中。*/
#define MAP_TYPE 0x0f /* Mask for type of mapping */
#define MAP_FIXED 0x10 /* Interpret addr exactly -使用指定的映射起始地址,如果有start和len参数指定的内存区重叠于现存的映射空间,重叠部分将会被丢弃。如果指定起始地址不可用,操作将会失败。并且起始地址必须落在页的边界上。*/
#define MAP_ANONYMOUS 0x20 /* don't use a file ---匿名映射,映射区不与任何文件关联。此时fd应设置为-1。*/
#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
# define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be uninitialized */
#else
# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
#endif
#define MAP_GROWSDOWN 0x0100 /* stack-like segment --------------告诉内核VM系统,映射区可以向下扩展。*/
#define MAP_DENYWRITE 0x0800 /* ETXTBSY */
#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
#define MAP_LOCKED 0x2000 /* pages are locked -------------------锁定映射区页面,从而防止页面被交换出内存。*/
#define MAP_NORESERVE 0x4000 /* don't check for reservations */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables ---对文件映射来说,会提前预读文件内容到映射区域,只支持私有映*/射。
#define MAP_NONBLOCK 0x10000 /* do not block on IO --------------和MAP_POPULATE一起使用时才有意义。不执行预读,只为已存在与内存中的页面建立页表入口。*/
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
glibc中mmap宏
为了便于理解,看一下MMAP的代码
#define MMAP(addr, size, prot, flags) \
__mmap((addr), (size), (prot), (flags)|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)
#define INTERNAL_SYSCALL_MAIN_6(name, err, arg1, arg2, arg3, \
arg4, arg5, arg6) \
struct libc_do_syscall_args _xv = \
{ \
(int) (0), \
(int) (-1), \
(int) (0) \
}; \
asm volatile ( \
"movl %1, %%eax\n\t" \
"call __libc_do_syscall" \
: "=a" (resultvar) \
: "i" (__NR_mmap2), "c" (size), "d" (PROT_READ | PROT_WRITE), "S" (MAP_ANONYMOUS|MAP_PRIVATE), "D" (&_xv) \
: "memory", "cc")
MMAP在libc中被定义为宏,包括了汇编和C代码
9 common mmap sys_mmap
系统调用接口
通过__libc_do_syscall 进入系统调用(内核版本5.15.0),在arch/x86/entry/syscalls/syscall_64.tbl中有如下定义
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
if (offset_in_page(off) != 0)
return -EINVAL;
return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
根据mmap的参数6找到SYSCALL_DEFINE6 在arch/x86/kernel/sys_x86_64.c中
ksys_mmap_pgoff
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
.....
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
.....
return retval;
}
再进入到ksys_mmap_pgoff 中,省略判断内存页状态的部分,主功能函数是vm_mmap_pgoff 实现的
vm_mmap_pgoff
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
&uf);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
首先初始化 mm_sturct 结构体为当前进程的mm_struct 该结构体保存了虚拟页和物理页的映射关系security_mmap_file 函数与安全相关,并提供了mmap入口处的hook点,随后进入do_mmap
do_mmap
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff,
unsigned long *populate, struct list_head *uf)
{
.......
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
if (find_vma_intersection(mm, addr, addr + len))
return -EEXIST;
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
pkey = 0;
}
vm_flags = calc_vm_prot_bits(prot, pkey)|calc_vm_flag_bits(flags) |mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
//针对文件页和匿名页进行不一样的处理逻辑
.......:
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}
get_unmmapped_area(struct file *file, unsigned long addr, unsigned long len,unsigned long pgoff, unsigned long flags)
{
unsigned long (*get_area)(struct file *, unsigned long,unsigned long, unsigned long, unsigned long);
get_area = current->mm->get_unmapped_area;
.....
addr = get_area(file, addr, len, pgoff, flags)
}
? 首先调用了get_unmmap_area 在进程的用户空间中查找已分配的虚拟空间,get_area 函数指针根据mmap的方向分为arch_get_unmapped_area_topdown 或者arch_get_unmapped_area 两个函数指针,arch_get_unmapped_area 从低地址往高地址拓展。
? 获取到了返回地址后,如果是增大addr需要vma是否存在交叉区域,并检查内存上下文一致性。calc_vm_prot_bits 和calc_vm_flag_bits 用来将prot和flags中的标志位转化为vm的标志位,例如prot中的PROT_READ 转化为VM_READ ,flags中的MAP_GROWSDOWN 转化为VM_GROWSDOWN 。根据前面prot和flags中的值,这里转化后,vm_flags 为VM_READ|VM_WRITE|mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC 。最后就调用mmap_region 构造一个vma用来保存刚刚获得的虚拟内存。
? MAP的标志位是int类型而vm是unsigned long 所以需要修改标志位,这样也可以消除64位和32位的差异。mmap_region 函数比较复杂网上的分析也很多linux内核那些事之mmap_region流程梳理
arch_get_unmapped_area
#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
const unsigned long mmap_end = arch_get_mmap_end(addr);
if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
return addr;
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma_prev(mm, addr, &prev);
if (mmap_end - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)) &&
(!prev || addr >= vm_end_gap(prev)))
return addr;
}
//初始化随机映射
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = mmap_end;
info.align_mask = 0;
info.align_offset = 0;
return vm_unmapped_area(&info);
}
#endif
如果是固定地址映射则直接返回addr。另一种情况是指定了起始mmap地址,对齐地址后,调用find_vma_prev 查找addr开始已经分配的虚拟内存vma,最后判断addr到len之间是否有分配虚拟内存并针对VM_GROWSUP(向上扩展)需要将地址减一,随后返回addr。
find_vma_prev
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
vma = find_vma(mm, addr);
if (vma) {
*pprev = vma->vm_prev;
} else {
struct rb_node *rb_node = rb_last(&mm->mm_rb);
*pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
后面就涉及到了红黑树算法了,目的是从进程的红黑树中找到原来分配的内存,红黑树简单的作用就是防止一个树结构不平衡,出现某个左子树严重大于右子树的情况。为了加快查找的速度,这里设立了缓存。通过观察while结构,这里就是查找第一个结束地址大于addr的已经分配的虚拟内存,然后返回。
MUMMAP
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
addr = untagged_addr(addr);
profile_munmap(addr);
return __vm_munmap(addr, len, true);
}
profile_munmap 用于阻塞通知程序链中的调用函数禁止上下文切换,下面看vm_munmap
static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
{
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = __do_munmap(mm, start, len, &uf, downgrade);
/*
* Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
* it to 0 before return.
*/
if (ret == 1) {
mmap_read_unlock(mm);
ret = 0;
} else
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
return ret;
}
首先mmap_write_lock_killable 检查内存mm是否存在写锁,然后执行__do_munmap 释放内存。
do_mmap
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len){
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
len = PAGE_ALIGN(len);
vma = find_vma(mm, start);
prev = vma->vm_prev;
end = start + len;
if (vma->vm_start >= end)
return 0;
if (start > vma->vm_start) {
int error;
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
error = __split_vma(mm, vma, start, 0);
if (error)
return error;
prev = vma;
}
last = find_vma(mm, end);
if (last && end > last->vm_start) {
int error = __split_vma(mm, last, end, 1);
if (error)
return error;
}
vma = prev ? prev->vm_next : mm->mmap;
detach_vmas_to_be_unmapped(mm, vma, prev, end);
unmap_region(mm, vma, prev, start, end);
arch_unmap(mm, vma, start, end);
remove_vma_list(mm, vma);
return 0;
}
? 首先对传入参数进行检查,需要释放的虚拟内存的开始地址start 和长度len 必须按页对齐,且不能释放内核空间的内存。然后通过find_vma 在AVL树上查找第一个结束地址大于start 的虚拟内存如果vma->vm_start >= end ,说明需要释放的虚拟内存本来就不存在,因此什么也不做返回;如果start > vma->vm_start ,则表示找到的vma 包含了需要释放的内存,这时候通过__split_vma 函数将该vma 根据start 地址划分成两块,因此需要判断虚拟内存的数量是否超过了系统的限制sysctl_max_map_count 。
__split_vma
static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below){
struct vm_area_struct *new;
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
*new = *vma;
if (new_below)
new->vm_end = addr;
else {
new->vm_start = addr;
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
if (new_below)
err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
((addr - new->vm_start) >> PAGE_SHIFT), new);
else
err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
if (!err)
return 0;
}
? 首先分配一个vm_area_struct 结构体new ,然后将vma 中的所有内容拷贝到new 中,new_below 决定将原vma 按照addr 决定的地址分割成两个后,vma 中保存低地址部分还是高地址部分。do_munmap 第一次进入__split_vma 时new_below 为0,因此返回的vma 保存低地址部分。然后调用vma_adjust 对低地址部分的vma 进行相应的设置,主要是更改其end 变量为addr ,并将高地址部分插入进程内存的管理树中。
回到do_munmap 中,find_vma(mm, end) 获得最尾部的last ,如果该last 包含了需要释放的虚拟内存,就继续将其拆成两部分,这时候由于new_below 为1,因此返回的last 为高地址部分。返回后,vma 将指向低地址部分。
结合前面的分析,在执行detach_vmas_to_be_unmapped 之前,原来的vma被拆成如下所示 | prev | vma | … | vma | last | mm->mmap 的赋值是在vma_adjust 中,其实就是拆分后低地址处那块虚拟内存。 接下来detach_vmas_to_be_unmapped 用于将所有和要释放的内存有交集的vma 从红黑树中删除,并形成一个以vma 为链表头的链表。根据刚刚vma 被拆开成的结果,其实就是取数组中所有除了prev 和last 的元素构成一个链表。即 | prev | vma | … | vma | last | 经过detach_vmas_to_be_unmapped 后变成, | prev| last | | vma | … | vma | 往下就是要释放第二部分。
static void detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, unsigned long end){
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
vma_rb_erase(vma, &mm->mm_rb);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
if (vma) {
vma->vm_prev = prev;
vma_gap_update(vma);
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
vmacache_invalidate(mm);
}
? 回到do_munmap 中,unmap_region 就是用于释放内存了。下面来看,
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end){
struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
struct mmu_gather tlb;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
}
? lru_add_drain 用于将percpu 变量pagevec 对应的每个page 放回其对应的zone 的lru 链表中,因为马上要解映射了,这些缓存的page变量由可能被改变。 tlb_gather_mmu 构造了一个mmu_gather 变量并初始化。 接下来的unmap_vmas 用于解映射,即释放存在物理页面映射的虚拟内存.
void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr){
struct mm_struct *mm = vma->vm_mm;
mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
}
这里开始遍历vma 链表,对每个vma 调用unmap_single_vma 进行释放,
static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details){
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
if (start >= vma->vm_end)
return;
end = min(vma->vm_end, end_addr);
if (end <= vma->vm_start)
return;
if (vma->vm_file)
uprobe_munmap(vma, start, end);
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn(vma, 0, 0);
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
if (vma->vm_file) {
i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
i_mmap_unlock_write(vma->vm_file->f_mapping);
}
} else
unmap_page_range(tlb, vma, start, end, details);
}
}
这里主要就是通过unmap_page_range 进行释放。再往下因为涉及太多linux内核内存管理的知识,这里就不深入分析了,最后就是通过虚拟地址找到页表pte ,解开和物理页面之间的映射,并设置一些page结构。 由于unmap_vmas 后,一些页表里没有了相对应的物理页面,free_pgtables 将这些页表释放。
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long floor, unsigned long ceiling){
while (vma) {
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
unlink_anon_vmas(vma);
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
} else {
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
}
vma = next;
}
}
这里主要是调用free_pgd_range 。该函数中,假设要释放的虚拟内存为vma,其前一个vma为prev ,后一个为last ,如果释放完vma 后,prev->vm_end 到last->vm_start 大于一个pgd管理的内存大小(32位系统下为4MB),就释放pgd里的所有页表,如果小于4MB,就什么也不做返回。
再回到do_munmap 中,arch_unmap 是一些体系结构相关的操作,不管它。remove_vma_list 释放每个vma 对应的vm_area_struct 结构至slab分配器中。
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma){
unsigned long nr_accounted = 0;
update_hiwater_vm(mm);
do {
long nrpages = vma_pages(vma);
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
vma = remove_vma(vma);
} while (vma);
vm_unacct_memory(nr_accounted);
validate_mm(mm);
}
主要的函数是remove_vma ,该函数通过kmem_cache_free 释放对应的vma ,并返回链表上的下一个vma 。
static struct vm_area_struct *remove_vma(struct vm_area_struct *vma){
struct vm_area_struct *next = vma->vm_next;
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
return next;
}
|