内核对vm_area_struct 数据结构相关操作进行了一系列封装,方面进行后续操作,vma的操作实现大部分位于mm\mmap.c文件中
find_vma()
find_vma()是内核中经常需要查找某个给定的虚拟地址是否已经分配,源代码如下:
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct rb_node *rb_node;
struct vm_area_struct *vma;
/* Check the cache first. */
vma = vmacache_find(mm, addr);
if (likely(vma))
return vma;
rb_node = mm->mm_rb.rb_node;
while (rb_node) {
struct vm_area_struct *tmp;
tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
if (tmp->vm_end > addr) {
vma = tmp;
if (tmp->vm_start <= addr)
break;
rb_node = rb_node->rb_left;
} else
rb_node = rb_node->rb_right;
}
if (vma)
vmacache_update(addr, vma);
return vma;
}
参数:
- struct mm_struct *mm:为查询的虚拟地址空间是属于哪个进程的空间管理。
- unsigned long addr:要查询的地址。
函数源代码主要流程:
- vmacache_find(mm, addr):首先从 vmcache中查询是否存在该地址对应的vma,关于vmcache可以看下《inux内核那些事之用户空间管理》.
- 如果vmcache里面没有,则从mm中所管理的所有vma进行遍历查询即遍历mm中mm_rb红黑树,寻找合适的vma(合适vma定义即离addr最近的vma即addr一定小于vma->end)。
- 如果查询到vma,则刷新vmacahe?vmacache_update(addr, vma), 以加快速度查找。
find_vma函数返回的vma,有两种情况:
- 要查询的addr地址位于VMA空间范围之内,即位于[vma->vm_start, vma->vm_end) 之内
- 要查询的addr 不位于vma内,即addr 小于vm->vm->vm_start
返回的vma 的vm->end地址一定大于addr,但是不能够保证addr位于vma内,但是能够保证返回的vma一定位于addr最近的已经分配的空间。
find_vma_prev
find_vma_prev和find_vma功能类似,不过该函数不仅能够返回查找到的vma,还返回了vma前一个prev节点:
/*
* Same as find_vma, but also return a pointer to the previous VMA in *pprev.
*/
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
vma = find_vma(mm, addr);
if (vma) {
*pprev = vma->vm_prev;
} else {
struct rb_node *rb_node = rb_last(&mm->mm_rb);
*pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
代码主要处理流程:
- 调用find_vma查找 一个vma
- 如果vma不为空,则将vma之前的节点返回*pprev=vma->vm_prev.
- 如果vma为空,则返回当前进程内最后申请的vma rb_last(&mm->mm_rb), 如果rb_node为空,则说明该进程空间还没有申请过vma。
find_vma_links()
find_vma_link 主要经常用于创建一个新的vma过程中使用,该函数不仅能够返回查询到的vma以及pprev, 还返回rb_link即为新的要插入的vma返回一个合适的位置进行插入:
static int find_vma_links(struct mm_struct *mm, unsigned long addr,
unsigned long end, struct vm_area_struct **pprev,
struct rb_node ***rb_link, struct rb_node **rb_parent)
{
struct rb_node **__rb_link, *__rb_parent, *rb_prev;
__rb_link = &mm->mm_rb.rb_node;
rb_prev = __rb_parent = NULL;
while (*__rb_link) {
struct vm_area_struct *vma_tmp;
__rb_parent = *__rb_link;
vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
if (vma_tmp->vm_end > addr) {
/* Fail if an existing vma overlaps the area */
if (vma_tmp->vm_start < end)
return -ENOMEM;
__rb_link = &__rb_parent->rb_left;
} else {
rb_prev = __rb_parent;
__rb_link = &__rb_parent->rb_right;
}
}
*pprev = NULL;
if (rb_prev)
*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
*rb_link = __rb_link;
*rb_parent = __rb_parent;
return 0;
}
该函数会遍历rb tree直到叶子节点没有子节点为止。
- 遍历过程中如果addr 大于vma->vm_end,则将__rb_link = &__rb_parent->rb_rightrb_prev=__rb_parent,往右节点遍历
- 如果addr 小于vma->vm_end, 且end>vma->vm_start, 则返回错误-ENOMEM
- 如果addr 小于vma->vm_end? 且且end<=?vma->vm_start,则往左节点遍历
- 上述如此反复,直到叶子节点没有子节点为止。
- 并将*rb_link = __rb_link及?*rb_parent = __rb_parent, 并且返回*pprev.
vm_area_cachep
每次申请一个新的vm_area_struct 结构是,都是从vm_area_cachep中获取,vm_area_cachep为基于slab 算法管理着所有的申请的vm_area_struct结构,例如vm_area_alloc 申请一个新的VMA时,从vm_area_cachep中申请一个新的VMA结构:
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (vma)
vma_init(vma, mm);
return vma;
}
从?vm_area_cachep中申请一个vma,?vm_area_cache为基于slab分配算法(后续再详细描述)在buffy算法之上 管理小于page的内存分配算法,proc_caches_init()初始化时会创建一个?vm_area_cache:
void __init proc_caches_init(void)
{
... ...
mm_cachep = kmem_cache_create_usercopy("mm_struct",
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
offsetof(struct mm_struct, saved_auxv),
sizeof_field(struct mm_struct, saved_auxv),
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
可以通过cat /proc/slabinfo 命令查看 到vm_are_struct结构分配情况:
?vma_init
初始化vma函数:
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
}
?这里需要特别注意的就是vm_ops设置,此时vm_ops里面各自操作全部为空,即使用系统默认的VMA操作。
VMA 匿名空间
在系统中很多情况包括申请一个内存,以及mmap系统调用申请一个新的匿名映射申请的vma都属于匿名空间,vma_set_anonymous()设置VMA为匿名空间:
static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
vma->vm_ops = NULL;
}
可以看懂当vma为匿名空间时,vm_ops为NULL。
static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
return !vma->vm_ops;
}
判断一个vma是否为匿名,如果vm_ops为空则为匿名。
vma_is_foreign
判断vma属于当前进程空间内:
static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
if (!current->mm)
return true;
if (current->mm != vma->vm_mm)
return true;
return false;
}
如果不属于当前运行进程内空间则返回true,否则返回false?
insert_vm_struct()
把新创建的VMA插入到进程的mm 红黑树中:
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
* then i_mmap_rwsem is taken here.
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
if (find_vma_links(mm, vma->vm_start, vma->vm_end,
&prev, &rb_link, &rb_parent))
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
* until its first write fault, when page's anon_vma and index
* are set. But now set the vm_pgoff it will almost certainly
* end up with (unless mremap moves it elsewhere before that
* first wfault), so /proc/pid/maps tells a consistent story.
*
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
* Similarly in do_mmap_pgoff and in do_brk.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
}
- ?调用find_vma_links(),查找vma要插入的红黑树中节点位置。
- 如何flag 设置了VM_ACCOUNT,且security_vm_enough_memory_mm()没有足够内存 返回失败,否则继续往下走。
- vma_is_anonymous(),如果是匿名映射 则把vma->vm_pgoff 设置成?vma->vm_start >> PAGE_SHIFT,即偏移位置为vm的start 虚拟地址 vfn
- vma_link将vma 添加到相应节点中。
vma_link
插入vma到相应的节点上去,由于vma 即支持双向链表又支持红黑树方式,处理起来比较繁琐:
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
struct address_space *mapping = NULL;
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
}
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
if (mapping)
i_mmap_unlock_write(mapping);
mm->map_count++;
validate_mm(mm);
}
- 如果是文件映射,则获取文件映射mapping = vma->vm_file->f_mapping 空间
- __vma_link:将vma插入到链表和红黑树中。
- __vma_link_file:如果是文件映射,则还需要将vma添加到文件映射空间address_space中的i_mmap中
- mm->map_count++: mm中map_count+1.
- validate_mm: vma的后续处理。
vma_merge
vma虽然是一个虚拟空间,但是再反复申请或释放过程中会不断产生一些碎片,为了解决这些碎片问题需要不断与周围vma进行合并。合并的另外一个好处就是可以节省vma数目,减少内存占用空间。因此在新申请一个节点之前,首先需要根据需要申请的addr以及大小判断是否和之前或者之后的vma需要合并,如何能够合并就可以减少vma结构。
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
int err;
/*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
*/
if (vm_flags & VM_SPECIAL)
return NULL;
if (prev)
next = prev->vm_next;
else
next = mm->mmap;
area = next;
if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
/* verify some invariant that must be enforced by the caller */
VM_WARN_ON(prev && addr <= prev->vm_start);
VM_WARN_ON(area && end > area->vm_end);
VM_WARN_ON(addr >= end);
/*
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
err = __vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL,
prev);
} else /* cases 2, 5, 7 */
err = __vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL, prev);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
return prev;
}
/*
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
else { /* cases 3, 8 */
err = __vma_adjust(area, addr, next->vm_end,
next->vm_pgoff - pglen, NULL, next);
/*
* In case 3 area is already equal to next and
* this is a noop, but in case 8 "area" has
* been removed and next was expanded over it.
*/
area = next;
}
if (err)
return NULL;
khugepaged_enter_vma_merge(area, vm_flags);
return area;
}
return NULL;
}
该函数参数比较多:
mm: 所处的进程空间
prev: 新创建节点要插入的位置之前的节点
addr: 新申请的进程起始地址。
end:新申请的进程空间结束地址。
vm_flags:新申请的进程空间的vm_flags
anon_vma:匿名映射,后续需要详细将主要用于反向映射。
file:文件映射
pgoff: 文件映射中的偏移
policy:内存策略
vm_userfaultfd_ctx: context上下文。
该函数整个代码逻辑比较简单,就是判断是否能够和之前vma以及之后vma向合并:
与前一个vma合并的条件为:
- 前一个vma->end结束地址等于addr
- vma policy策略相同
- 不能和前一个vma?vm_flags相冲突
和后一个vma合并条件为:
- 后一个vma->start地址与end地址相等
- vma policy策略相同
- 不能和后一个vma?vm_flags相冲突
?如果需要申请的新的进程空间vma flag设置为VM_SPECIAL,则不能进行合并直接进行返回
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
因此一个新申请的进程空间可以合并情况如下:
- 新VMA的起始地址和前prev节点结束地址重叠,和前一个prev节点合并且满足合并需求,但是不能和后next节点合并
- 新VMA的结束地址和后next节点的起始地址重叠,和后一个next节点合并且满足合并需求,但是不能和prev节点合并
- 新的VMA正好能够将prev和next都能衔接上且都能满足合并需求,那么就可以和prev和next合并成一个大的vma
如果能够合并则调用__vma_adjust()进行调整。
__vma_adjust()
__vma_adjust为具体执行合并操作:
/*
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
* is already present in an i_mmap tree without adjusting the tree.
* The following helper function should be used when such adjustments
* are necessary. The "insert" vma (if any) is to be inserted
* before we drop the necessary locks.
*/
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root_cached *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
bool start_changed = false, end_changed = false;
long adjust_next = 0;
int remove_next = 0;
if (next && !insert) {
struct vm_area_struct *exporter = NULL, *importer = NULL;
if (end >= next->vm_end) {
/*
* vma expands, overlapping all the next, and
* perhaps the one after too (mprotect case 6).
* The only other cases that gets here are
* case 1, case 7 and case 8.
*/
if (next == expand) {
/*
* The only case where we don't expand "vma"
* and we expand "next" instead is case 8.
*/
VM_WARN_ON(end != next->vm_end);
/*
* remove_next == 3 means we're
* removing "vma" and that to do so we
* swapped "vma" and "next".
*/
remove_next = 3;
VM_WARN_ON(file != next->vm_file);
swap(vma, next);
} else {
VM_WARN_ON(expand != vma);
/*
* case 1, 6, 7, remove_next == 2 is case 6,
* remove_next == 1 is case 1 or 7.
*/
remove_next = 1 + (end > next->vm_end);
VM_WARN_ON(remove_next == 2 &&
end != next->vm_next->vm_end);
/* trim end to next, for case 6 first pass */
end = next->vm_end;
}
exporter = next;
importer = vma;
/*
* If next doesn't have anon_vma, import from vma after
* next, if the vma overlaps with it.
*/
if (remove_next == 2 && !next->anon_vma)
exporter = next->vm_next;
} else if (end > next->vm_start) {
/*
* vma expands, overlapping part of the next:
* mprotect case 5 shifting the boundary up.
*/
adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
exporter = next;
importer = vma;
VM_WARN_ON(expand != importer);
} else if (end < vma->vm_end) {
/*
* vma shrinks, and !insert tells it's not
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
exporter = vma;
importer = next;
VM_WARN_ON(expand != importer);
}
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
if (exporter && exporter->anon_vma && !importer->anon_vma) {
int error;
importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
if (error)
return error;
}
}
again:
vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
if (file) {
mapping = file->f_mapping;
root = &mapping->i_mmap;
uprobe_munmap(vma, vma->vm_start, vma->vm_end);
if (adjust_next)
uprobe_munmap(next, next->vm_start, next->vm_end);
i_mmap_lock_write(mapping);
if (insert) {
/*
* Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
*/
__vma_link_file(insert);
}
}
anon_vma = vma->anon_vma;
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
if (anon_vma) {
VM_WARN_ON(adjust_next && next->anon_vma &&
anon_vma != next->anon_vma);
anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_pre_update_vma(next);
}
if (root) {
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, root);
if (adjust_next)
vma_interval_tree_remove(next, root);
}
if (start != vma->vm_start) {
vma->vm_start = start;
start_changed = true;
}
if (end != vma->vm_end) {
vma->vm_end = end;
end_changed = true;
}
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next << PAGE_SHIFT;
next->vm_pgoff += adjust_next;
}
if (root) {
if (adjust_next)
vma_interval_tree_insert(next, root);
vma_interval_tree_insert(vma, root);
flush_dcache_mmap_unlock(mapping);
}
if (remove_next) {
/*
* vma_merge has merged next into vma, and needs
* us to remove next before dropping the locks.
*/
if (remove_next != 3)
__vma_unlink_common(mm, next, next);
else
/*
* vma is not before next if they've been
* swapped.
*
* pre-swap() next->vm_start was reduced so
* tell validate_mm_rb to ignore pre-swap()
* "next" (which is stored in post-swap()
* "vma").
*/
__vma_unlink_common(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
__insert_vm_struct(mm, insert);
} else {
if (start_changed)
vma_gap_update(vma);
if (end_changed) {
if (!next)
mm->highest_vm_end = vm_end_gap(vma);
else if (!adjust_next)
vma_gap_update(next);
}
}
if (anon_vma) {
anon_vma_interval_tree_post_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock_write(anon_vma);
}
if (mapping)
i_mmap_unlock_write(mapping);
if (root) {
uprobe_mmap(vma);
if (adjust_next)
uprobe_mmap(next);
}
if (remove_next) {
if (file) {
uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
}
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
vm_area_free(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
* up the code too much to do both in one go.
*/
if (remove_next != 3) {
/*
* If "next" was removed and vma->vm_end was
* expanded (up) over it, in turn
* "next->vm_prev->vm_end" changed and the
* "vma->vm_next" gap must be updated.
*/
next = vma->vm_next;
} else {
/*
* For the scope of the comment "next" and
* "vma" considered pre-swap(): if "vma" was
* removed, next->vm_start was expanded (down)
* over it and the "next" gap must be updated.
* Because of the swap() the post-swap() "vma"
* actually points to pre-swap() "next"
* (post-swap() "next" as opposed is now a
* dangling pointer).
*/
next = vma;
}
if (remove_next == 2) {
remove_next = 1;
end = next->vm_end;
goto again;
}
else if (next)
vma_gap_update(next);
else {
/*
* If remove_next == 2 we obviously can't
* reach this path.
*
* If remove_next == 3 we can't reach this
* path because pre-swap() next is always not
* NULL. pre-swap() "next" is not being
* removed and its next->vm_end is not altered
* (and furthermore "end" already matches
* next->vm_end in remove_next == 3).
*
* We reach this only in the remove_next == 1
* case if the "next" vma that was removed was
* the highest vma of the mm. However in such
* case next->vm_end == "end" and the extended
* "vma" has vma->vm_end == next->vm_end so
* mm->highest_vm_end doesn't need any update
* in remove_next == 1 case.
*/
VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
}
}
if (insert && file)
uprobe_mmap(insert);
validate_mm(mm);
return 0;
}
参数:
vma:要合并调整的vma
start: 新合并的vma 起始地址
end:要合并的结束地址
pgoff: 文件映射则指向偏移
insert:要插入的节点位置
expand:合并后要扩展到的节点位置,如果要合并的vma只是向前或者向后一个vma,则expand和vma相等,如果是两个vma合并,则expand指向要扩展到的节点vma.
|