mmap原理
mmap完成的是将物理内存映射到用户态虚拟内存,中间不需要任何的内存拷贝,文件映射实质上就是在创建内核文件的时候,给文件挂上一个mmap钩子,下面将讲解mmap系统调用是如何调用到文件mmap钩子函数。
首先是系统调用,由于mmap是对物理内存对映射,因此需要准从MMU在CPU架构上对差异,这里选择了arm64架构对函数实现,系统调用函数入口在arch/arm64/kernel/sys.c中,不同架构实现方式基本相同,对于不同对内核版本,有不同对实现方式,对于4.X内核,系统调用内部调用对是一个隐藏系统调用函数sys_mmap_pgoff,实质上就是SYSCALL_DEFINE6(mmap_pgoff…),逻辑上没啥差异,新版本内核也有该函数,调用对也是ksys_mmap_pgoff(),下面主要以5.12.1版本对内核进行讲解:
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
if (offset_in_page(off) != 0)
return -EINVAL;
return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
struct file *file = NULL;
unsigned long retval;
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
if (!file)
return -EBADF;
if (is_file_hugepages(file)) {
len = ALIGN(len, huge_page_size(hstate_file(file)));
} else if (unlikely(flags & MAP_HUGETLB)) {
retval = -EINVAL;
goto out_fput;
}
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (!hs)
return -EINVAL;
len = ALIGN(len, huge_page_size(hs));
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
if (file)
fput(file);
return retval;
}
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
&uf);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff,
unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
vm_flags_t vm_flags;
int pkey = 0;
*populate = 0;
if (!len)
return -EINVAL;
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED;
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
len = PAGE_ALIGN(len);
if (!len)
return -ENOMEM;
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
struct vm_area_struct *vma = find_vma(mm, addr);
if (vma && vma->vm_start < addr + len)
return -EEXIST;
}
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
pkey = 0;
}
vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
if (mlock_future_check(mm, vm_flags, len))
return -EAGAIN;
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
case MAP_SHARED:
flags &= LEGACY_MAP_MASK;
fallthrough;
case MAP_SHARED_VALIDATE:
...
case MAP_PRIVATE:
...
break;
default:
return -EINVAL;
}
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
if (flags & MAP_NORESERVE) {
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev, *merge;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
...
}
if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
return -ENOMEM;
if (accountable_mapping(file, vm_flags)) {
..
}
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
if (file) {
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
}
if (vm_flags & VM_SHARED) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto allow_write_and_free_vma;
}
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
if (unlikely(vm_flags != vma->vm_flags && prev)) {
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
if (merge) {
fput(vma->vm_file);
vm_area_free(vma);
vma = merge;
vm_flags = vma->vm_flags;
goto unmap_writable;
}
}
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
} else {
vma_set_anonymous(vma);
}
if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL;
if (file)
goto unmap_and_free_vma;
else
goto free_vma;
}
vma_link(mm, vma, prev, rb_link, rb_parent);
if (file) {
unmap_writable:
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
}
file = vma->vm_file;
out:
perf_event_mmap(vma);
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
uprobe_mmap(vma);
vma->vm_flags |= VM_SOFTDIRTY;
vma_set_page_prot(vma);
return addr;
unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
free_vma:
vm_area_free(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}
mmap用例
内核态代码
#include <cgel.h>
#include <linux/list.h>
#include <linux/sched.h>
#include <linux/init_task.h>
#include <linux/string.h>
#include <linux/io.h>
#include <asm/uaccess.h>
static int task_num = 0;
static int old_task_num = 0;
static struct task_info *g_taskinfo;
static int get_bits(int num, unsigned int base)
{
int merchant;
int count = 0;
merchant = num;
while (merchant >= base) {
merchant /= base;
count ++;
}
return count + 1;
}
#define NUM_TASKINFO get_bits(task_num, 10)
static int taskinfo_mmap(struct file *file, struct vm_area_struct *vma)
{
unsigned long taskinfo_pages;
unsigned long size;
ssize_t mmap_size;
if (!g_taskinfo)
return -EINVAL;
mmap_size = task_num * sizeof(struct task_info);
taskinfo_pages = virt_to_phys(g_taskinfo);
size = (unsigned long)(vma->vm_end - vma->vm_start);
if ((size >> PAGE_SHIFT) > mmap_size) {
pr_err("invalid mem range, size %ld, task size :%ld\n",
size, mmap_size);
return -ERANGE;
}
return remap_pfn_range(vma,
vma->vm_start,
taskinfo_pages >> PAGE_SHIFT,
size,
vma->vm_page_prot);
}
static ssize_t get_taskinfo(struct file *fs,
char __user *buf, size_t len, loff_t *lf)
{
int count = 0;
int ret = 0;
struct task_info *taskinfo;
struct task_struct *task;
if (likely(access_ok(VERIFY_READ, &init_task,
sizeof(init_task)))) {
pr_err("init_task is in user space\n");
return -ERANGE;
}
for_each_process(task) {
if (likely(access_ok(VERIFY_READ, task,
sizeof(*task)))) {
pr_err("task is in the user-mem space\n");
continue;
}
if (task) {
taskinfo = &task->taskinfo;
memcpy(g_taskinfo + count * sizeof(struct task_info),
taskinfo, sizeof(struct task_info));
}
count ++;
if (count == task_num)
break;
}
len = get_bits(count, 10);
char statistic[len + 1];
memset(statistic, 0, len + 1);
snprintf(statistic, len + 1, "%d", count);
if ((ret = copy_to_user(buf, statistic, len)) < 0) {
pr_err("Failed to copy to user, ret %d\n",
ret);
return ret;
}
return len;
}
static ssize_t set_taskinfo(struct file *fs,
const char __user *buf, size_t len, loff_t *lf)
{
char num[len];
char *end;
int ret;
if (!buf) {
return -EINVAL;
}
memset(num, 0, len);
if((ret = copy_from_user(num, buf, len)) < 0) {
return ret;
}
task_num = simple_strtoll(num, &end, 10);
if (old_task_num != task_num) {
g_taskinfo = kzalloc(task_num * sizeof(struct task_info),
GFP_USER);
if (!g_taskinfo)
return -ENOMEM;
}
old_task_num = task_num;
return len;
}
static int taskinfo_close(struct inode *node, struct file *fs)
{
if (g_taskinfo)
kfree(g_taskinfo);
return 0;
}
static struct file_operations opts = {
.owner = THIS_MODULE,
.open = simple_open,
.read = get_taskinfo,
.write = set_taskinfo,
.mmap = taskinfo_mmap,
.release = taskinfo_close,
};
用户态代码
编译Makefile
CROSS_COMPILE ?=
ARCH ?=
CC = ${CROSS_COMPILE}gcc
CFLAGS += -g
SRC += taskinfo.c
OUT = taskinfo
all:
$(CC) $(CFLAGS) $(SRC) -o $(OUT)
clean:
rm -fr *.o
源码实现如下
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
struct taskinfo {
int pid;
int tgid;
};
static int get_bits(ssize_t num, unsigned int base)
{
int merchant;
int count = 0;
merchant = num;
while (merchant >= base) {
merchant /= base;
count ++;
}
return count + 1;
}
int main (int argc, char ** argv)
{
struct taskinfo *task;
char *file;
int num;
int fd;
int i;
int count = 0;
int task_num = 0;
ssize_t ret;
char *addr;
char *buf;
char *read_buf;
if (argc > 2) {
file = argv[1];
count = strlen(argv[2]);
num = atoi(argv[2]);
}
else {
perror("invalid arg\n");
return -1;
}
fd = open(file, O_RDWR);
if (fd < 0) {
printf("can't open %s\n", file);
return -2;
}
count = get_bits(num, 10);
buf = (char *)malloc(count);
snprintf(buf, count, "%d", num);
read_buf = (char *)malloc(count);
if ((ret = write(fd, buf, count)) < 0) {
perror("write failed\n");
return ret;
}
sleep(1);
if ((ret = read(fd, read_buf, count)) < 0) {
perror("read failed\n");
return ret;
}
task_num = atoi(read_buf);
task = (struct taskinfo *)mmap(NULL, num * sizeof(struct taskinfo),
PROT_READ, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) {
perror("mmap failed\n");
return -3;
}
count = 0;
printf("pid\t\ttgid\n");
for (i = 0; i < num; i ++) {
if (!task) {
break;
}
printf("%d\t\t%d\n", task->pid, task->tgid);
task = task + count * sizeof(struct taskinfo);
count ++;
if (count == task_num)
break;
}
sleep(1);
printf("start munmap\n");
munmap(task, num * sizeof(*task));
sleep(1);
printf("start close\n");
close(fd);
sleep(1);
return 0;
}
|