上一篇<<linux内核启动过程4:内核运行时>>分析到了内核进入运行时状态(不退出),本篇分析用户空间(用户层)的加载过程。
启动应用空间
? 进入kernel_init函数,在这里做了用户空间的初始化及启动(pid)1进程工作:
static int __ref kernel_init(void *unused)
{
int ret;
kernel_init_freeable();
? 进入kernel_init_freeable函数:
/*
* Wait until kthreadd is all set-up.
*/
wait_for_completion(&kthreadd_done);
/* Now the scheduler is fully set up and can do blocking allocations */
gfp_allowed_mask = __GFP_BITS_MASK;
// 调度器已经完全设置好,可以执行阻塞分配
/*
* init can allocate pages on any node
*/
set_mems_allowed(node_states[N_MEMORY]);
//设置init可以在任何内存中分配(常规、高、可移动内存)
? 等待kthreadd(pid 2)进程设置完成后(执行到complete(&kthreadd_done)),继续向下执行,此时调度器已经进入工作状态
cad_pid = task_pid(current);
//保存init id,cad_pid用于在内核启动过程中执行ctrl-alt-del重新启动(默认值为yes)
smp_prepare_cpus(setup_max_cpus);
workqueue_init();
// 这是两阶段工作队列子系统初始化的后半部分,初始化numa队列(cpu可使用numa内存),初始化工作池等等
init_mm_internals();
...
? 赋值cad_pid,初始化工作队列等等
/*
* check if there is an early userspace init. If yes, let it do all
* the work
*/
if (init_eaccess(ramdisk_execute_command) != 0) {
ramdisk_execute_command = NULL;
prepare_namespace();
}
? 进入prepare_namespace函数:
/*
* wait for the known devices to complete their probing
*
* Note: this is a potential source of long boot delays.
* For example, it is not atypical to wait 5 seconds here
* for the touchpad of a laptop to initialize.
*/
wait_for_device_probe();
md_run_setup();
if (saved_root_name[0]) {
root_device_name = saved_root_name;
if (!strncmp(root_device_name, "mtd", 3) ||
!strncmp(root_device_name, "ubi", 3)) {
mount_block_root(root_device_name, root_mountflags);
goto out;
}
ROOT_DEV = name_to_dev_t(root_device_name);
if (strncmp(root_device_name, "/dev/", 5) == 0)
root_device_name += 5;
}
if (initrd_load())
goto out;
? 进入initrd_load函数:
#ifdef CONFIG_BLK_DEV_INITRD
bool __init initrd_load(void);
#else
static inline bool initrd_load(void) { return false; }
#endif
? 默认CONFIG_BLK_DEV_INITRD未开启,这里直接执行return false; ,不加载initrd
mount_root();
||
\/
#ifdef CONFIG_BLOCK
{
int err = create_dev("/dev/root", ROOT_DEV);
if (err < 0)
pr_emerg("Failed to create /dev/root: %d\n", err);
mount_block_root("/dev/root", root_mountflags);
}
#endif
? 创建/dev/root(CONFIG_SECURITY_PATH未启动,只是创建路径)
out:
devtmpfs_mount();
// devtmpfs挂载到/dev目录
init_mount(".", "/", NULL, MS_MOVE, NULL);
// 挂载根分区到/目录(默认分区为/dev/mapper/cl-root,可以自定义分区)
init_chroot(".");
//设置系统根位置(环境)到/
? 挂载devtmpfs(/dev),系统根位置等等,回到kernel_init_freeable函数:
/*
* Ok, we have completed the initial bootup, and
* we're essentially up and running. Get rid of the
* initmem segments and start the user-mode stuff..
*
* rootfs is available now, try loading the public keys
* and default modules
*/
integrity_load_keys();
}
? 现在已经完成了初始启动,去掉initmem段并启动用户模式,回到kernel_init函数:
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
kprobe_free_init_mem();
ftrace_free_init_mem();
free_initmem();
? 释放部分内存,放回buddy系统(以供后续使用)
/*
* Kernel mappings are now finalized - update the userspace page-table
* to finalize PTI.
*/
pti_finalize();
// 更新(克隆)用户空间页表
system_state = SYSTEM_RUNNING;
numa_default_policy();
// 有好多空壳函数,一般不加注释
rcu_end_inkernel_boot();
do_sysctl_args();
if (ramdisk_execute_command) { // 没有执行
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
if (execute_command) { // 没有执行
ret = run_init_process(execute_command);
if (!ret)
return 0;
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
if (CONFIG_DEFAULT_INIT[0] != '\0') { // 没有执行,CONFIG_DEFAULT_INIT=""
ret = run_init_process(CONFIG_DEFAULT_INIT);
if (ret)
pr_err("Default init %s failed (error %d)\n",
CONFIG_DEFAULT_INIT, ret);
else
return 0;
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
? 更新(克隆)用户空间页表后,执行init程序,这里执行到了/sbin/init(在systemd安装包中,查看src/core/main.c),查看main函数执行流程:
int main(int argc, char *argv[]) {
dual_timestamp initrd_timestamp = DUAL_TIMESTAMP_NULL, userspace_timestamp = DUAL_TIMESTAMP_NULL, kernel_timestamp = DUAL_TIMESTAMP_NULL,
security_start_timestamp = DUAL_TIMESTAMP_NULL, security_finish_timestamp = DUAL_TIMESTAMP_NULL;
struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0), saved_rlimit_memlock = RLIMIT_MAKE_CONST((rlim_t) -1);
bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false, reexecute = false;
char *switch_root_dir = NULL, *switch_root_init = NULL;
usec_t before_startup, after_startup;
static char systemd[] = "systemd";
char timespan[FORMAT_TIMESPAN_MAX];
const char *shutdown_verb = NULL, *error_message = NULL;
int r, retval = EXIT_FAILURE;
Manager *m = NULL;
FDSet *fds = NULL;
redirect_telinit(argc, argv);
//重定向 tel
? 执行systemctl
/* Figure out whether we need to do initialize the system, or if we already did that because we are
* reexecuting */
skip_setup = early_skip_setup_check(argc, argv);
/* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent reexecution we
* are then called 'systemd'. That is confusing, hence let's call us systemd right-away. */
program_invocation_short_name = systemd;
(void) prctl(PR_SET_NAME, systemd);
/* Save the original command line */
saved_argv = argv;
saved_argc = argc;
? 检查是否需要重新启动,设置进程名称为systemd
if (getpid_cached() == 1) {
/* When we run as PID 1 force system mode */
arg_system = true;
? systemd进程必须是(pid)1
if (detect_container() <= 0) {
//detect_container函数检查是否已经设置容器(如lxc、lxc-libvirt、docker等),如果没有设置容器相关变量跳转到check_sched,检查/proc/1/sched文件(记录着thread相关参数)
? 重定向 tel,检查进程是否重新启动(swith-root默认情况下为系统初始化启动 或序列化),检查是否设置容器
/* Running outside of a container as PID 1 */
log_set_target(LOG_TARGET_KMSG);
log_open();
// 打开控制台日志,/dev/kmsg
if (in_initrd()) // 没有打开initrd参数(内核配置)
initrd_timestamp = userspace_timestamp;
if (!skip_setup) {
r = mount_setup_early();
if (r < 0) {
error_message = "Failed to mount early API filesystems";
goto finish;
}
/* Let's open the log backend a second time, in case the first time didn't
* work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
* available, and it previously wasn't. */
log_open();
r = initialize_security(
&loaded_policy,
&security_start_timestamp,
&security_finish_timestamp,
&error_message);
// 初始化安全相关模式,如selinux、smack
if (r < 0)
goto finish;
}
...
initialize_coredump(skip_setup);
// 初始化核心转储
? 打开日志功能(如控制台日志,/dev/kmsg),初始化安全相关模块
r = fixup_environment();
if (r < 0) {
log_emergency_errno(r, "Failed to fix up PID 1 environment: %m");
error_message = "Failed to fix up PID1 environment";
goto finish;
}
? 检查pid、控制台等是否正确/启动
if (arg_system) {
/* Try to figure out if we can use colors with the console. No need to do that for user instances since
* they never log into the console. */
log_show_color(colors_enabled());
// 检查控制台是否支持彩色,tty为关闭彩色
r = make_null_stdio();
// 使用传入的三个文件描述符设置stdin、stdout和stderr。如果任何描述符指定为-1,它将改为与/dev/null连接
if (r < 0)
log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
}
? 检查控制台,设置stdin、stdout和stderr
/* Mount /proc, /sys and friends, so that /proc/cmdline and
* /proc/$PID/fd is available. */
if (getpid_cached() == 1) {
/* Load the kernel modules early. */
if (!skip_setup)
kmod_setup();
// 加班部分内核模块,如autofs4、ip_tables等等
r = mount_setup(loaded_policy);
// 创建一些目录,如/run/systemd、/run/systemd/inaccessible(mknod相关函数)
if (r < 0) {
error_message = "Failed to mount API filesystems";
goto finish;
}
}
? 加载部分内核模块,创建一些特殊文件
/* Reset all signal handlers. */
(void) reset_all_signal_handlers();
(void) ignore_signals(SIGNALS_IGNORE, -1);
r = load_configuration(argc, argv, &saved_rlimit_nofile, &saved_rlimit_memlock, &error_message);
// 解析传入参数
if (r < 0)
goto finish;
r = safety_checks();
// 检查环境变量,及根路径(/proc/1/root保存根内文件夹/文件名称)
if (r < 0)
goto finish;
? 解析传参,检查环境变量等
if (arg_action == ACTION_RUN) {
/* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
log_close();
/* Remember open file descriptors for later deserialization */
r = collect_fds(&fds, &error_message);
if (r < 0)
goto finish;
/* Give up any control of the console, but make sure its initialized. */
setup_console_terminal(skip_setup);
/* Open the logging devices, if possible and necessary */
log_open();
}
log_execution_mode(&first_boot);
r = initialize_runtime(skip_setup,
&saved_rlimit_nofile,
&saved_rlimit_memlock,
&error_message);
// 初始化cpu运行时间,及获取nofile和memlock限制数
if (r < 0)
goto finish;
? 设置fds,初始化cpu运行时间
r = manager_new(arg_system ? UNIT_FILE_SYSTEM : UNIT_FILE_USER,
arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
&m);
if (r < 0) {
log_emergency_errno(r, "Failed to allocate manager object: %m");
error_message = "Failed to allocate manager object";
goto finish;
}
? 设置管理器参数、环境等等
m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
m->timestamps[MANAGER_TIMESTAMP_SECURITY_START] = security_start_timestamp;
m->timestamps[MANAGER_TIMESTAMP_SECURITY_FINISH] = security_finish_timestamp;
set_manager_defaults(m);
set_manager_settings(m);
manager_set_first_boot(m, first_boot);
/* Remember whether we should queue the default job */
queue_default_job = !arg_serialization || arg_switched_root;
before_startup = now(CLOCK_MONOTONIC);
r = manager_startup(m, arg_serialization, fds);
// 管理器启动,设置cgroups_agent、设置事件io(sd_event_add_io)、事件源等
if (r < 0) {
log_error_errno(r, "Failed to fully start up daemon: %m");
error_message = "Failed to start up manager";
goto finish;
}
? 设置、启动管理器
after_startup = now(CLOCK_MONOTONIC);
...
(void) invoke_main_loop(m,
&saved_rlimit_nofile,
&saved_rlimit_memlock,
&reexecute,
&retval,
&shutdown_verb,
&fds,
&switch_root_dir,
&switch_root_init,
&error_message);
? 正常流程到这里用户空间加载完成,并循环检查调用(systemd在root用户中执行),后面的内容属于序列化或者其他异常操作(如非系统初始化执行systemd)
后续内容(如非系统初始化执行systemd)
finish:
pager_close();
if (m) {
arg_shutdown_watchdog = m->shutdown_watchdog;
m = manager_free(m);
}
reset_arguments();
mac_selinux_finish();
if (reexecute)
do_reexecute(argc, argv,
&saved_rlimit_nofile,
&saved_rlimit_memlock,
fds,
switch_root_dir,
switch_root_init,
&error_message); /* This only returns if reexecution failed */
...
目录预览
<<linux内核make menuconfig执行过程>> <<linux内核make执行过程>> <<linux内核启动过程>> <<linux内核压缩制作bzImage>> <<linux内核启动过程2:保护模式执行流程>> <<linux内核启动过程3:内核初始化阶段>> <<linux内核启动过程4:内核运行时>>
|