前言
进程是处于执行期的程序以及它所管理的资源(如打开的文件、挂起的信号、进程状态、地址空间等等)的总称。注意,程序并不是进程,实际上两个或多个进程不仅有可能执行同一程序,而且还有可能共享地址空间等资源。 Linux 里面,无论是进程,还是线程,到了内核里面,我们统一都叫任务(Task),由一个统一的结构 task_struct 进行管理。
一、task_struct的定义
task_struct的定义位于内核代码:include/linux/sched.h文件中。
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
#endif
volatile long state;
void *stack;
atomic_t usage;
unsigned int flags;
unsigned int ptrace;
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
unsigned int cpu;
#endif
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;
int wake_cpu;
#endif
int on_rq;
int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_SCHED_WALT
struct ravg ravg;
u32 init_load_pct;
u64 last_sleep_ts;
#endif
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl;
#ifdef CONFIG_PREEMPT_NOTIFIERS
struct hlist_head preempt_notifiers;
#endif
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
struct rcu_node *rcu_blocked_node;
#endif
#ifdef CONFIG_TASKS_RCU
unsigned long rcu_tasks_nvcsw;
bool rcu_tasks_holdout;
struct list_head rcu_tasks_holdout_list;
int rcu_tasks_idle_cpu;
#endif
#ifdef CONFIG_SCHED_INFO
struct sched_info sched_info;
#endif
struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif
struct mm_struct *mm, *active_mm;
u64 vmacache_seqnum;
struct vm_area_struct *vmacache[VMACACHE_SIZE];
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
int exit_state;
int exit_code, exit_signal;
int pdeath_signal;
unsigned long jobctl;
unsigned int personality;
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
unsigned sched_remote_wakeup:1;
#ifdef CONFIG_PSI
unsigned sched_psi_wake_requeue:1;
#endif
unsigned :0;
unsigned in_execve:1;
unsigned in_iowait:1;
#if !defined(TIF_RESTORE_SIGMASK)
unsigned restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
unsigned memcg_may_oom:1;
#ifndef CONFIG_SLOB
unsigned memcg_kmem_skip_account:1;
#endif
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
unsigned no_cgroup_migration:1;
#endif
unsigned long atomic_flags;
struct restart_block restart_block;
pid_t pid;
pid_t tgid;
#ifdef CONFIG_CC_STACKPROTECTOR
unsigned long stack_canary;
#endif
struct task_struct __rcu *real_parent;
struct task_struct __rcu *parent;
struct list_head children;
struct list_head sibling;
struct task_struct *group_leader;
struct list_head ptraced;
struct list_head ptrace_entry;
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
struct list_head thread_node;
struct completion *vfork_done;
int __user *set_child_tid;
int __user *clear_child_tid;
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifdef CONFIG_CPU_FREQ_TIMES
u64 *time_in_state;
unsigned int max_state;
#endif
struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqcount_t vtime_seqcount;
unsigned long long vtime_snap;
enum {
VTIME_INACTIVE = 0,
VTIME_USER,
VTIME_SYS,
} vtime_snap_whence;
#endif
#ifdef CONFIG_NO_HZ_FULL
atomic_t tick_dep_mask;
#endif
unsigned long nvcsw, nivcsw;
u64 start_time;
u64 real_start_time;
unsigned long min_flt, maj_flt;
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
const struct cred __rcu *ptracer_cred;
const struct cred __rcu *real_cred;
const struct cred __rcu *cred;
char comm[TASK_COMM_LEN];
struct nameidata *nameidata;
#ifdef CONFIG_SYSVIPC
struct sysv_sem sysvsem;
struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
#endif
struct fs_struct *fs;
struct files_struct *files;
struct nsproxy *nsproxy;
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked, real_blocked;
sigset_t saved_sigmask;
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned sas_ss_flags;
struct callback_head *task_works;
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
kuid_t loginuid;
unsigned int sessionid;
#endif
struct seccomp seccomp;
u32 parent_exec_id;
u32 self_exec_id;
spinlock_t alloc_lock;
raw_spinlock_t pi_lock;
struct wake_q_node wake_q;
#ifdef CONFIG_RT_MUTEXES
struct rb_root pi_waiters;
struct rb_node *pi_waiters_leftmost;
struct rt_mutex_waiter *pi_blocked_on;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
gfp_t lockdep_reclaim_gfp;
#endif
#ifdef CONFIG_UBSAN
unsigned int in_ubsan;
#endif
void *journal_info;
struct bio_list *bio_list;
#ifdef CONFIG_BLOCK
struct blk_plug *plug;
#endif
struct reclaim_state *reclaim_state;
struct backing_dev_info *backing_dev_info;
struct io_context *io_context;
unsigned long ptrace_message;
siginfo_t *last_siginfo;
struct task_io_accounting ioac;
#ifdef CONFIG_PSI
unsigned int psi_flags;
#endif
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1;
u64 acct_vm_mem1;
cputime_t acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed;
seqcount_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
struct css_set __rcu *cgroups;
struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
u64 node_stamp;
u64 last_task_numa_placement;
u64 last_sum_exec_runtime;
struct callback_head numa_work;
struct list_head numa_entry;
struct numa_group *numa_group;
unsigned long *numa_faults;
unsigned long total_numa_faults;
unsigned long numa_faults_locality[3];
unsigned long numa_pages_migrated;
#endif
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
struct tlbflush_unmap_batch tlb_ubc;
#endif
struct rcu_head rcu;
struct pipe_inode_info *splice_pipe;
struct page_frag task_frag;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
int nr_dirtied;
int nr_dirtied_pause;
unsigned long dirty_paused_when;
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
u64 timer_slack_ns;
u64 default_timer_slack_ns;
#ifdef CONFIG_KASAN
unsigned int kasan_depth;
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
int curr_ret_stack;
struct ftrace_ret_stack *ret_stack;
unsigned long long ftrace_timestamp;
atomic_t trace_overrun;
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
unsigned long trace;
unsigned long trace_recursion;
#endif
#ifdef CONFIG_KCOV
enum kcov_mode kcov_mode;
unsigned kcov_size;
void *kcov_area;
struct kcov *kcov;
#endif
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg_in_oom;
gfp_t memcg_oom_gfp_mask;
int memcg_oom_order;
unsigned int memcg_nr_pages_over_high;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
atomic_t stack_refcount;
#endif
struct thread_struct thread;
};
单从代码来看,结构体task_struct还是很复杂的,所以我们将其进行拆解,从几个维度上去进行分析和记忆。
二、 解析task_struct
1.任务ID
代码如下(示例):
pid_t pid;
pid_t tgid;
struct task_struct *group_leader;
pid是proceess id,tgid是thread group id,group_leader是一个地址指向主线程ID。 既然ID是唯一标识,为何要弄出pid,tgid和group_leader? 主要是解决两个问题: 一是任务展示,可以选择展示当前用户创建的任务,而不是列出所有的任务。 二是给任务下发指令,若使用kill给多线程中的1个线程发送信号,不能只退出这个线程,而是应该退出整个进程。
一个进程,如果只有主线程,pid是自己,tgid是自己,group_leader也是指向自己。 一个进程,如果创建了多个线程,除主线程外的其它线程会有自己的pid,tgid是进程的主线程pid,group_leader指向的就是进程的主线程。
2.信号处理
代码如下(示例):
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked, real_blocked;
sigset_t saved_sigmask;
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned sas_ss_flags;
此处定义了信号不同的处理方式:阻塞暂不处理(blocked),等待处理(pending),正在通过信号处理函数进行处理(sighand)。 信号处理函数默认使用用户态的函数栈,此处以外,也可以开辟新的栈专门用于信号处理,sas_ss开头的三个变量就用于开辟新的栈。
3.任务状态
代码如下(示例):
volatile long state;
unsigned int flags;
int exit_state;
int exit_code, exit_signal;
state的取值定义在includ/linux/sched.h中
#define TASK_RUNNING 0
#define TASK_INTERRUPTIBLE 1
#define TASK_UNINTERRUPTIBLE 2
#define __TASK_STOPPED 4
#define __TASK_TRACED 8
#define EXIT_DEAD 16
#define EXIT_ZOMBIE 32
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
#define TASK_DEAD 64
#define TASK_WAKEKILL 128
#define TASK_WAKING 256
#define TASK_PARKED 512
#define TASK_NOLOAD 1024
#define TASK_NEW 2048
#define TASK_STATE_MAX 4096
state的值是通过bitset的方式设置的,state的每1个bit就代表一种状态。
TASK_RUNNING 并不是说进程正在运行,而是表示进程在时刻准备运行的状态。当处于这个状态的进程获得时间片的时候,就是在运行中;如果没有获得时间片,就说明它被其他进程抢占了,在等待再次分配时间片。
运行中的进程,在需要进行一些I/O操作时,需要等待I/O完毕,在等待的时候会释放CPU,进入睡眠状态。 在Linux中,有两种睡眠状态。TASK_INTERRUPTIBLE,可中断的睡眠状态。这是一种浅睡眠的状态,这个时候当有信号到来的时候,进程还是要被唤醒。唤醒后,进行信号处理。TASK_UNINTERRUPTIBLE,不可中断的睡眠状态。这是一种深度睡眠状态,不可被信号唤醒,只能等待I/O操作完成。当I/O操作因为特殊原因不能完成时,这个时候,设备不能叫醒这个进程。这是一种非常危险的情况。 针对这种情况,设计了一种新的进程睡眠状态,TASK_KILLABLE,可以终止的新睡眠状态。进程处于这种状态中,它的运行原理类似TASK_UNITERRUPTIBLE,但是可以响应致命信号。这一点可以通过TASK_KILLBLE的定义可以看出: TASK_KELLABLE = TASK_WAKEKILL + TASK_UNINTERRUPTIBLE
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
TASK_STOPPED是在进程接收到SIGSTOP、SIGTTIN、SIGTSTP、SIGTTOU信号之后进入的状态。 TASK_TRACED表示进程被debugger等进程监视,进程的执行会被调试程序所停止。当一个进程被其它的进程说件事,每一个信号都会让进程进入该状态。 进程中5种互斥的状态: TASK_RUNNING、TASK_INTERRUPTIBLE、TASK_UNINTERRUPTIBLE、TASK_STOPPED、TASK_TRACED. 2个终止状态 EXIT_ZOMBIE,进程的执行被终止,但是父进程还没有使用wait()等系统调用来获取到它的终止信息,此时进程为僵尸进程。EXIT_DEAD,进程的最终状态。
4.任务标记
任务标记,反应进程状态的信息,但不是运行状态,用于内核识别进程的当前状态,以进行下一步操作。 flags的取值如下,定义在include/linux/sched.h中
#define PF_EXITING 0x00000004
#define PF_EXITPIDONE 0x00000008
#define PF_VCPU 0x00000010
#define PF_WQ_WORKER 0x00000020
#define PF_FORKNOEXEC 0x00000040
#define PF_MCE_PROCESS 0x00000080
#define PF_SUPERPRIV 0x00000100
#define PF_DUMPCORE 0x00000200
#define PF_SIGNALED 0x00000400
#define PF_MEMALLOC 0x00000800
#define PF_NPROC_EXCEEDED 0x00001000
#define PF_USED_MATH 0x00002000
#define PF_USED_ASYNC 0x00004000
#define PF_NOFREEZE 0x00008000
#define PF_FROZEN 0x00010000
#define PF_FSTRANS 0x00020000
#define PF_KSWAPD 0x00040000
#define PF_MEMALLOC_NOIO 0x00080000
#define PF_LESS_THROTTLE 0x00100000
#define PF_KTHREAD 0x00200000
#define PF_RANDOMIZE 0x00400000
#define PF_SWAPWRITE 0x00800000
#define PF_MEMSTALL 0x01000000
#define PF_NO_SETAFFINITY 0x04000000
#define PF_MCE_EARLY 0x08000000
#define PF_MUTEX_TESTER 0x20000000
#define PF_FREEZER_SKIP 0x40000000
#define PF_SUSPEND_TASK 0x80000000
PF_EXITING 表示正在退出。当这个flag被设置时,函数find_alive_thread检测到这个flag就直接跳过,不认为它是一个活着的线程。 PF_FORKNOEXEC 表示fork结束,还没有exec。在_do_forck里调用copy_process时,会把flag设置成PF_FORKNOEXEC。在exec中调用了load_elf_binary时,会把这个flag去掉。 PF_VCPU表示进程运行在虚拟CPU上。
5.进程调度
进程的状态涉及调度,task_struct中和调度相关的字段
int on_rq;
int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_SCHED_WALT
struct ravg ravg;
u32 init_load_pct;
u64 last_sleep_ts;
#endif
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl;
#ifdef CONFIG_PREEMPT_NOTIFIERS
struct hlist_head preempt_notifiers;
#endif
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
struct rcu_node *rcu_blocked_node;
#endif
#ifdef CONFIG_TASKS_RCU
unsigned long rcu_tasks_nvcsw;
bool rcu_tasks_holdout;
struct list_head rcu_tasks_holdout_list;
int rcu_tasks_idle_cpu;
#endif
#ifdef CONFIG_SCHED_INFO
struct sched_info sched_info;
#endif
优先级范围 实时进程优先级范围是0到MAX_RT_PRIO-1(99),而普通进程优先级范围是MAX_RT_PRIO到MAX_PRIO-1(100到139)。值越小优先级越高。 调度策略 SCHED_NORMAL 用于普通进程,通过CFS调度器实现。 SCHED_BATCH 是SCHED_NORMAL策略的分化版本,采用分时策略,根据动态优先级,分配CPU资源。 SCHED_IDLE 在系统空闲时,才会跑这类进程。 SCHED_FIFO 先入先出调度算法,用于实时进程,相同优先级的进程,先到先得,高优先级的任务可以抢占低优先级的任务。 SCHED_RR 轮流调度算法,用于实时进程,采用时间片,相同优先级的任务当时间片用完时会放到队列尾部,高优先级的任务可以抢占低优先级的任务。 SCHED_DEADLINE 基于Earliest Deadline First(EDF)调度算法。
总结
|