目录
kernel启动流程-head.S的执行__8._primary_switch_HZero.chen的博客-CSDN博客
kernel启动流程-head.S的执行_9.__primary_switched_HZero.chen的博客-CSDN博客
linux 内核启动会有一个init_task 进程,为0号进程或idle、swapper 进程,当系统没有进程需要调度,就去执行idle 进程。其是在start_kernel 时静态创建的。
kernel-4.19/init/init_task.c
kernel-4.19/include/linux/sched.h
struct task_struct init_task
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
__init_task_data
#endif
= {
#ifdef CONFIG_THREAD_INFO_IN_TASK
.thread_info = INIT_THREAD_INFO(init_task),
.stack_refcount = ATOMIC_INIT(1),
#endif
.state = 0,
.stack = init_stack,
.policy = SCHED_NORMAL,
.mm = NULL,
.active_mm = &init_mm,
}
kernel-4.19/arch/arm/include/asm/thread_info.h
thread_info = INIT_THREAD_INFO(init_task),
这里宏初始化struct thread_info,必现是task_struct 第一个域;
__init_task_data
kernel-4.19/include/linux/init_task.h
/* Attach to the init_task data structure for proper alignment */
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
#define __init_task_data __attribute__((__section__(".data..init_task")))
#else
#define __init_task_data /**/
#endif
/* Attach to the thread_info data structure for proper alignment */
#define __init_thread_info __attribute__((__section__(".data..init_thread_info")))
#endif
__init_task_data宏会直接从".data..init_task"段内存读取
kernel-4.19/include/asm-generic/vmlinux.lds.h
kernel-4.19/arch/arm64/kernel/vmlinux.lds.S
RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) 这里会定义section ".data..init_task"
__init_end = .;
_data = .;
_sdata = .;
RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
#define RW_DATA_SECTION(cacheline, pagealigned, inittask) \
. = ALIGN(PAGE_SIZE); \
.data : AT(ADDR(.data) - LOAD_OFFSET) { \
INIT_TASK_DATA(inittask) \
NOSAVE_DATA \
PAGE_ALIGNED_DATA(pagealigned) \
CACHELINE_ALIGNED_DATA(cacheline) \
READ_MOSTLY_DATA(cacheline) \
DATA_DATA \
CONSTRUCTORS \
} \
BUG_TABLE
#define INIT_TASK_DATA(align) \
. = ALIGN(align); \
__start_init_task = .; \
init_thread_union = .; \
init_stack = .; \
KEEP(*(.data..init_task)) \
KEEP(*(.data..init_thread_info)) \
. = __start_init_task + THREAD_SIZE; \
__end_init_task = .;
kernel-4.19/include/linux/sched/task.h
extern union thread_union init_thread_union;
kernel-4.19/include/linux/sched.h
union thread_union {
#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
struct task_struct task;
#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
#endif
unsigned long stack[THREAD_SIZE/sizeof(long)];
};
.stack = init_stack, #栈指向thread_info
kernel-4.19/arch/arm64/kernel/head.S
mov x29, #0
mov x30, #0
b start_kernel
ENDPROC(__primary_switched)
__primary_switched 初始化好task_struct 栈等信息,这个时候mmu 打开了,跳转到start_kernel 执行。
linux 通过fork、vfork、clone 等系统调用创建进程或线程,系统调用都是通过do_fork()实现。
kernel-4.19/kernel/fork.c
/*
2495 * Create a kernel thread.
2496 */
2497pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2498{
2499 return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
2500 (unsigned long)arg, NULL, NULL, 0);
2501}
2502
2503#ifdef __ARCH_WANT_SYS_FORK
2504SYSCALL_DEFINE0(fork)
2505{
2506#ifdef CONFIG_MMU
2507 return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
2508#else
2509 /* can not support in nommu mode */
2510 return -EINVAL;
2511#endif
2512}
2513#endif
2514
2515#ifdef __ARCH_WANT_SYS_VFORK
2516SYSCALL_DEFINE0(vfork)
2517{
2518 return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
2519 0, NULL, NULL, 0);
2520}
2521#endif
fork ------ SIGCHLD,子进程终止发生SIGCHLD 给父进程,创建时只复制父进程页表,不复制页面内容,cow
vfork ----- CLONE_VFORK | CLONE_VM | SIGCHLD ,CLONE_VFORK 表示子进程调用execve() 或exit()释放虚拟内存 ,父进程会wait_for_completion()进入睡眠等待;CLONE_VM 表示共享内存
clone -----创建线程,通常会创建新的栈地址
内核线程 ----- flags|CLONE_VM|CLONE_UNTRACED
do_fork(){
....
p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
2449 pid = get_task_pid(p, PIDTYPE_PID);
2450 nr = pid_vnr(pid);
2459 if (clone_flags & CLONE_VFORK) {
2460 p->vfork_done = &vfork;
2461 init_completion(&vfork);
2462 get_task_struct(p);
2463 }
#vfork 创建完子进程,需要保证子进程先运行,因此init_completion(&vfork); 父进程等待。
#在调用exec 或 exit 之前父子进程是共享数据的,调用后才能父进程被调度。
wake_up_new_task(p);
#唤醒加入调度器
2471 if (clone_flags & CLONE_VFORK) {
2472 if (!wait_for_vfork_done(p, &vfork))
2473 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2474 }
2475
2476 put_pid(pid);
2477 return nr;
#最后父进程返回用户空间,返回创建的进程pid;子进程返回 0
}
static __latent_entropy struct task_struct *copy_process {
/*
* Don't allow sharing the root directory with processes in a different
* namespace
*/
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
/*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
CLONE_NEWNS: 父进程跟子进程不共享 mount namespace
CLONE_NEWUSER: 父进程跟子进程不共享user namespace ,即不同的userid 和 group id 映射。user namespace 相当一个隔离容器,容器里第一个进程uid 是0,即root 用户,但非系统特权root 用户
CLONE_SIGHAND:父子进程共享信号处理相关信息
CLONE_THREAD:父子进程在同一个线程组里;POSIX 规定一个进程内部多个线程共享一个PID,但是linux 对线程和进程平等对待分配PID.linux就多了一个线程组概念(thread group)。sys_getpid() 系统调用返回tgid(thread group id),sys_gettid 返回线程PID。
CLONE_PARENT:创建兄弟进程,也就是有共同的父进程;对linux 内核来说进程鼻祖是idle 或swapper 进程,但对用户空间是init 进程,所以用户空间进程由init 创建或派生,自有init 才能设置SIGNAL_UNKILLABLE。如果init 或容器init进程使用CLONE_PARENT 创建兄弟进程,在其退出时,将无法由init 回收,idle 也无能为力,就会成为僵尸进程。
p = dup_task_struct(current, node);
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
#取消超级用户权限,告知系统不是worker 线程
p->flags |= PF_FORKNOEXEC;
#进程暂时还不能执行
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
#新进程的子进程和兄弟进程列表
#设置task_struct 时间、虚拟时间
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
#初始化task_strct 对应的sched_entity 调度实体,每个进程或线程有一个调度实体,另外也包含组#调度(sched gruop);调度策略和调度类
retval = copy_files(clone_flags, p);
retval = copy_fs(clone_flags, p);
retval = copy_sighand(clone_flags, p);
retval = copy_signal(clone_flags, p);
retval = copy_mm(clone_flags, p);
retval = copy_namespaces(clone_flags, p);
retval = copy_io(clone_flags, p);
retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
}
}
#创建pid 结构体,init_struct_pid 是init_task 默认的
futex_init_task(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) { //父子进程共享进程组
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
p->group_leader = p;
p->tgid = p->pid;
}
设置线程组group_leader 和tgid 组id
p->start_time = ktime_get_ns();
p->real_start_time = ktime_get_boot_ns();
2192
2193 /* CLONE_PARENT re-uses the old parent */
2194 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { //CLONE_PARENT 兄弟进程
2195 p->real_parent = current->real_parent;
2196 p->parent_exec_id = current->parent_exec_id;
2197 if (clone_flags & CLONE_THREAD)
2198 p->exit_signal = -1;
2199 else
2200 p->exit_signal = current->group_leader->exit_signal;
2201 } else {
2202 p->real_parent = current;
2203 p->parent_exec_id = current->self_exec_id;
2204 p->exit_signal = (clone_flags & CSIGNAL);
2205 }
2206
2243 if (likely(p->pid)) {
2244 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
2245
2246 init_task_pid(p, PIDTYPE_PID, pid);
2247 if (thread_group_leader(p)) {
2248 init_task_pid(p, PIDTYPE_TGID, pid);
2249 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
2250 init_task_pid(p, PIDTYPE_SID, task_session(current));
2251
2252 if (is_child_reaper(pid)) {
2253 ns_of_pid(pid)->child_reaper = p;
2254 p->signal->flags |= SIGNAL_UNKILLABLE;
2255 }
2256 p->signal->shared_pending.signal = delayed.signal;
2257 p->signal->tty = tty_kref_get(current->signal->tty);
.....
attach_pid(p, PIDTYPE_PID);
2288 nr_threads++;
2289 }
2290 total_forks++;
#nr_threads 和 total_forks 计数
}
dup_task_struct 创建task_struct 、thread_info 作为stack、vm;
arch_dup_task_struct 将父进程task_struct 复制;setup_thread_stack(tsk, orig)将父进程threa_info 复制到子进程;
clear_user_return_notifier(tsk) 清除thread_info->flags 中TIF_NEED_RESCHED清空,新进程刚诞生,不希望调度。
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
858{
859 struct task_struct *tsk;
860 unsigned long *stack;
861 struct vm_struct *stack_vm_area;
862 int err;
863
864 if (node == NUMA_NO_NODE)
865 node = tsk_fork_get_node(orig);
866 tsk = alloc_task_struct_node(node);
867 if (!tsk)
868 return NULL;
869
870 stack = alloc_thread_stack_node(tsk, node);
871 if (!stack)
872 goto free_tsk;
873
874 stack_vm_area = task_stack_vm_area(tsk);
875
876 err = arch_dup_task_struct(tsk, orig);
878 /*
879 * arch_dup_task_struct() clobbers the stack-related fields. Make
880 * sure they're properly initialized before using any stack-related
881 * functions again.
882 */
883 tsk->stack = stack;
884#ifdef CONFIG_VMAP_STACK
885 tsk->stack_vm_area = stack_vm_area;
886#endif
887#ifdef CONFIG_THREAD_INFO_IN_TASK
888 atomic_set(&tsk->stack_refcount, 1);
889#endif
890
891 if (err)
892 goto free_stack;
893
894 err = scs_prepare(tsk, node);
895 if (err)
896 goto free_stack;
897
898#ifdef CONFIG_SECCOMP
899 /*
900 * We must handle setting up seccomp filters once we're under
901 * the sighand lock in case orig has changed between now and
902 * then. Until then, filter must be NULL to avoid messing up
903 * the usage counts on the error path calling free_task.
904 */
905 tsk->seccomp.filter = NULL;
906#endif
907
908 setup_thread_stack(tsk, orig);
909 clear_user_return_notifier(tsk);
910 clear_tsk_need_resched(tsk);
911 set_task_stack_end_magic(tsk);
912
913#ifdef CONFIG_STACKPROTECTOR
914 tsk->stack_canary = get_random_canary();
915#endif
...
}
kernel-4.19/kernel/sched/core.c
3094/*
3095 * fork()/clone()-time setup:
3096 */
3097int sched_fork(unsigned long clone_flags, struct task_struct *p)
3098{
3099 unsigned long flags;
3100 bool reset;
3101
3102 __sched_fork(clone_flags, p);
3103 /*
3104 * We mark the process as NEW here. This guarantees that
3105 * nobody will actually run it, and a signal or other external
3106 * event cannot wake it up and insert it on the runqueue either.
3107 */
3108 p->state = TASK_NEW;
3109
3110 /*
3111 * Make sure we do not leak PI boosting priority to the child.
3112 */
3113 p->prio = current->normal_prio;
#设置task 状态,优先级
3177 __set_task_cpu(p, smp_processor_id());
3178 if (p->sched_class->task_fork)
3179 p->sched_class->task_fork(p);
#设置thread_info 为当前cpu,使用了wmb 写屏障;调度类task_fork 进行初始化
3182#ifdef CONFIG_SCHED_INFO
3183 if (likely(sched_info_on()))
3184 memset(&p->sched_info, 0, sizeof(p->sched_info));
3185#endif
3186#if defined(CONFIG_SMP)
3187 p->on_cpu = 0;
3188#endif
3189 init_task_preempt_count(p)
#设置内核抢占preempt
...
}
#设置内核抢占preempt
#kernel-4.19/include/linux/preempt.h
26 * PREEMPT_MASK: 0x000000ff
27 * SOFTIRQ_MASK: 0x0000ff00
28 * HARDIRQ_MASK: 0x000f0000
29 * NMI_MASK: 0x00100000
30 * PREEMPT_NEED_RESCHED: 0x80000000
preempt_disable(); #preempt ++
preempt_enable(); #preempt--
当thread_info->preempt 为0,则可以抢占,同时thread_info->TIF_NEED_RESCHED 置位,则调用schedule()完成调度抢占。