Blame - kernel/exit.c - codeaurora/cp-linux

blob: ffba5df4abd54cd2dc146a09fa152ed84502e213 [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame^]	1	/*
				2	* linux/kernel/exit.c
				3	*
				4	* Copyright (C) 1991, 1992 Linus Torvalds
				5	*/
				6
				7	#include <linux/mm.h>
				8	#include <linux/slab.h>
				9	#include <linux/interrupt.h>
				10	#include <linux/module.h>
				11	#include <linux/capability.h>
				12	#include <linux/completion.h>
				13	#include <linux/personality.h>
				14	#include <linux/tty.h>
				15	#include <linux/iocontext.h>
				16	#include <linux/key.h>
				17	#include <linux/security.h>
				18	#include <linux/cpu.h>
				19	#include <linux/acct.h>
				20	#include <linux/tsacct_kern.h>
				21	#include <linux/file.h>
				22	#include <linux/fdtable.h>
				23	#include <linux/freezer.h>
				24	#include <linux/binfmts.h>
				25	#include <linux/nsproxy.h>
				26	#include <linux/pid_namespace.h>
				27	#include <linux/ptrace.h>
				28	#include <linux/profile.h>
				29	#include <linux/mount.h>
				30	#include <linux/proc_fs.h>
				31	#include <linux/kthread.h>
				32	#include <linux/mempolicy.h>
				33	#include <linux/taskstats_kern.h>
				34	#include <linux/delayacct.h>
				35	#include <linux/cgroup.h>
				36	#include <linux/syscalls.h>
				37	#include <linux/signal.h>
				38	#include <linux/posix-timers.h>
				39	#include <linux/cn_proc.h>
				40	#include <linux/mutex.h>
				41	#include <linux/futex.h>
				42	#include <linux/pipe_fs_i.h>
				43	#include <linux/audit.h> /* for audit_free() */
				44	#include <linux/resource.h>
				45	#include <linux/blkdev.h>
				46	#include <linux/task_io_accounting_ops.h>
				47	#include <linux/tracehook.h>
				48	#include <linux/fs_struct.h>
				49	#include <linux/init_task.h>
				50	#include <linux/perf_event.h>
				51	#include <trace/events/sched.h>
				52	#include <linux/hw_breakpoint.h>
				53	#include <linux/oom.h>
				54	#include <linux/writeback.h>
				55	#include <linux/shm.h>
				56
				57	#include <asm/uaccess.h>
				58	#include <asm/unistd.h>
				59	#include <asm/pgtable.h>
				60	#include <asm/mmu_context.h>
				61
				62	static void exit_mm(struct task_struct *tsk);
				63
				64	static void __unhash_process(struct task_struct *p, bool group_dead)
				65	{
				66	nr_threads--;
				67	detach_pid(p, PIDTYPE_PID);
				68	if (group_dead) {
				69	detach_pid(p, PIDTYPE_PGID);
				70	detach_pid(p, PIDTYPE_SID);
				71
				72	list_del_rcu(&p->tasks);
				73	list_del_init(&p->sibling);
				74	__this_cpu_dec(process_counts);
				75	}
				76	list_del_rcu(&p->thread_group);
				77	list_del_rcu(&p->thread_node);
				78	}
				79
				80	/*
				81	* This function expects the tasklist_lock write-locked.
				82	*/
				83	static void __exit_signal(struct task_struct *tsk)
				84	{
				85	struct signal_struct *sig = tsk->signal;
				86	bool group_dead = thread_group_leader(tsk);
				87	struct sighand_struct *sighand;
				88	struct tty_struct *uninitialized_var(tty);
				89	cputime_t utime, stime;
				90
				91	sighand = rcu_dereference_check(tsk->sighand,
				92	lockdep_tasklist_lock_is_held());
				93	spin_lock(&sighand->siglock);
				94
				95	posix_cpu_timers_exit(tsk);
				96	if (group_dead) {
				97	posix_cpu_timers_exit_group(tsk);
				98	tty = sig->tty;
				99	sig->tty = NULL;
				100	} else {
				101	/*
				102	* This can only happen if the caller is de_thread().
				103	* FIXME: this is the temporary hack, we should teach
				104	* posix-cpu-timers to handle this case correctly.
				105	*/
				106	if (unlikely(has_group_leader_pid(tsk)))
				107	posix_cpu_timers_exit_group(tsk);
				108
				109	/*
				110	* If there is any task waiting for the group exit
				111	* then notify it:
				112	*/
				113	if (sig->notify_count > 0 && !--sig->notify_count)
				114	wake_up_process(sig->group_exit_task);
				115
				116	if (tsk == sig->curr_target)
				117	sig->curr_target = next_thread(tsk);
				118	}
				119
				120	/*
				121	* Accumulate here the counters for all threads as they die. We could
				122	* skip the group leader because it is the last user of signal_struct,
				123	* but we want to avoid the race with thread_group_cputime() which can
				124	* see the empty ->thread_head list.
				125	*/
				126	task_cputime(tsk, &utime, &stime);
				127	write_seqlock(&sig->stats_lock);
				128	sig->utime += utime;
				129	sig->stime += stime;
				130	sig->gtime += task_gtime(tsk);
				131	sig->min_flt += tsk->min_flt;
				132	sig->maj_flt += tsk->maj_flt;
				133	sig->nvcsw += tsk->nvcsw;
				134	sig->nivcsw += tsk->nivcsw;
				135	sig->inblock += task_io_get_inblock(tsk);
				136	sig->oublock += task_io_get_oublock(tsk);
				137	task_io_accounting_add(&sig->ioac, &tsk->ioac);
				138	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
				139	sig->nr_threads--;
				140	__unhash_process(tsk, group_dead);
				141	write_sequnlock(&sig->stats_lock);
				142
				143	/*
				144	* Do this under ->siglock, we can race with another thread
				145	* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
				146	*/
				147	flush_sigqueue(&tsk->pending);
				148	tsk->sighand = NULL;
				149	spin_unlock(&sighand->siglock);
				150
				151	__cleanup_sighand(sighand);
				152	clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
				153	if (group_dead) {
				154	flush_sigqueue(&sig->shared_pending);
				155	tty_kref_put(tty);
				156	}
				157	}
				158
				159	static void delayed_put_task_struct(struct rcu_head *rhp)
				160	{
				161	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
				162
				163	perf_event_delayed_put(tsk);
				164	trace_sched_process_free(tsk);
				165	put_task_struct(tsk);
				166	}
				167
				168
				169	void release_task(struct task_struct *p)
				170	{
				171	struct task_struct *leader;
				172	int zap_leader;
				173	repeat:
				174	/* don't need to get the RCU readlock here - the process is dead and
				175	* can't be modifying its own credentials. But shut RCU-lockdep up */
				176	rcu_read_lock();
				177	atomic_dec(&__task_cred(p)->user->processes);
				178	rcu_read_unlock();
				179
				180	proc_flush_task(p);
				181
				182	write_lock_irq(&tasklist_lock);
				183	ptrace_release_task(p);
				184	__exit_signal(p);
				185
				186	/*
				187	* If we are the last non-leader member of the thread
				188	* group, and the leader is zombie, then notify the
				189	* group leader's parent process. (if it wants notification.)
				190	*/
				191	zap_leader = 0;
				192	leader = p->group_leader;
				193	if (leader != p && thread_group_empty(leader)
				194	&& leader->exit_state == EXIT_ZOMBIE) {
				195	/*
				196	* If we were the last child thread and the leader has
				197	* exited already, and the leader's parent ignores SIGCHLD,
				198	* then we are the one who should release the leader.
				199	*/
				200	zap_leader = do_notify_parent(leader, leader->exit_signal);
				201	if (zap_leader)
				202	leader->exit_state = EXIT_DEAD;
				203	}
				204
				205	write_unlock_irq(&tasklist_lock);
				206	release_thread(p);
				207	call_rcu(&p->rcu, delayed_put_task_struct);
				208
				209	p = leader;
				210	if (unlikely(zap_leader))
				211	goto repeat;
				212	}
				213
				214	/*
				215	* Determine if a process group is "orphaned", according to the POSIX
				216	* definition in 2.2.2.52. Orphaned process groups are not to be affected
				217	* by terminal-generated stop signals. Newly orphaned process groups are
				218	* to receive a SIGHUP and a SIGCONT.
				219	*
				220	* "I ask you, have you ever known what it is to be an orphan?"
				221	*/
				222	static int will_become_orphaned_pgrp(struct pid *pgrp,
				223	struct task_struct *ignored_task)
				224	{
				225	struct task_struct *p;
				226
				227	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
				228	if ((p == ignored_task) \|\|
				229	(p->exit_state && thread_group_empty(p)) \|\|
				230	is_global_init(p->real_parent))
				231	continue;
				232
				233	if (task_pgrp(p->real_parent) != pgrp &&
				234	task_session(p->real_parent) == task_session(p))
				235	return 0;
				236	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
				237
				238	return 1;
				239	}
				240
				241	int is_current_pgrp_orphaned(void)
				242	{
				243	int retval;
				244
				245	read_lock(&tasklist_lock);
				246	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
				247	read_unlock(&tasklist_lock);
				248
				249	return retval;
				250	}
				251
				252	static bool has_stopped_jobs(struct pid *pgrp)
				253	{
				254	struct task_struct *p;
				255
				256	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
				257	if (p->signal->flags & SIGNAL_STOP_STOPPED)
				258	return true;
				259	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
				260
				261	return false;
				262	}
				263
				264	/*
				265	* Check to see if any process groups have become orphaned as
				266	* a result of our exiting, and if they have any stopped jobs,
				267	* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
				268	*/
				269	static void
				270	kill_orphaned_pgrp(struct task_struct tsk, struct task_struct parent)
				271	{
				272	struct pid *pgrp = task_pgrp(tsk);
				273	struct task_struct *ignored_task = tsk;
				274
				275	if (!parent)
				276	/* exit: our father is in a different pgrp than
				277	* we are and we were the only connection outside.
				278	*/
				279	parent = tsk->real_parent;
				280	else
				281	/* reparent: our child is in a different pgrp than
				282	* we are, and it was the only connection outside.
				283	*/
				284	ignored_task = NULL;
				285
				286	if (task_pgrp(parent) != pgrp &&
				287	task_session(parent) == task_session(tsk) &&
				288	will_become_orphaned_pgrp(pgrp, ignored_task) &&
				289	has_stopped_jobs(pgrp)) {
				290	__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
				291	__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
				292	}
				293	}
				294
				295	#ifdef CONFIG_MEMCG
				296	/*
				297	* A task is exiting. If it owned this mm, find a new owner for the mm.
				298	*/
				299	void mm_update_next_owner(struct mm_struct *mm)
				300	{
				301	struct task_struct c, g, *p = current;
				302
				303	retry:
				304	/*
				305	* If the exiting or execing task is not the owner, it's
				306	* someone else's problem.
				307	*/
				308	if (mm->owner != p)
				309	return;
				310	/*
				311	* The current owner is exiting/execing and there are no other
				312	* candidates. Do not leave the mm pointing to a possibly
				313	* freed task structure.
				314	*/
				315	if (atomic_read(&mm->mm_users) <= 1) {
				316	mm->owner = NULL;
				317	return;
				318	}
				319
				320	read_lock(&tasklist_lock);
				321	/*
				322	* Search in the children
				323	*/
				324	list_for_each_entry(c, &p->children, sibling) {
				325	if (c->mm == mm)
				326	goto assign_new_owner;
				327	}
				328
				329	/*
				330	* Search in the siblings
				331	*/
				332	list_for_each_entry(c, &p->real_parent->children, sibling) {
				333	if (c->mm == mm)
				334	goto assign_new_owner;
				335	}
				336
				337	/*
				338	* Search through everything else, we should not get here often.
				339	*/
				340	for_each_process(g) {
				341	if (g->flags & PF_KTHREAD)
				342	continue;
				343	for_each_thread(g, c) {
				344	if (c->mm == mm)
				345	goto assign_new_owner;
				346	if (c->mm)
				347	break;
				348	}
				349	}
				350	read_unlock(&tasklist_lock);
				351	/*
				352	* We found no owner yet mm_users > 1: this implies that we are
				353	* most likely racing with swapoff (try_to_unuse()) or /proc or
				354	* ptrace or page migration (get_task_mm()). Mark owner as NULL.
				355	*/
				356	mm->owner = NULL;
				357	return;
				358
				359	assign_new_owner:
				360	BUG_ON(c == p);
				361	get_task_struct(c);
				362	/*
				363	* The task_lock protects c->mm from changing.
				364	* We always want mm->owner->mm == mm
				365	*/
				366	task_lock(c);
				367	/*
				368	* Delay read_unlock() till we have the task_lock()
				369	* to ensure that c does not slip away underneath us
				370	*/
				371	read_unlock(&tasklist_lock);
				372	if (c->mm != mm) {
				373	task_unlock(c);
				374	put_task_struct(c);
				375	goto retry;
				376	}
				377	mm->owner = c;
				378	task_unlock(c);
				379	put_task_struct(c);
				380	}
				381	#endif /* CONFIG_MEMCG */
				382
				383	/*
				384	* Turn us into a lazy TLB process if we
				385	* aren't already..
				386	*/
				387	static void exit_mm(struct task_struct *tsk)
				388	{
				389	struct mm_struct *mm = tsk->mm;
				390	struct core_state *core_state;
				391
				392	mm_release(tsk, mm);
				393	if (!mm)
				394	return;
				395	sync_mm_rss(mm);
				396	/*
				397	* Serialize with any possible pending coredump.
				398	* We must hold mmap_sem around checking core_state
				399	* and clearing tsk->mm. The core-inducing thread
				400	* will increment ->nr_threads for each thread in the
				401	* group with ->mm != NULL.
				402	*/
				403	down_read(&mm->mmap_sem);
				404	core_state = mm->core_state;
				405	if (core_state) {
				406	struct core_thread self;
				407
				408	up_read(&mm->mmap_sem);
				409
				410	self.task = tsk;
				411	self.next = xchg(&core_state->dumper.next, &self);
				412	/*
				413	* Implies mb(), the result of xchg() must be visible
				414	* to core_state->dumper.
				415	*/
				416	if (atomic_dec_and_test(&core_state->nr_threads))
				417	complete(&core_state->startup);
				418
				419	for (;;) {
				420	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
				421	if (!self.task) /* see coredump_finish() */
				422	break;
				423	freezable_schedule();
				424	}
				425	__set_task_state(tsk, TASK_RUNNING);
				426	down_read(&mm->mmap_sem);
				427	}
				428	atomic_inc(&mm->mm_count);
				429	BUG_ON(mm != tsk->active_mm);
				430	/* more a memory barrier than a real lock */
				431	task_lock(tsk);
				432	tsk->mm = NULL;
				433	up_read(&mm->mmap_sem);
				434	enter_lazy_tlb(mm, current);
				435	task_unlock(tsk);
				436	mm_update_next_owner(mm);
				437	mmput(mm);
				438	if (test_thread_flag(TIF_MEMDIE))
				439	exit_oom_victim();
				440	}
				441
				442	static struct task_struct find_alive_thread(struct task_struct p)
				443	{
				444	struct task_struct *t;
				445
				446	for_each_thread(p, t) {
				447	if (!(t->flags & PF_EXITING))
				448	return t;
				449	}
				450	return NULL;
				451	}
				452
				453	static struct task_struct find_child_reaper(struct task_struct father)
				454	__releases(&tasklist_lock)
				455	__acquires(&tasklist_lock)
				456	{
				457	struct pid_namespace *pid_ns = task_active_pid_ns(father);
				458	struct task_struct *reaper = pid_ns->child_reaper;
				459
				460	if (likely(reaper != father))
				461	return reaper;
				462
				463	reaper = find_alive_thread(father);
				464	if (reaper) {
				465	pid_ns->child_reaper = reaper;
				466	return reaper;
				467	}
				468
				469	write_unlock_irq(&tasklist_lock);
				470	if (unlikely(pid_ns == &init_pid_ns)) {
				471	panic("Attempted to kill init! exitcode=0x%08x\n",
				472	father->signal->group_exit_code ?: father->exit_code);
				473	}
				474	zap_pid_ns_processes(pid_ns);
				475	write_lock_irq(&tasklist_lock);
				476
				477	return father;
				478	}
				479
				480	/*
				481	* When we die, we re-parent all our children, and try to:
				482	* 1. give them to another thread in our thread group, if such a member exists
				483	* 2. give it to the first ancestor process which prctl'd itself as a
				484	* child_subreaper for its children (like a service manager)
				485	* 3. give it to the init process (PID 1) in our pid namespace
				486	*/
				487	static struct task_struct find_new_reaper(struct task_struct father,
				488	struct task_struct *child_reaper)
				489	{
				490	struct task_struct thread, reaper;
				491
				492	thread = find_alive_thread(father);
				493	if (thread)
				494	return thread;
				495
				496	if (father->signal->has_child_subreaper) {
				497	/*
				498	* Find the first ->is_child_subreaper ancestor in our pid_ns.
				499	* We start from father to ensure we can not look into another
				500	* namespace, this is safe because all its threads are dead.
				501	*/
				502	for (reaper = father;
				503	!same_thread_group(reaper, child_reaper);
				504	reaper = reaper->real_parent) {
				505	/* call_usermodehelper() descendants need this check */
				506	if (reaper == &init_task)
				507	break;
				508	if (!reaper->signal->is_child_subreaper)
				509	continue;
				510	thread = find_alive_thread(reaper);
				511	if (thread)
				512	return thread;
				513	}
				514	}
				515
				516	return child_reaper;
				517	}
				518
				519	/*
				520	* Any that need to be release_task'd are put on the @dead list.
				521	*/
				522	static void reparent_leader(struct task_struct father, struct task_struct p,
				523	struct list_head *dead)
				524	{
				525	if (unlikely(p->exit_state == EXIT_DEAD))
				526	return;
				527
				528	/* We don't want people slaying init. */
				529	p->exit_signal = SIGCHLD;
				530
				531	/* If it has exited notify the new parent about this child's death. */
				532	if (!p->ptrace &&
				533	p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
				534	if (do_notify_parent(p, p->exit_signal)) {
				535	p->exit_state = EXIT_DEAD;
				536	list_add(&p->ptrace_entry, dead);
				537	}
				538	}
				539
				540	kill_orphaned_pgrp(p, father);
				541	}
				542
				543	/*
				544	* This does two things:
				545	*
				546	* A. Make init inherit all the child processes
				547	* B. Check to see if any process groups have become orphaned
				548	* as a result of our exiting, and if they have any stopped
				549	* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
				550	*/
				551	static void forget_original_parent(struct task_struct *father,
				552	struct list_head *dead)
				553	{
				554	struct task_struct p, t, *reaper;
				555
				556	if (unlikely(!list_empty(&father->ptraced)))
				557	exit_ptrace(father, dead);
				558
				559	/* Can drop and reacquire tasklist_lock */
				560	reaper = find_child_reaper(father);
				561	if (list_empty(&father->children))
				562	return;
				563
				564	reaper = find_new_reaper(father, reaper);
				565	list_for_each_entry(p, &father->children, sibling) {
				566	for_each_thread(p, t) {
				567	t->real_parent = reaper;
				568	BUG_ON((!t->ptrace) != (t->parent == father));
				569	if (likely(!t->ptrace))
				570	t->parent = t->real_parent;
				571	if (t->pdeath_signal)
				572	group_send_sig_info(t->pdeath_signal,
				573	SEND_SIG_NOINFO, t);
				574	}
				575	/*
				576	* If this is a threaded reparent there is no need to
				577	* notify anyone anything has happened.
				578	*/
				579	if (!same_thread_group(reaper, father))
				580	reparent_leader(father, p, dead);
				581	}
				582	list_splice_tail_init(&father->children, &reaper->children);
				583	}
				584
				585	/*
				586	* Send signals to all our closest relatives so that they know
				587	* to properly mourn us..
				588	*/
				589	static void exit_notify(struct task_struct *tsk, int group_dead)
				590	{
				591	bool autoreap;
				592	struct task_struct p, n;
				593	LIST_HEAD(dead);
				594
				595	write_lock_irq(&tasklist_lock);
				596	forget_original_parent(tsk, &dead);
				597
				598	if (group_dead)
				599	kill_orphaned_pgrp(tsk->group_leader, NULL);
				600
				601	if (unlikely(tsk->ptrace)) {
				602	int sig = thread_group_leader(tsk) &&
				603	thread_group_empty(tsk) &&
				604	!ptrace_reparented(tsk) ?
				605	tsk->exit_signal : SIGCHLD;
				606	autoreap = do_notify_parent(tsk, sig);
				607	} else if (thread_group_leader(tsk)) {
				608	autoreap = thread_group_empty(tsk) &&
				609	do_notify_parent(tsk, tsk->exit_signal);
				610	} else {
				611	autoreap = true;
				612	}
				613
				614	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
				615	if (tsk->exit_state == EXIT_DEAD)
				616	list_add(&tsk->ptrace_entry, &dead);
				617
				618	/* mt-exec, de_thread() is waiting for group leader */
				619	if (unlikely(tsk->signal->notify_count < 0))
				620	wake_up_process(tsk->signal->group_exit_task);
				621	write_unlock_irq(&tasklist_lock);
				622
				623	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
				624	list_del_init(&p->ptrace_entry);
				625	release_task(p);
				626	}
				627	}
				628
				629	#ifdef CONFIG_DEBUG_STACK_USAGE
				630	static void check_stack_usage(void)
				631	{
				632	static DEFINE_SPINLOCK(low_water_lock);
				633	static int lowest_to_date = THREAD_SIZE;
				634	unsigned long free;
				635
				636	free = stack_not_used(current);
				637
				638	if (free >= lowest_to_date)
				639	return;
				640
				641	spin_lock(&low_water_lock);
				642	if (free < lowest_to_date) {
				643	pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
				644	current->comm, task_pid_nr(current), free);
				645	lowest_to_date = free;
				646	}
				647	spin_unlock(&low_water_lock);
				648	}
				649	#else
				650	static inline void check_stack_usage(void) {}
				651	#endif
				652
				653	void do_exit(long code)
				654	{
				655	struct task_struct *tsk = current;
				656	int group_dead;
				657	TASKS_RCU(int tasks_rcu_i);
				658
				659	profile_task_exit(tsk);
				660
				661	WARN_ON(blk_needs_flush_plug(tsk));
				662
				663	if (unlikely(in_interrupt()))
				664	panic("Aiee, killing interrupt handler!");
				665	if (unlikely(!tsk->pid))
				666	panic("Attempted to kill the idle task!");
				667
				668	/*
				669	* If do_exit is called because this processes oopsed, it's possible
				670	* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
				671	* continuing. Amongst other possible reasons, this is to prevent
				672	* mm_release()->clear_child_tid() from writing to a user-controlled
				673	* kernel address.
				674	*/
				675	set_fs(USER_DS);
				676
				677	ptrace_event(PTRACE_EVENT_EXIT, code);
				678
				679	validate_creds_for_do_exit(tsk);
				680
				681	/*
				682	* We're taking recursive faults here in do_exit. Safest is to just
				683	* leave this task alone and wait for reboot.
				684	*/
				685	if (unlikely(tsk->flags & PF_EXITING)) {
				686	pr_alert("Fixing recursive fault but reboot is needed!\n");
				687	/*
				688	* We can do this unlocked here. The futex code uses
				689	* this flag just to verify whether the pi state
				690	* cleanup has been done or not. In the worst case it
				691	* loops once more. We pretend that the cleanup was
				692	* done as there is no way to return. Either the
				693	* OWNER_DIED bit is set by now or we push the blocked
				694	* task into the wait for ever nirwana as well.
				695	*/
				696	tsk->flags \|= PF_EXITPIDONE;
				697	set_current_state(TASK_UNINTERRUPTIBLE);
				698	schedule();
				699	}
				700
				701	exit_signals(tsk); /* sets PF_EXITING */
				702	/*
				703	* tsk->flags are checked in the futex code to protect against
				704	* an exiting task cleaning up the robust pi futexes.
				705	*/
				706	smp_mb();
				707	raw_spin_unlock_wait(&tsk->pi_lock);
				708
				709	if (unlikely(in_atomic())) {
				710	pr_info("note: %s[%d] exited with preempt_count %d\n",
				711	current->comm, task_pid_nr(current),
				712	preempt_count());
				713	preempt_count_set(PREEMPT_ENABLED);
				714	}
				715
				716	/* sync mm's RSS info before statistics gathering */
				717	if (tsk->mm)
				718	sync_mm_rss(tsk->mm);
				719	acct_update_integrals(tsk);
				720	group_dead = atomic_dec_and_test(&tsk->signal->live);
				721	if (group_dead) {
				722	hrtimer_cancel(&tsk->signal->real_timer);
				723	exit_itimers(tsk->signal);
				724	if (tsk->mm)
				725	setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
				726	}
				727	acct_collect(code, group_dead);
				728	if (group_dead)
				729	tty_audit_exit();
				730	audit_free(tsk);
				731
				732	tsk->exit_code = code;
				733	taskstats_exit(tsk, group_dead);
				734
				735	exit_mm(tsk);
				736
				737	if (group_dead)
				738	acct_process();
				739	trace_sched_process_exit(tsk);
				740
				741	exit_sem(tsk);
				742	exit_shm(tsk);
				743	exit_files(tsk);
				744	exit_fs(tsk);
				745	if (group_dead)
				746	disassociate_ctty(1);
				747	exit_task_namespaces(tsk);
				748	exit_task_work(tsk);
				749	exit_thread();
				750
				751	/*
				752	* Flush inherited counters to the parent - before the parent
				753	* gets woken up by child-exit notifications.
				754	*
				755	* because of cgroup mode, must be called before cgroup_exit()
				756	*/
				757	perf_event_exit_task(tsk);
				758
				759	cgroup_exit(tsk);
				760
				761	/*
				762	* FIXME: do that only when needed, using sched_exit tracepoint
				763	*/
				764	flush_ptrace_hw_breakpoint(tsk);
				765
				766	TASKS_RCU(preempt_disable());
				767	TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
				768	TASKS_RCU(preempt_enable());
				769	exit_notify(tsk, group_dead);
				770	proc_exit_connector(tsk);
				771	#ifdef CONFIG_NUMA
				772	task_lock(tsk);
				773	mpol_put(tsk->mempolicy);
				774	tsk->mempolicy = NULL;
				775	task_unlock(tsk);
				776	#endif
				777	#ifdef CONFIG_FUTEX
				778	if (unlikely(current->pi_state_cache))
				779	kfree(current->pi_state_cache);
				780	#endif
				781	/*
				782	* Make sure we are holding no locks:
				783	*/
				784	debug_check_no_locks_held();
				785	/*
				786	* We can do this unlocked here. The futex code uses this flag
				787	* just to verify whether the pi state cleanup has been done
				788	* or not. In the worst case it loops once more.
				789	*/
				790	tsk->flags \|= PF_EXITPIDONE;
				791
				792	if (tsk->io_context)
				793	exit_io_context(tsk);
				794
				795	if (tsk->splice_pipe)
				796	free_pipe_info(tsk->splice_pipe);
				797
				798	if (tsk->task_frag.page)
				799	put_page(tsk->task_frag.page);
				800
				801	validate_creds_for_do_exit(tsk);
				802
				803	check_stack_usage();
				804	preempt_disable();
				805	if (tsk->nr_dirtied)
				806	__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
				807	exit_rcu();
				808	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
				809
				810	/*
				811	* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
				812	* when the following two conditions become true.
				813	* - There is race condition of mmap_sem (It is acquired by
				814	* exit_mm()), and
				815	* - SMI occurs before setting TASK_RUNINNG.
				816	* (or hypervisor of virtual machine switches to other guest)
				817	* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
				818	*
				819	* To avoid it, we have to wait for releasing tsk->pi_lock which
				820	* is held by try_to_wake_up()
				821	*/
				822	smp_mb();
				823	raw_spin_unlock_wait(&tsk->pi_lock);
				824
				825	/* causes final put_task_struct in finish_task_switch(). */
				826	tsk->state = TASK_DEAD;
				827	tsk->flags \|= PF_NOFREEZE; /* tell freezer to ignore us */
				828	schedule();
				829	BUG();
				830	/* Avoid "noreturn function does return". */
				831	for (;;)
				832	cpu_relax(); /* For when BUG is null */
				833	}
				834	EXPORT_SYMBOL_GPL(do_exit);
				835
				836	void complete_and_exit(struct completion *comp, long code)
				837	{
				838	if (comp)
				839	complete(comp);
				840
				841	do_exit(code);
				842	}
				843	EXPORT_SYMBOL(complete_and_exit);
				844
				845	SYSCALL_DEFINE1(exit, int, error_code)
				846	{
				847	do_exit((error_code&0xff)<<8);
				848	}
				849
				850	/*
				851	* Take down every thread in the group. This is called by fatal signals
				852	* as well as by sys_exit_group (below).
				853	*/
				854	void
				855	do_group_exit(int exit_code)
				856	{
				857	struct signal_struct *sig = current->signal;
				858
				859	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
				860
				861	if (signal_group_exit(sig))
				862	exit_code = sig->group_exit_code;
				863	else if (!thread_group_empty(current)) {
				864	struct sighand_struct *const sighand = current->sighand;
				865
				866	spin_lock_irq(&sighand->siglock);
				867	if (signal_group_exit(sig))
				868	/* Another thread got here before we took the lock. */
				869	exit_code = sig->group_exit_code;
				870	else {
				871	sig->group_exit_code = exit_code;
				872	sig->flags = SIGNAL_GROUP_EXIT;
				873	zap_other_threads(current);
				874	}
				875	spin_unlock_irq(&sighand->siglock);
				876	}
				877
				878	do_exit(exit_code);
				879	/* NOTREACHED */
				880	}
				881
				882	/*
				883	* this kills every thread in the thread group. Note that any externally
				884	* wait4()-ing process will get the correct exit code - even if this
				885	* thread is not the thread group leader.
				886	*/
				887	SYSCALL_DEFINE1(exit_group, int, error_code)
				888	{
				889	do_group_exit((error_code & 0xff) << 8);
				890	/* NOTREACHED */
				891	return 0;
				892	}
				893
				894	struct wait_opts {
				895	enum pid_type wo_type;
				896	int wo_flags;
				897	struct pid *wo_pid;
				898
				899	struct siginfo __user *wo_info;
				900	int __user *wo_stat;
				901	struct rusage __user *wo_rusage;
				902
				903	wait_queue_t child_wait;
				904	int notask_error;
				905	};
				906
				907	static inline
				908	struct pid task_pid_type(struct task_struct task, enum pid_type type)
				909	{
				910	if (type != PIDTYPE_PID)
				911	task = task->group_leader;
				912	return task->pids[type].pid;
				913	}
				914
				915	static int eligible_pid(struct wait_opts wo, struct task_struct p)
				916	{
				917	return wo->wo_type == PIDTYPE_MAX \|\|
				918	task_pid_type(p, wo->wo_type) == wo->wo_pid;
				919	}
				920
				921	static int
				922	eligible_child(struct wait_opts wo, bool ptrace, struct task_struct p)
				923	{
				924	if (!eligible_pid(wo, p))
				925	return 0;
				926
				927	/*
				928	* Wait for all children (clone and not) if __WALL is set or
				929	* if it is traced by us.
				930	*/
				931	if (ptrace \|\| (wo->wo_flags & __WALL))
				932	return 1;
				933
				934	/*
				935	* Otherwise, wait for clone children only if __WCLONE is set;
				936	* otherwise, wait for non-clone children only.
				937	*
				938	* Note: a "clone" child here is one that reports to its parent
				939	* using a signal other than SIGCHLD, or a non-leader thread which
				940	* we can only see if it is traced by us.
				941	*/
				942	if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
				943	return 0;
				944
				945	return 1;
				946	}
				947
				948	static int wait_noreap_copyout(struct wait_opts wo, struct task_struct p,
				949	pid_t pid, uid_t uid, int why, int status)
				950	{
				951	struct siginfo __user *infop;
				952	int retval = wo->wo_rusage
				953	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
				954
				955	put_task_struct(p);
				956	infop = wo->wo_info;
				957	if (infop) {
				958	if (!retval)
				959	retval = put_user(SIGCHLD, &infop->si_signo);
				960	if (!retval)
				961	retval = put_user(0, &infop->si_errno);
				962	if (!retval)
				963	retval = put_user((short)why, &infop->si_code);
				964	if (!retval)
				965	retval = put_user(pid, &infop->si_pid);
				966	if (!retval)
				967	retval = put_user(uid, &infop->si_uid);
				968	if (!retval)
				969	retval = put_user(status, &infop->si_status);
				970	}
				971	if (!retval)
				972	retval = pid;
				973	return retval;
				974	}
				975
				976	/*
				977	* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
				978	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
				979	* the lock and this task is uninteresting. If we return nonzero, we have
				980	* released the lock and the system call should return.
				981	*/
				982	static int wait_task_zombie(struct wait_opts wo, struct task_struct p)
				983	{
				984	int state, retval, status;
				985	pid_t pid = task_pid_vnr(p);
				986	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
				987	struct siginfo __user *infop;
				988
				989	if (!likely(wo->wo_flags & WEXITED))
				990	return 0;
				991
				992	if (unlikely(wo->wo_flags & WNOWAIT)) {
				993	int exit_code = p->exit_code;
				994	int why;
				995
				996	get_task_struct(p);
				997	read_unlock(&tasklist_lock);
				998	sched_annotate_sleep();
				999
				1000	if ((exit_code & 0x7f) == 0) {
				1001	why = CLD_EXITED;
				1002	status = exit_code >> 8;
				1003	} else {
				1004	why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
				1005	status = exit_code & 0x7f;
				1006	}
				1007	return wait_noreap_copyout(wo, p, pid, uid, why, status);
				1008	}
				1009	/*
				1010	* Move the task's state to DEAD/TRACE, only one thread can do this.
				1011	*/
				1012	state = (ptrace_reparented(p) && thread_group_leader(p)) ?
				1013	EXIT_TRACE : EXIT_DEAD;
				1014	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
				1015	return 0;
				1016	/*
				1017	* We own this thread, nobody else can reap it.
				1018	*/
				1019	read_unlock(&tasklist_lock);
				1020	sched_annotate_sleep();
				1021
				1022	/*
				1023	* Check thread_group_leader() to exclude the traced sub-threads.
				1024	*/
				1025	if (state == EXIT_DEAD && thread_group_leader(p)) {
				1026	struct signal_struct *sig = p->signal;
				1027	struct signal_struct *psig = current->signal;
				1028	unsigned long maxrss;
				1029	cputime_t tgutime, tgstime;
				1030
				1031	/*
				1032	* The resource counters for the group leader are in its
				1033	* own task_struct. Those for dead threads in the group
				1034	* are in its signal_struct, as are those for the child
				1035	* processes it has previously reaped. All these
				1036	* accumulate in the parent's signal_struct c* fields.
				1037	*
				1038	* We don't bother to take a lock here to protect these
				1039	* p->signal fields because the whole thread group is dead
				1040	* and nobody can change them.
				1041	*
				1042	* psig->stats_lock also protects us from our sub-theads
				1043	* which can reap other children at the same time. Until
				1044	* we change k_getrusage()-like users to rely on this lock
				1045	* we have to take ->siglock as well.
				1046	*
				1047	* We use thread_group_cputime_adjusted() to get times for
				1048	* the thread group, which consolidates times for all threads
				1049	* in the group including the group leader.
				1050	*/
				1051	thread_group_cputime_adjusted(p, &tgutime, &tgstime);
				1052	spin_lock_irq(&current->sighand->siglock);
				1053	write_seqlock(&psig->stats_lock);
				1054	psig->cutime += tgutime + sig->cutime;
				1055	psig->cstime += tgstime + sig->cstime;
				1056	psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
				1057	psig->cmin_flt +=
				1058	p->min_flt + sig->min_flt + sig->cmin_flt;
				1059	psig->cmaj_flt +=
				1060	p->maj_flt + sig->maj_flt + sig->cmaj_flt;
				1061	psig->cnvcsw +=
				1062	p->nvcsw + sig->nvcsw + sig->cnvcsw;
				1063	psig->cnivcsw +=
				1064	p->nivcsw + sig->nivcsw + sig->cnivcsw;
				1065	psig->cinblock +=
				1066	task_io_get_inblock(p) +
				1067	sig->inblock + sig->cinblock;
				1068	psig->coublock +=
				1069	task_io_get_oublock(p) +
				1070	sig->oublock + sig->coublock;
				1071	maxrss = max(sig->maxrss, sig->cmaxrss);
				1072	if (psig->cmaxrss < maxrss)
				1073	psig->cmaxrss = maxrss;
				1074	task_io_accounting_add(&psig->ioac, &p->ioac);
				1075	task_io_accounting_add(&psig->ioac, &sig->ioac);
				1076	write_sequnlock(&psig->stats_lock);
				1077	spin_unlock_irq(&current->sighand->siglock);
				1078	}
				1079
				1080	retval = wo->wo_rusage
				1081	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
				1082	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
				1083	? p->signal->group_exit_code : p->exit_code;
				1084	if (!retval && wo->wo_stat)
				1085	retval = put_user(status, wo->wo_stat);
				1086
				1087	infop = wo->wo_info;
				1088	if (!retval && infop)
				1089	retval = put_user(SIGCHLD, &infop->si_signo);
				1090	if (!retval && infop)
				1091	retval = put_user(0, &infop->si_errno);
				1092	if (!retval && infop) {
				1093	int why;
				1094
				1095	if ((status & 0x7f) == 0) {
				1096	why = CLD_EXITED;
				1097	status >>= 8;
				1098	} else {
				1099	why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
				1100	status &= 0x7f;
				1101	}
				1102	retval = put_user((short)why, &infop->si_code);
				1103	if (!retval)
				1104	retval = put_user(status, &infop->si_status);
				1105	}
				1106	if (!retval && infop)
				1107	retval = put_user(pid, &infop->si_pid);
				1108	if (!retval && infop)
				1109	retval = put_user(uid, &infop->si_uid);
				1110	if (!retval)
				1111	retval = pid;
				1112
				1113	if (state == EXIT_TRACE) {
				1114	write_lock_irq(&tasklist_lock);
				1115	/* We dropped tasklist, ptracer could die and untrace */
				1116	ptrace_unlink(p);
				1117
				1118	/* If parent wants a zombie, don't release it now */
				1119	state = EXIT_ZOMBIE;
				1120	if (do_notify_parent(p, p->exit_signal))
				1121	state = EXIT_DEAD;
				1122	p->exit_state = state;
				1123	write_unlock_irq(&tasklist_lock);
				1124	}
				1125	if (state == EXIT_DEAD)
				1126	release_task(p);
				1127
				1128	return retval;
				1129	}
				1130
				1131	static int task_stopped_code(struct task_struct p, bool ptrace)
				1132	{
				1133	if (ptrace) {
				1134	if (task_is_stopped_or_traced(p) &&
				1135	!(p->jobctl & JOBCTL_LISTENING))
				1136	return &p->exit_code;
				1137	} else {
				1138	if (p->signal->flags & SIGNAL_STOP_STOPPED)
				1139	return &p->signal->group_exit_code;
				1140	}
				1141	return NULL;
				1142	}
				1143
				1144	/**
				1145	* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
				1146	* @wo: wait options
				1147	* @ptrace: is the wait for ptrace
				1148	* @p: task to wait for
				1149	*
				1150	* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
				1151	*
				1152	* CONTEXT:
				1153	* read_lock(&tasklist_lock), which is released if return value is
				1154	* non-zero. Also, grabs and releases @p->sighand->siglock.
				1155	*
				1156	* RETURNS:
				1157	* 0 if wait condition didn't exist and search for other wait conditions
				1158	* should continue. Non-zero return, -errno on failure and @p's pid on
				1159	* success, implies that tasklist_lock is released and wait condition
				1160	* search should terminate.
				1161	*/
				1162	static int wait_task_stopped(struct wait_opts *wo,
				1163	int ptrace, struct task_struct *p)
				1164	{
				1165	struct siginfo __user *infop;
				1166	int retval, exit_code, *p_code, why;
				1167	uid_t uid = 0; /* unneeded, required by compiler */
				1168	pid_t pid;
				1169
				1170	/*
				1171	* Traditionally we see ptrace'd stopped tasks regardless of options.
				1172	*/
				1173	if (!ptrace && !(wo->wo_flags & WUNTRACED))
				1174	return 0;
				1175
				1176	if (!task_stopped_code(p, ptrace))
				1177	return 0;
				1178
				1179	exit_code = 0;
				1180	spin_lock_irq(&p->sighand->siglock);
				1181
				1182	p_code = task_stopped_code(p, ptrace);
				1183	if (unlikely(!p_code))
				1184	goto unlock_sig;
				1185
				1186	exit_code = *p_code;
				1187	if (!exit_code)
				1188	goto unlock_sig;
				1189
				1190	if (!unlikely(wo->wo_flags & WNOWAIT))
				1191	*p_code = 0;
				1192
				1193	uid = from_kuid_munged(current_user_ns(), task_uid(p));
				1194	unlock_sig:
				1195	spin_unlock_irq(&p->sighand->siglock);
				1196	if (!exit_code)
				1197	return 0;
				1198
				1199	/*
				1200	* Now we are pretty sure this task is interesting.
				1201	* Make sure it doesn't get reaped out from under us while we
				1202	* give up the lock and then examine it below. We don't want to
				1203	* keep holding onto the tasklist_lock while we call getrusage and
				1204	* possibly take page faults for user memory.
				1205	*/
				1206	get_task_struct(p);
				1207	pid = task_pid_vnr(p);
				1208	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
				1209	read_unlock(&tasklist_lock);
				1210	sched_annotate_sleep();
				1211
				1212	if (unlikely(wo->wo_flags & WNOWAIT))
				1213	return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
				1214
				1215	retval = wo->wo_rusage
				1216	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
				1217	if (!retval && wo->wo_stat)
				1218	retval = put_user((exit_code << 8) \| 0x7f, wo->wo_stat);
				1219
				1220	infop = wo->wo_info;
				1221	if (!retval && infop)
				1222	retval = put_user(SIGCHLD, &infop->si_signo);
				1223	if (!retval && infop)
				1224	retval = put_user(0, &infop->si_errno);
				1225	if (!retval && infop)
				1226	retval = put_user((short)why, &infop->si_code);
				1227	if (!retval && infop)
				1228	retval = put_user(exit_code, &infop->si_status);
				1229	if (!retval && infop)
				1230	retval = put_user(pid, &infop->si_pid);
				1231	if (!retval && infop)
				1232	retval = put_user(uid, &infop->si_uid);
				1233	if (!retval)
				1234	retval = pid;
				1235	put_task_struct(p);
				1236
				1237	BUG_ON(!retval);
				1238	return retval;
				1239	}
				1240
				1241	/*
				1242	* Handle do_wait work for one task in a live, non-stopped state.
				1243	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
				1244	* the lock and this task is uninteresting. If we return nonzero, we have
				1245	* released the lock and the system call should return.
				1246	*/
				1247	static int wait_task_continued(struct wait_opts wo, struct task_struct p)
				1248	{
				1249	int retval;
				1250	pid_t pid;
				1251	uid_t uid;
				1252
				1253	if (!unlikely(wo->wo_flags & WCONTINUED))
				1254	return 0;
				1255
				1256	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
				1257	return 0;
				1258
				1259	spin_lock_irq(&p->sighand->siglock);
				1260	/* Re-check with the lock held. */
				1261	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
				1262	spin_unlock_irq(&p->sighand->siglock);
				1263	return 0;
				1264	}
				1265	if (!unlikely(wo->wo_flags & WNOWAIT))
				1266	p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
				1267	uid = from_kuid_munged(current_user_ns(), task_uid(p));
				1268	spin_unlock_irq(&p->sighand->siglock);
				1269
				1270	pid = task_pid_vnr(p);
				1271	get_task_struct(p);
				1272	read_unlock(&tasklist_lock);
				1273	sched_annotate_sleep();
				1274
				1275	if (!wo->wo_info) {
				1276	retval = wo->wo_rusage
				1277	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
				1278	put_task_struct(p);
				1279	if (!retval && wo->wo_stat)
				1280	retval = put_user(0xffff, wo->wo_stat);
				1281	if (!retval)
				1282	retval = pid;
				1283	} else {
				1284	retval = wait_noreap_copyout(wo, p, pid, uid,
				1285	CLD_CONTINUED, SIGCONT);
				1286	BUG_ON(retval == 0);
				1287	}
				1288
				1289	return retval;
				1290	}
				1291
				1292	/*
				1293	* Consider @p for a wait by @parent.
				1294	*
				1295	* -ECHILD should be in ->notask_error before the first call.
				1296	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
				1297	* Returns zero if the search for a child should continue;
				1298	* then ->notask_error is 0 if @p is an eligible child,
				1299	* or another error from security_task_wait(), or still -ECHILD.
				1300	*/
				1301	static int wait_consider_task(struct wait_opts *wo, int ptrace,
				1302	struct task_struct *p)
				1303	{
				1304	/*
				1305	* We can race with wait_task_zombie() from another thread.
				1306	* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
				1307	* can't confuse the checks below.
				1308	*/
				1309	int exit_state = ACCESS_ONCE(p->exit_state);
				1310	int ret;
				1311
				1312	if (unlikely(exit_state == EXIT_DEAD))
				1313	return 0;
				1314
				1315	ret = eligible_child(wo, ptrace, p);
				1316	if (!ret)
				1317	return ret;
				1318
				1319	ret = security_task_wait(p);
				1320	if (unlikely(ret < 0)) {
				1321	/*
				1322	* If we have not yet seen any eligible child,
				1323	* then let this error code replace -ECHILD.
				1324	* A permission error will give the user a clue
				1325	* to look for security policy problems, rather
				1326	* than for mysterious wait bugs.
				1327	*/
				1328	if (wo->notask_error)
				1329	wo->notask_error = ret;
				1330	return 0;
				1331	}
				1332
				1333	if (unlikely(exit_state == EXIT_TRACE)) {
				1334	/*
				1335	* ptrace == 0 means we are the natural parent. In this case
				1336	* we should clear notask_error, debugger will notify us.
				1337	*/
				1338	if (likely(!ptrace))
				1339	wo->notask_error = 0;
				1340	return 0;
				1341	}
				1342
				1343	if (likely(!ptrace) && unlikely(p->ptrace)) {
				1344	/*
				1345	* If it is traced by its real parent's group, just pretend
				1346	* the caller is ptrace_do_wait() and reap this child if it
				1347	* is zombie.
				1348	*
				1349	* This also hides group stop state from real parent; otherwise
				1350	* a single stop can be reported twice as group and ptrace stop.
				1351	* If a ptracer wants to distinguish these two events for its
				1352	* own children it should create a separate process which takes
				1353	* the role of real parent.
				1354	*/
				1355	if (!ptrace_reparented(p))
				1356	ptrace = 1;
				1357	}
				1358
				1359	/* slay zombie? */
				1360	if (exit_state == EXIT_ZOMBIE) {
				1361	/* we don't reap group leaders with subthreads */
				1362	if (!delay_group_leader(p)) {
				1363	/*
				1364	* A zombie ptracee is only visible to its ptracer.
				1365	* Notification and reaping will be cascaded to the
				1366	* real parent when the ptracer detaches.
				1367	*/
				1368	if (unlikely(ptrace) \|\| likely(!p->ptrace))
				1369	return wait_task_zombie(wo, p);
				1370	}
				1371
				1372	/*
				1373	* Allow access to stopped/continued state via zombie by
				1374	* falling through. Clearing of notask_error is complex.
				1375	*
				1376	* When !@ptrace:
				1377	*
				1378	* If WEXITED is set, notask_error should naturally be
				1379	* cleared. If not, subset of WSTOPPED\|WCONTINUED is set,
				1380	* so, if there are live subthreads, there are events to
				1381	* wait for. If all subthreads are dead, it's still safe
				1382	* to clear - this function will be called again in finite
				1383	* amount time once all the subthreads are released and
				1384	* will then return without clearing.
				1385	*
				1386	* When @ptrace:
				1387	*
				1388	* Stopped state is per-task and thus can't change once the
				1389	* target task dies. Only continued and exited can happen.
				1390	* Clear notask_error if WCONTINUED \| WEXITED.
				1391	*/
				1392	if (likely(!ptrace) \|\| (wo->wo_flags & (WCONTINUED \| WEXITED)))
				1393	wo->notask_error = 0;
				1394	} else {
				1395	/*
				1396	* @p is alive and it's gonna stop, continue or exit, so
				1397	* there always is something to wait for.
				1398	*/
				1399	wo->notask_error = 0;
				1400	}
				1401
				1402	/*
				1403	* Wait for stopped. Depending on @ptrace, different stopped state
				1404	* is used and the two don't interact with each other.
				1405	*/
				1406	ret = wait_task_stopped(wo, ptrace, p);
				1407	if (ret)
				1408	return ret;
				1409
				1410	/*
				1411	* Wait for continued. There's only one continued state and the
				1412	* ptracer can consume it which can confuse the real parent. Don't
				1413	* use WCONTINUED from ptracer. You don't need or want it.
				1414	*/
				1415	return wait_task_continued(wo, p);
				1416	}
				1417
				1418	/*
				1419	* Do the work of do_wait() for one thread in the group, @tsk.
				1420	*
				1421	* -ECHILD should be in ->notask_error before the first call.
				1422	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
				1423	* Returns zero if the search for a child should continue; then
				1424	* ->notask_error is 0 if there were any eligible children,
				1425	* or another error from security_task_wait(), or still -ECHILD.
				1426	*/
				1427	static int do_wait_thread(struct wait_opts wo, struct task_struct tsk)
				1428	{
				1429	struct task_struct *p;
				1430
				1431	list_for_each_entry(p, &tsk->children, sibling) {
				1432	int ret = wait_consider_task(wo, 0, p);
				1433
				1434	if (ret)
				1435	return ret;
				1436	}
				1437
				1438	return 0;
				1439	}
				1440
				1441	static int ptrace_do_wait(struct wait_opts wo, struct task_struct tsk)
				1442	{
				1443	struct task_struct *p;
				1444
				1445	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
				1446	int ret = wait_consider_task(wo, 1, p);
				1447
				1448	if (ret)
				1449	return ret;
				1450	}
				1451
				1452	return 0;
				1453	}
				1454
				1455	static int child_wait_callback(wait_queue_t *wait, unsigned mode,
				1456	int sync, void *key)
				1457	{
				1458	struct wait_opts *wo = container_of(wait, struct wait_opts,
				1459	child_wait);
				1460	struct task_struct *p = key;
				1461
				1462	if (!eligible_pid(wo, p))
				1463	return 0;
				1464
				1465	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
				1466	return 0;
				1467
				1468	return default_wake_function(wait, mode, sync, key);
				1469	}
				1470
				1471	void __wake_up_parent(struct task_struct p, struct task_struct parent)
				1472	{
				1473	__wake_up_sync_key(&parent->signal->wait_chldexit,
				1474	TASK_INTERRUPTIBLE, 1, p);
				1475	}
				1476
				1477	static long do_wait(struct wait_opts *wo)
				1478	{
				1479	struct task_struct *tsk;
				1480	int retval;
				1481
				1482	trace_sched_process_wait(wo->wo_pid);
				1483
				1484	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
				1485	wo->child_wait.private = current;
				1486	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
				1487	repeat:
				1488	/*
				1489	* If there is nothing that can match our criteria, just get out.
				1490	* We will clear ->notask_error to zero if we see any child that
				1491	* might later match our criteria, even if we are not able to reap
				1492	* it yet.
				1493	*/
				1494	wo->notask_error = -ECHILD;
				1495	if ((wo->wo_type < PIDTYPE_MAX) &&
				1496	(!wo->wo_pid \|\| hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
				1497	goto notask;
				1498
				1499	set_current_state(TASK_INTERRUPTIBLE);
				1500	read_lock(&tasklist_lock);
				1501	tsk = current;
				1502	do {
				1503	retval = do_wait_thread(wo, tsk);
				1504	if (retval)
				1505	goto end;
				1506
				1507	retval = ptrace_do_wait(wo, tsk);
				1508	if (retval)
				1509	goto end;
				1510
				1511	if (wo->wo_flags & __WNOTHREAD)
				1512	break;
				1513	} while_each_thread(current, tsk);
				1514	read_unlock(&tasklist_lock);
				1515
				1516	notask:
				1517	retval = wo->notask_error;
				1518	if (!retval && !(wo->wo_flags & WNOHANG)) {
				1519	retval = -ERESTARTSYS;
				1520	if (!signal_pending(current)) {
				1521	schedule();
				1522	goto repeat;
				1523	}
				1524	}
				1525	end:
				1526	__set_current_state(TASK_RUNNING);
				1527	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
				1528	return retval;
				1529	}
				1530
				1531	SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
				1532	infop, int, options, struct rusage __user *, ru)
				1533	{
				1534	struct wait_opts wo;
				1535	struct pid *pid = NULL;
				1536	enum pid_type type;
				1537	long ret;
				1538
				1539	if (options & ~(WNOHANG\|WNOWAIT\|WEXITED\|WSTOPPED\|WCONTINUED))
				1540	return -EINVAL;
				1541	if (!(options & (WEXITED\|WSTOPPED\|WCONTINUED)))
				1542	return -EINVAL;
				1543
				1544	switch (which) {
				1545	case P_ALL:
				1546	type = PIDTYPE_MAX;
				1547	break;
				1548	case P_PID:
				1549	type = PIDTYPE_PID;
				1550	if (upid <= 0)
				1551	return -EINVAL;
				1552	break;
				1553	case P_PGID:
				1554	type = PIDTYPE_PGID;
				1555	if (upid <= 0)
				1556	return -EINVAL;
				1557	break;
				1558	default:
				1559	return -EINVAL;
				1560	}
				1561
				1562	if (type < PIDTYPE_MAX)
				1563	pid = find_get_pid(upid);
				1564
				1565	wo.wo_type = type;
				1566	wo.wo_pid = pid;
				1567	wo.wo_flags = options;
				1568	wo.wo_info = infop;
				1569	wo.wo_stat = NULL;
				1570	wo.wo_rusage = ru;
				1571	ret = do_wait(&wo);
				1572
				1573	if (ret > 0) {
				1574	ret = 0;
				1575	} else if (infop) {
				1576	/*
				1577	* For a WNOHANG return, clear out all the fields
				1578	* we would set so the user can easily tell the
				1579	* difference.
				1580	*/
				1581	if (!ret)
				1582	ret = put_user(0, &infop->si_signo);
				1583	if (!ret)
				1584	ret = put_user(0, &infop->si_errno);
				1585	if (!ret)
				1586	ret = put_user(0, &infop->si_code);
				1587	if (!ret)
				1588	ret = put_user(0, &infop->si_pid);
				1589	if (!ret)
				1590	ret = put_user(0, &infop->si_uid);
				1591	if (!ret)
				1592	ret = put_user(0, &infop->si_status);
				1593	}
				1594
				1595	put_pid(pid);
				1596	return ret;
				1597	}
				1598
				1599	SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
				1600	int, options, struct rusage __user *, ru)
				1601	{
				1602	struct wait_opts wo;
				1603	struct pid *pid = NULL;
				1604	enum pid_type type;
				1605	long ret;
				1606
				1607	if (options & ~(WNOHANG\|WUNTRACED\|WCONTINUED\|
				1608	__WNOTHREAD\|__WCLONE\|__WALL))
				1609	return -EINVAL;
				1610
				1611	if (upid == -1)
				1612	type = PIDTYPE_MAX;
				1613	else if (upid < 0) {
				1614	type = PIDTYPE_PGID;
				1615	pid = find_get_pid(-upid);
				1616	} else if (upid == 0) {
				1617	type = PIDTYPE_PGID;
				1618	pid = get_task_pid(current, PIDTYPE_PGID);
				1619	} else /* upid > 0 */ {
				1620	type = PIDTYPE_PID;
				1621	pid = find_get_pid(upid);
				1622	}
				1623
				1624	wo.wo_type = type;
				1625	wo.wo_pid = pid;
				1626	wo.wo_flags = options \| WEXITED;
				1627	wo.wo_info = NULL;
				1628	wo.wo_stat = stat_addr;
				1629	wo.wo_rusage = ru;
				1630	ret = do_wait(&wo);
				1631	put_pid(pid);
				1632
				1633	return ret;
				1634	}
				1635
				1636	#ifdef __ARCH_WANT_SYS_WAITPID
				1637
				1638	/*
				1639	* sys_waitpid() remains for compatibility. waitpid() should be
				1640	* implemented by calling sys_wait4() from libc.a.
				1641	*/
				1642	SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
				1643	{
				1644	return sys_wait4(pid, stat_addr, options, NULL);
				1645	}
				1646
				1647	#endif