Blame - kernel/futex.c - codeaurora/cp-linux

blob: 3057dabf726f723d014b12e9cc99a612e4d972a4 [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1	/*
				2	* Fast Userspace Mutexes (which I call "Futexes!").
				3	* (C) Rusty Russell, IBM 2002
				4	*
				5	* Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
				6	* (C) Copyright 2003 Red Hat Inc, All Rights Reserved
				7	*
				8	* Removed page pinning, fix privately mapped COW pages and other cleanups
				9	* (C) Copyright 2003, 2004 Jamie Lokier
				10	*
				11	* Robust futex support started by Ingo Molnar
				12	* (C) Copyright 2006 Red Hat Inc, All Rights Reserved
				13	* Thanks to Thomas Gleixner for suggestions, analysis and fixes.
				14	*
				15	* PI-futex support started by Ingo Molnar and Thomas Gleixner
				16	* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				17	* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
				18	*
				19	* PRIVATE futexes by Eric Dumazet
				20	* Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
				21	*
				22	* Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
				23	* Copyright (C) IBM Corporation, 2009
				24	* Thanks to Thomas Gleixner for conceptual design and careful reviews.
				25	*
				26	* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
				27	* enough at me, Linus for the original (flawed) idea, Matthew
				28	* Kirkwood for proof-of-concept implementation.
				29	*
				30	* "The futexes are also cursed."
				31	* "But they come in a choice of three flavours!"
				32	*
				33	* This program is free software; you can redistribute it and/or modify
				34	* it under the terms of the GNU General Public License as published by
				35	* the Free Software Foundation; either version 2 of the License, or
				36	* (at your option) any later version.
				37	*
				38	* This program is distributed in the hope that it will be useful,
				39	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				40	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				41	* GNU General Public License for more details.
				42	*
				43	* You should have received a copy of the GNU General Public License
				44	* along with this program; if not, write to the Free Software
				45	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
				46	*/
				47	#include <linux/slab.h>
				48	#include <linux/poll.h>
				49	#include <linux/fs.h>
				50	#include <linux/file.h>
				51	#include <linux/jhash.h>
				52	#include <linux/init.h>
				53	#include <linux/futex.h>
				54	#include <linux/mount.h>
				55	#include <linux/pagemap.h>
				56	#include <linux/syscalls.h>
				57	#include <linux/signal.h>
				58	#include <linux/export.h>
				59	#include <linux/magic.h>
				60	#include <linux/pid.h>
				61	#include <linux/nsproxy.h>
				62	#include <linux/ptrace.h>
				63	#include <linux/sched/rt.h>
				64	#include <linux/hugetlb.h>
				65	#include <linux/freezer.h>
				66	#include <linux/bootmem.h>
				67	#include <linux/fault-inject.h>
				68
				69	#include <asm/futex.h>
				70
				71	#include "locking/rtmutex_common.h"
				72
				73	/*
				74	* READ this before attempting to hack on futexes!
				75	*
				76	* Basic futex operation and ordering guarantees
				77	* =============================================
				78	*
				79	* The waiter reads the futex value in user space and calls
				80	* futex_wait(). This function computes the hash bucket and acquires
				81	* the hash bucket lock. After that it reads the futex user space value
				82	* again and verifies that the data has not changed. If it has not changed
				83	* it enqueues itself into the hash bucket, releases the hash bucket lock
				84	* and schedules.
				85	*
				86	* The waker side modifies the user space value of the futex and calls
				87	* futex_wake(). This function computes the hash bucket and acquires the
				88	* hash bucket lock. Then it looks for waiters on that futex in the hash
				89	* bucket and wakes them.
				90	*
				91	* In futex wake up scenarios where no tasks are blocked on a futex, taking
				92	* the hb spinlock can be avoided and simply return. In order for this
				93	* optimization to work, ordering guarantees must exist so that the waiter
				94	* being added to the list is acknowledged when the list is concurrently being
				95	* checked by the waker, avoiding scenarios like the following:
				96	*
				97	* CPU 0 CPU 1
				98	* val = *futex;
				99	* sys_futex(WAIT, futex, val);
				100	* futex_wait(futex, val);
				101	* uval = *futex;
				102	* *futex = newval;
				103	* sys_futex(WAKE, futex);
				104	* futex_wake(futex);
				105	* if (queue_empty())
				106	* return;
				107	* if (uval == val)
				108	* lock(hash_bucket(futex));
				109	* queue();
				110	* unlock(hash_bucket(futex));
				111	* schedule();
				112	*
				113	* This would cause the waiter on CPU 0 to wait forever because it
				114	* missed the transition of the user space value from val to newval
				115	* and the waker did not find the waiter in the hash bucket queue.
				116	*
				117	* The correct serialization ensures that a waiter either observes
				118	* the changed user space value before blocking or is woken by a
				119	* concurrent waker:
				120	*
				121	* CPU 0 CPU 1
				122	* val = *futex;
				123	* sys_futex(WAIT, futex, val);
				124	* futex_wait(futex, val);
				125	*
				126	* waiters++; (a)
				127	* mb(); (A) <-- paired with -.
				128	* \|
				129	* lock(hash_bucket(futex)); \|
				130	* \|
				131	* uval = *futex; \|
				132	* \| *futex = newval;
				133	* \| sys_futex(WAKE, futex);
				134	* \| futex_wake(futex);
				135	* \|
				136	* `-------> mb(); (B)
				137	* if (uval == val)
				138	* queue();
				139	* unlock(hash_bucket(futex));
				140	* schedule(); if (waiters)
				141	* lock(hash_bucket(futex));
				142	* else wake_waiters(futex);
				143	* waiters--; (b) unlock(hash_bucket(futex));
				144	*
				145	* Where (A) orders the waiters increment and the futex value read through
				146	* atomic operations (see hb_waiters_inc) and where (B) orders the write
				147	* to futex and the waiters read -- this is done by the barriers for both
				148	* shared and private futexes in get_futex_key_refs().
				149	*
				150	* This yields the following case (where X:=waiters, Y:=futex):
				151	*
				152	* X = Y = 0
				153	*
				154	* w[X]=1 w[Y]=1
				155	* MB MB
				156	* r[Y]=y r[X]=x
				157	*
				158	* Which guarantees that x==0 && y==0 is impossible; which translates back into
				159	* the guarantee that we cannot both miss the futex variable change and the
				160	* enqueue.
				161	*
				162	* Note that a new waiter is accounted for in (a) even when it is possible that
				163	* the wait call can return error, in which case we backtrack from it in (b).
				164	* Refer to the comment in queue_lock().
				165	*
				166	* Similarly, in order to account for waiters being requeued on another
				167	* address we always increment the waiters for the destination bucket before
				168	* acquiring the lock. It then decrements them again after releasing it -
				169	* the code that actually moves the futex(es) between hash buckets (requeue_futex)
				170	* will do the additional required waiter count housekeeping. This is done for
				171	* double_lock_hb() and double_unlock_hb(), respectively.
				172	*/
				173
				174	#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
				175	int __read_mostly futex_cmpxchg_enabled;
				176	#endif
				177
				178	/*
				179	* Futex flags used to encode options to functions and preserve them across
				180	* restarts.
				181	*/
				182	#define FLAGS_SHARED 0x01
				183	#define FLAGS_CLOCKRT 0x02
				184	#define FLAGS_HAS_TIMEOUT 0x04
				185
				186	/*
				187	* Priority Inheritance state:
				188	*/
				189	struct futex_pi_state {
				190	/*
				191	* list of 'owned' pi_state instances - these have to be
				192	* cleaned up in do_exit() if the task exits prematurely:
				193	*/
				194	struct list_head list;
				195
				196	/*
				197	* The PI object:
				198	*/
				199	struct rt_mutex pi_mutex;
				200
				201	struct task_struct *owner;
				202	atomic_t refcount;
				203
				204	union futex_key key;
				205	};
				206
				207	/**
				208	* struct futex_q - The hashed futex queue entry, one per waiting task
				209	* @list: priority-sorted list of tasks waiting on this futex
				210	* @task: the task waiting on the futex
				211	* @lock_ptr: the hash bucket lock
				212	* @key: the key the futex is hashed on
				213	* @pi_state: optional priority inheritance state
				214	* @rt_waiter: rt_waiter storage for use with requeue_pi
				215	* @requeue_pi_key: the requeue_pi target futex key
				216	* @bitset: bitset for the optional bitmasked wakeup
				217	*
				218	* We use this hashed waitqueue, instead of a normal wait_queue_t, so
				219	* we can wake only the relevant ones (hashed queues may be shared).
				220	*
				221	* A futex_q has a woken state, just like tasks have TASK_RUNNING.
				222	* It is considered woken when plist_node_empty(&q->list) \|\| q->lock_ptr == 0.
				223	* The order of wakeup is always to make the first condition true, then
				224	* the second.
				225	*
				226	* PI futexes are typically woken before they are removed from the hash list via
				227	* the rt_mutex code. See unqueue_me_pi().
				228	*/
				229	struct futex_q {
				230	struct plist_node list;
				231
				232	struct task_struct *task;
				233	spinlock_t *lock_ptr;
				234	union futex_key key;
				235	struct futex_pi_state *pi_state;
				236	struct rt_mutex_waiter *rt_waiter;
				237	union futex_key *requeue_pi_key;
				238	u32 bitset;
				239	};
				240
				241	static const struct futex_q futex_q_init = {
				242	/* list gets initialized in queue_me()*/
				243	.key = FUTEX_KEY_INIT,
				244	.bitset = FUTEX_BITSET_MATCH_ANY
				245	};
				246
				247	/*
				248	* Hash buckets are shared by all the futex_keys that hash to the same
				249	* location. Each key may have multiple futex_q structures, one for each task
				250	* waiting on a futex.
				251	*/
				252	struct futex_hash_bucket {
				253	atomic_t waiters;
				254	spinlock_t lock;
				255	struct plist_head chain;
				256	} ____cacheline_aligned_in_smp;
				257
				258	/*
				259	* The base of the bucket array and its size are always used together
				260	* (after initialization only in hash_futex()), so ensure that they
				261	* reside in the same cacheline.
				262	*/
				263	static struct {
				264	struct futex_hash_bucket *queues;
				265	unsigned long hashsize;
				266	} __futex_data __read_mostly __aligned(2*sizeof(long));
				267	#define futex_queues (__futex_data.queues)
				268	#define futex_hashsize (__futex_data.hashsize)
				269
				270
				271	/*
				272	* Fault injections for futexes.
				273	*/
				274	#ifdef CONFIG_FAIL_FUTEX
				275
				276	static struct {
				277	struct fault_attr attr;
				278
				279	bool ignore_private;
				280	} fail_futex = {
				281	.attr = FAULT_ATTR_INITIALIZER,
				282	.ignore_private = false,
				283	};
				284
				285	static int __init setup_fail_futex(char *str)
				286	{
				287	return setup_fault_attr(&fail_futex.attr, str);
				288	}
				289	__setup("fail_futex=", setup_fail_futex);
				290
				291	static bool should_fail_futex(bool fshared)
				292	{
				293	if (fail_futex.ignore_private && !fshared)
				294	return false;
				295
				296	return should_fail(&fail_futex.attr, 1);
				297	}
				298
				299	#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
				300
				301	static int __init fail_futex_debugfs(void)
				302	{
				303	umode_t mode = S_IFREG \| S_IRUSR \| S_IWUSR;
				304	struct dentry *dir;
				305
				306	dir = fault_create_debugfs_attr("fail_futex", NULL,
				307	&fail_futex.attr);
				308	if (IS_ERR(dir))
				309	return PTR_ERR(dir);
				310
				311	if (!debugfs_create_bool("ignore-private", mode, dir,
				312	&fail_futex.ignore_private)) {
				313	debugfs_remove_recursive(dir);
				314	return -ENOMEM;
				315	}
				316
				317	return 0;
				318	}
				319
				320	late_initcall(fail_futex_debugfs);
				321
				322	#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
				323
				324	#else
				325	static inline bool should_fail_futex(bool fshared)
				326	{
				327	return false;
				328	}
				329	#endif /* CONFIG_FAIL_FUTEX */
				330
				331	static inline void futex_get_mm(union futex_key *key)
				332	{
				333	atomic_inc(&key->private.mm->mm_count);
				334	/*
				335	* Ensure futex_get_mm() implies a full barrier such that
				336	* get_futex_key() implies a full barrier. This is relied upon
				337	* as full barrier (B), see the ordering comment above.
				338	*/
				339	smp_mb__after_atomic();
				340	}
				341
				342	/*
				343	* Reflects a new waiter being added to the waitqueue.
				344	*/
				345	static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
				346	{
				347	#ifdef CONFIG_SMP
				348	atomic_inc(&hb->waiters);
				349	/*
				350	* Full barrier (A), see the ordering comment above.
				351	*/
				352	smp_mb__after_atomic();
				353	#endif
				354	}
				355
				356	/*
				357	* Reflects a waiter being removed from the waitqueue by wakeup
				358	* paths.
				359	*/
				360	static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
				361	{
				362	#ifdef CONFIG_SMP
				363	atomic_dec(&hb->waiters);
				364	#endif
				365	}
				366
				367	static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
				368	{
				369	#ifdef CONFIG_SMP
				370	return atomic_read(&hb->waiters);
				371	#else
				372	return 1;
				373	#endif
				374	}
				375
				376	/*
				377	* We hash on the keys returned from get_futex_key (see below).
				378	*/
				379	static struct futex_hash_bucket hash_futex(union futex_key key)
				380	{
				381	u32 hash = jhash2((u32*)&key->both.word,
				382	(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
				383	key->both.offset);
				384	return &futex_queues[hash & (futex_hashsize - 1)];
				385	}
				386
				387	/*
				388	* Return 1 if two futex_keys are equal, 0 otherwise.
				389	*/
				390	static inline int match_futex(union futex_key key1, union futex_key key2)
				391	{
				392	return (key1 && key2
				393	&& key1->both.word == key2->both.word
				394	&& key1->both.ptr == key2->both.ptr
				395	&& key1->both.offset == key2->both.offset);
				396	}
				397
				398	/*
				399	* Take a reference to the resource addressed by a key.
				400	* Can be called while holding spinlocks.
				401	*
				402	*/
				403	static void get_futex_key_refs(union futex_key *key)
				404	{
				405	if (!key->both.ptr)
				406	return;
				407
				408	switch (key->both.offset & (FUT_OFF_INODE\|FUT_OFF_MMSHARED)) {
				409	case FUT_OFF_INODE:
				410	ihold(key->shared.inode); /* implies MB (B) */
				411	break;
				412	case FUT_OFF_MMSHARED:
				413	futex_get_mm(key); /* implies MB (B) */
				414	break;
				415	default:
				416	/*
				417	* Private futexes do not hold reference on an inode or
				418	* mm, therefore the only purpose of calling get_futex_key_refs
				419	* is because we need the barrier for the lockless waiter check.
				420	*/
				421	smp_mb(); /* explicit MB (B) */
				422	}
				423	}
				424
				425	/*
				426	* Drop a reference to the resource addressed by a key.
				427	* The hash bucket spinlock must not be held. This is
				428	* a no-op for private futexes, see comment in the get
				429	* counterpart.
				430	*/
				431	static void drop_futex_key_refs(union futex_key *key)
				432	{
				433	if (!key->both.ptr) {
				434	/* If we're here then we tried to put a key we failed to get */
				435	WARN_ON_ONCE(1);
				436	return;
				437	}
				438
				439	switch (key->both.offset & (FUT_OFF_INODE\|FUT_OFF_MMSHARED)) {
				440	case FUT_OFF_INODE:
				441	iput(key->shared.inode);
				442	break;
				443	case FUT_OFF_MMSHARED:
				444	mmdrop(key->private.mm);
				445	break;
				446	}
				447	}
				448
				449	/**
				450	* get_futex_key() - Get parameters which are the keys for a futex
				451	* @uaddr: virtual address of the futex
				452	* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
				453	* @key: address where result is stored.
				454	* @rw: mapping needs to be read/write (values: VERIFY_READ,
				455	* VERIFY_WRITE)
				456	*
				457	* Return: a negative error code or 0
				458	*
				459	* The key words are stored in *key on success.
				460	*
				461	* For shared mappings, it's (page->index, file_inode(vma->vm_file),
				462	* offset_within_page). For private mappings, it's (uaddr, current->mm).
				463	* We can usually work out the index without swapping in the page.
				464	*
				465	* lock_page() might sleep, the caller should not hold a spinlock.
				466	*/
				467	static int
				468	get_futex_key(u32 __user uaddr, int fshared, union futex_key key, int rw)
				469	{
				470	unsigned long address = (unsigned long)uaddr;
				471	struct mm_struct *mm = current->mm;
				472	struct page page, page_head;
				473	int err, ro = 0;
				474
				475	/*
				476	* The futex address must be "naturally" aligned.
				477	*/
				478	key->both.offset = address % PAGE_SIZE;
				479	if (unlikely((address % sizeof(u32)) != 0))
				480	return -EINVAL;
				481	address -= key->both.offset;
				482
				483	if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
				484	return -EFAULT;
				485
				486	if (unlikely(should_fail_futex(fshared)))
				487	return -EFAULT;
				488
				489	/*
				490	* PROCESS_PRIVATE futexes are fast.
				491	* As the mm cannot disappear under us and the 'key' only needs
				492	* virtual address, we dont even have to find the underlying vma.
				493	* Note : We do have to check 'uaddr' is a valid user address,
				494	* but access_ok() should be faster than find_vma()
				495	*/
				496	if (!fshared) {
				497	key->private.mm = mm;
				498	key->private.address = address;
				499	get_futex_key_refs(key); /* implies MB (B) */
				500	return 0;
				501	}
				502
				503	again:
				504	/* Ignore any VERIFY_READ mapping (futex common case) */
				505	if (unlikely(should_fail_futex(fshared)))
				506	return -EFAULT;
				507
				508	err = get_user_pages_fast(address, 1, 1, &page);
				509	/*
				510	* If write access is not required (eg. FUTEX_WAIT), try
				511	* and get read-only access.
				512	*/
				513	if (err == -EFAULT && rw == VERIFY_READ) {
				514	err = get_user_pages_fast(address, 1, 0, &page);
				515	ro = 1;
				516	}
				517	if (err < 0)
				518	return err;
				519	else
				520	err = 0;
				521
				522	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				523	page_head = page;
				524	if (unlikely(PageTail(page))) {
				525	put_page(page);
				526	/* serialize against __split_huge_page_splitting() */
				527	local_irq_disable();
				528	if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
				529	page_head = compound_head(page);
				530	/*
				531	* page_head is valid pointer but we must pin
				532	* it before taking the PG_lock and/or
				533	* PG_compound_lock. The moment we re-enable
				534	* irqs __split_huge_page_splitting() can
				535	* return and the head page can be freed from
				536	* under us. We can't take the PG_lock and/or
				537	* PG_compound_lock on a page that could be
				538	* freed from under us.
				539	*/
				540	if (page != page_head) {
				541	get_page(page_head);
				542	put_page(page);
				543	}
				544	local_irq_enable();
				545	} else {
				546	local_irq_enable();
				547	goto again;
				548	}
				549	}
				550	#else
				551	page_head = compound_head(page);
				552	if (page != page_head) {
				553	get_page(page_head);
				554	put_page(page);
				555	}
				556	#endif
				557
				558	lock_page(page_head);
				559
				560	/*
				561	* If page_head->mapping is NULL, then it cannot be a PageAnon
				562	* page; but it might be the ZERO_PAGE or in the gate area or
				563	* in a special mapping (all cases which we are happy to fail);
				564	* or it may have been a good file page when get_user_pages_fast
				565	* found it, but truncated or holepunched or subjected to
				566	* invalidate_complete_page2 before we got the page lock (also
				567	* cases which we are happy to fail). And we hold a reference,
				568	* so refcount care in invalidate_complete_page's remove_mapping
				569	* prevents drop_caches from setting mapping to NULL beneath us.
				570	*
				571	* The case we do have to guard against is when memory pressure made
				572	* shmem_writepage move it from filecache to swapcache beneath us:
				573	* an unlikely race, but we do need to retry for page_head->mapping.
				574	*/
				575	if (!page_head->mapping) {
				576	int shmem_swizzled = PageSwapCache(page_head);
				577	unlock_page(page_head);
				578	put_page(page_head);
				579	if (shmem_swizzled)
				580	goto again;
				581	return -EFAULT;
				582	}
				583
				584	/*
				585	* Private mappings are handled in a simple way.
				586	*
				587	* NOTE: When userspace waits on a MAP_SHARED mapping, even if
				588	* it's a read-only handle, it's expected that futexes attach to
				589	* the object not the particular process.
				590	*/
				591	if (PageAnon(page_head)) {
				592	/*
				593	* A RO anonymous page will never change and thus doesn't make
				594	* sense for futex operations.
				595	*/
				596	if (unlikely(should_fail_futex(fshared)) \|\| ro) {
				597	err = -EFAULT;
				598	goto out;
				599	}
				600
				601	key->both.offset \|= FUT_OFF_MMSHARED; /* ref taken on mm */
				602	key->private.mm = mm;
				603	key->private.address = address;
				604	} else {
				605	key->both.offset \|= FUT_OFF_INODE; /* inode-based key */
				606	key->shared.inode = page_head->mapping->host;
				607	key->shared.pgoff = basepage_index(page);
				608	}
				609
				610	get_futex_key_refs(key); /* implies MB (B) */
				611
				612	out:
				613	unlock_page(page_head);
				614	put_page(page_head);
				615	return err;
				616	}
				617
				618	static inline void put_futex_key(union futex_key *key)
				619	{
				620	drop_futex_key_refs(key);
				621	}
				622
				623	/**
				624	* fault_in_user_writeable() - Fault in user address and verify RW access
				625	* @uaddr: pointer to faulting user space address
				626	*
				627	* Slow path to fixup the fault we just took in the atomic write
				628	* access to @uaddr.
				629	*
				630	* We have no generic implementation of a non-destructive write to the
				631	* user address. We know that we faulted in the atomic pagefault
				632	* disabled section so we can as well avoid the #PF overhead by
				633	* calling get_user_pages() right away.
				634	*/
				635	static int fault_in_user_writeable(u32 __user *uaddr)
				636	{
				637	struct mm_struct *mm = current->mm;
				638	int ret;
				639
				640	down_read(&mm->mmap_sem);
				641	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
				642	FAULT_FLAG_WRITE);
				643	up_read(&mm->mmap_sem);
				644
				645	return ret < 0 ? ret : 0;
				646	}
				647
				648	/**
				649	* futex_top_waiter() - Return the highest priority waiter on a futex
				650	* @hb: the hash bucket the futex_q's reside in
				651	* @key: the futex key (to distinguish it from other futex futex_q's)
				652	*
				653	* Must be called with the hb lock held.
				654	*/
				655	static struct futex_q futex_top_waiter(struct futex_hash_bucket hb,
				656	union futex_key *key)
				657	{
				658	struct futex_q *this;
				659
				660	plist_for_each_entry(this, &hb->chain, list) {
				661	if (match_futex(&this->key, key))
				662	return this;
				663	}
				664	return NULL;
				665	}
				666
				667	static int cmpxchg_futex_value_locked(u32 curval, u32 __user uaddr,
				668	u32 uval, u32 newval)
				669	{
				670	int ret;
				671
				672	pagefault_disable();
				673	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
				674	pagefault_enable();
				675
				676	return ret;
				677	}
				678
				679	static int get_futex_value_locked(u32 dest, u32 __user from)
				680	{
				681	int ret;
				682
				683	pagefault_disable();
				684	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
				685	pagefault_enable();
				686
				687	return ret ? -EFAULT : 0;
				688	}
				689
				690
				691	/*
				692	* PI code:
				693	*/
				694	static int refill_pi_state_cache(void)
				695	{
				696	struct futex_pi_state *pi_state;
				697
				698	if (likely(current->pi_state_cache))
				699	return 0;
				700
				701	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
				702
				703	if (!pi_state)
				704	return -ENOMEM;
				705
				706	INIT_LIST_HEAD(&pi_state->list);
				707	/* pi_mutex gets initialized later */
				708	pi_state->owner = NULL;
				709	atomic_set(&pi_state->refcount, 1);
				710	pi_state->key = FUTEX_KEY_INIT;
				711
				712	current->pi_state_cache = pi_state;
				713
				714	return 0;
				715	}
				716
				717	static struct futex_pi_state * alloc_pi_state(void)
				718	{
				719	struct futex_pi_state *pi_state = current->pi_state_cache;
				720
				721	WARN_ON(!pi_state);
				722	current->pi_state_cache = NULL;
				723
				724	return pi_state;
				725	}
				726
				727	/*
				728	* Must be called with the hb lock held.
				729	*/
				730	static void free_pi_state(struct futex_pi_state *pi_state)
				731	{
				732	if (!pi_state)
				733	return;
				734
				735	if (!atomic_dec_and_test(&pi_state->refcount))
				736	return;
				737
				738	/*
				739	* If pi_state->owner is NULL, the owner is most probably dying
				740	* and has cleaned up the pi_state already
				741	*/
				742	if (pi_state->owner) {
				743	raw_spin_lock_irq(&pi_state->owner->pi_lock);
				744	list_del_init(&pi_state->list);
				745	raw_spin_unlock_irq(&pi_state->owner->pi_lock);
				746
				747	rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
				748	}
				749
				750	if (current->pi_state_cache)
				751	kfree(pi_state);
				752	else {
				753	/*
				754	* pi_state->list is already empty.
				755	* clear pi_state->owner.
				756	* refcount is at 0 - put it back to 1.
				757	*/
				758	pi_state->owner = NULL;
				759	atomic_set(&pi_state->refcount, 1);
				760	current->pi_state_cache = pi_state;
				761	}
				762	}
				763
				764	/*
				765	* Look up the task based on what TID userspace gave us.
				766	* We dont trust it.
				767	*/
				768	static struct task_struct * futex_find_get_task(pid_t pid)
				769	{
				770	struct task_struct *p;
				771
				772	rcu_read_lock();
				773	p = find_task_by_vpid(pid);
				774	if (p)
				775	get_task_struct(p);
				776
				777	rcu_read_unlock();
				778
				779	return p;
				780	}
				781
				782	/*
				783	* This task is holding PI mutexes at exit time => bad.
				784	* Kernel cleans up PI-state, but userspace is likely hosed.
				785	* (Robust-futex cleanup is separate and might save the day for userspace.)
				786	*/
				787	void exit_pi_state_list(struct task_struct *curr)
				788	{
				789	struct list_head next, head = &curr->pi_state_list;
				790	struct futex_pi_state *pi_state;
				791	struct futex_hash_bucket *hb;
				792	union futex_key key = FUTEX_KEY_INIT;
				793
				794	if (!futex_cmpxchg_enabled)
				795	return;
				796	/*
				797	* We are a ZOMBIE and nobody can enqueue itself on
				798	* pi_state_list anymore, but we have to be careful
				799	* versus waiters unqueueing themselves:
				800	*/
				801	raw_spin_lock_irq(&curr->pi_lock);
				802	while (!list_empty(head)) {
				803
				804	next = head->next;
				805	pi_state = list_entry(next, struct futex_pi_state, list);
				806	key = pi_state->key;
				807	hb = hash_futex(&key);
				808	raw_spin_unlock_irq(&curr->pi_lock);
				809
				810	spin_lock(&hb->lock);
				811
				812	raw_spin_lock_irq(&curr->pi_lock);
				813	/*
				814	* We dropped the pi-lock, so re-check whether this
				815	* task still owns the PI-state:
				816	*/
				817	if (head->next != next) {
				818	spin_unlock(&hb->lock);
				819	continue;
				820	}
				821
				822	WARN_ON(pi_state->owner != curr);
				823	WARN_ON(list_empty(&pi_state->list));
				824	list_del_init(&pi_state->list);
				825	pi_state->owner = NULL;
				826	raw_spin_unlock_irq(&curr->pi_lock);
				827
				828	rt_mutex_unlock(&pi_state->pi_mutex);
				829
				830	spin_unlock(&hb->lock);
				831
				832	raw_spin_lock_irq(&curr->pi_lock);
				833	}
				834	raw_spin_unlock_irq(&curr->pi_lock);
				835	}
				836
				837	/*
				838	* We need to check the following states:
				839	*
				840	* Waiter \| pi_state \| pi->owner \| uTID \| uODIED \| ?
				841	*
				842	* [1] NULL \| --- \| --- \| 0 \| 0/1 \| Valid
				843	* [2] NULL \| --- \| --- \| >0 \| 0/1 \| Valid
				844	*
				845	* [3] Found \| NULL \| -- \| Any \| 0/1 \| Invalid
				846	*
				847	* [4] Found \| Found \| NULL \| 0 \| 1 \| Valid
				848	* [5] Found \| Found \| NULL \| >0 \| 1 \| Invalid
				849	*
				850	* [6] Found \| Found \| task \| 0 \| 1 \| Valid
				851	*
				852	* [7] Found \| Found \| NULL \| Any \| 0 \| Invalid
				853	*
				854	* [8] Found \| Found \| task \| ==taskTID \| 0/1 \| Valid
				855	* [9] Found \| Found \| task \| 0 \| 0 \| Invalid
				856	* [10] Found \| Found \| task \| !=taskTID \| 0/1 \| Invalid
				857	*
				858	* [1] Indicates that the kernel can acquire the futex atomically. We
				859	* came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
				860	*
				861	* [2] Valid, if TID does not belong to a kernel thread. If no matching
				862	* thread is found then it indicates that the owner TID has died.
				863	*
				864	* [3] Invalid. The waiter is queued on a non PI futex
				865	*
				866	* [4] Valid state after exit_robust_list(), which sets the user space
				867	* value to FUTEX_WAITERS \| FUTEX_OWNER_DIED.
				868	*
				869	* [5] The user space value got manipulated between exit_robust_list()
				870	* and exit_pi_state_list()
				871	*
				872	* [6] Valid state after exit_pi_state_list() which sets the new owner in
				873	* the pi_state but cannot access the user space value.
				874	*
				875	* [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
				876	*
				877	* [8] Owner and user space value match
				878	*
				879	* [9] There is no transient state which sets the user space TID to 0
				880	* except exit_robust_list(), but this is indicated by the
				881	* FUTEX_OWNER_DIED bit. See [4]
				882	*
				883	* [10] There is no transient state which leaves owner and user space
				884	* TID out of sync.
				885	*/
				886
				887	/*
				888	* Validate that the existing waiter has a pi_state and sanity check
				889	* the pi_state against the user space value. If correct, attach to
				890	* it.
				891	*/
				892	static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
				893	struct futex_pi_state **ps)
				894	{
				895	pid_t pid = uval & FUTEX_TID_MASK;
				896
				897	/*
				898	* Userspace might have messed up non-PI and PI futexes [3]
				899	*/
				900	if (unlikely(!pi_state))
				901	return -EINVAL;
				902
				903	WARN_ON(!atomic_read(&pi_state->refcount));
				904
				905	/*
				906	* Handle the owner died case:
				907	*/
				908	if (uval & FUTEX_OWNER_DIED) {
				909	/*
				910	* exit_pi_state_list sets owner to NULL and wakes the
				911	* topmost waiter. The task which acquires the
				912	* pi_state->rt_mutex will fixup owner.
				913	*/
				914	if (!pi_state->owner) {
				915	/*
				916	* No pi state owner, but the user space TID
				917	* is not 0. Inconsistent state. [5]
				918	*/
				919	if (pid)
				920	return -EINVAL;
				921	/*
				922	* Take a ref on the state and return success. [4]
				923	*/
				924	goto out_state;
				925	}
				926
				927	/*
				928	* If TID is 0, then either the dying owner has not
				929	* yet executed exit_pi_state_list() or some waiter
				930	* acquired the rtmutex in the pi state, but did not
				931	* yet fixup the TID in user space.
				932	*
				933	* Take a ref on the state and return success. [6]
				934	*/
				935	if (!pid)
				936	goto out_state;
				937	} else {
				938	/*
				939	* If the owner died bit is not set, then the pi_state
				940	* must have an owner. [7]
				941	*/
				942	if (!pi_state->owner)
				943	return -EINVAL;
				944	}
				945
				946	/*
				947	* Bail out if user space manipulated the futex value. If pi
				948	* state exists then the owner TID must be the same as the
				949	* user space TID. [9/10]
				950	*/
				951	if (pid != task_pid_vnr(pi_state->owner))
				952	return -EINVAL;
				953	out_state:
				954	atomic_inc(&pi_state->refcount);
				955	*ps = pi_state;
				956	return 0;
				957	}
				958
				959	/*
				960	* Lookup the task for the TID provided from user space and attach to
				961	* it after doing proper sanity checks.
				962	*/
				963	static int attach_to_pi_owner(u32 uval, union futex_key *key,
				964	struct futex_pi_state **ps)
				965	{
				966	pid_t pid = uval & FUTEX_TID_MASK;
				967	struct futex_pi_state *pi_state;
				968	struct task_struct *p;
				969
				970	/*
				971	* We are the first waiter - try to look up the real owner and attach
				972	* the new pi_state to it, but bail out when TID = 0 [1]
				973	*/
				974	if (!pid)
				975	return -ESRCH;
				976	p = futex_find_get_task(pid);
				977	if (!p)
				978	return -ESRCH;
				979
				980	if (unlikely(p->flags & PF_KTHREAD)) {
				981	put_task_struct(p);
				982	return -EPERM;
				983	}
				984
				985	/*
				986	* We need to look at the task state flags to figure out,
				987	* whether the task is exiting. To protect against the do_exit
				988	* change of the task flags, we do this protected by
				989	* p->pi_lock:
				990	*/
				991	raw_spin_lock_irq(&p->pi_lock);
				992	if (unlikely(p->flags & PF_EXITING)) {
				993	/*
				994	* The task is on the way out. When PF_EXITPIDONE is
				995	* set, we know that the task has finished the
				996	* cleanup:
				997	*/
				998	int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
				999
				1000	raw_spin_unlock_irq(&p->pi_lock);
				1001	put_task_struct(p);
				1002	return ret;
				1003	}
				1004
				1005	/*
				1006	* No existing pi state. First waiter. [2]
				1007	*/
				1008	pi_state = alloc_pi_state();
				1009
				1010	/*
				1011	* Initialize the pi_mutex in locked state and make @p
				1012	* the owner of it:
				1013	*/
				1014	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
				1015
				1016	/* Store the key for possible exit cleanups: */
				1017	pi_state->key = *key;
				1018
				1019	WARN_ON(!list_empty(&pi_state->list));
				1020	list_add(&pi_state->list, &p->pi_state_list);
				1021	pi_state->owner = p;
				1022	raw_spin_unlock_irq(&p->pi_lock);
				1023
				1024	put_task_struct(p);
				1025
				1026	*ps = pi_state;
				1027
				1028	return 0;
				1029	}
				1030
				1031	static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
				1032	union futex_key key, struct futex_pi_state *ps)
				1033	{
				1034	struct futex_q *match = futex_top_waiter(hb, key);
				1035
				1036	/*
				1037	* If there is a waiter on that futex, validate it and
				1038	* attach to the pi_state when the validation succeeds.
				1039	*/
				1040	if (match)
				1041	return attach_to_pi_state(uval, match->pi_state, ps);
				1042
				1043	/*
				1044	* We are the first waiter - try to look up the owner based on
				1045	* @uval and attach to it.
				1046	*/
				1047	return attach_to_pi_owner(uval, key, ps);
				1048	}
				1049
				1050	static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
				1051	{
				1052	u32 uninitialized_var(curval);
				1053
				1054	if (unlikely(should_fail_futex(true)))
				1055	return -EFAULT;
				1056
				1057	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
				1058	return -EFAULT;
				1059
				1060	/If user space value changed, let the caller retry /
				1061	return curval != uval ? -EAGAIN : 0;
				1062	}
				1063
				1064	/**
				1065	* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
				1066	* @uaddr: the pi futex user address
				1067	* @hb: the pi futex hash bucket
				1068	* @key: the futex key associated with uaddr and hb
				1069	* @ps: the pi_state pointer where we store the result of the
				1070	* lookup
				1071	* @task: the task to perform the atomic lock work for. This will
				1072	* be "current" except in the case of requeue pi.
				1073	* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
				1074	*
				1075	* Return:
				1076	* 0 - ready to wait;
				1077	* 1 - acquired the lock;
				1078	* <0 - error
				1079	*
				1080	* The hb->lock and futex_key refs shall be held by the caller.
				1081	*/
				1082	static int futex_lock_pi_atomic(u32 __user uaddr, struct futex_hash_bucket hb,
				1083	union futex_key *key,
				1084	struct futex_pi_state **ps,
				1085	struct task_struct *task, int set_waiters)
				1086	{
				1087	u32 uval, newval, vpid = task_pid_vnr(task);
				1088	struct futex_q *match;
				1089	int ret;
				1090
				1091	/*
				1092	* Read the user space value first so we can validate a few
				1093	* things before proceeding further.
				1094	*/
				1095	if (get_futex_value_locked(&uval, uaddr))
				1096	return -EFAULT;
				1097
				1098	if (unlikely(should_fail_futex(true)))
				1099	return -EFAULT;
				1100
				1101	/*
				1102	* Detect deadlocks.
				1103	*/
				1104	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
				1105	return -EDEADLK;
				1106
				1107	if ((unlikely(should_fail_futex(true))))
				1108	return -EDEADLK;
				1109
				1110	/*
				1111	* Lookup existing state first. If it exists, try to attach to
				1112	* its pi_state.
				1113	*/
				1114	match = futex_top_waiter(hb, key);
				1115	if (match)
				1116	return attach_to_pi_state(uval, match->pi_state, ps);
				1117
				1118	/*
				1119	* No waiter and user TID is 0. We are here because the
				1120	* waiters or the owner died bit is set or called from
				1121	* requeue_cmp_pi or for whatever reason something took the
				1122	* syscall.
				1123	*/
				1124	if (!(uval & FUTEX_TID_MASK)) {
				1125	/*
				1126	* We take over the futex. No other waiters and the user space
				1127	* TID is 0. We preserve the owner died bit.
				1128	*/
				1129	newval = uval & FUTEX_OWNER_DIED;
				1130	newval \|= vpid;
				1131
				1132	/* The futex requeue_pi code can enforce the waiters bit */
				1133	if (set_waiters)
				1134	newval \|= FUTEX_WAITERS;
				1135
				1136	ret = lock_pi_update_atomic(uaddr, uval, newval);
				1137	/* If the take over worked, return 1 */
				1138	return ret < 0 ? ret : 1;
				1139	}
				1140
				1141	/*
				1142	* First waiter. Set the waiters bit before attaching ourself to
				1143	* the owner. If owner tries to unlock, it will be forced into
				1144	* the kernel and blocked on hb->lock.
				1145	*/
				1146	newval = uval \| FUTEX_WAITERS;
				1147	ret = lock_pi_update_atomic(uaddr, uval, newval);
				1148	if (ret)
				1149	return ret;
				1150	/*
				1151	* If the update of the user space value succeeded, we try to
				1152	* attach to the owner. If that fails, no harm done, we only
				1153	* set the FUTEX_WAITERS bit in the user space variable.
				1154	*/
				1155	return attach_to_pi_owner(uval, key, ps);
				1156	}
				1157
				1158	/**
				1159	* __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
				1160	* @q: The futex_q to unqueue
				1161	*
				1162	* The q->lock_ptr must not be NULL and must be held by the caller.
				1163	*/
				1164	static void __unqueue_futex(struct futex_q *q)
				1165	{
				1166	struct futex_hash_bucket *hb;
				1167
				1168	if (WARN_ON_SMP(!q->lock_ptr \|\| !spin_is_locked(q->lock_ptr))
				1169	\|\| WARN_ON(plist_node_empty(&q->list)))
				1170	return;
				1171
				1172	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
				1173	plist_del(&q->list, &hb->chain);
				1174	hb_waiters_dec(hb);
				1175	}
				1176
				1177	/*
				1178	* The hash bucket lock must be held when this is called.
				1179	* Afterwards, the futex_q must not be accessed. Callers
				1180	* must ensure to later call wake_up_q() for the actual
				1181	* wakeups to occur.
				1182	*/
				1183	static void mark_wake_futex(struct wake_q_head wake_q, struct futex_q q)
				1184	{
				1185	struct task_struct *p = q->task;
				1186
				1187	if (WARN(q->pi_state \|\| q->rt_waiter, "refusing to wake PI futex\n"))
				1188	return;
				1189
				1190	/*
				1191	* Queue the task for later wakeup for after we've released
				1192	* the hb->lock. wake_q_add() grabs reference to p.
				1193	*/
				1194	wake_q_add(wake_q, p);
				1195	__unqueue_futex(q);
				1196	/*
				1197	* The waiting task can free the futex_q as soon as
				1198	* q->lock_ptr = NULL is written, without taking any locks. A
				1199	* memory barrier is required here to prevent the following
				1200	* store to lock_ptr from getting ahead of the plist_del.
				1201	*/
				1202	smp_wmb();
				1203	q->lock_ptr = NULL;
				1204	}
				1205
				1206	static int wake_futex_pi(u32 __user uaddr, u32 uval, struct futex_q this,
				1207	struct futex_hash_bucket *hb)
				1208	{
				1209	struct task_struct *new_owner;
				1210	struct futex_pi_state *pi_state = this->pi_state;
				1211	u32 uninitialized_var(curval), newval;
				1212	WAKE_Q(wake_q);
				1213	bool deboost;
				1214	int ret = 0;
				1215
				1216	if (!pi_state)
				1217	return -EINVAL;
				1218
				1219	/*
				1220	* If current does not own the pi_state then the futex is
				1221	* inconsistent and user space fiddled with the futex value.
				1222	*/
				1223	if (pi_state->owner != current)
				1224	return -EINVAL;
				1225
				1226	raw_spin_lock(&pi_state->pi_mutex.wait_lock);
				1227	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
				1228
				1229	/*
				1230	* It is possible that the next waiter (the one that brought
				1231	* this owner to the kernel) timed out and is no longer
				1232	* waiting on the lock.
				1233	*/
				1234	if (!new_owner)
				1235	new_owner = this->task;
				1236
				1237	/*
				1238	* We pass it to the next owner. The WAITERS bit is always
				1239	* kept enabled while there is PI state around. We cleanup the
				1240	* owner died bit, because we are the owner.
				1241	*/
				1242	newval = FUTEX_WAITERS \| task_pid_vnr(new_owner);
				1243
				1244	if (unlikely(should_fail_futex(true)))
				1245	ret = -EFAULT;
				1246
				1247	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
				1248	ret = -EFAULT;
				1249	} else if (curval != uval) {
				1250	/*
				1251	* If a unconditional UNLOCK_PI operation (user space did not
				1252	* try the TID->0 transition) raced with a waiter setting the
				1253	* FUTEX_WAITERS flag between get_user() and locking the hash
				1254	* bucket lock, retry the operation.
				1255	*/
				1256	if ((FUTEX_TID_MASK & curval) == uval)
				1257	ret = -EAGAIN;
				1258	else
				1259	ret = -EINVAL;
				1260	}
				1261	if (ret) {
				1262	raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
				1263	return ret;
				1264	}
				1265
				1266	raw_spin_lock_irq(&pi_state->owner->pi_lock);
				1267	WARN_ON(list_empty(&pi_state->list));
				1268	list_del_init(&pi_state->list);
				1269	raw_spin_unlock_irq(&pi_state->owner->pi_lock);
				1270
				1271	raw_spin_lock_irq(&new_owner->pi_lock);
				1272	WARN_ON(!list_empty(&pi_state->list));
				1273	list_add(&pi_state->list, &new_owner->pi_state_list);
				1274	pi_state->owner = new_owner;
				1275	raw_spin_unlock_irq(&new_owner->pi_lock);
				1276
				1277	raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
				1278
				1279	deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
				1280
				1281	/*
				1282	* First unlock HB so the waiter does not spin on it once he got woken
				1283	* up. Second wake up the waiter before the priority is adjusted. If we
				1284	* deboost first (and lose our higher priority), then the task might get
				1285	* scheduled away before the wake up can take place.
				1286	*/
				1287	spin_unlock(&hb->lock);
				1288	wake_up_q(&wake_q);
				1289	if (deboost)
				1290	rt_mutex_adjust_prio(current);
				1291
				1292	return 0;
				1293	}
				1294
				1295	/*
				1296	* Express the locking dependencies for lockdep:
				1297	*/
				1298	static inline void
				1299	double_lock_hb(struct futex_hash_bucket hb1, struct futex_hash_bucket hb2)
				1300	{
				1301	if (hb1 <= hb2) {
				1302	spin_lock(&hb1->lock);
				1303	if (hb1 < hb2)
				1304	spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
				1305	} else { /* hb1 > hb2 */
				1306	spin_lock(&hb2->lock);
				1307	spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
				1308	}
				1309	}
				1310
				1311	static inline void
				1312	double_unlock_hb(struct futex_hash_bucket hb1, struct futex_hash_bucket hb2)
				1313	{
				1314	spin_unlock(&hb1->lock);
				1315	if (hb1 != hb2)
				1316	spin_unlock(&hb2->lock);
				1317	}
				1318
				1319	/*
				1320	* Wake up waiters matching bitset queued on this futex (uaddr).
				1321	*/
				1322	static int
				1323	futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
				1324	{
				1325	struct futex_hash_bucket *hb;
				1326	struct futex_q this, next;
				1327	union futex_key key = FUTEX_KEY_INIT;
				1328	int ret;
				1329	WAKE_Q(wake_q);
				1330
				1331	if (!bitset)
				1332	return -EINVAL;
				1333
				1334	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
				1335	if (unlikely(ret != 0))
				1336	goto out;
				1337
				1338	hb = hash_futex(&key);
				1339
				1340	/* Make sure we really have tasks to wakeup */
				1341	if (!hb_waiters_pending(hb))
				1342	goto out_put_key;
				1343
				1344	spin_lock(&hb->lock);
				1345
				1346	plist_for_each_entry_safe(this, next, &hb->chain, list) {
				1347	if (match_futex (&this->key, &key)) {
				1348	if (this->pi_state \|\| this->rt_waiter) {
				1349	ret = -EINVAL;
				1350	break;
				1351	}
				1352
				1353	/* Check if one of the bits is set in both bitsets */
				1354	if (!(this->bitset & bitset))
				1355	continue;
				1356
				1357	mark_wake_futex(&wake_q, this);
				1358	if (++ret >= nr_wake)
				1359	break;
				1360	}
				1361	}
				1362
				1363	spin_unlock(&hb->lock);
				1364	wake_up_q(&wake_q);
				1365	out_put_key:
				1366	put_futex_key(&key);
				1367	out:
				1368	return ret;
				1369	}
				1370
				1371	/*
				1372	* Wake up all waiters hashed on the physical page that is mapped
				1373	* to this virtual address:
				1374	*/
				1375	static int
				1376	futex_wake_op(u32 __user uaddr1, unsigned int flags, u32 __user uaddr2,
				1377	int nr_wake, int nr_wake2, int op)
				1378	{
				1379	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
				1380	struct futex_hash_bucket hb1, hb2;
				1381	struct futex_q this, next;
				1382	int ret, op_ret;
				1383	WAKE_Q(wake_q);
				1384
				1385	retry:
				1386	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
				1387	if (unlikely(ret != 0))
				1388	goto out;
				1389	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
				1390	if (unlikely(ret != 0))
				1391	goto out_put_key1;
				1392
				1393	hb1 = hash_futex(&key1);
				1394	hb2 = hash_futex(&key2);
				1395
				1396	retry_private:
				1397	double_lock_hb(hb1, hb2);
				1398	op_ret = futex_atomic_op_inuser(op, uaddr2);
				1399	if (unlikely(op_ret < 0)) {
				1400
				1401	double_unlock_hb(hb1, hb2);
				1402
				1403	#ifndef CONFIG_MMU
				1404	/*
				1405	* we don't get EFAULT from MMU faults if we don't have an MMU,
				1406	* but we might get them from range checking
				1407	*/
				1408	ret = op_ret;
				1409	goto out_put_keys;
				1410	#endif
				1411
				1412	if (unlikely(op_ret != -EFAULT)) {
				1413	ret = op_ret;
				1414	goto out_put_keys;
				1415	}
				1416
				1417	ret = fault_in_user_writeable(uaddr2);
				1418	if (ret)
				1419	goto out_put_keys;
				1420
				1421	if (!(flags & FLAGS_SHARED))
				1422	goto retry_private;
				1423
				1424	put_futex_key(&key2);
				1425	put_futex_key(&key1);
				1426	goto retry;
				1427	}
				1428
				1429	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
				1430	if (match_futex (&this->key, &key1)) {
				1431	if (this->pi_state \|\| this->rt_waiter) {
				1432	ret = -EINVAL;
				1433	goto out_unlock;
				1434	}
				1435	mark_wake_futex(&wake_q, this);
				1436	if (++ret >= nr_wake)
				1437	break;
				1438	}
				1439	}
				1440
				1441	if (op_ret > 0) {
				1442	op_ret = 0;
				1443	plist_for_each_entry_safe(this, next, &hb2->chain, list) {
				1444	if (match_futex (&this->key, &key2)) {
				1445	if (this->pi_state \|\| this->rt_waiter) {
				1446	ret = -EINVAL;
				1447	goto out_unlock;
				1448	}
				1449	mark_wake_futex(&wake_q, this);
				1450	if (++op_ret >= nr_wake2)
				1451	break;
				1452	}
				1453	}
				1454	ret += op_ret;
				1455	}
				1456
				1457	out_unlock:
				1458	double_unlock_hb(hb1, hb2);
				1459	wake_up_q(&wake_q);
				1460	out_put_keys:
				1461	put_futex_key(&key2);
				1462	out_put_key1:
				1463	put_futex_key(&key1);
				1464	out:
				1465	return ret;
				1466	}
				1467
				1468	/**
				1469	* requeue_futex() - Requeue a futex_q from one hb to another
				1470	* @q: the futex_q to requeue
				1471	* @hb1: the source hash_bucket
				1472	* @hb2: the target hash_bucket
				1473	* @key2: the new key for the requeued futex_q
				1474	*/
				1475	static inline
				1476	void requeue_futex(struct futex_q q, struct futex_hash_bucket hb1,
				1477	struct futex_hash_bucket hb2, union futex_key key2)
				1478	{
				1479
				1480	/*
				1481	* If key1 and key2 hash to the same bucket, no need to
				1482	* requeue.
				1483	*/
				1484	if (likely(&hb1->chain != &hb2->chain)) {
				1485	plist_del(&q->list, &hb1->chain);
				1486	hb_waiters_dec(hb1);
				1487	hb_waiters_inc(hb2);
				1488	plist_add(&q->list, &hb2->chain);
				1489	q->lock_ptr = &hb2->lock;
				1490	}
				1491	get_futex_key_refs(key2);
				1492	q->key = *key2;
				1493	}
				1494
				1495	/**
				1496	* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
				1497	* @q: the futex_q
				1498	* @key: the key of the requeue target futex
				1499	* @hb: the hash_bucket of the requeue target futex
				1500	*
				1501	* During futex_requeue, with requeue_pi=1, it is possible to acquire the
				1502	* target futex if it is uncontended or via a lock steal. Set the futex_q key
				1503	* to the requeue target futex so the waiter can detect the wakeup on the right
				1504	* futex, but remove it from the hb and NULL the rt_waiter so it can detect
				1505	* atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
				1506	* to protect access to the pi_state to fixup the owner later. Must be called
				1507	* with both q->lock_ptr and hb->lock held.
				1508	*/
				1509	static inline
				1510	void requeue_pi_wake_futex(struct futex_q q, union futex_key key,
				1511	struct futex_hash_bucket *hb)
				1512	{
				1513	get_futex_key_refs(key);
				1514	q->key = *key;
				1515
				1516	__unqueue_futex(q);
				1517
				1518	WARN_ON(!q->rt_waiter);
				1519	q->rt_waiter = NULL;
				1520
				1521	q->lock_ptr = &hb->lock;
				1522
				1523	wake_up_state(q->task, TASK_NORMAL);
				1524	}
				1525
				1526	/**
				1527	* futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
				1528	* @pifutex: the user address of the to futex
				1529	* @hb1: the from futex hash bucket, must be locked by the caller
				1530	* @hb2: the to futex hash bucket, must be locked by the caller
				1531	* @key1: the from futex key
				1532	* @key2: the to futex key
				1533	* @ps: address to store the pi_state pointer
				1534	* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
				1535	*
				1536	* Try and get the lock on behalf of the top waiter if we can do it atomically.
				1537	* Wake the top waiter if we succeed. If the caller specified set_waiters,
				1538	* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
				1539	* hb1 and hb2 must be held by the caller.
				1540	*
				1541	* Return:
				1542	* 0 - failed to acquire the lock atomically;
				1543	* >0 - acquired the lock, return value is vpid of the top_waiter
				1544	* <0 - error
				1545	*/
				1546	static int futex_proxy_trylock_atomic(u32 __user *pifutex,
				1547	struct futex_hash_bucket *hb1,
				1548	struct futex_hash_bucket *hb2,
				1549	union futex_key key1, union futex_key key2,
				1550	struct futex_pi_state **ps, int set_waiters)
				1551	{
				1552	struct futex_q *top_waiter = NULL;
				1553	u32 curval;
				1554	int ret, vpid;
				1555
				1556	if (get_futex_value_locked(&curval, pifutex))
				1557	return -EFAULT;
				1558
				1559	if (unlikely(should_fail_futex(true)))
				1560	return -EFAULT;
				1561
				1562	/*
				1563	* Find the top_waiter and determine if there are additional waiters.
				1564	* If the caller intends to requeue more than 1 waiter to pifutex,
				1565	* force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
				1566	* as we have means to handle the possible fault. If not, don't set
				1567	* the bit unecessarily as it will force the subsequent unlock to enter
				1568	* the kernel.
				1569	*/
				1570	top_waiter = futex_top_waiter(hb1, key1);
				1571
				1572	/* There are no waiters, nothing for us to do. */
				1573	if (!top_waiter)
				1574	return 0;
				1575
				1576	/* Ensure we requeue to the expected futex. */
				1577	if (!match_futex(top_waiter->requeue_pi_key, key2))
				1578	return -EINVAL;
				1579
				1580	/*
				1581	* Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
				1582	* the contended case or if set_waiters is 1. The pi_state is returned
				1583	* in ps in contended cases.
				1584	*/
				1585	vpid = task_pid_vnr(top_waiter->task);
				1586	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
				1587	set_waiters);
				1588	if (ret == 1) {
				1589	requeue_pi_wake_futex(top_waiter, key2, hb2);
				1590	return vpid;
				1591	}
				1592	return ret;
				1593	}
				1594
				1595	/**
				1596	* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
				1597	* @uaddr1: source futex user address
				1598	* @flags: futex flags (FLAGS_SHARED, etc.)
				1599	* @uaddr2: target futex user address
				1600	* @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
				1601	* @nr_requeue: number of waiters to requeue (0-INT_MAX)
				1602	* @cmpval: @uaddr1 expected value (or %NULL)
				1603	* @requeue_pi: if we are attempting to requeue from a non-pi futex to a
				1604	* pi futex (pi to pi requeue is not supported)
				1605	*
				1606	* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
				1607	* uaddr2 atomically on behalf of the top waiter.
				1608	*
				1609	* Return:
				1610	* >=0 - on success, the number of tasks requeued or woken;
				1611	* <0 - on error
				1612	*/
				1613	static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
				1614	u32 __user *uaddr2, int nr_wake, int nr_requeue,
				1615	u32 *cmpval, int requeue_pi)
				1616	{
				1617	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
				1618	int drop_count = 0, task_count = 0, ret;
				1619	struct futex_pi_state *pi_state = NULL;
				1620	struct futex_hash_bucket hb1, hb2;
				1621	struct futex_q this, next;
				1622	WAKE_Q(wake_q);
				1623
				1624	if (requeue_pi) {
				1625	/*
				1626	* Requeue PI only works on two distinct uaddrs. This
				1627	* check is only valid for private futexes. See below.
				1628	*/
				1629	if (uaddr1 == uaddr2)
				1630	return -EINVAL;
				1631
				1632	/*
				1633	* requeue_pi requires a pi_state, try to allocate it now
				1634	* without any locks in case it fails.
				1635	*/
				1636	if (refill_pi_state_cache())
				1637	return -ENOMEM;
				1638	/*
				1639	* requeue_pi must wake as many tasks as it can, up to nr_wake
				1640	* + nr_requeue, since it acquires the rt_mutex prior to
				1641	* returning to userspace, so as to not leave the rt_mutex with
				1642	* waiters and no owner. However, second and third wake-ups
				1643	* cannot be predicted as they involve race conditions with the
				1644	* first wake and a fault while looking up the pi_state. Both
				1645	* pthread_cond_signal() and pthread_cond_broadcast() should
				1646	* use nr_wake=1.
				1647	*/
				1648	if (nr_wake != 1)
				1649	return -EINVAL;
				1650	}
				1651
				1652	retry:
				1653	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
				1654	if (unlikely(ret != 0))
				1655	goto out;
				1656	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
				1657	requeue_pi ? VERIFY_WRITE : VERIFY_READ);
				1658	if (unlikely(ret != 0))
				1659	goto out_put_key1;
				1660
				1661	/*
				1662	* The check above which compares uaddrs is not sufficient for
				1663	* shared futexes. We need to compare the keys:
				1664	*/
				1665	if (requeue_pi && match_futex(&key1, &key2)) {
				1666	ret = -EINVAL;
				1667	goto out_put_keys;
				1668	}
				1669
				1670	hb1 = hash_futex(&key1);
				1671	hb2 = hash_futex(&key2);
				1672
				1673	retry_private:
				1674	hb_waiters_inc(hb2);
				1675	double_lock_hb(hb1, hb2);
				1676
				1677	if (likely(cmpval != NULL)) {
				1678	u32 curval;
				1679
				1680	ret = get_futex_value_locked(&curval, uaddr1);
				1681
				1682	if (unlikely(ret)) {
				1683	double_unlock_hb(hb1, hb2);
				1684	hb_waiters_dec(hb2);
				1685
				1686	ret = get_user(curval, uaddr1);
				1687	if (ret)
				1688	goto out_put_keys;
				1689
				1690	if (!(flags & FLAGS_SHARED))
				1691	goto retry_private;
				1692
				1693	put_futex_key(&key2);
				1694	put_futex_key(&key1);
				1695	goto retry;
				1696	}
				1697	if (curval != *cmpval) {
				1698	ret = -EAGAIN;
				1699	goto out_unlock;
				1700	}
				1701	}
				1702
				1703	if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
				1704	/*
				1705	* Attempt to acquire uaddr2 and wake the top waiter. If we
				1706	* intend to requeue waiters, force setting the FUTEX_WAITERS
				1707	* bit. We force this here where we are able to easily handle
				1708	* faults rather in the requeue loop below.
				1709	*/
				1710	ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
				1711	&key2, &pi_state, nr_requeue);
				1712
				1713	/*
				1714	* At this point the top_waiter has either taken uaddr2 or is
				1715	* waiting on it. If the former, then the pi_state will not
				1716	* exist yet, look it up one more time to ensure we have a
				1717	* reference to it. If the lock was taken, ret contains the
				1718	* vpid of the top waiter task.
				1719	*/
				1720	if (ret > 0) {
				1721	WARN_ON(pi_state);
				1722	drop_count++;
				1723	task_count++;
				1724	/*
				1725	* If we acquired the lock, then the user
				1726	* space value of uaddr2 should be vpid. It
				1727	* cannot be changed by the top waiter as it
				1728	* is blocked on hb2 lock if it tries to do
				1729	* so. If something fiddled with it behind our
				1730	* back the pi state lookup might unearth
				1731	* it. So we rather use the known value than
				1732	* rereading and handing potential crap to
				1733	* lookup_pi_state.
				1734	*/
				1735	ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
				1736	}
				1737
				1738	switch (ret) {
				1739	case 0:
				1740	break;
				1741	case -EFAULT:
				1742	free_pi_state(pi_state);
				1743	pi_state = NULL;
				1744	double_unlock_hb(hb1, hb2);
				1745	hb_waiters_dec(hb2);
				1746	put_futex_key(&key2);
				1747	put_futex_key(&key1);
				1748	ret = fault_in_user_writeable(uaddr2);
				1749	if (!ret)
				1750	goto retry;
				1751	goto out;
				1752	case -EAGAIN:
				1753	/*
				1754	* Two reasons for this:
				1755	* - Owner is exiting and we just wait for the
				1756	* exit to complete.
				1757	* - The user space value changed.
				1758	*/
				1759	free_pi_state(pi_state);
				1760	pi_state = NULL;
				1761	double_unlock_hb(hb1, hb2);
				1762	hb_waiters_dec(hb2);
				1763	put_futex_key(&key2);
				1764	put_futex_key(&key1);
				1765	cond_resched();
				1766	goto retry;
				1767	default:
				1768	goto out_unlock;
				1769	}
				1770	}
				1771
				1772	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
				1773	if (task_count - nr_wake >= nr_requeue)
				1774	break;
				1775
				1776	if (!match_futex(&this->key, &key1))
				1777	continue;
				1778
				1779	/*
				1780	* FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
				1781	* be paired with each other and no other futex ops.
				1782	*
				1783	* We should never be requeueing a futex_q with a pi_state,
				1784	* which is awaiting a futex_unlock_pi().
				1785	*/
				1786	if ((requeue_pi && !this->rt_waiter) \|\|
				1787	(!requeue_pi && this->rt_waiter) \|\|
				1788	this->pi_state) {
				1789	ret = -EINVAL;
				1790	break;
				1791	}
				1792
				1793	/*
				1794	* Wake nr_wake waiters. For requeue_pi, if we acquired the
				1795	* lock, we already woke the top_waiter. If not, it will be
				1796	* woken by futex_unlock_pi().
				1797	*/
				1798	if (++task_count <= nr_wake && !requeue_pi) {
				1799	mark_wake_futex(&wake_q, this);
				1800	continue;
				1801	}
				1802
				1803	/* Ensure we requeue to the expected futex for requeue_pi. */
				1804	if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
				1805	ret = -EINVAL;
				1806	break;
				1807	}
				1808
				1809	/*
				1810	* Requeue nr_requeue waiters and possibly one more in the case
				1811	* of requeue_pi if we couldn't acquire the lock atomically.
				1812	*/
				1813	if (requeue_pi) {
				1814	/* Prepare the waiter to take the rt_mutex. */
				1815	atomic_inc(&pi_state->refcount);
				1816	this->pi_state = pi_state;
				1817	ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
				1818	this->rt_waiter,
				1819	this->task);
				1820	if (ret == 1) {
				1821	/* We got the lock. */
				1822	requeue_pi_wake_futex(this, &key2, hb2);
				1823	drop_count++;
				1824	continue;
				1825	} else if (ret) {
				1826	/* -EDEADLK */
				1827	this->pi_state = NULL;
				1828	free_pi_state(pi_state);
				1829	goto out_unlock;
				1830	}
				1831	}
				1832	requeue_futex(this, hb1, hb2, &key2);
				1833	drop_count++;
				1834	}
				1835
				1836	out_unlock:
				1837	free_pi_state(pi_state);
				1838	double_unlock_hb(hb1, hb2);
				1839	wake_up_q(&wake_q);
				1840	hb_waiters_dec(hb2);
				1841
				1842	/*
				1843	* drop_futex_key_refs() must be called outside the spinlocks. During
				1844	* the requeue we moved futex_q's from the hash bucket at key1 to the
				1845	* one at key2 and updated their key pointer. We no longer need to
				1846	* hold the references to key1.
				1847	*/
				1848	while (--drop_count >= 0)
				1849	drop_futex_key_refs(&key1);
				1850
				1851	out_put_keys:
				1852	put_futex_key(&key2);
				1853	out_put_key1:
				1854	put_futex_key(&key1);
				1855	out:
				1856	return ret ? ret : task_count;
				1857	}
				1858
				1859	/* The key must be already stored in q->key. */
				1860	static inline struct futex_hash_bucket queue_lock(struct futex_q q)
				1861	__acquires(&hb->lock)
				1862	{
				1863	struct futex_hash_bucket *hb;
				1864
				1865	hb = hash_futex(&q->key);
				1866
				1867	/*
				1868	* Increment the counter before taking the lock so that
				1869	* a potential waker won't miss a to-be-slept task that is
				1870	* waiting for the spinlock. This is safe as all queue_lock()
				1871	* users end up calling queue_me(). Similarly, for housekeeping,
				1872	* decrement the counter at queue_unlock() when some error has
				1873	* occurred and we don't end up adding the task to the list.
				1874	*/
				1875	hb_waiters_inc(hb);
				1876
				1877	q->lock_ptr = &hb->lock;
				1878
				1879	spin_lock(&hb->lock); /* implies MB (A) */
				1880	return hb;
				1881	}
				1882
				1883	static inline void
				1884	queue_unlock(struct futex_hash_bucket *hb)
				1885	__releases(&hb->lock)
				1886	{
				1887	spin_unlock(&hb->lock);
				1888	hb_waiters_dec(hb);
				1889	}
				1890
				1891	/**
				1892	* queue_me() - Enqueue the futex_q on the futex_hash_bucket
				1893	* @q: The futex_q to enqueue
				1894	* @hb: The destination hash bucket
				1895	*
				1896	* The hb->lock must be held by the caller, and is released here. A call to
				1897	* queue_me() is typically paired with exactly one call to unqueue_me(). The
				1898	* exceptions involve the PI related operations, which may use unqueue_me_pi()
				1899	* or nothing if the unqueue is done as part of the wake process and the unqueue
				1900	* state is implicit in the state of woken task (see futex_wait_requeue_pi() for
				1901	* an example).
				1902	*/
				1903	static inline void queue_me(struct futex_q q, struct futex_hash_bucket hb)
				1904	__releases(&hb->lock)
				1905	{
				1906	int prio;
				1907
				1908	/*
				1909	* The priority used to register this element is
				1910	* - either the real thread-priority for the real-time threads
				1911	* (i.e. threads with a priority lower than MAX_RT_PRIO)
				1912	* - or MAX_RT_PRIO for non-RT threads.
				1913	* Thus, all RT-threads are woken first in priority order, and
				1914	* the others are woken last, in FIFO order.
				1915	*/
				1916	prio = min(current->normal_prio, MAX_RT_PRIO);
				1917
				1918	plist_node_init(&q->list, prio);
				1919	plist_add(&q->list, &hb->chain);
				1920	q->task = current;
				1921	spin_unlock(&hb->lock);
				1922	}
				1923
				1924	/**
				1925	* unqueue_me() - Remove the futex_q from its futex_hash_bucket
				1926	* @q: The futex_q to unqueue
				1927	*
				1928	* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
				1929	* be paired with exactly one earlier call to queue_me().
				1930	*
				1931	* Return:
				1932	* 1 - if the futex_q was still queued (and we removed unqueued it);
				1933	* 0 - if the futex_q was already removed by the waking thread
				1934	*/
				1935	static int unqueue_me(struct futex_q *q)
				1936	{
				1937	spinlock_t *lock_ptr;
				1938	int ret = 0;
				1939
				1940	/* In the common case we don't take the spinlock, which is nice. */
				1941	retry:
				1942	lock_ptr = q->lock_ptr;
				1943	barrier();
				1944	if (lock_ptr != NULL) {
				1945	spin_lock(lock_ptr);
				1946	/*
				1947	* q->lock_ptr can change between reading it and
				1948	* spin_lock(), causing us to take the wrong lock. This
				1949	* corrects the race condition.
				1950	*
				1951	* Reasoning goes like this: if we have the wrong lock,
				1952	* q->lock_ptr must have changed (maybe several times)
				1953	* between reading it and the spin_lock(). It can
				1954	* change again after the spin_lock() but only if it was
				1955	* already changed before the spin_lock(). It cannot,
				1956	* however, change back to the original value. Therefore
				1957	* we can detect whether we acquired the correct lock.
				1958	*/
				1959	if (unlikely(lock_ptr != q->lock_ptr)) {
				1960	spin_unlock(lock_ptr);
				1961	goto retry;
				1962	}
				1963	__unqueue_futex(q);
				1964
				1965	BUG_ON(q->pi_state);
				1966
				1967	spin_unlock(lock_ptr);
				1968	ret = 1;
				1969	}
				1970
				1971	drop_futex_key_refs(&q->key);
				1972	return ret;
				1973	}
				1974
				1975	/*
				1976	* PI futexes can not be requeued and must remove themself from the
				1977	* hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
				1978	* and dropped here.
				1979	*/
				1980	static void unqueue_me_pi(struct futex_q *q)
				1981	__releases(q->lock_ptr)
				1982	{
				1983	__unqueue_futex(q);
				1984
				1985	BUG_ON(!q->pi_state);
				1986	free_pi_state(q->pi_state);
				1987	q->pi_state = NULL;
				1988
				1989	spin_unlock(q->lock_ptr);
				1990	}
				1991
				1992	/*
				1993	* Fixup the pi_state owner with the new owner.
				1994	*
				1995	* Must be called with hash bucket lock held and mm->sem held for non
				1996	* private futexes.
				1997	*/
				1998	static int fixup_pi_state_owner(u32 __user uaddr, struct futex_q q,
				1999	struct task_struct *newowner)
				2000	{
				2001	u32 newtid = task_pid_vnr(newowner) \| FUTEX_WAITERS;
				2002	struct futex_pi_state *pi_state = q->pi_state;
				2003	struct task_struct *oldowner = pi_state->owner;
				2004	u32 uval, uninitialized_var(curval), newval;
				2005	int ret;
				2006
				2007	/* Owner died? */
				2008	if (!pi_state->owner)
				2009	newtid \|= FUTEX_OWNER_DIED;
				2010
				2011	/*
				2012	* We are here either because we stole the rtmutex from the
				2013	* previous highest priority waiter or we are the highest priority
				2014	* waiter but failed to get the rtmutex the first time.
				2015	* We have to replace the newowner TID in the user space variable.
				2016	* This must be atomic as we have to preserve the owner died bit here.
				2017	*
				2018	* Note: We write the user space value _before_ changing the pi_state
				2019	* because we can fault here. Imagine swapped out pages or a fork
				2020	* that marked all the anonymous memory readonly for cow.
				2021	*
				2022	* Modifying pi_state _before_ the user space value would
				2023	* leave the pi_state in an inconsistent state when we fault
				2024	* here, because we need to drop the hash bucket lock to
				2025	* handle the fault. This might be observed in the PID check
				2026	* in lookup_pi_state.
				2027	*/
				2028	retry:
				2029	if (get_futex_value_locked(&uval, uaddr))
				2030	goto handle_fault;
				2031
				2032	while (1) {
				2033	newval = (uval & FUTEX_OWNER_DIED) \| newtid;
				2034
				2035	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
				2036	goto handle_fault;
				2037	if (curval == uval)
				2038	break;
				2039	uval = curval;
				2040	}
				2041
				2042	/*
				2043	* We fixed up user space. Now we need to fix the pi_state
				2044	* itself.
				2045	*/
				2046	if (pi_state->owner != NULL) {
				2047	raw_spin_lock_irq(&pi_state->owner->pi_lock);
				2048	WARN_ON(list_empty(&pi_state->list));
				2049	list_del_init(&pi_state->list);
				2050	raw_spin_unlock_irq(&pi_state->owner->pi_lock);
				2051	}
				2052
				2053	pi_state->owner = newowner;
				2054
				2055	raw_spin_lock_irq(&newowner->pi_lock);
				2056	WARN_ON(!list_empty(&pi_state->list));
				2057	list_add(&pi_state->list, &newowner->pi_state_list);
				2058	raw_spin_unlock_irq(&newowner->pi_lock);
				2059	return 0;
				2060
				2061	/*
				2062	* To handle the page fault we need to drop the hash bucket
				2063	* lock here. That gives the other task (either the highest priority
				2064	* waiter itself or the task which stole the rtmutex) the
				2065	* chance to try the fixup of the pi_state. So once we are
				2066	* back from handling the fault we need to check the pi_state
				2067	* after reacquiring the hash bucket lock and before trying to
				2068	* do another fixup. When the fixup has been done already we
				2069	* simply return.
				2070	*/
				2071	handle_fault:
				2072	spin_unlock(q->lock_ptr);
				2073
				2074	ret = fault_in_user_writeable(uaddr);
				2075
				2076	spin_lock(q->lock_ptr);
				2077
				2078	/*
				2079	* Check if someone else fixed it for us:
				2080	*/
				2081	if (pi_state->owner != oldowner)
				2082	return 0;
				2083
				2084	if (ret)
				2085	return ret;
				2086
				2087	goto retry;
				2088	}
				2089
				2090	static long futex_wait_restart(struct restart_block *restart);
				2091
				2092	/**
				2093	* fixup_owner() - Post lock pi_state and corner case management
				2094	* @uaddr: user address of the futex
				2095	* @q: futex_q (contains pi_state and access to the rt_mutex)
				2096	* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
				2097	*
				2098	* After attempting to lock an rt_mutex, this function is called to cleanup
				2099	* the pi_state owner as well as handle race conditions that may allow us to
				2100	* acquire the lock. Must be called with the hb lock held.
				2101	*
				2102	* Return:
				2103	* 1 - success, lock taken;
				2104	* 0 - success, lock not taken;
				2105	* <0 - on error (-EFAULT)
				2106	*/
				2107	static int fixup_owner(u32 __user uaddr, struct futex_q q, int locked)
				2108	{
				2109	struct task_struct *owner;
				2110	int ret = 0;
				2111
				2112	if (locked) {
				2113	/*
				2114	* Got the lock. We might not be the anticipated owner if we
				2115	* did a lock-steal - fix up the PI-state in that case:
				2116	*/
				2117	if (q->pi_state->owner != current)
				2118	ret = fixup_pi_state_owner(uaddr, q, current);
				2119	goto out;
				2120	}
				2121
				2122	/*
				2123	* Catch the rare case, where the lock was released when we were on the
				2124	* way back before we locked the hash bucket.
				2125	*/
				2126	if (q->pi_state->owner == current) {
				2127	/*
				2128	* Try to get the rt_mutex now. This might fail as some other
				2129	* task acquired the rt_mutex after we removed ourself from the
				2130	* rt_mutex waiters list.
				2131	*/
				2132	if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
				2133	locked = 1;
				2134	goto out;
				2135	}
				2136
				2137	/*
				2138	* pi_state is incorrect, some other task did a lock steal and
				2139	* we returned due to timeout or signal without taking the
				2140	* rt_mutex. Too late.
				2141	*/
				2142	raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
				2143	owner = rt_mutex_owner(&q->pi_state->pi_mutex);
				2144	if (!owner)
				2145	owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
				2146	raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
				2147	ret = fixup_pi_state_owner(uaddr, q, owner);
				2148	goto out;
				2149	}
				2150
				2151	/*
				2152	* Paranoia check. If we did not take the lock, then we should not be
				2153	* the owner of the rt_mutex.
				2154	*/
				2155	if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
				2156	printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
				2157	"pi-state %p\n", ret,
				2158	q->pi_state->pi_mutex.owner,
				2159	q->pi_state->owner);
				2160
				2161	out:
				2162	return ret ? ret : locked;
				2163	}
				2164
				2165	/**
				2166	* futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
				2167	* @hb: the futex hash bucket, must be locked by the caller
				2168	* @q: the futex_q to queue up on
				2169	* @timeout: the prepared hrtimer_sleeper, or null for no timeout
				2170	*/
				2171	static void futex_wait_queue_me(struct futex_hash_bucket hb, struct futex_q q,
				2172	struct hrtimer_sleeper *timeout)
				2173	{
				2174	/*
				2175	* The task state is guaranteed to be set before another task can
				2176	* wake it. set_current_state() is implemented using smp_store_mb() and
				2177	* queue_me() calls spin_unlock() upon completion, both serializing
				2178	* access to the hash list and forcing another memory barrier.
				2179	*/
				2180	set_current_state(TASK_INTERRUPTIBLE);
				2181	queue_me(q, hb);
				2182
				2183	/* Arm the timer */
				2184	if (timeout)
				2185	hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
				2186
				2187	/*
				2188	* If we have been removed from the hash list, then another task
				2189	* has tried to wake us, and we can skip the call to schedule().
				2190	*/
				2191	if (likely(!plist_node_empty(&q->list))) {
				2192	/*
				2193	* If the timer has already expired, current will already be
				2194	* flagged for rescheduling. Only call schedule if there
				2195	* is no timeout, or if it has yet to expire.
				2196	*/
				2197	if (!timeout \|\| timeout->task)
				2198	freezable_schedule();
				2199	}
				2200	__set_current_state(TASK_RUNNING);
				2201	}
				2202
				2203	/**
				2204	* futex_wait_setup() - Prepare to wait on a futex
				2205	* @uaddr: the futex userspace address
				2206	* @val: the expected value
				2207	* @flags: futex flags (FLAGS_SHARED, etc.)
				2208	* @q: the associated futex_q
				2209	* @hb: storage for hash_bucket pointer to be returned to caller
				2210	*
				2211	* Setup the futex_q and locate the hash_bucket. Get the futex value and
				2212	* compare it with the expected value. Handle atomic faults internally.
				2213	* Return with the hb lock held and a q.key reference on success, and unlocked
				2214	* with no q.key reference on failure.
				2215	*
				2216	* Return:
				2217	* 0 - uaddr contains val and hb has been locked;
				2218	* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
				2219	*/
				2220	static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
				2221	struct futex_q q, struct futex_hash_bucket *hb)
				2222	{
				2223	u32 uval;
				2224	int ret;
				2225
				2226	/*
				2227	* Access the page AFTER the hash-bucket is locked.
				2228	* Order is important:
				2229	*
				2230	* Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
				2231	* Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
				2232	*
				2233	* The basic logical guarantee of a futex is that it blocks ONLY
				2234	* if cond(var) is known to be true at the time of blocking, for
				2235	* any cond. If we locked the hash-bucket after testing *uaddr, that
				2236	* would open a race condition where we could block indefinitely with
				2237	* cond(var) false, which would violate the guarantee.
				2238	*
				2239	* On the other hand, we insert q and release the hash-bucket only
				2240	* after testing *uaddr. This guarantees that futex_wait() will NOT
				2241	* absorb a wakeup if *uaddr does not match the desired values
				2242	* while the syscall executes.
				2243	*/
				2244	retry:
				2245	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
				2246	if (unlikely(ret != 0))
				2247	return ret;
				2248
				2249	retry_private:
				2250	*hb = queue_lock(q);
				2251
				2252	ret = get_futex_value_locked(&uval, uaddr);
				2253
				2254	if (ret) {
				2255	queue_unlock(*hb);
				2256
				2257	ret = get_user(uval, uaddr);
				2258	if (ret)
				2259	goto out;
				2260
				2261	if (!(flags & FLAGS_SHARED))
				2262	goto retry_private;
				2263
				2264	put_futex_key(&q->key);
				2265	goto retry;
				2266	}
				2267
				2268	if (uval != val) {
				2269	queue_unlock(*hb);
				2270	ret = -EWOULDBLOCK;
				2271	}
				2272
				2273	out:
				2274	if (ret)
				2275	put_futex_key(&q->key);
				2276	return ret;
				2277	}
				2278
				2279	static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
				2280	ktime_t *abs_time, u32 bitset)
				2281	{
				2282	struct hrtimer_sleeper timeout, *to = NULL;
				2283	struct restart_block *restart;
				2284	struct futex_hash_bucket *hb;
				2285	struct futex_q q = futex_q_init;
				2286	int ret;
				2287
				2288	if (!bitset)
				2289	return -EINVAL;
				2290	q.bitset = bitset;
				2291
				2292	if (abs_time) {
				2293	to = &timeout;
				2294
				2295	hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
				2296	CLOCK_REALTIME : CLOCK_MONOTONIC,
				2297	HRTIMER_MODE_ABS);
				2298	hrtimer_init_sleeper(to, current);
				2299	hrtimer_set_expires_range_ns(&to->timer, *abs_time,
				2300	current->timer_slack_ns);
				2301	}
				2302
				2303	retry:
				2304	/*
				2305	* Prepare to wait on uaddr. On success, holds hb lock and increments
				2306	* q.key refs.
				2307	*/
				2308	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
				2309	if (ret)
				2310	goto out;
				2311
				2312	/* queue_me and wait for wakeup, timeout, or a signal. */
				2313	futex_wait_queue_me(hb, &q, to);
				2314
				2315	/* If we were woken (and unqueued), we succeeded, whatever. */
				2316	ret = 0;
				2317	/* unqueue_me() drops q.key ref */
				2318	if (!unqueue_me(&q))
				2319	goto out;
				2320	ret = -ETIMEDOUT;
				2321	if (to && !to->task)
				2322	goto out;
				2323
				2324	/*
				2325	* We expect signal_pending(current), but we might be the
				2326	* victim of a spurious wakeup as well.
				2327	*/
				2328	if (!signal_pending(current))
				2329	goto retry;
				2330
				2331	ret = -ERESTARTSYS;
				2332	if (!abs_time)
				2333	goto out;
				2334
				2335	restart = &current->restart_block;
				2336	restart->fn = futex_wait_restart;
				2337	restart->futex.uaddr = uaddr;
				2338	restart->futex.val = val;
				2339	restart->futex.time = abs_time->tv64;
				2340	restart->futex.bitset = bitset;
				2341	restart->futex.flags = flags \| FLAGS_HAS_TIMEOUT;
				2342
				2343	ret = -ERESTART_RESTARTBLOCK;
				2344
				2345	out:
				2346	if (to) {
				2347	hrtimer_cancel(&to->timer);
				2348	destroy_hrtimer_on_stack(&to->timer);
				2349	}
				2350	return ret;
				2351	}
				2352
				2353
				2354	static long futex_wait_restart(struct restart_block *restart)
				2355	{
				2356	u32 __user *uaddr = restart->futex.uaddr;
				2357	ktime_t t, *tp = NULL;
				2358
				2359	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
				2360	t.tv64 = restart->futex.time;
				2361	tp = &t;
				2362	}
				2363	restart->fn = do_no_restart_syscall;
				2364
				2365	return (long)futex_wait(uaddr, restart->futex.flags,
				2366	restart->futex.val, tp, restart->futex.bitset);
				2367	}
				2368
				2369
				2370	/*
				2371	* Userspace tried a 0 -> TID atomic transition of the futex value
				2372	* and failed. The kernel side here does the whole locking operation:
				2373	* if there are waiters then it will block as a consequence of relying
				2374	* on rt-mutexes, it does PI, etc. (Due to races the kernel might see
				2375	* a 0 value of the futex too.).
				2376	*
				2377	* Also serves as futex trylock_pi()'ing, and due semantics.
				2378	*/
				2379	static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
				2380	ktime_t *time, int trylock)
				2381	{
				2382	struct hrtimer_sleeper timeout, *to = NULL;
				2383	struct futex_hash_bucket *hb;
				2384	struct futex_q q = futex_q_init;
				2385	int res, ret;
				2386
				2387	if (refill_pi_state_cache())
				2388	return -ENOMEM;
				2389
				2390	if (time) {
				2391	to = &timeout;
				2392	hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
				2393	HRTIMER_MODE_ABS);
				2394	hrtimer_init_sleeper(to, current);
				2395	hrtimer_set_expires(&to->timer, *time);
				2396	}
				2397
				2398	retry:
				2399	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
				2400	if (unlikely(ret != 0))
				2401	goto out;
				2402
				2403	retry_private:
				2404	hb = queue_lock(&q);
				2405
				2406	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
				2407	if (unlikely(ret)) {
				2408	/*
				2409	* Atomic work succeeded and we got the lock,
				2410	* or failed. Either way, we do _not_ block.
				2411	*/
				2412	switch (ret) {
				2413	case 1:
				2414	/* We got the lock. */
				2415	ret = 0;
				2416	goto out_unlock_put_key;
				2417	case -EFAULT:
				2418	goto uaddr_faulted;
				2419	case -EAGAIN:
				2420	/*
				2421	* Two reasons for this:
				2422	* - Task is exiting and we just wait for the
				2423	* exit to complete.
				2424	* - The user space value changed.
				2425	*/
				2426	queue_unlock(hb);
				2427	put_futex_key(&q.key);
				2428	cond_resched();
				2429	goto retry;
				2430	default:
				2431	goto out_unlock_put_key;
				2432	}
				2433	}
				2434
				2435	/*
				2436	* Only actually queue now that the atomic ops are done:
				2437	*/
				2438	queue_me(&q, hb);
				2439
				2440	WARN_ON(!q.pi_state);
				2441	/*
				2442	* Block on the PI mutex:
				2443	*/
				2444	if (!trylock) {
				2445	ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
				2446	} else {
				2447	ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
				2448	/* Fixup the trylock return value: */
				2449	ret = ret ? 0 : -EWOULDBLOCK;
				2450	}
				2451
				2452	spin_lock(q.lock_ptr);
				2453	/*
				2454	* Fixup the pi_state owner and possibly acquire the lock if we
				2455	* haven't already.
				2456	*/
				2457	res = fixup_owner(uaddr, &q, !ret);
				2458	/*
				2459	* If fixup_owner() returned an error, proprogate that. If it acquired
				2460	* the lock, clear our -ETIMEDOUT or -EINTR.
				2461	*/
				2462	if (res)
				2463	ret = (res < 0) ? res : 0;
				2464
				2465	/*
				2466	* If fixup_owner() faulted and was unable to handle the fault, unlock
				2467	* it and return the fault to userspace.
				2468	*/
				2469	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
				2470	rt_mutex_unlock(&q.pi_state->pi_mutex);
				2471
				2472	/* Unqueue and drop the lock */
				2473	unqueue_me_pi(&q);
				2474
				2475	goto out_put_key;
				2476
				2477	out_unlock_put_key:
				2478	queue_unlock(hb);
				2479
				2480	out_put_key:
				2481	put_futex_key(&q.key);
				2482	out:
				2483	if (to)
				2484	destroy_hrtimer_on_stack(&to->timer);
				2485	return ret != -EINTR ? ret : -ERESTARTNOINTR;
				2486
				2487	uaddr_faulted:
				2488	queue_unlock(hb);
				2489
				2490	ret = fault_in_user_writeable(uaddr);
				2491	if (ret)
				2492	goto out_put_key;
				2493
				2494	if (!(flags & FLAGS_SHARED))
				2495	goto retry_private;
				2496
				2497	put_futex_key(&q.key);
				2498	goto retry;
				2499	}
				2500
				2501	/*
				2502	* Userspace attempted a TID -> 0 atomic transition, and failed.
				2503	* This is the in-kernel slowpath: we look up the PI state (if any),
				2504	* and do the rt-mutex unlock.
				2505	*/
				2506	static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
				2507	{
				2508	u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
				2509	union futex_key key = FUTEX_KEY_INIT;
				2510	struct futex_hash_bucket *hb;
				2511	struct futex_q *match;
				2512	int ret;
				2513
				2514	retry:
				2515	if (get_user(uval, uaddr))
				2516	return -EFAULT;
				2517	/*
				2518	* We release only a lock we actually own:
				2519	*/
				2520	if ((uval & FUTEX_TID_MASK) != vpid)
				2521	return -EPERM;
				2522
				2523	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
				2524	if (ret)
				2525	return ret;
				2526
				2527	hb = hash_futex(&key);
				2528	spin_lock(&hb->lock);
				2529
				2530	/*
				2531	* Check waiters first. We do not trust user space values at
				2532	* all and we at least want to know if user space fiddled
				2533	* with the futex value instead of blindly unlocking.
				2534	*/
				2535	match = futex_top_waiter(hb, &key);
				2536	if (match) {
				2537	ret = wake_futex_pi(uaddr, uval, match, hb);
				2538	/*
				2539	* In case of success wake_futex_pi dropped the hash
				2540	* bucket lock.
				2541	*/
				2542	if (!ret)
				2543	goto out_putkey;
				2544	/*
				2545	* The atomic access to the futex value generated a
				2546	* pagefault, so retry the user-access and the wakeup:
				2547	*/
				2548	if (ret == -EFAULT)
				2549	goto pi_faulted;
				2550	/*
				2551	* A unconditional UNLOCK_PI op raced against a waiter
				2552	* setting the FUTEX_WAITERS bit. Try again.
				2553	*/
				2554	if (ret == -EAGAIN) {
				2555	spin_unlock(&hb->lock);
				2556	put_futex_key(&key);
				2557	goto retry;
				2558	}
				2559	/*
				2560	* wake_futex_pi has detected invalid state. Tell user
				2561	* space.
				2562	*/
				2563	goto out_unlock;
				2564	}
				2565
				2566	/*
				2567	* We have no kernel internal state, i.e. no waiters in the
				2568	* kernel. Waiters which are about to queue themselves are stuck
				2569	* on hb->lock. So we can safely ignore them. We do neither
				2570	* preserve the WAITERS bit not the OWNER_DIED one. We are the
				2571	* owner.
				2572	*/
				2573	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
				2574	goto pi_faulted;
				2575
				2576	/*
				2577	* If uval has changed, let user space handle it.
				2578	*/
				2579	ret = (curval == uval) ? 0 : -EAGAIN;
				2580
				2581	out_unlock:
				2582	spin_unlock(&hb->lock);
				2583	out_putkey:
				2584	put_futex_key(&key);
				2585	return ret;
				2586
				2587	pi_faulted:
				2588	spin_unlock(&hb->lock);
				2589	put_futex_key(&key);
				2590
				2591	ret = fault_in_user_writeable(uaddr);
				2592	if (!ret)
				2593	goto retry;
				2594
				2595	return ret;
				2596	}
				2597
				2598	/**
				2599	* handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
				2600	* @hb: the hash_bucket futex_q was original enqueued on
				2601	* @q: the futex_q woken while waiting to be requeued
				2602	* @key2: the futex_key of the requeue target futex
				2603	* @timeout: the timeout associated with the wait (NULL if none)
				2604	*
				2605	* Detect if the task was woken on the initial futex as opposed to the requeue
				2606	* target futex. If so, determine if it was a timeout or a signal that caused
				2607	* the wakeup and return the appropriate error code to the caller. Must be
				2608	* called with the hb lock held.
				2609	*
				2610	* Return:
				2611	* 0 = no early wakeup detected;
				2612	* <0 = -ETIMEDOUT or -ERESTARTNOINTR
				2613	*/
				2614	static inline
				2615	int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
				2616	struct futex_q q, union futex_key key2,
				2617	struct hrtimer_sleeper *timeout)
				2618	{
				2619	int ret = 0;
				2620
				2621	/*
				2622	* With the hb lock held, we avoid races while we process the wakeup.
				2623	* We only need to hold hb (and not hb2) to ensure atomicity as the
				2624	* wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
				2625	* It can't be requeued from uaddr2 to something else since we don't
				2626	* support a PI aware source futex for requeue.
				2627	*/
				2628	if (!match_futex(&q->key, key2)) {
				2629	WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
				2630	/*
				2631	* We were woken prior to requeue by a timeout or a signal.
				2632	* Unqueue the futex_q and determine which it was.
				2633	*/
				2634	plist_del(&q->list, &hb->chain);
				2635	hb_waiters_dec(hb);
				2636
				2637	/* Handle spurious wakeups gracefully */
				2638	ret = -EWOULDBLOCK;
				2639	if (timeout && !timeout->task)
				2640	ret = -ETIMEDOUT;
				2641	else if (signal_pending(current))
				2642	ret = -ERESTARTNOINTR;
				2643	}
				2644	return ret;
				2645	}
				2646
				2647	/**
				2648	* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
				2649	* @uaddr: the futex we initially wait on (non-pi)
				2650	* @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
				2651	* the same type, no requeueing from private to shared, etc.
				2652	* @val: the expected value of uaddr
				2653	* @abs_time: absolute timeout
				2654	* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
				2655	* @uaddr2: the pi futex we will take prior to returning to user-space
				2656	*
				2657	* The caller will wait on uaddr and will be requeued by futex_requeue() to
				2658	* uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
				2659	* on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
				2660	* userspace. This ensures the rt_mutex maintains an owner when it has waiters;
				2661	* without one, the pi logic would not know which task to boost/deboost, if
				2662	* there was a need to.
				2663	*
				2664	* We call schedule in futex_wait_queue_me() when we enqueue and return there
				2665	* via the following--
				2666	* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
				2667	* 2) wakeup on uaddr2 after a requeue
				2668	* 3) signal
				2669	* 4) timeout
				2670	*
				2671	* If 3, cleanup and return -ERESTARTNOINTR.
				2672	*
				2673	* If 2, we may then block on trying to take the rt_mutex and return via:
				2674	* 5) successful lock
				2675	* 6) signal
				2676	* 7) timeout
				2677	* 8) other lock acquisition failure
				2678	*
				2679	* If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
				2680	*
				2681	* If 4 or 7, we cleanup and return with -ETIMEDOUT.
				2682	*
				2683	* Return:
				2684	* 0 - On success;
				2685	* <0 - On error
				2686	*/
				2687	static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
				2688	u32 val, ktime_t *abs_time, u32 bitset,
				2689	u32 __user *uaddr2)
				2690	{
				2691	struct hrtimer_sleeper timeout, *to = NULL;
				2692	struct rt_mutex_waiter rt_waiter;
				2693	struct futex_hash_bucket *hb;
				2694	union futex_key key2 = FUTEX_KEY_INIT;
				2695	struct futex_q q = futex_q_init;
				2696	int res, ret;
				2697
				2698	if (uaddr == uaddr2)
				2699	return -EINVAL;
				2700
				2701	if (!bitset)
				2702	return -EINVAL;
				2703
				2704	if (abs_time) {
				2705	to = &timeout;
				2706	hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
				2707	CLOCK_REALTIME : CLOCK_MONOTONIC,
				2708	HRTIMER_MODE_ABS);
				2709	hrtimer_init_sleeper(to, current);
				2710	hrtimer_set_expires_range_ns(&to->timer, *abs_time,
				2711	current->timer_slack_ns);
				2712	}
				2713
				2714	/*
				2715	* The waiter is allocated on our stack, manipulated by the requeue
				2716	* code while we sleep on uaddr.
				2717	*/
				2718	debug_rt_mutex_init_waiter(&rt_waiter);
				2719	RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
				2720	RB_CLEAR_NODE(&rt_waiter.tree_entry);
				2721	rt_waiter.task = NULL;
				2722
				2723	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
				2724	if (unlikely(ret != 0))
				2725	goto out;
				2726
				2727	q.bitset = bitset;
				2728	q.rt_waiter = &rt_waiter;
				2729	q.requeue_pi_key = &key2;
				2730
				2731	/*
				2732	* Prepare to wait on uaddr. On success, increments q.key (key1) ref
				2733	* count.
				2734	*/
				2735	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
				2736	if (ret)
				2737	goto out_key2;
				2738
				2739	/*
				2740	* The check above which compares uaddrs is not sufficient for
				2741	* shared futexes. We need to compare the keys:
				2742	*/
				2743	if (match_futex(&q.key, &key2)) {
				2744	queue_unlock(hb);
				2745	ret = -EINVAL;
				2746	goto out_put_keys;
				2747	}
				2748
				2749	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
				2750	futex_wait_queue_me(hb, &q, to);
				2751
				2752	spin_lock(&hb->lock);
				2753	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
				2754	spin_unlock(&hb->lock);
				2755	if (ret)
				2756	goto out_put_keys;
				2757
				2758	/*
				2759	* In order for us to be here, we know our q.key == key2, and since
				2760	* we took the hb->lock above, we also know that futex_requeue() has
				2761	* completed and we no longer have to concern ourselves with a wakeup
				2762	* race with the atomic proxy lock acquisition by the requeue code. The
				2763	* futex_requeue dropped our key1 reference and incremented our key2
				2764	* reference count.
				2765	*/
				2766
				2767	/* Check if the requeue code acquired the second futex for us. */
				2768	if (!q.rt_waiter) {
				2769	/*
				2770	* Got the lock. We might not be the anticipated owner if we
				2771	* did a lock-steal - fix up the PI-state in that case.
				2772	*/
				2773	if (q.pi_state && (q.pi_state->owner != current)) {
				2774	spin_lock(q.lock_ptr);
				2775	ret = fixup_pi_state_owner(uaddr2, &q, current);
				2776	if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
				2777	rt_mutex_unlock(&q.pi_state->pi_mutex);
				2778	/*
				2779	* Drop the reference to the pi state which
				2780	* the requeue_pi() code acquired for us.
				2781	*/
				2782	free_pi_state(q.pi_state);
				2783	spin_unlock(q.lock_ptr);
				2784	}
				2785	} else {
				2786	struct rt_mutex *pi_mutex;
				2787
				2788	/*
				2789	* We have been woken up by futex_unlock_pi(), a timeout, or a
				2790	* signal. futex_unlock_pi() will not destroy the lock_ptr nor
				2791	* the pi_state.
				2792	*/
				2793	WARN_ON(!q.pi_state);
				2794	pi_mutex = &q.pi_state->pi_mutex;
				2795	ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
				2796	debug_rt_mutex_free_waiter(&rt_waiter);
				2797
				2798	spin_lock(q.lock_ptr);
				2799	/*
				2800	* Fixup the pi_state owner and possibly acquire the lock if we
				2801	* haven't already.
				2802	*/
				2803	res = fixup_owner(uaddr2, &q, !ret);
				2804	/*
				2805	* If fixup_owner() returned an error, proprogate that. If it
				2806	* acquired the lock, clear -ETIMEDOUT or -EINTR.
				2807	*/
				2808	if (res)
				2809	ret = (res < 0) ? res : 0;
				2810
				2811	/*
				2812	* If fixup_pi_state_owner() faulted and was unable to handle
				2813	* the fault, unlock the rt_mutex and return the fault to
				2814	* userspace.
				2815	*/
				2816	if (ret && rt_mutex_owner(pi_mutex) == current)
				2817	rt_mutex_unlock(pi_mutex);
				2818
				2819	/* Unqueue and drop the lock. */
				2820	unqueue_me_pi(&q);
				2821	}
				2822
				2823	if (ret == -EINTR) {
				2824	/*
				2825	* We've already been requeued, but cannot restart by calling
				2826	* futex_lock_pi() directly. We could restart this syscall, but
				2827	* it would detect that the user space "val" changed and return
				2828	* -EWOULDBLOCK. Save the overhead of the restart and return
				2829	* -EWOULDBLOCK directly.
				2830	*/
				2831	ret = -EWOULDBLOCK;
				2832	}
				2833
				2834	out_put_keys:
				2835	put_futex_key(&q.key);
				2836	out_key2:
				2837	put_futex_key(&key2);
				2838
				2839	out:
				2840	if (to) {
				2841	hrtimer_cancel(&to->timer);
				2842	destroy_hrtimer_on_stack(&to->timer);
				2843	}
				2844	return ret;
				2845	}
				2846
				2847	/*
				2848	* Support for robust futexes: the kernel cleans up held futexes at
				2849	* thread exit time.
				2850	*
				2851	* Implementation: user-space maintains a per-thread list of locks it
				2852	* is holding. Upon do_exit(), the kernel carefully walks this list,
				2853	* and marks all locks that are owned by this thread with the
				2854	* FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
				2855	* always manipulated with the lock held, so the list is private and
				2856	* per-thread. Userspace also maintains a per-thread 'list_op_pending'
				2857	* field, to allow the kernel to clean up if the thread dies after
				2858	* acquiring the lock, but just before it could have added itself to
				2859	* the list. There can only be one such pending lock.
				2860	*/
				2861
				2862	/**
				2863	* sys_set_robust_list() - Set the robust-futex list head of a task
				2864	* @head: pointer to the list-head
				2865	* @len: length of the list-head, as userspace expects
				2866	*/
				2867	SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
				2868	size_t, len)
				2869	{
				2870	if (!futex_cmpxchg_enabled)
				2871	return -ENOSYS;
				2872	/*
				2873	* The kernel knows only one size for now:
				2874	*/
				2875	if (unlikely(len != sizeof(*head)))
				2876	return -EINVAL;
				2877
				2878	current->robust_list = head;
				2879
				2880	return 0;
				2881	}
				2882
				2883	/**
				2884	* sys_get_robust_list() - Get the robust-futex list head of a task
				2885	* @pid: pid of the process [zero for current task]
				2886	* @head_ptr: pointer to a list-head pointer, the kernel fills it in
				2887	* @len_ptr: pointer to a length field, the kernel fills in the header size
				2888	*/
				2889	SYSCALL_DEFINE3(get_robust_list, int, pid,
				2890	struct robust_list_head __user * __user *, head_ptr,
				2891	size_t __user *, len_ptr)
				2892	{
				2893	struct robust_list_head __user *head;
				2894	unsigned long ret;
				2895	struct task_struct *p;
				2896
				2897	if (!futex_cmpxchg_enabled)
				2898	return -ENOSYS;
				2899
				2900	rcu_read_lock();
				2901
				2902	ret = -ESRCH;
				2903	if (!pid)
				2904	p = current;
				2905	else {
				2906	p = find_task_by_vpid(pid);
				2907	if (!p)
				2908	goto err_unlock;
				2909	}
				2910
				2911	ret = -EPERM;
				2912	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
				2913	goto err_unlock;
				2914
				2915	head = p->robust_list;
				2916	rcu_read_unlock();
				2917
				2918	if (put_user(sizeof(*head), len_ptr))
				2919	return -EFAULT;
				2920	return put_user(head, head_ptr);
				2921
				2922	err_unlock:
				2923	rcu_read_unlock();
				2924
				2925	return ret;
				2926	}
				2927
				2928	/*
				2929	* Process a futex-list entry, check whether it's owned by the
				2930	* dying task, and do notification if so:
				2931	*/
				2932	int handle_futex_death(u32 __user uaddr, struct task_struct curr, int pi)
				2933	{
				2934	u32 uval, uninitialized_var(nval), mval;
				2935
				2936	retry:
				2937	if (get_user(uval, uaddr))
				2938	return -1;
				2939
				2940	if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
				2941	/*
				2942	* Ok, this dying thread is truly holding a futex
				2943	* of interest. Set the OWNER_DIED bit atomically
				2944	* via cmpxchg, and if the value had FUTEX_WAITERS
				2945	* set, wake up a waiter (if any). (We have to do a
				2946	* futex_wake() even if OWNER_DIED is already set -
				2947	* to handle the rare but possible case of recursive
				2948	* thread-death.) The rest of the cleanup is done in
				2949	* userspace.
				2950	*/
				2951	mval = (uval & FUTEX_WAITERS) \| FUTEX_OWNER_DIED;
				2952	/*
				2953	* We are not holding a lock here, but we want to have
				2954	* the pagefault_disable/enable() protection because
				2955	* we want to handle the fault gracefully. If the
				2956	* access fails we try to fault in the futex with R/W
				2957	* verification via get_user_pages. get_user() above
				2958	* does not guarantee R/W access. If that fails we
				2959	* give up and leave the futex locked.
				2960	*/
				2961	if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
				2962	if (fault_in_user_writeable(uaddr))
				2963	return -1;
				2964	goto retry;
				2965	}
				2966	if (nval != uval)
				2967	goto retry;
				2968
				2969	/*
				2970	* Wake robust non-PI futexes here. The wakeup of
				2971	* PI futexes happens in exit_pi_state():
				2972	*/
				2973	if (!pi && (uval & FUTEX_WAITERS))
				2974	futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
				2975	}
				2976	return 0;
				2977	}
				2978
				2979	/*
				2980	* Fetch a robust-list pointer. Bit 0 signals PI futexes:
				2981	*/
				2982	static inline int fetch_robust_entry(struct robust_list __user **entry,
				2983	struct robust_list __user * __user *head,
				2984	unsigned int *pi)
				2985	{
				2986	unsigned long uentry;
				2987
				2988	if (get_user(uentry, (unsigned long __user *)head))
				2989	return -EFAULT;
				2990
				2991	entry = (void __user )(uentry & ~1UL);
				2992	*pi = uentry & 1;
				2993
				2994	return 0;
				2995	}
				2996
				2997	/*
				2998	* Walk curr->robust_list (very carefully, it's a userspace list!)
				2999	* and mark any locks found there dead, and notify any waiters.
				3000	*
				3001	* We silently return on any sign of list-walking problem.
				3002	*/
				3003	void exit_robust_list(struct task_struct *curr)
				3004	{
				3005	struct robust_list_head __user *head = curr->robust_list;
				3006	struct robust_list __user entry, next_entry, *pending;
				3007	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
				3008	unsigned int uninitialized_var(next_pi);
				3009	unsigned long futex_offset;
				3010	int rc;
				3011
				3012	if (!futex_cmpxchg_enabled)
				3013	return;
				3014
				3015	/*
				3016	* Fetch the list head (which was registered earlier, via
				3017	* sys_set_robust_list()):
				3018	*/
				3019	if (fetch_robust_entry(&entry, &head->list.next, &pi))
				3020	return;
				3021	/*
				3022	* Fetch the relative futex offset:
				3023	*/
				3024	if (get_user(futex_offset, &head->futex_offset))
				3025	return;
				3026	/*
				3027	* Fetch any possibly pending lock-add first, and handle it
				3028	* if it exists:
				3029	*/
				3030	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
				3031	return;
				3032
				3033	next_entry = NULL; /* avoid warning with gcc */
				3034	while (entry != &head->list) {
				3035	/*
				3036	* Fetch the next entry in the list before calling
				3037	* handle_futex_death:
				3038	*/
				3039	rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
				3040	/*
				3041	* A pending lock might already be on the list, so
				3042	* don't process it twice:
				3043	*/
				3044	if (entry != pending)
				3045	if (handle_futex_death((void __user *)entry + futex_offset,
				3046	curr, pi))
				3047	return;
				3048	if (rc)
				3049	return;
				3050	entry = next_entry;
				3051	pi = next_pi;
				3052	/*
				3053	* Avoid excessively long or circular lists:
				3054	*/
				3055	if (!--limit)
				3056	break;
				3057
				3058	cond_resched();
				3059	}
				3060
				3061	if (pending)
				3062	handle_futex_death((void __user *)pending + futex_offset,
				3063	curr, pip);
				3064	}
				3065
				3066	long do_futex(u32 __user uaddr, int op, u32 val, ktime_t timeout,
				3067	u32 __user *uaddr2, u32 val2, u32 val3)
				3068	{
				3069	int cmd = op & FUTEX_CMD_MASK;
				3070	unsigned int flags = 0;
				3071
				3072	if (!(op & FUTEX_PRIVATE_FLAG))
				3073	flags \|= FLAGS_SHARED;
				3074
				3075	if (op & FUTEX_CLOCK_REALTIME) {
				3076	flags \|= FLAGS_CLOCKRT;
				3077	if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
				3078	return -ENOSYS;
				3079	}
				3080
				3081	switch (cmd) {
				3082	case FUTEX_LOCK_PI:
				3083	case FUTEX_UNLOCK_PI:
				3084	case FUTEX_TRYLOCK_PI:
				3085	case FUTEX_WAIT_REQUEUE_PI:
				3086	case FUTEX_CMP_REQUEUE_PI:
				3087	if (!futex_cmpxchg_enabled)
				3088	return -ENOSYS;
				3089	}
				3090
				3091	switch (cmd) {
				3092	case FUTEX_WAIT:
				3093	val3 = FUTEX_BITSET_MATCH_ANY;
				3094	case FUTEX_WAIT_BITSET:
				3095	return futex_wait(uaddr, flags, val, timeout, val3);
				3096	case FUTEX_WAKE:
				3097	val3 = FUTEX_BITSET_MATCH_ANY;
				3098	case FUTEX_WAKE_BITSET:
				3099	return futex_wake(uaddr, flags, val, val3);
				3100	case FUTEX_REQUEUE:
				3101	return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
				3102	case FUTEX_CMP_REQUEUE:
				3103	return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
				3104	case FUTEX_WAKE_OP:
				3105	return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
				3106	case FUTEX_LOCK_PI:
				3107	return futex_lock_pi(uaddr, flags, timeout, 0);
				3108	case FUTEX_UNLOCK_PI:
				3109	return futex_unlock_pi(uaddr, flags);
				3110	case FUTEX_TRYLOCK_PI:
				3111	return futex_lock_pi(uaddr, flags, NULL, 1);
				3112	case FUTEX_WAIT_REQUEUE_PI:
				3113	val3 = FUTEX_BITSET_MATCH_ANY;
				3114	return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
				3115	uaddr2);
				3116	case FUTEX_CMP_REQUEUE_PI:
				3117	return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
				3118	}
				3119	return -ENOSYS;
				3120	}
				3121
				3122
				3123	SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
				3124	struct timespec __user , utime, u32 __user , uaddr2,
				3125	u32, val3)
				3126	{
				3127	struct timespec ts;
				3128	ktime_t t, *tp = NULL;
				3129	u32 val2 = 0;
				3130	int cmd = op & FUTEX_CMD_MASK;
				3131
				3132	if (utime && (cmd == FUTEX_WAIT \|\| cmd == FUTEX_LOCK_PI \|\|
				3133	cmd == FUTEX_WAIT_BITSET \|\|
				3134	cmd == FUTEX_WAIT_REQUEUE_PI)) {
				3135	if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
				3136	return -EFAULT;
				3137	if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
				3138	return -EFAULT;
				3139	if (!timespec_valid(&ts))
				3140	return -EINVAL;
				3141
				3142	t = timespec_to_ktime(ts);
				3143	if (cmd == FUTEX_WAIT)
				3144	t = ktime_add_safe(ktime_get(), t);
				3145	tp = &t;
				3146	}
				3147	/*
				3148	* requeue parameter in 'utime' if cmd == FUTEX__REQUEUE_.
				3149	* number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
				3150	*/
				3151	if (cmd == FUTEX_REQUEUE \|\| cmd == FUTEX_CMP_REQUEUE \|\|
				3152	cmd == FUTEX_CMP_REQUEUE_PI \|\| cmd == FUTEX_WAKE_OP)
				3153	val2 = (u32) (unsigned long) utime;
				3154
				3155	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
				3156	}
				3157
				3158	static void __init futex_detect_cmpxchg(void)
				3159	{
				3160	#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
				3161	u32 curval;
				3162
				3163	/*
				3164	* This will fail and we want it. Some arch implementations do
				3165	* runtime detection of the futex_atomic_cmpxchg_inatomic()
				3166	* functionality. We want to know that before we call in any
				3167	* of the complex code paths. Also we want to prevent
				3168	* registration of robust lists in that case. NULL is
				3169	* guaranteed to fault and we get -EFAULT on functional
				3170	* implementation, the non-functional ones will return
				3171	* -ENOSYS.
				3172	*/
				3173	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
				3174	futex_cmpxchg_enabled = 1;
				3175	#endif
				3176	}
				3177
				3178	static int __init futex_init(void)
				3179	{
				3180	unsigned int futex_shift;
				3181	unsigned long i;
				3182
				3183	#if CONFIG_BASE_SMALL
				3184	futex_hashsize = 16;
				3185	#else
				3186	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
				3187	#endif
				3188
				3189	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
				3190	futex_hashsize, 0,
				3191	futex_hashsize < 256 ? HASH_SMALL : 0,
				3192	&futex_shift, NULL,
				3193	futex_hashsize, futex_hashsize);
				3194	futex_hashsize = 1UL << futex_shift;
				3195
				3196	futex_detect_cmpxchg();
				3197
				3198	for (i = 0; i < futex_hashsize; i++) {
				3199	atomic_set(&futex_queues[i].waiters, 0);
				3200	plist_head_init(&futex_queues[i].chain);
				3201	spin_lock_init(&futex_queues[i].lock);
				3202	}
				3203
				3204	return 0;
				3205	}
				3206	core_initcall(futex_init);