Blame - ipc/sem.c - codeaurora/cp-linux

blob: 9862c3d1c26d294b1c281c38bec6bc59ec665c5c [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame^]	1	/*
				2	* linux/ipc/sem.c
				3	* Copyright (C) 1992 Krishna Balasubramanian
				4	* Copyright (C) 1995 Eric Schenk, Bruno Haible
				5	*
				6	* /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
				7	*
				8	* SMP-threaded, sysctl's added
				9	* (c) 1999 Manfred Spraul <manfred@colorfullife.com>
				10	* Enforced range limit on SEM_UNDO
				11	* (c) 2001 Red Hat Inc
				12	* Lockless wakeup
				13	* (c) 2003 Manfred Spraul <manfred@colorfullife.com>
				14	* Further wakeup optimizations, documentation
				15	* (c) 2010 Manfred Spraul <manfred@colorfullife.com>
				16	*
				17	* support for audit of ipc object properties and permission changes
				18	* Dustin Kirkland <dustin.kirkland@us.ibm.com>
				19	*
				20	* namespaces support
				21	* OpenVZ, SWsoft Inc.
				22	* Pavel Emelianov <xemul@openvz.org>
				23	*
				24	* Implementation notes: (May 2010)
				25	* This file implements System V semaphores.
				26	*
				27	* User space visible behavior:
				28	* - FIFO ordering for semop() operations (just FIFO, not starvation
				29	* protection)
				30	* - multiple semaphore operations that alter the same semaphore in
				31	* one semop() are handled.
				32	* - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
				33	* SETALL calls.
				34	* - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
				35	* - undo adjustments at process exit are limited to 0..SEMVMX.
				36	* - namespace are supported.
				37	* - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
				38	* to /proc/sys/kernel/sem.
				39	* - statistics about the usage are reported in /proc/sysvipc/sem.
				40	*
				41	* Internals:
				42	* - scalability:
				43	* - all global variables are read-mostly.
				44	* - semop() calls and semctl(RMID) are synchronized by RCU.
				45	* - most operations do write operations (actually: spin_lock calls) to
				46	* the per-semaphore array structure.
				47	* Thus: Perfect SMP scaling between independent semaphore arrays.
				48	* If multiple semaphores in one array are used, then cache line
				49	* trashing on the semaphore array spinlock will limit the scaling.
				50	* - semncnt and semzcnt are calculated on demand in count_semcnt()
				51	* - the task that performs a successful semop() scans the list of all
				52	* sleeping tasks and completes any pending operations that can be fulfilled.
				53	* Semaphores are actively given to waiting tasks (necessary for FIFO).
				54	* (see update_queue())
				55	* - To improve the scalability, the actual wake-up calls are performed after
				56	* dropping all locks. (see wake_up_sem_queue_prepare(),
				57	* wake_up_sem_queue_do())
				58	* - All work is done by the waker, the woken up task does not have to do
				59	* anything - not even acquiring a lock or dropping a refcount.
				60	* - A woken up task may not even touch the semaphore array anymore, it may
				61	* have been destroyed already by a semctl(RMID).
				62	* - The synchronizations between wake-ups due to a timeout/signal and a
				63	* wake-up due to a completed semaphore operation is achieved by using an
				64	* intermediate state (IN_WAKEUP).
				65	* - UNDO values are stored in an array (one per process and per
				66	* semaphore array, lazily allocated). For backwards compatibility, multiple
				67	* modes for the UNDO variables are supported (per process, per thread)
				68	* (see copy_semundo, CLONE_SYSVSEM)
				69	* - There are two lists of the pending operations: a per-array list
				70	* and per-semaphore list (stored in the array). This allows to achieve FIFO
				71	* ordering without always scanning all pending operations.
				72	* The worst-case behavior is nevertheless O(N^2) for N wakeups.
				73	*/
				74
				75	#include <linux/slab.h>
				76	#include <linux/spinlock.h>
				77	#include <linux/init.h>
				78	#include <linux/proc_fs.h>
				79	#include <linux/time.h>
				80	#include <linux/security.h>
				81	#include <linux/syscalls.h>
				82	#include <linux/audit.h>
				83	#include <linux/capability.h>
				84	#include <linux/seq_file.h>
				85	#include <linux/rwsem.h>
				86	#include <linux/nsproxy.h>
				87	#include <linux/ipc_namespace.h>
				88
				89	#include <linux/uaccess.h>
				90	#include "util.h"
				91
				92	/* One semaphore structure for each semaphore in the system. */
				93	struct sem {
				94	int semval; /* current value */
				95	int sempid; /* pid of last operation */
				96	spinlock_t lock; /* spinlock for fine-grained semtimedop */
				97	struct list_head pending_alter; /* pending single-sop operations */
				98	/* that alter the semaphore */
				99	struct list_head pending_const; /* pending single-sop operations */
				100	/* that do not alter the semaphore*/
				101	time_t sem_otime; /* candidate for sem_otime */
				102	} ____cacheline_aligned_in_smp;
				103
				104	/* One queue for each sleeping process in the system. */
				105	struct sem_queue {
				106	struct list_head list; /* queue of pending operations */
				107	struct task_struct sleeper; / this process */
				108	struct sem_undo undo; / undo structure */
				109	int pid; /* process id of requesting process */
				110	int status; /* completion status of operation */
				111	struct sembuf sops; / array of pending operations */
				112	struct sembuf blocking; / the operation that blocked */
				113	int nsops; /* number of operations */
				114	int alter; /* does sops alter the array? /
				115	};
				116
				117	/* Each task has a list of undo requests. They are executed automatically
				118	* when the process exits.
				119	*/
				120	struct sem_undo {
				121	struct list_head list_proc; /* per-process list: *
				122	* all undos from one process
				123	* rcu protected */
				124	struct rcu_head rcu; /* rcu struct for sem_undo */
				125	struct sem_undo_list ulp; / back ptr to sem_undo_list */
				126	struct list_head list_id; /* per semaphore array list:
				127	* all undos for one array */
				128	int semid; /* semaphore set identifier */
				129	short semadj; / array of adjustments */
				130	/* one per semaphore */
				131	};
				132
				133	/* sem_undo_list controls shared access to the list of sem_undo structures
				134	* that may be shared among all a CLONE_SYSVSEM task group.
				135	*/
				136	struct sem_undo_list {
				137	atomic_t refcnt;
				138	spinlock_t lock;
				139	struct list_head list_proc;
				140	};
				141
				142
				143	#define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS])
				144
				145	#define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid)
				146
				147	static int newary(struct ipc_namespace , struct ipc_params );
				148	static void freeary(struct ipc_namespace , struct kern_ipc_perm );
				149	#ifdef CONFIG_PROC_FS
				150	static int sysvipc_sem_proc_show(struct seq_file s, void it);
				151	#endif
				152
				153	#define SEMMSL_FAST 256 /* 512 bytes on stack */
				154	#define SEMOPM_FAST 64 /* ~ 372 bytes on stack */
				155
				156	/*
				157	* Locking:
				158	* a) global sem_lock() for read/write
				159	* sem_undo.id_next,
				160	* sem_array.complex_count,
				161	* sem_array.complex_mode
				162	* sem_array.pending{_alter,_const},
				163	* sem_array.sem_undo
				164	*
				165	* b) global or semaphore sem_lock() for read/write:
				166	* sem_array.sem_base[i].pending_{const,alter}:
				167	* sem_array.complex_mode (for read)
				168	*
				169	* c) special:
				170	* sem_undo_list.list_proc:
				171	* * undo_list->lock for write
				172	* * rcu for read
				173	*/
				174
				175	#define sc_semmsl sem_ctls[0]
				176	#define sc_semmns sem_ctls[1]
				177	#define sc_semopm sem_ctls[2]
				178	#define sc_semmni sem_ctls[3]
				179
				180	void sem_init_ns(struct ipc_namespace *ns)
				181	{
				182	ns->sc_semmsl = SEMMSL;
				183	ns->sc_semmns = SEMMNS;
				184	ns->sc_semopm = SEMOPM;
				185	ns->sc_semmni = SEMMNI;
				186	ns->used_sems = 0;
				187	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
				188	}
				189
				190	#ifdef CONFIG_IPC_NS
				191	void sem_exit_ns(struct ipc_namespace *ns)
				192	{
				193	free_ipcs(ns, &sem_ids(ns), freeary);
				194	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
				195	}
				196	#endif
				197
				198	void __init sem_init(void)
				199	{
				200	sem_init_ns(&init_ipc_ns);
				201	ipc_init_proc_interface("sysvipc/sem",
				202	" key semid perms nsems uid gid cuid cgid otime ctime\n",
				203	IPC_SEM_IDS, sysvipc_sem_proc_show);
				204	}
				205
				206	/**
				207	* unmerge_queues - unmerge queues, if possible.
				208	* @sma: semaphore array
				209	*
				210	* The function unmerges the wait queues if complex_count is 0.
				211	* It must be called prior to dropping the global semaphore array lock.
				212	*/
				213	static void unmerge_queues(struct sem_array *sma)
				214	{
				215	struct sem_queue q, tq;
				216
				217	/* complex operations still around? */
				218	if (sma->complex_count)
				219	return;
				220	/*
				221	* We will switch back to simple mode.
				222	* Move all pending operation back into the per-semaphore
				223	* queues.
				224	*/
				225	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
				226	struct sem *curr;
				227	curr = &sma->sem_base[q->sops[0].sem_num];
				228
				229	list_add_tail(&q->list, &curr->pending_alter);
				230	}
				231	INIT_LIST_HEAD(&sma->pending_alter);
				232	}
				233
				234	/**
				235	* merge_queues - merge single semop queues into global queue
				236	* @sma: semaphore array
				237	*
				238	* This function merges all per-semaphore queues into the global queue.
				239	* It is necessary to achieve FIFO ordering for the pending single-sop
				240	* operations when a multi-semop operation must sleep.
				241	* Only the alter operations must be moved, the const operations can stay.
				242	*/
				243	static void merge_queues(struct sem_array *sma)
				244	{
				245	int i;
				246	for (i = 0; i < sma->sem_nsems; i++) {
				247	struct sem *sem = sma->sem_base + i;
				248
				249	list_splice_init(&sem->pending_alter, &sma->pending_alter);
				250	}
				251	}
				252
				253	static void sem_rcu_free(struct rcu_head *head)
				254	{
				255	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
				256	struct sem_array *sma = ipc_rcu_to_struct(p);
				257
				258	security_sem_free(sma);
				259	ipc_rcu_free(head);
				260	}
				261
				262	/*
				263	* spin_unlock_wait() and !spin_is_locked() are not memory barriers, they
				264	* are only control barriers.
				265	* The code must pair with spin_unlock(&sem->lock) or
				266	* spin_unlock(&sem_perm.lock), thus just the control barrier is insufficient.
				267	*
				268	* smp_rmb() is sufficient, as writes cannot pass the control barrier.
				269	*/
				270	#define ipc_smp_acquire__after_spin_is_unlocked() smp_rmb()
				271
				272	/*
				273	* Enter the mode suitable for non-simple operations:
				274	* Caller must own sem_perm.lock.
				275	*/
				276	static void complexmode_enter(struct sem_array *sma)
				277	{
				278	int i;
				279	struct sem *sem;
				280
				281	if (sma->complex_mode) {
				282	/* We are already in complex_mode. Nothing to do */
				283	return;
				284	}
				285
				286	/* We need a full barrier after seting complex_mode:
				287	* The write to complex_mode must be visible
				288	* before we read the first sem->lock spinlock state.
				289	*/
				290	smp_store_mb(sma->complex_mode, true);
				291
				292	for (i = 0; i < sma->sem_nsems; i++) {
				293	sem = sma->sem_base + i;
				294	spin_unlock_wait(&sem->lock);
				295	}
				296	ipc_smp_acquire__after_spin_is_unlocked();
				297	}
				298
				299	/*
				300	* Try to leave the mode that disallows simple operations:
				301	* Caller must own sem_perm.lock.
				302	*/
				303	static void complexmode_tryleave(struct sem_array *sma)
				304	{
				305	if (sma->complex_count) {
				306	/* Complex ops are sleeping.
				307	* We must stay in complex mode
				308	*/
				309	return;
				310	}
				311	/*
				312	* Immediately after setting complex_mode to false,
				313	* a simple op can start. Thus: all memory writes
				314	* performed by the current operation must be visible
				315	* before we set complex_mode to false.
				316	*/
				317	smp_store_release(&sma->complex_mode, false);
				318	}
				319
				320	#define SEM_GLOBAL_LOCK (-1)
				321	/*
				322	* If the request contains only one semaphore operation, and there are
				323	* no complex transactions pending, lock only the semaphore involved.
				324	* Otherwise, lock the entire semaphore array, since we either have
				325	* multiple semaphores in our own semops, or we need to look at
				326	* semaphores from other pending complex operations.
				327	*/
				328	static inline int sem_lock(struct sem_array sma, struct sembuf sops,
				329	int nsops)
				330	{
				331	struct sem *sem;
				332
				333	if (nsops != 1) {
				334	/* Complex operation - acquire a full lock */
				335	ipc_lock_object(&sma->sem_perm);
				336
				337	/* Prevent parallel simple ops */
				338	complexmode_enter(sma);
				339	return SEM_GLOBAL_LOCK;
				340	}
				341
				342	/*
				343	* Only one semaphore affected - try to optimize locking.
				344	* Optimized locking is possible if no complex operation
				345	* is either enqueued or processed right now.
				346	*
				347	* Both facts are tracked by complex_mode.
				348	*/
				349	sem = sma->sem_base + sops->sem_num;
				350
				351	/*
				352	* Initial check for complex_mode. Just an optimization,
				353	* no locking, no memory barrier.
				354	*/
				355	if (!sma->complex_mode) {
				356	/*
				357	* It appears that no complex operation is around.
				358	* Acquire the per-semaphore lock.
				359	*/
				360	spin_lock(&sem->lock);
				361
				362	/*
				363	* See 51d7d5205d33
				364	* ("powerpc: Add smp_mb() to arch_spin_is_locked()"):
				365	* A full barrier is required: the write of sem->lock
				366	* must be visible before the read is executed
				367	*/
				368	smp_mb();
				369
				370	if (!smp_load_acquire(&sma->complex_mode)) {
				371	/* fast path successful! */
				372	return sops->sem_num;
				373	}
				374	spin_unlock(&sem->lock);
				375	}
				376
				377	/* slow path: acquire the full lock */
				378	ipc_lock_object(&sma->sem_perm);
				379
				380	if (sma->complex_count == 0) {
				381	/* False alarm:
				382	* There is no complex operation, thus we can switch
				383	* back to the fast path.
				384	*/
				385	spin_lock(&sem->lock);
				386	ipc_unlock_object(&sma->sem_perm);
				387	return sops->sem_num;
				388	} else {
				389	/* Not a false alarm, thus complete the sequence for a
				390	* full lock.
				391	*/
				392	complexmode_enter(sma);
				393	return SEM_GLOBAL_LOCK;
				394	}
				395	}
				396
				397	static inline void sem_unlock(struct sem_array *sma, int locknum)
				398	{
				399	if (locknum == SEM_GLOBAL_LOCK) {
				400	unmerge_queues(sma);
				401	complexmode_tryleave(sma);
				402	ipc_unlock_object(&sma->sem_perm);
				403	} else {
				404	struct sem *sem = sma->sem_base + locknum;
				405	spin_unlock(&sem->lock);
				406	}
				407	}
				408
				409	/*
				410	* sem_lock_(check_) routines are called in the paths where the rwsem
				411	* is not held.
				412	*
				413	* The caller holds the RCU read lock.
				414	*/
				415	static inline struct sem_array sem_obtain_lock(struct ipc_namespace ns,
				416	int id, struct sembuf sops, int nsops, int locknum)
				417	{
				418	struct kern_ipc_perm *ipcp;
				419	struct sem_array *sma;
				420
				421	ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
				422	if (IS_ERR(ipcp))
				423	return ERR_CAST(ipcp);
				424
				425	sma = container_of(ipcp, struct sem_array, sem_perm);
				426	*locknum = sem_lock(sma, sops, nsops);
				427
				428	/* ipc_rmid() may have already freed the ID while sem_lock
				429	* was spinning: verify that the structure is still valid
				430	*/
				431	if (ipc_valid_object(ipcp))
				432	return container_of(ipcp, struct sem_array, sem_perm);
				433
				434	sem_unlock(sma, *locknum);
				435	return ERR_PTR(-EINVAL);
				436	}
				437
				438	static inline struct sem_array sem_obtain_object(struct ipc_namespace ns, int id)
				439	{
				440	struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
				441
				442	if (IS_ERR(ipcp))
				443	return ERR_CAST(ipcp);
				444
				445	return container_of(ipcp, struct sem_array, sem_perm);
				446	}
				447
				448	static inline struct sem_array sem_obtain_object_check(struct ipc_namespace ns,
				449	int id)
				450	{
				451	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);
				452
				453	if (IS_ERR(ipcp))
				454	return ERR_CAST(ipcp);
				455
				456	return container_of(ipcp, struct sem_array, sem_perm);
				457	}
				458
				459	static inline void sem_lock_and_putref(struct sem_array *sma)
				460	{
				461	sem_lock(sma, NULL, -1);
				462	ipc_rcu_putref(sma, sem_rcu_free);
				463	}
				464
				465	static inline void sem_rmid(struct ipc_namespace ns, struct sem_array s)
				466	{
				467	ipc_rmid(&sem_ids(ns), &s->sem_perm);
				468	}
				469
				470	/*
				471	* Lockless wakeup algorithm:
				472	* Without the check/retry algorithm a lockless wakeup is possible:
				473	* - queue.status is initialized to -EINTR before blocking.
				474	* - wakeup is performed by
				475	* * unlinking the queue entry from the pending list
				476	* * setting queue.status to IN_WAKEUP
				477	* This is the notification for the blocked thread that a
				478	* result value is imminent.
				479	* * call wake_up_process
				480	* * set queue.status to the final value.
				481	* - the previously blocked thread checks queue.status:
				482	* * if it's IN_WAKEUP, then it must wait until the value changes
				483	* * if it's not -EINTR, then the operation was completed by
				484	* update_queue. semtimedop can return queue.status without
				485	* performing any operation on the sem array.
				486	* * otherwise it must acquire the spinlock and check what's up.
				487	*
				488	* The two-stage algorithm is necessary to protect against the following
				489	* races:
				490	* - if queue.status is set after wake_up_process, then the woken up idle
				491	* thread could race forward and try (and fail) to acquire sma->lock
				492	* before update_queue had a chance to set queue.status
				493	* - if queue.status is written before wake_up_process and if the
				494	* blocked process is woken up by a signal between writing
				495	* queue.status and the wake_up_process, then the woken up
				496	* process could return from semtimedop and die by calling
				497	* sys_exit before wake_up_process is called. Then wake_up_process
				498	* will oops, because the task structure is already invalid.
				499	* (yes, this happened on s390 with sysv msg).
				500	*
				501	*/
				502	#define IN_WAKEUP 1
				503
				504	/**
				505	* newary - Create a new semaphore set
				506	* @ns: namespace
				507	* @params: ptr to the structure that contains key, semflg and nsems
				508	*
				509	* Called with sem_ids.rwsem held (as a writer)
				510	*/
				511	static int newary(struct ipc_namespace ns, struct ipc_params params)
				512	{
				513	int id;
				514	int retval;
				515	struct sem_array *sma;
				516	int size;
				517	key_t key = params->key;
				518	int nsems = params->u.nsems;
				519	int semflg = params->flg;
				520	int i;
				521
				522	if (!nsems)
				523	return -EINVAL;
				524	if (ns->used_sems + nsems > ns->sc_semmns)
				525	return -ENOSPC;
				526
				527	size = sizeof(sma) + nsems sizeof(struct sem);
				528	sma = ipc_rcu_alloc(size);
				529	if (!sma)
				530	return -ENOMEM;
				531
				532	memset(sma, 0, size);
				533
				534	sma->sem_perm.mode = (semflg & S_IRWXUGO);
				535	sma->sem_perm.key = key;
				536
				537	sma->sem_perm.security = NULL;
				538	retval = security_sem_alloc(sma);
				539	if (retval) {
				540	ipc_rcu_putref(sma, ipc_rcu_free);
				541	return retval;
				542	}
				543
				544	sma->sem_base = (struct sem *) &sma[1];
				545
				546	for (i = 0; i < nsems; i++) {
				547	INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
				548	INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
				549	spin_lock_init(&sma->sem_base[i].lock);
				550	}
				551
				552	sma->complex_count = 0;
				553	sma->complex_mode = true; /* dropped by sem_unlock below */
				554	INIT_LIST_HEAD(&sma->pending_alter);
				555	INIT_LIST_HEAD(&sma->pending_const);
				556	INIT_LIST_HEAD(&sma->list_id);
				557	sma->sem_nsems = nsems;
				558	sma->sem_ctime = get_seconds();
				559
				560	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
				561	if (id < 0) {
				562	ipc_rcu_putref(sma, sem_rcu_free);
				563	return id;
				564	}
				565	ns->used_sems += nsems;
				566
				567	sem_unlock(sma, -1);
				568	rcu_read_unlock();
				569
				570	return sma->sem_perm.id;
				571	}
				572
				573
				574	/*
				575	* Called with sem_ids.rwsem and ipcp locked.
				576	*/
				577	static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
				578	{
				579	struct sem_array *sma;
				580
				581	sma = container_of(ipcp, struct sem_array, sem_perm);
				582	return security_sem_associate(sma, semflg);
				583	}
				584
				585	/*
				586	* Called with sem_ids.rwsem and ipcp locked.
				587	*/
				588	static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
				589	struct ipc_params *params)
				590	{
				591	struct sem_array *sma;
				592
				593	sma = container_of(ipcp, struct sem_array, sem_perm);
				594	if (params->u.nsems > sma->sem_nsems)
				595	return -EINVAL;
				596
				597	return 0;
				598	}
				599
				600	SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
				601	{
				602	struct ipc_namespace *ns;
				603	static const struct ipc_ops sem_ops = {
				604	.getnew = newary,
				605	.associate = sem_security,
				606	.more_checks = sem_more_checks,
				607	};
				608	struct ipc_params sem_params;
				609
				610	ns = current->nsproxy->ipc_ns;
				611
				612	if (nsems < 0 \|\| nsems > ns->sc_semmsl)
				613	return -EINVAL;
				614
				615	sem_params.key = key;
				616	sem_params.flg = semflg;
				617	sem_params.u.nsems = nsems;
				618
				619	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
				620	}
				621
				622	/**
				623	* perform_atomic_semop - Perform (if possible) a semaphore operation
				624	* @sma: semaphore array
				625	* @q: struct sem_queue that describes the operation
				626	*
				627	* Returns 0 if the operation was possible.
				628	* Returns 1 if the operation is impossible, the caller must sleep.
				629	* Negative values are error codes.
				630	*/
				631	static int perform_atomic_semop(struct sem_array sma, struct sem_queue q)
				632	{
				633	int result, sem_op, nsops, pid;
				634	struct sembuf *sop;
				635	struct sem *curr;
				636	struct sembuf *sops;
				637	struct sem_undo *un;
				638
				639	sops = q->sops;
				640	nsops = q->nsops;
				641	un = q->undo;
				642
				643	for (sop = sops; sop < sops + nsops; sop++) {
				644	curr = sma->sem_base + sop->sem_num;
				645	sem_op = sop->sem_op;
				646	result = curr->semval;
				647
				648	if (!sem_op && result)
				649	goto would_block;
				650
				651	result += sem_op;
				652	if (result < 0)
				653	goto would_block;
				654	if (result > SEMVMX)
				655	goto out_of_range;
				656
				657	if (sop->sem_flg & SEM_UNDO) {
				658	int undo = un->semadj[sop->sem_num] - sem_op;
				659	/* Exceeding the undo range is an error. */
				660	if (undo < (-SEMAEM - 1) \|\| undo > SEMAEM)
				661	goto out_of_range;
				662	un->semadj[sop->sem_num] = undo;
				663	}
				664
				665	curr->semval = result;
				666	}
				667
				668	sop--;
				669	pid = q->pid;
				670	while (sop >= sops) {
				671	sma->sem_base[sop->sem_num].sempid = pid;
				672	sop--;
				673	}
				674
				675	return 0;
				676
				677	out_of_range:
				678	result = -ERANGE;
				679	goto undo;
				680
				681	would_block:
				682	q->blocking = sop;
				683
				684	if (sop->sem_flg & IPC_NOWAIT)
				685	result = -EAGAIN;
				686	else
				687	result = 1;
				688
				689	undo:
				690	sop--;
				691	while (sop >= sops) {
				692	sem_op = sop->sem_op;
				693	sma->sem_base[sop->sem_num].semval -= sem_op;
				694	if (sop->sem_flg & SEM_UNDO)
				695	un->semadj[sop->sem_num] += sem_op;
				696	sop--;
				697	}
				698
				699	return result;
				700	}
				701
				702	/** wake_up_sem_queue_prepare(q, error): Prepare wake-up
				703	* @q: queue entry that must be signaled
				704	* @error: Error value for the signal
				705	*
				706	* Prepare the wake-up of the queue entry q.
				707	*/
				708	static void wake_up_sem_queue_prepare(struct list_head *pt,
				709	struct sem_queue *q, int error)
				710	{
				711	if (list_empty(pt)) {
				712	/*
				713	* Hold preempt off so that we don't get preempted and have the
				714	* wakee busy-wait until we're scheduled back on.
				715	*/
				716	preempt_disable();
				717	}
				718	q->status = IN_WAKEUP;
				719	q->pid = error;
				720
				721	list_add_tail(&q->list, pt);
				722	}
				723
				724	/**
				725	* wake_up_sem_queue_do - do the actual wake-up
				726	* @pt: list of tasks to be woken up
				727	*
				728	* Do the actual wake-up.
				729	* The function is called without any locks held, thus the semaphore array
				730	* could be destroyed already and the tasks can disappear as soon as the
				731	* status is set to the actual return code.
				732	*/
				733	static void wake_up_sem_queue_do(struct list_head *pt)
				734	{
				735	struct sem_queue q, t;
				736	int did_something;
				737
				738	did_something = !list_empty(pt);
				739	list_for_each_entry_safe(q, t, pt, list) {
				740	wake_up_process(q->sleeper);
				741	/* q can disappear immediately after writing q->status. */
				742	smp_wmb();
				743	q->status = q->pid;
				744	}
				745	if (did_something)
				746	preempt_enable();
				747	}
				748
				749	static void unlink_queue(struct sem_array sma, struct sem_queue q)
				750	{
				751	list_del(&q->list);
				752	if (q->nsops > 1)
				753	sma->complex_count--;
				754	}
				755
				756	/** check_restart(sma, q)
				757	* @sma: semaphore array
				758	* @q: the operation that just completed
				759	*
				760	* update_queue is O(N^2) when it restarts scanning the whole queue of
				761	* waiting operations. Therefore this function checks if the restart is
				762	* really necessary. It is called after a previously waiting operation
				763	* modified the array.
				764	* Note that wait-for-zero operations are handled without restart.
				765	*/
				766	static int check_restart(struct sem_array sma, struct sem_queue q)
				767	{
				768	/* pending complex alter operations are too difficult to analyse */
				769	if (!list_empty(&sma->pending_alter))
				770	return 1;
				771
				772	/* we were a sleeping complex operation. Too difficult */
				773	if (q->nsops > 1)
				774	return 1;
				775
				776	/* It is impossible that someone waits for the new value:
				777	* - complex operations always restart.
				778	* - wait-for-zero are handled seperately.
				779	* - q is a previously sleeping simple operation that
				780	* altered the array. It must be a decrement, because
				781	* simple increments never sleep.
				782	* - If there are older (higher priority) decrements
				783	* in the queue, then they have observed the original
				784	* semval value and couldn't proceed. The operation
				785	* decremented to value - thus they won't proceed either.
				786	*/
				787	return 0;
				788	}
				789
				790	/**
				791	* wake_const_ops - wake up non-alter tasks
				792	* @sma: semaphore array.
				793	* @semnum: semaphore that was modified.
				794	* @pt: list head for the tasks that must be woken up.
				795	*
				796	* wake_const_ops must be called after a semaphore in a semaphore array
				797	* was set to 0. If complex const operations are pending, wake_const_ops must
				798	* be called with semnum = -1, as well as with the number of each modified
				799	* semaphore.
				800	* The tasks that must be woken up are added to @pt. The return code
				801	* is stored in q->pid.
				802	* The function returns 1 if at least one operation was completed successfully.
				803	*/
				804	static int wake_const_ops(struct sem_array *sma, int semnum,
				805	struct list_head *pt)
				806	{
				807	struct sem_queue *q;
				808	struct list_head *walk;
				809	struct list_head *pending_list;
				810	int semop_completed = 0;
				811
				812	if (semnum == -1)
				813	pending_list = &sma->pending_const;
				814	else
				815	pending_list = &sma->sem_base[semnum].pending_const;
				816
				817	walk = pending_list->next;
				818	while (walk != pending_list) {
				819	int error;
				820
				821	q = container_of(walk, struct sem_queue, list);
				822	walk = walk->next;
				823
				824	error = perform_atomic_semop(sma, q);
				825
				826	if (error <= 0) {
				827	/* operation completed, remove from queue & wakeup */
				828
				829	unlink_queue(sma, q);
				830
				831	wake_up_sem_queue_prepare(pt, q, error);
				832	if (error == 0)
				833	semop_completed = 1;
				834	}
				835	}
				836	return semop_completed;
				837	}
				838
				839	/**
				840	* do_smart_wakeup_zero - wakeup all wait for zero tasks
				841	* @sma: semaphore array
				842	* @sops: operations that were performed
				843	* @nsops: number of operations
				844	* @pt: list head of the tasks that must be woken up.
				845	*
				846	* Checks all required queue for wait-for-zero operations, based
				847	* on the actual changes that were performed on the semaphore array.
				848	* The function returns 1 if at least one operation was completed successfully.
				849	*/
				850	static int do_smart_wakeup_zero(struct sem_array sma, struct sembuf sops,
				851	int nsops, struct list_head *pt)
				852	{
				853	int i;
				854	int semop_completed = 0;
				855	int got_zero = 0;
				856
				857	/* first: the per-semaphore queues, if known */
				858	if (sops) {
				859	for (i = 0; i < nsops; i++) {
				860	int num = sops[i].sem_num;
				861
				862	if (sma->sem_base[num].semval == 0) {
				863	got_zero = 1;
				864	semop_completed \|= wake_const_ops(sma, num, pt);
				865	}
				866	}
				867	} else {
				868	/*
				869	* No sops means modified semaphores not known.
				870	* Assume all were changed.
				871	*/
				872	for (i = 0; i < sma->sem_nsems; i++) {
				873	if (sma->sem_base[i].semval == 0) {
				874	got_zero = 1;
				875	semop_completed \|= wake_const_ops(sma, i, pt);
				876	}
				877	}
				878	}
				879	/*
				880	* If one of the modified semaphores got 0,
				881	* then check the global queue, too.
				882	*/
				883	if (got_zero)
				884	semop_completed \|= wake_const_ops(sma, -1, pt);
				885
				886	return semop_completed;
				887	}
				888
				889
				890	/**
				891	* update_queue - look for tasks that can be completed.
				892	* @sma: semaphore array.
				893	* @semnum: semaphore that was modified.
				894	* @pt: list head for the tasks that must be woken up.
				895	*
				896	* update_queue must be called after a semaphore in a semaphore array
				897	* was modified. If multiple semaphores were modified, update_queue must
				898	* be called with semnum = -1, as well as with the number of each modified
				899	* semaphore.
				900	* The tasks that must be woken up are added to @pt. The return code
				901	* is stored in q->pid.
				902	* The function internally checks if const operations can now succeed.
				903	*
				904	* The function return 1 if at least one semop was completed successfully.
				905	*/
				906	static int update_queue(struct sem_array sma, int semnum, struct list_head pt)
				907	{
				908	struct sem_queue *q;
				909	struct list_head *walk;
				910	struct list_head *pending_list;
				911	int semop_completed = 0;
				912
				913	if (semnum == -1)
				914	pending_list = &sma->pending_alter;
				915	else
				916	pending_list = &sma->sem_base[semnum].pending_alter;
				917
				918	again:
				919	walk = pending_list->next;
				920	while (walk != pending_list) {
				921	int error, restart;
				922
				923	q = container_of(walk, struct sem_queue, list);
				924	walk = walk->next;
				925
				926	/* If we are scanning the single sop, per-semaphore list of
				927	* one semaphore and that semaphore is 0, then it is not
				928	* necessary to scan further: simple increments
				929	* that affect only one entry succeed immediately and cannot
				930	* be in the per semaphore pending queue, and decrements
				931	* cannot be successful if the value is already 0.
				932	*/
				933	if (semnum != -1 && sma->sem_base[semnum].semval == 0)
				934	break;
				935
				936	error = perform_atomic_semop(sma, q);
				937
				938	/* Does q->sleeper still need to sleep? */
				939	if (error > 0)
				940	continue;
				941
				942	unlink_queue(sma, q);
				943
				944	if (error) {
				945	restart = 0;
				946	} else {
				947	semop_completed = 1;
				948	do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
				949	restart = check_restart(sma, q);
				950	}
				951
				952	wake_up_sem_queue_prepare(pt, q, error);
				953	if (restart)
				954	goto again;
				955	}
				956	return semop_completed;
				957	}
				958
				959	/**
				960	* set_semotime - set sem_otime
				961	* @sma: semaphore array
				962	* @sops: operations that modified the array, may be NULL
				963	*
				964	* sem_otime is replicated to avoid cache line trashing.
				965	* This function sets one instance to the current time.
				966	*/
				967	static void set_semotime(struct sem_array sma, struct sembuf sops)
				968	{
				969	if (sops == NULL) {
				970	sma->sem_base[0].sem_otime = get_seconds();
				971	} else {
				972	sma->sem_base[sops[0].sem_num].sem_otime =
				973	get_seconds();
				974	}
				975	}
				976
				977	/**
				978	* do_smart_update - optimized update_queue
				979	* @sma: semaphore array
				980	* @sops: operations that were performed
				981	* @nsops: number of operations
				982	* @otime: force setting otime
				983	* @pt: list head of the tasks that must be woken up.
				984	*
				985	* do_smart_update() does the required calls to update_queue and wakeup_zero,
				986	* based on the actual changes that were performed on the semaphore array.
				987	* Note that the function does not do the actual wake-up: the caller is
				988	* responsible for calling wake_up_sem_queue_do(@pt).
				989	* It is safe to perform this call after dropping all locks.
				990	*/
				991	static void do_smart_update(struct sem_array sma, struct sembuf sops, int nsops,
				992	int otime, struct list_head *pt)
				993	{
				994	int i;
				995
				996	otime \|= do_smart_wakeup_zero(sma, sops, nsops, pt);
				997
				998	if (!list_empty(&sma->pending_alter)) {
				999	/* semaphore array uses the global queue - just process it. */
				1000	otime \|= update_queue(sma, -1, pt);
				1001	} else {
				1002	if (!sops) {
				1003	/*
				1004	* No sops, thus the modified semaphores are not
				1005	* known. Check all.
				1006	*/
				1007	for (i = 0; i < sma->sem_nsems; i++)
				1008	otime \|= update_queue(sma, i, pt);
				1009	} else {
				1010	/*
				1011	* Check the semaphores that were increased:
				1012	* - No complex ops, thus all sleeping ops are
				1013	* decrease.
				1014	* - if we decreased the value, then any sleeping
				1015	* semaphore ops wont be able to run: If the
				1016	* previous value was too small, then the new
				1017	* value will be too small, too.
				1018	*/
				1019	for (i = 0; i < nsops; i++) {
				1020	if (sops[i].sem_op > 0) {
				1021	otime \|= update_queue(sma,
				1022	sops[i].sem_num, pt);
				1023	}
				1024	}
				1025	}
				1026	}
				1027	if (otime)
				1028	set_semotime(sma, sops);
				1029	}
				1030
				1031	/*
				1032	* check_qop: Test if a queued operation sleeps on the semaphore semnum
				1033	*/
				1034	static int check_qop(struct sem_array sma, int semnum, struct sem_queue q,
				1035	bool count_zero)
				1036	{
				1037	struct sembuf *sop = q->blocking;
				1038
				1039	/*
				1040	* Linux always (since 0.99.10) reported a task as sleeping on all
				1041	* semaphores. This violates SUS, therefore it was changed to the
				1042	* standard compliant behavior.
				1043	* Give the administrators a chance to notice that an application
				1044	* might misbehave because it relies on the Linux behavior.
				1045	*/
				1046	pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n"
				1047	"The task %s (%d) triggered the difference, watch for misbehavior.\n",
				1048	current->comm, task_pid_nr(current));
				1049
				1050	if (sop->sem_num != semnum)
				1051	return 0;
				1052
				1053	if (count_zero && sop->sem_op == 0)
				1054	return 1;
				1055	if (!count_zero && sop->sem_op < 0)
				1056	return 1;
				1057
				1058	return 0;
				1059	}
				1060
				1061	/* The following counts are associated to each semaphore:
				1062	* semncnt number of tasks waiting on semval being nonzero
				1063	* semzcnt number of tasks waiting on semval being zero
				1064	*
				1065	* Per definition, a task waits only on the semaphore of the first semop
				1066	* that cannot proceed, even if additional operation would block, too.
				1067	*/
				1068	static int count_semcnt(struct sem_array *sma, ushort semnum,
				1069	bool count_zero)
				1070	{
				1071	struct list_head *l;
				1072	struct sem_queue *q;
				1073	int semcnt;
				1074
				1075	semcnt = 0;
				1076	/* First: check the simple operations. They are easy to evaluate */
				1077	if (count_zero)
				1078	l = &sma->sem_base[semnum].pending_const;
				1079	else
				1080	l = &sma->sem_base[semnum].pending_alter;
				1081
				1082	list_for_each_entry(q, l, list) {
				1083	/* all task on a per-semaphore list sleep on exactly
				1084	* that semaphore
				1085	*/
				1086	semcnt++;
				1087	}
				1088
				1089	/* Then: check the complex operations. */
				1090	list_for_each_entry(q, &sma->pending_alter, list) {
				1091	semcnt += check_qop(sma, semnum, q, count_zero);
				1092	}
				1093	if (count_zero) {
				1094	list_for_each_entry(q, &sma->pending_const, list) {
				1095	semcnt += check_qop(sma, semnum, q, count_zero);
				1096	}
				1097	}
				1098	return semcnt;
				1099	}
				1100
				1101	/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
				1102	* as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
				1103	* remains locked on exit.
				1104	*/
				1105	static void freeary(struct ipc_namespace ns, struct kern_ipc_perm ipcp)
				1106	{
				1107	struct sem_undo un, tu;
				1108	struct sem_queue q, tq;
				1109	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
				1110	struct list_head tasks;
				1111	int i;
				1112
				1113	/* Free the existing undo structures for this semaphore set. */
				1114	ipc_assert_locked_object(&sma->sem_perm);
				1115	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
				1116	list_del(&un->list_id);
				1117	spin_lock(&un->ulp->lock);
				1118	un->semid = -1;
				1119	list_del_rcu(&un->list_proc);
				1120	spin_unlock(&un->ulp->lock);
				1121	kfree_rcu(un, rcu);
				1122	}
				1123
				1124	/* Wake up all pending processes and let them fail with EIDRM. */
				1125	INIT_LIST_HEAD(&tasks);
				1126	list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
				1127	unlink_queue(sma, q);
				1128	wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
				1129	}
				1130
				1131	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
				1132	unlink_queue(sma, q);
				1133	wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
				1134	}
				1135	for (i = 0; i < sma->sem_nsems; i++) {
				1136	struct sem *sem = sma->sem_base + i;
				1137	list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
				1138	unlink_queue(sma, q);
				1139	wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
				1140	}
				1141	list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
				1142	unlink_queue(sma, q);
				1143	wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
				1144	}
				1145	}
				1146
				1147	/* Remove the semaphore set from the IDR */
				1148	sem_rmid(ns, sma);
				1149	sem_unlock(sma, -1);
				1150	rcu_read_unlock();
				1151
				1152	wake_up_sem_queue_do(&tasks);
				1153	ns->used_sems -= sma->sem_nsems;
				1154	ipc_rcu_putref(sma, sem_rcu_free);
				1155	}
				1156
				1157	static unsigned long copy_semid_to_user(void __user buf, struct semid64_ds in, int version)
				1158	{
				1159	switch (version) {
				1160	case IPC_64:
				1161	return copy_to_user(buf, in, sizeof(*in));
				1162	case IPC_OLD:
				1163	{
				1164	struct semid_ds out;
				1165
				1166	memset(&out, 0, sizeof(out));
				1167
				1168	ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
				1169
				1170	out.sem_otime = in->sem_otime;
				1171	out.sem_ctime = in->sem_ctime;
				1172	out.sem_nsems = in->sem_nsems;
				1173
				1174	return copy_to_user(buf, &out, sizeof(out));
				1175	}
				1176	default:
				1177	return -EINVAL;
				1178	}
				1179	}
				1180
				1181	static time_t get_semotime(struct sem_array *sma)
				1182	{
				1183	int i;
				1184	time_t res;
				1185
				1186	res = sma->sem_base[0].sem_otime;
				1187	for (i = 1; i < sma->sem_nsems; i++) {
				1188	time_t to = sma->sem_base[i].sem_otime;
				1189
				1190	if (to > res)
				1191	res = to;
				1192	}
				1193	return res;
				1194	}
				1195
				1196	static int semctl_nolock(struct ipc_namespace *ns, int semid,
				1197	int cmd, int version, void __user *p)
				1198	{
				1199	int err;
				1200	struct sem_array *sma;
				1201
				1202	switch (cmd) {
				1203	case IPC_INFO:
				1204	case SEM_INFO:
				1205	{
				1206	struct seminfo seminfo;
				1207	int max_id;
				1208
				1209	err = security_sem_semctl(NULL, cmd);
				1210	if (err)
				1211	return err;
				1212
				1213	memset(&seminfo, 0, sizeof(seminfo));
				1214	seminfo.semmni = ns->sc_semmni;
				1215	seminfo.semmns = ns->sc_semmns;
				1216	seminfo.semmsl = ns->sc_semmsl;
				1217	seminfo.semopm = ns->sc_semopm;
				1218	seminfo.semvmx = SEMVMX;
				1219	seminfo.semmnu = SEMMNU;
				1220	seminfo.semmap = SEMMAP;
				1221	seminfo.semume = SEMUME;
				1222	down_read(&sem_ids(ns).rwsem);
				1223	if (cmd == SEM_INFO) {
				1224	seminfo.semusz = sem_ids(ns).in_use;
				1225	seminfo.semaem = ns->used_sems;
				1226	} else {
				1227	seminfo.semusz = SEMUSZ;
				1228	seminfo.semaem = SEMAEM;
				1229	}
				1230	max_id = ipc_get_maxid(&sem_ids(ns));
				1231	up_read(&sem_ids(ns).rwsem);
				1232	if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
				1233	return -EFAULT;
				1234	return (max_id < 0) ? 0 : max_id;
				1235	}
				1236	case IPC_STAT:
				1237	case SEM_STAT:
				1238	{
				1239	struct semid64_ds tbuf;
				1240	int id = 0;
				1241
				1242	memset(&tbuf, 0, sizeof(tbuf));
				1243
				1244	rcu_read_lock();
				1245	if (cmd == SEM_STAT) {
				1246	sma = sem_obtain_object(ns, semid);
				1247	if (IS_ERR(sma)) {
				1248	err = PTR_ERR(sma);
				1249	goto out_unlock;
				1250	}
				1251	id = sma->sem_perm.id;
				1252	} else {
				1253	sma = sem_obtain_object_check(ns, semid);
				1254	if (IS_ERR(sma)) {
				1255	err = PTR_ERR(sma);
				1256	goto out_unlock;
				1257	}
				1258	}
				1259
				1260	err = -EACCES;
				1261	if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
				1262	goto out_unlock;
				1263
				1264	err = security_sem_semctl(sma, cmd);
				1265	if (err)
				1266	goto out_unlock;
				1267
				1268	kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
				1269	tbuf.sem_otime = get_semotime(sma);
				1270	tbuf.sem_ctime = sma->sem_ctime;
				1271	tbuf.sem_nsems = sma->sem_nsems;
				1272	rcu_read_unlock();
				1273	if (copy_semid_to_user(p, &tbuf, version))
				1274	return -EFAULT;
				1275	return id;
				1276	}
				1277	default:
				1278	return -EINVAL;
				1279	}
				1280	out_unlock:
				1281	rcu_read_unlock();
				1282	return err;
				1283	}
				1284
				1285	static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
				1286	unsigned long arg)
				1287	{
				1288	struct sem_undo *un;
				1289	struct sem_array *sma;
				1290	struct sem *curr;
				1291	int err;
				1292	struct list_head tasks;
				1293	int val;
				1294	#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
				1295	/* big-endian 64bit */
				1296	val = arg >> 32;
				1297	#else
				1298	/* 32bit or little-endian 64bit */
				1299	val = arg;
				1300	#endif
				1301
				1302	if (val > SEMVMX \|\| val < 0)
				1303	return -ERANGE;
				1304
				1305	INIT_LIST_HEAD(&tasks);
				1306
				1307	rcu_read_lock();
				1308	sma = sem_obtain_object_check(ns, semid);
				1309	if (IS_ERR(sma)) {
				1310	rcu_read_unlock();
				1311	return PTR_ERR(sma);
				1312	}
				1313
				1314	if (semnum < 0 \|\| semnum >= sma->sem_nsems) {
				1315	rcu_read_unlock();
				1316	return -EINVAL;
				1317	}
				1318
				1319
				1320	if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
				1321	rcu_read_unlock();
				1322	return -EACCES;
				1323	}
				1324
				1325	err = security_sem_semctl(sma, SETVAL);
				1326	if (err) {
				1327	rcu_read_unlock();
				1328	return -EACCES;
				1329	}
				1330
				1331	sem_lock(sma, NULL, -1);
				1332
				1333	if (!ipc_valid_object(&sma->sem_perm)) {
				1334	sem_unlock(sma, -1);
				1335	rcu_read_unlock();
				1336	return -EIDRM;
				1337	}
				1338
				1339	curr = &sma->sem_base[semnum];
				1340
				1341	ipc_assert_locked_object(&sma->sem_perm);
				1342	list_for_each_entry(un, &sma->list_id, list_id)
				1343	un->semadj[semnum] = 0;
				1344
				1345	curr->semval = val;
				1346	curr->sempid = task_tgid_vnr(current);
				1347	sma->sem_ctime = get_seconds();
				1348	/* maybe some queued-up processes were waiting for this */
				1349	do_smart_update(sma, NULL, 0, 0, &tasks);
				1350	sem_unlock(sma, -1);
				1351	rcu_read_unlock();
				1352	wake_up_sem_queue_do(&tasks);
				1353	return 0;
				1354	}
				1355
				1356	static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
				1357	int cmd, void __user *p)
				1358	{
				1359	struct sem_array *sma;
				1360	struct sem *curr;
				1361	int err, nsems;
				1362	ushort fast_sem_io[SEMMSL_FAST];
				1363	ushort *sem_io = fast_sem_io;
				1364	struct list_head tasks;
				1365
				1366	INIT_LIST_HEAD(&tasks);
				1367
				1368	rcu_read_lock();
				1369	sma = sem_obtain_object_check(ns, semid);
				1370	if (IS_ERR(sma)) {
				1371	rcu_read_unlock();
				1372	return PTR_ERR(sma);
				1373	}
				1374
				1375	nsems = sma->sem_nsems;
				1376
				1377	err = -EACCES;
				1378	if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
				1379	goto out_rcu_wakeup;
				1380
				1381	err = security_sem_semctl(sma, cmd);
				1382	if (err)
				1383	goto out_rcu_wakeup;
				1384
				1385	err = -EACCES;
				1386	switch (cmd) {
				1387	case GETALL:
				1388	{
				1389	ushort __user *array = p;
				1390	int i;
				1391
				1392	sem_lock(sma, NULL, -1);
				1393	if (!ipc_valid_object(&sma->sem_perm)) {
				1394	err = -EIDRM;
				1395	goto out_unlock;
				1396	}
				1397	if (nsems > SEMMSL_FAST) {
				1398	if (!ipc_rcu_getref(sma)) {
				1399	err = -EIDRM;
				1400	goto out_unlock;
				1401	}
				1402	sem_unlock(sma, -1);
				1403	rcu_read_unlock();
				1404	sem_io = ipc_alloc(sizeof(ushort)*nsems);
				1405	if (sem_io == NULL) {
				1406	ipc_rcu_putref(sma, sem_rcu_free);
				1407	return -ENOMEM;
				1408	}
				1409
				1410	rcu_read_lock();
				1411	sem_lock_and_putref(sma);
				1412	if (!ipc_valid_object(&sma->sem_perm)) {
				1413	err = -EIDRM;
				1414	goto out_unlock;
				1415	}
				1416	}
				1417	for (i = 0; i < sma->sem_nsems; i++)
				1418	sem_io[i] = sma->sem_base[i].semval;
				1419	sem_unlock(sma, -1);
				1420	rcu_read_unlock();
				1421	err = 0;
				1422	if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
				1423	err = -EFAULT;
				1424	goto out_free;
				1425	}
				1426	case SETALL:
				1427	{
				1428	int i;
				1429	struct sem_undo *un;
				1430
				1431	if (!ipc_rcu_getref(sma)) {
				1432	err = -EIDRM;
				1433	goto out_rcu_wakeup;
				1434	}
				1435	rcu_read_unlock();
				1436
				1437	if (nsems > SEMMSL_FAST) {
				1438	sem_io = ipc_alloc(sizeof(ushort)*nsems);
				1439	if (sem_io == NULL) {
				1440	ipc_rcu_putref(sma, sem_rcu_free);
				1441	return -ENOMEM;
				1442	}
				1443	}
				1444
				1445	if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
				1446	ipc_rcu_putref(sma, sem_rcu_free);
				1447	err = -EFAULT;
				1448	goto out_free;
				1449	}
				1450
				1451	for (i = 0; i < nsems; i++) {
				1452	if (sem_io[i] > SEMVMX) {
				1453	ipc_rcu_putref(sma, sem_rcu_free);
				1454	err = -ERANGE;
				1455	goto out_free;
				1456	}
				1457	}
				1458	rcu_read_lock();
				1459	sem_lock_and_putref(sma);
				1460	if (!ipc_valid_object(&sma->sem_perm)) {
				1461	err = -EIDRM;
				1462	goto out_unlock;
				1463	}
				1464
				1465	for (i = 0; i < nsems; i++)
				1466	sma->sem_base[i].semval = sem_io[i];
				1467
				1468	ipc_assert_locked_object(&sma->sem_perm);
				1469	list_for_each_entry(un, &sma->list_id, list_id) {
				1470	for (i = 0; i < nsems; i++)
				1471	un->semadj[i] = 0;
				1472	}
				1473	sma->sem_ctime = get_seconds();
				1474	/* maybe some queued-up processes were waiting for this */
				1475	do_smart_update(sma, NULL, 0, 0, &tasks);
				1476	err = 0;
				1477	goto out_unlock;
				1478	}
				1479	/* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
				1480	}
				1481	err = -EINVAL;
				1482	if (semnum < 0 \|\| semnum >= nsems)
				1483	goto out_rcu_wakeup;
				1484
				1485	sem_lock(sma, NULL, -1);
				1486	if (!ipc_valid_object(&sma->sem_perm)) {
				1487	err = -EIDRM;
				1488	goto out_unlock;
				1489	}
				1490	curr = &sma->sem_base[semnum];
				1491
				1492	switch (cmd) {
				1493	case GETVAL:
				1494	err = curr->semval;
				1495	goto out_unlock;
				1496	case GETPID:
				1497	err = curr->sempid;
				1498	goto out_unlock;
				1499	case GETNCNT:
				1500	err = count_semcnt(sma, semnum, 0);
				1501	goto out_unlock;
				1502	case GETZCNT:
				1503	err = count_semcnt(sma, semnum, 1);
				1504	goto out_unlock;
				1505	}
				1506
				1507	out_unlock:
				1508	sem_unlock(sma, -1);
				1509	out_rcu_wakeup:
				1510	rcu_read_unlock();
				1511	wake_up_sem_queue_do(&tasks);
				1512	out_free:
				1513	if (sem_io != fast_sem_io)
				1514	ipc_free(sem_io, sizeof(ushort)*nsems);
				1515	return err;
				1516	}
				1517
				1518	static inline unsigned long
				1519	copy_semid_from_user(struct semid64_ds out, void __user buf, int version)
				1520	{
				1521	switch (version) {
				1522	case IPC_64:
				1523	if (copy_from_user(out, buf, sizeof(*out)))
				1524	return -EFAULT;
				1525	return 0;
				1526	case IPC_OLD:
				1527	{
				1528	struct semid_ds tbuf_old;
				1529
				1530	if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
				1531	return -EFAULT;
				1532
				1533	out->sem_perm.uid = tbuf_old.sem_perm.uid;
				1534	out->sem_perm.gid = tbuf_old.sem_perm.gid;
				1535	out->sem_perm.mode = tbuf_old.sem_perm.mode;
				1536
				1537	return 0;
				1538	}
				1539	default:
				1540	return -EINVAL;
				1541	}
				1542	}
				1543
				1544	/*
				1545	* This function handles some semctl commands which require the rwsem
				1546	* to be held in write mode.
				1547	* NOTE: no locks must be held, the rwsem is taken inside this function.
				1548	*/
				1549	static int semctl_down(struct ipc_namespace *ns, int semid,
				1550	int cmd, int version, void __user *p)
				1551	{
				1552	struct sem_array *sma;
				1553	int err;
				1554	struct semid64_ds semid64;
				1555	struct kern_ipc_perm *ipcp;
				1556
				1557	if (cmd == IPC_SET) {
				1558	if (copy_semid_from_user(&semid64, p, version))
				1559	return -EFAULT;
				1560	}
				1561
				1562	down_write(&sem_ids(ns).rwsem);
				1563	rcu_read_lock();
				1564
				1565	ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
				1566	&semid64.sem_perm, 0);
				1567	if (IS_ERR(ipcp)) {
				1568	err = PTR_ERR(ipcp);
				1569	goto out_unlock1;
				1570	}
				1571
				1572	sma = container_of(ipcp, struct sem_array, sem_perm);
				1573
				1574	err = security_sem_semctl(sma, cmd);
				1575	if (err)
				1576	goto out_unlock1;
				1577
				1578	switch (cmd) {
				1579	case IPC_RMID:
				1580	sem_lock(sma, NULL, -1);
				1581	/* freeary unlocks the ipc object and rcu */
				1582	freeary(ns, ipcp);
				1583	goto out_up;
				1584	case IPC_SET:
				1585	sem_lock(sma, NULL, -1);
				1586	err = ipc_update_perm(&semid64.sem_perm, ipcp);
				1587	if (err)
				1588	goto out_unlock0;
				1589	sma->sem_ctime = get_seconds();
				1590	break;
				1591	default:
				1592	err = -EINVAL;
				1593	goto out_unlock1;
				1594	}
				1595
				1596	out_unlock0:
				1597	sem_unlock(sma, -1);
				1598	out_unlock1:
				1599	rcu_read_unlock();
				1600	out_up:
				1601	up_write(&sem_ids(ns).rwsem);
				1602	return err;
				1603	}
				1604
				1605	SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
				1606	{
				1607	int version;
				1608	struct ipc_namespace *ns;
				1609	void __user p = (void __user )arg;
				1610
				1611	if (semid < 0)
				1612	return -EINVAL;
				1613
				1614	version = ipc_parse_version(&cmd);
				1615	ns = current->nsproxy->ipc_ns;
				1616
				1617	switch (cmd) {
				1618	case IPC_INFO:
				1619	case SEM_INFO:
				1620	case IPC_STAT:
				1621	case SEM_STAT:
				1622	return semctl_nolock(ns, semid, cmd, version, p);
				1623	case GETALL:
				1624	case GETVAL:
				1625	case GETPID:
				1626	case GETNCNT:
				1627	case GETZCNT:
				1628	case SETALL:
				1629	return semctl_main(ns, semid, semnum, cmd, p);
				1630	case SETVAL:
				1631	return semctl_setval(ns, semid, semnum, arg);
				1632	case IPC_RMID:
				1633	case IPC_SET:
				1634	return semctl_down(ns, semid, cmd, version, p);
				1635	default:
				1636	return -EINVAL;
				1637	}
				1638	}
				1639
				1640	/* If the task doesn't already have a undo_list, then allocate one
				1641	* here. We guarantee there is only one thread using this undo list,
				1642	* and current is THE ONE
				1643	*
				1644	* If this allocation and assignment succeeds, but later
				1645	* portions of this code fail, there is no need to free the sem_undo_list.
				1646	* Just let it stay associated with the task, and it'll be freed later
				1647	* at exit time.
				1648	*
				1649	* This can block, so callers must hold no locks.
				1650	*/
				1651	static inline int get_undo_list(struct sem_undo_list **undo_listp)
				1652	{
				1653	struct sem_undo_list *undo_list;
				1654
				1655	undo_list = current->sysvsem.undo_list;
				1656	if (!undo_list) {
				1657	undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
				1658	if (undo_list == NULL)
				1659	return -ENOMEM;
				1660	spin_lock_init(&undo_list->lock);
				1661	atomic_set(&undo_list->refcnt, 1);
				1662	INIT_LIST_HEAD(&undo_list->list_proc);
				1663
				1664	current->sysvsem.undo_list = undo_list;
				1665	}
				1666	*undo_listp = undo_list;
				1667	return 0;
				1668	}
				1669
				1670	static struct sem_undo __lookup_undo(struct sem_undo_list ulp, int semid)
				1671	{
				1672	struct sem_undo *un;
				1673
				1674	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
				1675	if (un->semid == semid)
				1676	return un;
				1677	}
				1678	return NULL;
				1679	}
				1680
				1681	static struct sem_undo lookup_undo(struct sem_undo_list ulp, int semid)
				1682	{
				1683	struct sem_undo *un;
				1684
				1685	assert_spin_locked(&ulp->lock);
				1686
				1687	un = __lookup_undo(ulp, semid);
				1688	if (un) {
				1689	list_del_rcu(&un->list_proc);
				1690	list_add_rcu(&un->list_proc, &ulp->list_proc);
				1691	}
				1692	return un;
				1693	}
				1694
				1695	/**
				1696	* find_alloc_undo - lookup (and if not present create) undo array
				1697	* @ns: namespace
				1698	* @semid: semaphore array id
				1699	*
				1700	* The function looks up (and if not present creates) the undo structure.
				1701	* The size of the undo structure depends on the size of the semaphore
				1702	* array, thus the alloc path is not that straightforward.
				1703	* Lifetime-rules: sem_undo is rcu-protected, on success, the function
				1704	* performs a rcu_read_lock().
				1705	*/
				1706	static struct sem_undo find_alloc_undo(struct ipc_namespace ns, int semid)
				1707	{
				1708	struct sem_array *sma;
				1709	struct sem_undo_list *ulp;
				1710	struct sem_undo un, new;
				1711	int nsems, error;
				1712
				1713	error = get_undo_list(&ulp);
				1714	if (error)
				1715	return ERR_PTR(error);
				1716
				1717	rcu_read_lock();
				1718	spin_lock(&ulp->lock);
				1719	un = lookup_undo(ulp, semid);
				1720	spin_unlock(&ulp->lock);
				1721	if (likely(un != NULL))
				1722	goto out;
				1723
				1724	/* no undo structure around - allocate one. */
				1725	/* step 1: figure out the size of the semaphore array */
				1726	sma = sem_obtain_object_check(ns, semid);
				1727	if (IS_ERR(sma)) {
				1728	rcu_read_unlock();
				1729	return ERR_CAST(sma);
				1730	}
				1731
				1732	nsems = sma->sem_nsems;
				1733	if (!ipc_rcu_getref(sma)) {
				1734	rcu_read_unlock();
				1735	un = ERR_PTR(-EIDRM);
				1736	goto out;
				1737	}
				1738	rcu_read_unlock();
				1739
				1740	/* step 2: allocate new undo structure */
				1741	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
				1742	if (!new) {
				1743	ipc_rcu_putref(sma, sem_rcu_free);
				1744	return ERR_PTR(-ENOMEM);
				1745	}
				1746
				1747	/* step 3: Acquire the lock on semaphore array */
				1748	rcu_read_lock();
				1749	sem_lock_and_putref(sma);
				1750	if (!ipc_valid_object(&sma->sem_perm)) {
				1751	sem_unlock(sma, -1);
				1752	rcu_read_unlock();
				1753	kfree(new);
				1754	un = ERR_PTR(-EIDRM);
				1755	goto out;
				1756	}
				1757	spin_lock(&ulp->lock);
				1758
				1759	/*
				1760	* step 4: check for races: did someone else allocate the undo struct?
				1761	*/
				1762	un = lookup_undo(ulp, semid);
				1763	if (un) {
				1764	kfree(new);
				1765	goto success;
				1766	}
				1767	/* step 5: initialize & link new undo structure */
				1768	new->semadj = (short *) &new[1];
				1769	new->ulp = ulp;
				1770	new->semid = semid;
				1771	assert_spin_locked(&ulp->lock);
				1772	list_add_rcu(&new->list_proc, &ulp->list_proc);
				1773	ipc_assert_locked_object(&sma->sem_perm);
				1774	list_add(&new->list_id, &sma->list_id);
				1775	un = new;
				1776
				1777	success:
				1778	spin_unlock(&ulp->lock);
				1779	sem_unlock(sma, -1);
				1780	out:
				1781	return un;
				1782	}
				1783
				1784
				1785	/**
				1786	* get_queue_result - retrieve the result code from sem_queue
				1787	* @q: Pointer to queue structure
				1788	*
				1789	* Retrieve the return code from the pending queue. If IN_WAKEUP is found in
				1790	* q->status, then we must loop until the value is replaced with the final
				1791	* value: This may happen if a task is woken up by an unrelated event (e.g.
				1792	* signal) and in parallel the task is woken up by another task because it got
				1793	* the requested semaphores.
				1794	*
				1795	* The function can be called with or without holding the semaphore spinlock.
				1796	*/
				1797	static int get_queue_result(struct sem_queue *q)
				1798	{
				1799	int error;
				1800
				1801	error = q->status;
				1802	while (unlikely(error == IN_WAKEUP)) {
				1803	cpu_relax();
				1804	error = q->status;
				1805	}
				1806
				1807	return error;
				1808	}
				1809
				1810	SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
				1811	unsigned, nsops, const struct timespec __user *, timeout)
				1812	{
				1813	int error = -EINVAL;
				1814	struct sem_array *sma;
				1815	struct sembuf fast_sops[SEMOPM_FAST];
				1816	struct sembuf sops = fast_sops, sop;
				1817	struct sem_undo *un;
				1818	int undos = 0, alter = 0, max, locknum;
				1819	struct sem_queue queue;
				1820	unsigned long jiffies_left = 0;
				1821	struct ipc_namespace *ns;
				1822	struct list_head tasks;
				1823
				1824	ns = current->nsproxy->ipc_ns;
				1825
				1826	if (nsops < 1 \|\| semid < 0)
				1827	return -EINVAL;
				1828	if (nsops > ns->sc_semopm)
				1829	return -E2BIG;
				1830	if (nsops > SEMOPM_FAST) {
				1831	sops = kmalloc(sizeof(sops)nsops, GFP_KERNEL);
				1832	if (sops == NULL)
				1833	return -ENOMEM;
				1834	}
				1835	if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
				1836	error = -EFAULT;
				1837	goto out_free;
				1838	}
				1839	if (timeout) {
				1840	struct timespec _timeout;
				1841	if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
				1842	error = -EFAULT;
				1843	goto out_free;
				1844	}
				1845	if (_timeout.tv_sec < 0 \|\| _timeout.tv_nsec < 0 \|\|
				1846	_timeout.tv_nsec >= 1000000000L) {
				1847	error = -EINVAL;
				1848	goto out_free;
				1849	}
				1850	jiffies_left = timespec_to_jiffies(&_timeout);
				1851	}
				1852	max = 0;
				1853	for (sop = sops; sop < sops + nsops; sop++) {
				1854	if (sop->sem_num >= max)
				1855	max = sop->sem_num;
				1856	if (sop->sem_flg & SEM_UNDO)
				1857	undos = 1;
				1858	if (sop->sem_op != 0)
				1859	alter = 1;
				1860	}
				1861
				1862	INIT_LIST_HEAD(&tasks);
				1863
				1864	if (undos) {
				1865	/* On success, find_alloc_undo takes the rcu_read_lock */
				1866	un = find_alloc_undo(ns, semid);
				1867	if (IS_ERR(un)) {
				1868	error = PTR_ERR(un);
				1869	goto out_free;
				1870	}
				1871	} else {
				1872	un = NULL;
				1873	rcu_read_lock();
				1874	}
				1875
				1876	sma = sem_obtain_object_check(ns, semid);
				1877	if (IS_ERR(sma)) {
				1878	rcu_read_unlock();
				1879	error = PTR_ERR(sma);
				1880	goto out_free;
				1881	}
				1882
				1883	error = -EFBIG;
				1884	if (max >= sma->sem_nsems)
				1885	goto out_rcu_wakeup;
				1886
				1887	error = -EACCES;
				1888	if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
				1889	goto out_rcu_wakeup;
				1890
				1891	error = security_sem_semop(sma, sops, nsops, alter);
				1892	if (error)
				1893	goto out_rcu_wakeup;
				1894
				1895	error = -EIDRM;
				1896	locknum = sem_lock(sma, sops, nsops);
				1897	/*
				1898	* We eventually might perform the following check in a lockless
				1899	* fashion, considering ipc_valid_object() locking constraints.
				1900	* If nsops == 1 and there is no contention for sem_perm.lock, then
				1901	* only a per-semaphore lock is held and it's OK to proceed with the
				1902	* check below. More details on the fine grained locking scheme
				1903	* entangled here and why it's RMID race safe on comments at sem_lock()
				1904	*/
				1905	if (!ipc_valid_object(&sma->sem_perm))
				1906	goto out_unlock_free;
				1907	/*
				1908	* semid identifiers are not unique - find_alloc_undo may have
				1909	* allocated an undo structure, it was invalidated by an RMID
				1910	* and now a new array with received the same id. Check and fail.
				1911	* This case can be detected checking un->semid. The existence of
				1912	* "un" itself is guaranteed by rcu.
				1913	*/
				1914	if (un && un->semid == -1)
				1915	goto out_unlock_free;
				1916
				1917	queue.sops = sops;
				1918	queue.nsops = nsops;
				1919	queue.undo = un;
				1920	queue.pid = task_tgid_vnr(current);
				1921	queue.alter = alter;
				1922
				1923	error = perform_atomic_semop(sma, &queue);
				1924	if (error == 0) {
				1925	/* If the operation was successful, then do
				1926	* the required updates.
				1927	*/
				1928	if (alter)
				1929	do_smart_update(sma, sops, nsops, 1, &tasks);
				1930	else
				1931	set_semotime(sma, sops);
				1932	}
				1933	if (error <= 0)
				1934	goto out_unlock_free;
				1935
				1936	/* We need to sleep on this operation, so we put the current
				1937	* task into the pending queue and go to sleep.
				1938	*/
				1939
				1940	if (nsops == 1) {
				1941	struct sem *curr;
				1942	curr = &sma->sem_base[sops->sem_num];
				1943
				1944	if (alter) {
				1945	if (sma->complex_count) {
				1946	list_add_tail(&queue.list,
				1947	&sma->pending_alter);
				1948	} else {
				1949
				1950	list_add_tail(&queue.list,
				1951	&curr->pending_alter);
				1952	}
				1953	} else {
				1954	list_add_tail(&queue.list, &curr->pending_const);
				1955	}
				1956	} else {
				1957	if (!sma->complex_count)
				1958	merge_queues(sma);
				1959
				1960	if (alter)
				1961	list_add_tail(&queue.list, &sma->pending_alter);
				1962	else
				1963	list_add_tail(&queue.list, &sma->pending_const);
				1964
				1965	sma->complex_count++;
				1966	}
				1967
				1968	queue.status = -EINTR;
				1969	queue.sleeper = current;
				1970
				1971	sleep_again:
				1972	__set_current_state(TASK_INTERRUPTIBLE);
				1973	sem_unlock(sma, locknum);
				1974	rcu_read_unlock();
				1975
				1976	if (timeout)
				1977	jiffies_left = schedule_timeout(jiffies_left);
				1978	else
				1979	schedule();
				1980
				1981	error = get_queue_result(&queue);
				1982
				1983	if (error != -EINTR) {
				1984	/* fast path: update_queue already obtained all requested
				1985	* resources.
				1986	* Perform a smp_mb(): User space could assume that semop()
				1987	* is a memory barrier: Without the mb(), the cpu could
				1988	* speculatively read in user space stale data that was
				1989	* overwritten by the previous owner of the semaphore.
				1990	*/
				1991	smp_mb();
				1992
				1993	goto out_free;
				1994	}
				1995
				1996	rcu_read_lock();
				1997	sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum);
				1998
				1999	/*
				2000	* Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
				2001	*/
				2002	error = get_queue_result(&queue);
				2003
				2004	/*
				2005	* Array removed? If yes, leave without sem_unlock().
				2006	*/
				2007	if (IS_ERR(sma)) {
				2008	rcu_read_unlock();
				2009	goto out_free;
				2010	}
				2011
				2012
				2013	/*
				2014	* If queue.status != -EINTR we are woken up by another process.
				2015	* Leave without unlink_queue(), but with sem_unlock().
				2016	*/
				2017	if (error != -EINTR)
				2018	goto out_unlock_free;
				2019
				2020	/*
				2021	* If an interrupt occurred we have to clean up the queue
				2022	*/
				2023	if (timeout && jiffies_left == 0)
				2024	error = -EAGAIN;
				2025
				2026	/*
				2027	* If the wakeup was spurious, just retry
				2028	*/
				2029	if (error == -EINTR && !signal_pending(current))
				2030	goto sleep_again;
				2031
				2032	unlink_queue(sma, &queue);
				2033
				2034	out_unlock_free:
				2035	sem_unlock(sma, locknum);
				2036	out_rcu_wakeup:
				2037	rcu_read_unlock();
				2038	wake_up_sem_queue_do(&tasks);
				2039	out_free:
				2040	if (sops != fast_sops)
				2041	kfree(sops);
				2042	return error;
				2043	}
				2044
				2045	SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
				2046	unsigned, nsops)
				2047	{
				2048	return sys_semtimedop(semid, tsops, nsops, NULL);
				2049	}
				2050
				2051	/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
				2052	* parent and child tasks.
				2053	*/
				2054
				2055	int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
				2056	{
				2057	struct sem_undo_list *undo_list;
				2058	int error;
				2059
				2060	if (clone_flags & CLONE_SYSVSEM) {
				2061	error = get_undo_list(&undo_list);
				2062	if (error)
				2063	return error;
				2064	atomic_inc(&undo_list->refcnt);
				2065	tsk->sysvsem.undo_list = undo_list;
				2066	} else
				2067	tsk->sysvsem.undo_list = NULL;
				2068
				2069	return 0;
				2070	}
				2071
				2072	/*
				2073	* add semadj values to semaphores, free undo structures.
				2074	* undo structures are not freed when semaphore arrays are destroyed
				2075	* so some of them may be out of date.
				2076	* IMPLEMENTATION NOTE: There is some confusion over whether the
				2077	* set of adjustments that needs to be done should be done in an atomic
				2078	* manner or not. That is, if we are attempting to decrement the semval
				2079	* should we queue up and wait until we can do so legally?
				2080	* The original implementation attempted to do this (queue and wait).
				2081	* The current implementation does not do so. The POSIX standard
				2082	* and SVID should be consulted to determine what behavior is mandated.
				2083	*/
				2084	void exit_sem(struct task_struct *tsk)
				2085	{
				2086	struct sem_undo_list *ulp;
				2087
				2088	ulp = tsk->sysvsem.undo_list;
				2089	if (!ulp)
				2090	return;
				2091	tsk->sysvsem.undo_list = NULL;
				2092
				2093	if (!atomic_dec_and_test(&ulp->refcnt))
				2094	return;
				2095
				2096	for (;;) {
				2097	struct sem_array *sma;
				2098	struct sem_undo *un;
				2099	struct list_head tasks;
				2100	int semid, i;
				2101
				2102	rcu_read_lock();
				2103	un = list_entry_rcu(ulp->list_proc.next,
				2104	struct sem_undo, list_proc);
				2105	if (&un->list_proc == &ulp->list_proc) {
				2106	/*
				2107	* We must wait for freeary() before freeing this ulp,
				2108	* in case we raced with last sem_undo. There is a small
				2109	* possibility where we exit while freeary() didn't
				2110	* finish unlocking sem_undo_list.
				2111	*/
				2112	spin_unlock_wait(&ulp->lock);
				2113	rcu_read_unlock();
				2114	break;
				2115	}
				2116	spin_lock(&ulp->lock);
				2117	semid = un->semid;
				2118	spin_unlock(&ulp->lock);
				2119
				2120	/* exit_sem raced with IPC_RMID, nothing to do */
				2121	if (semid == -1) {
				2122	rcu_read_unlock();
				2123	continue;
				2124	}
				2125
				2126	sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid);
				2127	/* exit_sem raced with IPC_RMID, nothing to do */
				2128	if (IS_ERR(sma)) {
				2129	rcu_read_unlock();
				2130	continue;
				2131	}
				2132
				2133	sem_lock(sma, NULL, -1);
				2134	/* exit_sem raced with IPC_RMID, nothing to do */
				2135	if (!ipc_valid_object(&sma->sem_perm)) {
				2136	sem_unlock(sma, -1);
				2137	rcu_read_unlock();
				2138	continue;
				2139	}
				2140	un = __lookup_undo(ulp, semid);
				2141	if (un == NULL) {
				2142	/* exit_sem raced with IPC_RMID+semget() that created
				2143	* exactly the same semid. Nothing to do.
				2144	*/
				2145	sem_unlock(sma, -1);
				2146	rcu_read_unlock();
				2147	continue;
				2148	}
				2149
				2150	/* remove un from the linked lists */
				2151	ipc_assert_locked_object(&sma->sem_perm);
				2152	list_del(&un->list_id);
				2153
				2154	/* we are the last process using this ulp, acquiring ulp->lock
				2155	* isn't required. Besides that, we are also protected against
				2156	* IPC_RMID as we hold sma->sem_perm lock now
				2157	*/
				2158	list_del_rcu(&un->list_proc);
				2159
				2160	/* perform adjustments registered in un */
				2161	for (i = 0; i < sma->sem_nsems; i++) {
				2162	struct sem *semaphore = &sma->sem_base[i];
				2163	if (un->semadj[i]) {
				2164	semaphore->semval += un->semadj[i];
				2165	/*
				2166	* Range checks of the new semaphore value,
				2167	* not defined by sus:
				2168	* - Some unices ignore the undo entirely
				2169	* (e.g. HP UX 11i 11.22, Tru64 V5.1)
				2170	* - some cap the value (e.g. FreeBSD caps
				2171	* at 0, but doesn't enforce SEMVMX)
				2172	*
				2173	* Linux caps the semaphore value, both at 0
				2174	* and at SEMVMX.
				2175	*
				2176	* Manfred <manfred@colorfullife.com>
				2177	*/
				2178	if (semaphore->semval < 0)
				2179	semaphore->semval = 0;
				2180	if (semaphore->semval > SEMVMX)
				2181	semaphore->semval = SEMVMX;
				2182	semaphore->sempid = task_tgid_vnr(current);
				2183	}
				2184	}
				2185	/* maybe some queued-up processes were waiting for this */
				2186	INIT_LIST_HEAD(&tasks);
				2187	do_smart_update(sma, NULL, 0, 1, &tasks);
				2188	sem_unlock(sma, -1);
				2189	rcu_read_unlock();
				2190	wake_up_sem_queue_do(&tasks);
				2191
				2192	kfree_rcu(un, rcu);
				2193	}
				2194	kfree(ulp);
				2195	}
				2196
				2197	#ifdef CONFIG_PROC_FS
				2198	static int sysvipc_sem_proc_show(struct seq_file s, void it)
				2199	{
				2200	struct user_namespace *user_ns = seq_user_ns(s);
				2201	struct sem_array *sma = it;
				2202	time_t sem_otime;
				2203
				2204	/*
				2205	* The proc interface isn't aware of sem_lock(), it calls
				2206	* ipc_lock_object() directly (in sysvipc_find_ipc).
				2207	* In order to stay compatible with sem_lock(), we must
				2208	* enter / leave complex_mode.
				2209	*/
				2210	complexmode_enter(sma);
				2211
				2212	sem_otime = get_semotime(sma);
				2213
				2214	seq_printf(s,
				2215	"%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
				2216	sma->sem_perm.key,
				2217	sma->sem_perm.id,
				2218	sma->sem_perm.mode,
				2219	sma->sem_nsems,
				2220	from_kuid_munged(user_ns, sma->sem_perm.uid),
				2221	from_kgid_munged(user_ns, sma->sem_perm.gid),
				2222	from_kuid_munged(user_ns, sma->sem_perm.cuid),
				2223	from_kgid_munged(user_ns, sma->sem_perm.cgid),
				2224	sem_otime,
				2225	sma->sem_ctime);
				2226
				2227	complexmode_tryleave(sma);
				2228
				2229	return 0;
				2230	}
				2231	#endif