Blame - security/commoncap.c - codeaurora/cp-linux

blob: 48071ed7c445d025fa4ae57c12f032bfa916521f [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1	/* Common capabilities, needed by capability.o.
				2	*
				3	* This program is free software; you can redistribute it and/or modify
				4	* it under the terms of the GNU General Public License as published by
				5	* the Free Software Foundation; either version 2 of the License, or
				6	* (at your option) any later version.
				7	*
				8	*/
				9
				10	#include <linux/capability.h>
				11	#include <linux/audit.h>
				12	#include <linux/module.h>
				13	#include <linux/init.h>
				14	#include <linux/kernel.h>
				15	#include <linux/lsm_hooks.h>
				16	#include <linux/file.h>
				17	#include <linux/mm.h>
				18	#include <linux/mman.h>
				19	#include <linux/pagemap.h>
				20	#include <linux/swap.h>
				21	#include <linux/skbuff.h>
				22	#include <linux/netlink.h>
				23	#include <linux/ptrace.h>
				24	#include <linux/xattr.h>
				25	#include <linux/hugetlb.h>
				26	#include <linux/mount.h>
				27	#include <linux/sched.h>
				28	#include <linux/prctl.h>
				29	#include <linux/securebits.h>
				30	#include <linux/user_namespace.h>
				31	#include <linux/binfmts.h>
				32	#include <linux/personality.h>
				33
				34	/*
				35	* If a non-root user executes a setuid-root binary in
				36	* !secure(SECURE_NOROOT) mode, then we raise capabilities.
				37	* However if fE is also set, then the intent is for only
				38	* the file capabilities to be applied, and the setuid-root
				39	* bit is left on either to change the uid (plausible) or
				40	* to get full privilege on a kernel without file capabilities
				41	* support. So in that case we do not raise capabilities.
				42	*
				43	* Warn if that happens, once per boot.
				44	*/
				45	static void warn_setuid_and_fcaps_mixed(const char *fname)
				46	{
				47	static int warned;
				48	if (!warned) {
				49	printk(KERN_INFO "warning: `%s' has both setuid-root and"
				50	" effective capabilities. Therefore not raising all"
				51	" capabilities.\n", fname);
				52	warned = 1;
				53	}
				54	}
				55
				56	/**
				57	* cap_capable - Determine whether a task has a particular effective capability
				58	* @cred: The credentials to use
				59	* @ns: The user namespace in which we need the capability
				60	* @cap: The capability to check for
				61	* @audit: Whether to write an audit message or not
				62	*
				63	* Determine whether the nominated task has the specified capability amongst
				64	* its effective set, returning 0 if it does, -ve if it does not.
				65	*
				66	* NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
				67	* and has_capability() functions. That is, it has the reverse semantics:
				68	* cap_has_capability() returns 0 when a task has a capability, but the
				69	* kernel's capable() and has_capability() returns 1 for this case.
				70	*/
				71	int cap_capable(const struct cred cred, struct user_namespace targ_ns,
				72	int cap, int audit)
				73	{
				74	struct user_namespace *ns = targ_ns;
				75
				76	/* See if cred has the capability in the target user namespace
				77	* by examining the target user namespace and all of the target
				78	* user namespace's parents.
				79	*/
				80	for (;;) {
				81	/* Do we have the necessary capabilities? */
				82	if (ns == cred->user_ns)
				83	return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
				84
				85	/* Have we tried all of the parent namespaces? */
				86	if (ns == &init_user_ns)
				87	return -EPERM;
				88
				89	/*
				90	* The owner of the user namespace in the parent of the
				91	* user namespace has all caps.
				92	*/
				93	if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
				94	return 0;
				95
				96	/*
				97	* If you have a capability in a parent user ns, then you have
				98	* it over all children user namespaces as well.
				99	*/
				100	ns = ns->parent;
				101	}
				102
				103	/* We never get here */
				104	}
				105
				106	/**
				107	* cap_settime - Determine whether the current process may set the system clock
				108	* @ts: The time to set
				109	* @tz: The timezone to set
				110	*
				111	* Determine whether the current process may set the system clock and timezone
				112	* information, returning 0 if permission granted, -ve if denied.
				113	*/
				114	int cap_settime(const struct timespec ts, const struct timezone tz)
				115	{
				116	if (!capable(CAP_SYS_TIME))
				117	return -EPERM;
				118	return 0;
				119	}
				120
				121	/**
				122	* cap_ptrace_access_check - Determine whether the current process may access
				123	* another
				124	* @child: The process to be accessed
				125	* @mode: The mode of attachment.
				126	*
				127	* If we are in the same or an ancestor user_ns and have all the target
				128	* task's capabilities, then ptrace access is allowed.
				129	* If we have the ptrace capability to the target user_ns, then ptrace
				130	* access is allowed.
				131	* Else denied.
				132	*
				133	* Determine whether a process may access another, returning 0 if permission
				134	* granted, -ve if denied.
				135	*/
				136	int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
				137	{
				138	int ret = 0;
				139	const struct cred cred, child_cred;
				140	const kernel_cap_t *caller_caps;
				141
				142	rcu_read_lock();
				143	cred = current_cred();
				144	child_cred = __task_cred(child);
				145	if (mode & PTRACE_MODE_FSCREDS)
				146	caller_caps = &cred->cap_effective;
				147	else
				148	caller_caps = &cred->cap_permitted;
				149	if (cred->user_ns == child_cred->user_ns &&
				150	cap_issubset(child_cred->cap_permitted, *caller_caps))
				151	goto out;
				152	if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
				153	goto out;
				154	ret = -EPERM;
				155	out:
				156	rcu_read_unlock();
				157	return ret;
				158	}
				159
				160	/**
				161	* cap_ptrace_traceme - Determine whether another process may trace the current
				162	* @parent: The task proposed to be the tracer
				163	*
				164	* If parent is in the same or an ancestor user_ns and has all current's
				165	* capabilities, then ptrace access is allowed.
				166	* If parent has the ptrace capability to current's user_ns, then ptrace
				167	* access is allowed.
				168	* Else denied.
				169	*
				170	* Determine whether the nominated task is permitted to trace the current
				171	* process, returning 0 if permission is granted, -ve if denied.
				172	*/
				173	int cap_ptrace_traceme(struct task_struct *parent)
				174	{
				175	int ret = 0;
				176	const struct cred cred, child_cred;
				177
				178	rcu_read_lock();
				179	cred = __task_cred(parent);
				180	child_cred = current_cred();
				181	if (cred->user_ns == child_cred->user_ns &&
				182	cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
				183	goto out;
				184	if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
				185	goto out;
				186	ret = -EPERM;
				187	out:
				188	rcu_read_unlock();
				189	return ret;
				190	}
				191
				192	/**
				193	* cap_capget - Retrieve a task's capability sets
				194	* @target: The task from which to retrieve the capability sets
				195	* @effective: The place to record the effective set
				196	* @inheritable: The place to record the inheritable set
				197	* @permitted: The place to record the permitted set
				198	*
				199	* This function retrieves the capabilities of the nominated task and returns
				200	* them to the caller.
				201	*/
				202	int cap_capget(struct task_struct target, kernel_cap_t effective,
				203	kernel_cap_t inheritable, kernel_cap_t permitted)
				204	{
				205	const struct cred *cred;
				206
				207	/* Derived from kernel/capability.c:sys_capget. */
				208	rcu_read_lock();
				209	cred = __task_cred(target);
				210	*effective = cred->cap_effective;
				211	*inheritable = cred->cap_inheritable;
				212	*permitted = cred->cap_permitted;
				213	rcu_read_unlock();
				214	return 0;
				215	}
				216
				217	/*
				218	* Determine whether the inheritable capabilities are limited to the old
				219	* permitted set. Returns 1 if they are limited, 0 if they are not.
				220	*/
				221	static inline int cap_inh_is_capped(void)
				222	{
				223
				224	/* they are so limited unless the current task has the CAP_SETPCAP
				225	* capability
				226	*/
				227	if (cap_capable(current_cred(), current_cred()->user_ns,
				228	CAP_SETPCAP, SECURITY_CAP_AUDIT) == 0)
				229	return 0;
				230	return 1;
				231	}
				232
				233	/**
				234	* cap_capset - Validate and apply proposed changes to current's capabilities
				235	* @new: The proposed new credentials; alterations should be made here
				236	* @old: The current task's current credentials
				237	* @effective: A pointer to the proposed new effective capabilities set
				238	* @inheritable: A pointer to the proposed new inheritable capabilities set
				239	* @permitted: A pointer to the proposed new permitted capabilities set
				240	*
				241	* This function validates and applies a proposed mass change to the current
				242	* process's capability sets. The changes are made to the proposed new
				243	* credentials, and assuming no error, will be committed by the caller of LSM.
				244	*/
				245	int cap_capset(struct cred *new,
				246	const struct cred *old,
				247	const kernel_cap_t *effective,
				248	const kernel_cap_t *inheritable,
				249	const kernel_cap_t *permitted)
				250	{
				251	if (cap_inh_is_capped() &&
				252	!cap_issubset(*inheritable,
				253	cap_combine(old->cap_inheritable,
				254	old->cap_permitted)))
				255	/* incapable of using this inheritable set */
				256	return -EPERM;
				257
				258	if (!cap_issubset(*inheritable,
				259	cap_combine(old->cap_inheritable,
				260	old->cap_bset)))
				261	/* no new pI capabilities outside bounding set */
				262	return -EPERM;
				263
				264	/* verify restrictions on target's new Permitted set */
				265	if (!cap_issubset(*permitted, old->cap_permitted))
				266	return -EPERM;
				267
				268	/* verify the _new_Effective_ is a subset of the _new_Permitted_ */
				269	if (!cap_issubset(effective, permitted))
				270	return -EPERM;
				271
				272	new->cap_effective = *effective;
				273	new->cap_inheritable = *inheritable;
				274	new->cap_permitted = *permitted;
				275
				276	/*
				277	* Mask off ambient bits that are no longer both permitted and
				278	* inheritable.
				279	*/
				280	new->cap_ambient = cap_intersect(new->cap_ambient,
				281	cap_intersect(*permitted,
				282	*inheritable));
				283	if (WARN_ON(!cap_ambient_invariant_ok(new)))
				284	return -EINVAL;
				285	return 0;
				286	}
				287
				288	/*
				289	* Clear proposed capability sets for execve().
				290	*/
				291	static inline void bprm_clear_caps(struct linux_binprm *bprm)
				292	{
				293	cap_clear(bprm->cred->cap_permitted);
				294	bprm->cap_effective = false;
				295	}
				296
				297	/**
				298	* cap_inode_need_killpriv - Determine if inode change affects privileges
				299	* @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
				300	*
				301	* Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
				302	* affects the security markings on that inode, and if it is, should
				303	* inode_killpriv() be invoked or the change rejected?
				304	*
				305	* Returns 0 if granted; +ve if granted, but inode_killpriv() is required; and
				306	* -ve to deny the change.
				307	*/
				308	int cap_inode_need_killpriv(struct dentry *dentry)
				309	{
				310	struct inode *inode = d_backing_inode(dentry);
				311	int error;
				312
				313	if (!inode->i_op->getxattr)
				314	return 0;
				315
				316	error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
				317	if (error <= 0)
				318	return 0;
				319	return 1;
				320	}
				321
				322	/**
				323	* cap_inode_killpriv - Erase the security markings on an inode
				324	* @dentry: The inode/dentry to alter
				325	*
				326	* Erase the privilege-enhancing security markings on an inode.
				327	*
				328	* Returns 0 if successful, -ve on error.
				329	*/
				330	int cap_inode_killpriv(struct dentry *dentry)
				331	{
				332	struct inode *inode = d_backing_inode(dentry);
				333
				334	if (!inode->i_op->removexattr)
				335	return 0;
				336
				337	return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
				338	}
				339
				340	/*
				341	* Calculate the new process capability sets from the capability sets attached
				342	* to a file.
				343	*/
				344	static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
				345	struct linux_binprm *bprm,
				346	bool *effective,
				347	bool *has_cap)
				348	{
				349	struct cred *new = bprm->cred;
				350	unsigned i;
				351	int ret = 0;
				352
				353	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
				354	*effective = true;
				355
				356	if (caps->magic_etc & VFS_CAP_REVISION_MASK)
				357	*has_cap = true;
				358
				359	CAP_FOR_EACH_U32(i) {
				360	__u32 permitted = caps->permitted.cap[i];
				361	__u32 inheritable = caps->inheritable.cap[i];
				362
				363	/*
				364	* pP' = (X & fP) \| (pI & fI)
				365	* The addition of pA' is handled later.
				366	*/
				367	new->cap_permitted.cap[i] =
				368	(new->cap_bset.cap[i] & permitted) \|
				369	(new->cap_inheritable.cap[i] & inheritable);
				370
				371	if (permitted & ~new->cap_permitted.cap[i])
				372	/* insufficient to execute correctly */
				373	ret = -EPERM;
				374	}
				375
				376	/*
				377	* For legacy apps, with no internal support for recognizing they
				378	* do not have enough capabilities, we return an error if they are
				379	* missing some "forced" (aka file-permitted) capabilities.
				380	*/
				381	return *effective ? ret : 0;
				382	}
				383
				384	/*
				385	* Extract the on-exec-apply capability sets for an executable file.
				386	*/
				387	int get_vfs_caps_from_disk(const struct dentry dentry, struct cpu_vfs_cap_data cpu_caps)
				388	{
				389	struct inode *inode = d_backing_inode(dentry);
				390	__u32 magic_etc;
				391	unsigned tocopy, i;
				392	int size;
				393	struct vfs_cap_data caps;
				394
				395	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
				396
				397	if (!inode \|\| !inode->i_op->getxattr)
				398	return -ENODATA;
				399
				400	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_CAPS, &caps,
				401	XATTR_CAPS_SZ);
				402	if (size == -ENODATA \|\| size == -EOPNOTSUPP)
				403	/* no data, that's ok */
				404	return -ENODATA;
				405	if (size < 0)
				406	return size;
				407
				408	if (size < sizeof(magic_etc))
				409	return -EINVAL;
				410
				411	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
				412
				413	switch (magic_etc & VFS_CAP_REVISION_MASK) {
				414	case VFS_CAP_REVISION_1:
				415	if (size != XATTR_CAPS_SZ_1)
				416	return -EINVAL;
				417	tocopy = VFS_CAP_U32_1;
				418	break;
				419	case VFS_CAP_REVISION_2:
				420	if (size != XATTR_CAPS_SZ_2)
				421	return -EINVAL;
				422	tocopy = VFS_CAP_U32_2;
				423	break;
				424	default:
				425	return -EINVAL;
				426	}
				427
				428	CAP_FOR_EACH_U32(i) {
				429	if (i >= tocopy)
				430	break;
				431	cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
				432	cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
				433	}
				434
				435	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
				436	cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
				437
				438	return 0;
				439	}
				440
				441	/*
				442	* Attempt to get the on-exec apply capability sets for an executable file from
				443	* its xattrs and, if present, apply them to the proposed credentials being
				444	* constructed by execve().
				445	*/
				446	static int get_file_caps(struct linux_binprm bprm, bool effective, bool *has_cap)
				447	{
				448	int rc = 0;
				449	struct cpu_vfs_cap_data vcaps;
				450
				451	bprm_clear_caps(bprm);
				452
				453	if (!file_caps_enabled)
				454	return 0;
				455
				456	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
				457	return 0;
				458
				459	rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
				460	if (rc < 0) {
				461	if (rc == -EINVAL)
				462	printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
				463	__func__, rc, bprm->filename);
				464	else if (rc == -ENODATA)
				465	rc = 0;
				466	goto out;
				467	}
				468
				469	rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_cap);
				470	if (rc == -EINVAL)
				471	printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n",
				472	__func__, rc, bprm->filename);
				473
				474	out:
				475	if (rc)
				476	bprm_clear_caps(bprm);
				477
				478	return rc;
				479	}
				480
				481	/**
				482	* cap_bprm_set_creds - Set up the proposed credentials for execve().
				483	* @bprm: The execution parameters, including the proposed creds
				484	*
				485	* Set up the proposed credentials for a new execution context being
				486	* constructed by execve(). The proposed creds in @bprm->cred is altered,
				487	* which won't take effect immediately. Returns 0 if successful, -ve on error.
				488	*/
				489	int cap_bprm_set_creds(struct linux_binprm *bprm)
				490	{
				491	const struct cred *old = current_cred();
				492	struct cred *new = bprm->cred;
				493	bool effective, has_cap = false, is_setid;
				494	int ret;
				495	kuid_t root_uid;
				496
				497	if (WARN_ON(!cap_ambient_invariant_ok(old)))
				498	return -EPERM;
				499
				500	effective = false;
				501	ret = get_file_caps(bprm, &effective, &has_cap);
				502	if (ret < 0)
				503	return ret;
				504
				505	root_uid = make_kuid(new->user_ns, 0);
				506
				507	if (!issecure(SECURE_NOROOT)) {
				508	/*
				509	* If the legacy file capability is set, then don't set privs
				510	* for a setuid root binary run by a non-root user. Do set it
				511	* for a root user just to cause least surprise to an admin.
				512	*/
				513	if (has_cap && !uid_eq(new->uid, root_uid) && uid_eq(new->euid, root_uid)) {
				514	warn_setuid_and_fcaps_mixed(bprm->filename);
				515	goto skip;
				516	}
				517	/*
				518	* To support inheritance of root-permissions and suid-root
				519	* executables under compatibility mode, we override the
				520	* capability sets for the file.
				521	*
				522	* If only the real uid is 0, we do not set the effective bit.
				523	*/
				524	if (uid_eq(new->euid, root_uid) \|\| uid_eq(new->uid, root_uid)) {
				525	/* pP' = (cap_bset & ~0) \| (pI & ~0) */
				526	new->cap_permitted = cap_combine(old->cap_bset,
				527	old->cap_inheritable);
				528	}
				529	if (uid_eq(new->euid, root_uid))
				530	effective = true;
				531	}
				532	skip:
				533
				534	/* if we have fs caps, clear dangerous personality flags */
				535	if (!cap_issubset(new->cap_permitted, old->cap_permitted))
				536	bprm->per_clear \|= PER_CLEAR_ON_SETID;
				537
				538
				539	/* Don't let someone trace a set[ug]id/setpcap binary with the revised
				540	* credentials unless they have the appropriate permit.
				541	*
				542	* In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
				543	*/
				544	is_setid = !uid_eq(new->euid, old->uid) \|\| !gid_eq(new->egid, old->gid);
				545
				546	if ((is_setid \|\|
				547	!cap_issubset(new->cap_permitted, old->cap_permitted)) &&
				548	bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
				549	/* downgrade; they get no more than they had, and maybe less */
				550	if (!capable(CAP_SETUID) \|\|
				551	(bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
				552	new->euid = new->uid;
				553	new->egid = new->gid;
				554	}
				555	new->cap_permitted = cap_intersect(new->cap_permitted,
				556	old->cap_permitted);
				557	}
				558
				559	new->suid = new->fsuid = new->euid;
				560	new->sgid = new->fsgid = new->egid;
				561
				562	/* File caps or setid cancels ambient. */
				563	if (has_cap \|\| is_setid)
				564	cap_clear(new->cap_ambient);
				565
				566	/*
				567	* Now that we've computed pA', update pP' to give:
				568	* pP' = (X & fP) \| (pI & fI) \| pA'
				569	*/
				570	new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
				571
				572	/*
				573	* Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set,
				574	* this is the same as pE' = (fE ? pP' : 0) \| pA'.
				575	*/
				576	if (effective)
				577	new->cap_effective = new->cap_permitted;
				578	else
				579	new->cap_effective = new->cap_ambient;
				580
				581	if (WARN_ON(!cap_ambient_invariant_ok(new)))
				582	return -EPERM;
				583
				584	bprm->cap_effective = effective;
				585
				586	/*
				587	* Audit candidate if current->cap_effective is set
				588	*
				589	* We do not bother to audit if 3 things are true:
				590	* 1) cap_effective has all caps
				591	* 2) we are root
				592	* 3) root is supposed to have all caps (SECURE_NOROOT)
				593	* Since this is just a normal root execing a process.
				594	*
				595	* Number 1 above might fail if you don't have a full bset, but I think
				596	* that is interesting information to audit.
				597	*/
				598	if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
				599	if (!cap_issubset(CAP_FULL_SET, new->cap_effective) \|\|
				600	!uid_eq(new->euid, root_uid) \|\| !uid_eq(new->uid, root_uid) \|\|
				601	issecure(SECURE_NOROOT)) {
				602	ret = audit_log_bprm_fcaps(bprm, new, old);
				603	if (ret < 0)
				604	return ret;
				605	}
				606	}
				607
				608	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
				609
				610	if (WARN_ON(!cap_ambient_invariant_ok(new)))
				611	return -EPERM;
				612
				613	return 0;
				614	}
				615
				616	/**
				617	* cap_bprm_secureexec - Determine whether a secure execution is required
				618	* @bprm: The execution parameters
				619	*
				620	* Determine whether a secure execution is required, return 1 if it is, and 0
				621	* if it is not.
				622	*
				623	* The credentials have been committed by this point, and so are no longer
				624	* available through @bprm->cred.
				625	*/
				626	int cap_bprm_secureexec(struct linux_binprm *bprm)
				627	{
				628	const struct cred *cred = current_cred();
				629	kuid_t root_uid = make_kuid(cred->user_ns, 0);
				630
				631	if (!uid_eq(cred->uid, root_uid)) {
				632	if (bprm->cap_effective)
				633	return 1;
				634	if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
				635	return 1;
				636	}
				637
				638	return (!uid_eq(cred->euid, cred->uid) \|\|
				639	!gid_eq(cred->egid, cred->gid));
				640	}
				641
				642	/**
				643	* cap_inode_setxattr - Determine whether an xattr may be altered
				644	* @dentry: The inode/dentry being altered
				645	* @name: The name of the xattr to be changed
				646	* @value: The value that the xattr will be changed to
				647	* @size: The size of value
				648	* @flags: The replacement flag
				649	*
				650	* Determine whether an xattr may be altered or set on an inode, returning 0 if
				651	* permission is granted, -ve if denied.
				652	*
				653	* This is used to make sure security xattrs don't get updated or set by those
				654	* who aren't privileged to do so.
				655	*/
				656	int cap_inode_setxattr(struct dentry dentry, const char name,
				657	const void *value, size_t size, int flags)
				658	{
				659	if (!strcmp(name, XATTR_NAME_CAPS)) {
				660	if (!capable(CAP_SETFCAP))
				661	return -EPERM;
				662	return 0;
				663	}
				664
				665	if (!strncmp(name, XATTR_SECURITY_PREFIX,
				666	sizeof(XATTR_SECURITY_PREFIX) - 1) &&
				667	!capable(CAP_SYS_ADMIN))
				668	return -EPERM;
				669	return 0;
				670	}
				671
				672	/**
				673	* cap_inode_removexattr - Determine whether an xattr may be removed
				674	* @dentry: The inode/dentry being altered
				675	* @name: The name of the xattr to be changed
				676	*
				677	* Determine whether an xattr may be removed from an inode, returning 0 if
				678	* permission is granted, -ve if denied.
				679	*
				680	* This is used to make sure security xattrs don't get removed by those who
				681	* aren't privileged to remove them.
				682	*/
				683	int cap_inode_removexattr(struct dentry dentry, const char name)
				684	{
				685	if (!strcmp(name, XATTR_NAME_CAPS)) {
				686	if (!capable(CAP_SETFCAP))
				687	return -EPERM;
				688	return 0;
				689	}
				690
				691	if (!strncmp(name, XATTR_SECURITY_PREFIX,
				692	sizeof(XATTR_SECURITY_PREFIX) - 1) &&
				693	!capable(CAP_SYS_ADMIN))
				694	return -EPERM;
				695	return 0;
				696	}
				697
				698	/*
				699	* cap_emulate_setxuid() fixes the effective / permitted capabilities of
				700	* a process after a call to setuid, setreuid, or setresuid.
				701	*
				702	* 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
				703	* {r,e,s}uid != 0, the permitted and effective capabilities are
				704	* cleared.
				705	*
				706	* 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
				707	* capabilities of the process are cleared.
				708	*
				709	* 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
				710	* capabilities are set to the permitted capabilities.
				711	*
				712	* fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
				713	* never happen.
				714	*
				715	* -astor
				716	*
				717	* cevans - New behaviour, Oct '99
				718	* A process may, via prctl(), elect to keep its capabilities when it
				719	* calls setuid() and switches away from uid==0. Both permitted and
				720	* effective sets will be retained.
				721	* Without this change, it was impossible for a daemon to drop only some
				722	* of its privilege. The call to setuid(!=0) would drop all privileges!
				723	* Keeping uid 0 is not an option because uid 0 owns too many vital
				724	* files..
				725	* Thanks to Olaf Kirch and Peter Benie for spotting this.
				726	*/
				727	static inline void cap_emulate_setxuid(struct cred new, const struct cred old)
				728	{
				729	kuid_t root_uid = make_kuid(old->user_ns, 0);
				730
				731	if ((uid_eq(old->uid, root_uid) \|\|
				732	uid_eq(old->euid, root_uid) \|\|
				733	uid_eq(old->suid, root_uid)) &&
				734	(!uid_eq(new->uid, root_uid) &&
				735	!uid_eq(new->euid, root_uid) &&
				736	!uid_eq(new->suid, root_uid))) {
				737	if (!issecure(SECURE_KEEP_CAPS)) {
				738	cap_clear(new->cap_permitted);
				739	cap_clear(new->cap_effective);
				740	}
				741
				742	/*
				743	* Pre-ambient programs expect setresuid to nonroot followed
				744	* by exec to drop capabilities. We should make sure that
				745	* this remains the case.
				746	*/
				747	cap_clear(new->cap_ambient);
				748	}
				749	if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
				750	cap_clear(new->cap_effective);
				751	if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
				752	new->cap_effective = new->cap_permitted;
				753	}
				754
				755	/**
				756	* cap_task_fix_setuid - Fix up the results of setuid() call
				757	* @new: The proposed credentials
				758	* @old: The current task's current credentials
				759	* @flags: Indications of what has changed
				760	*
				761	* Fix up the results of setuid() call before the credential changes are
				762	* actually applied, returning 0 to grant the changes, -ve to deny them.
				763	*/
				764	int cap_task_fix_setuid(struct cred new, const struct cred old, int flags)
				765	{
				766	switch (flags) {
				767	case LSM_SETID_RE:
				768	case LSM_SETID_ID:
				769	case LSM_SETID_RES:
				770	/* juggle the capabilities to follow [RES]UID changes unless
				771	* otherwise suppressed */
				772	if (!issecure(SECURE_NO_SETUID_FIXUP))
				773	cap_emulate_setxuid(new, old);
				774	break;
				775
				776	case LSM_SETID_FS:
				777	/* juggle the capabilties to follow FSUID changes, unless
				778	* otherwise suppressed
				779	*
				780	* FIXME - is fsuser used for all CAP_FS_MASK capabilities?
				781	* if not, we might be a bit too harsh here.
				782	*/
				783	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
				784	kuid_t root_uid = make_kuid(old->user_ns, 0);
				785	if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
				786	new->cap_effective =
				787	cap_drop_fs_set(new->cap_effective);
				788
				789	if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
				790	new->cap_effective =
				791	cap_raise_fs_set(new->cap_effective,
				792	new->cap_permitted);
				793	}
				794	break;
				795
				796	default:
				797	return -EINVAL;
				798	}
				799
				800	return 0;
				801	}
				802
				803	/*
				804	* Rationale: code calling task_setscheduler, task_setioprio, and
				805	* task_setnice, assumes that
				806	* . if capable(cap_sys_nice), then those actions should be allowed
				807	* . if not capable(cap_sys_nice), but acting on your own processes,
				808	* then those actions should be allowed
				809	* This is insufficient now since you can call code without suid, but
				810	* yet with increased caps.
				811	* So we check for increased caps on the target process.
				812	*/
				813	static int cap_safe_nice(struct task_struct *p)
				814	{
				815	int is_subset, ret = 0;
				816
				817	rcu_read_lock();
				818	is_subset = cap_issubset(__task_cred(p)->cap_permitted,
				819	current_cred()->cap_permitted);
				820	if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
				821	ret = -EPERM;
				822	rcu_read_unlock();
				823
				824	return ret;
				825	}
				826
				827	/**
				828	* cap_task_setscheduler - Detemine if scheduler policy change is permitted
				829	* @p: The task to affect
				830	*
				831	* Detemine if the requested scheduler policy change is permitted for the
				832	* specified task, returning 0 if permission is granted, -ve if denied.
				833	*/
				834	int cap_task_setscheduler(struct task_struct *p)
				835	{
				836	return cap_safe_nice(p);
				837	}
				838
				839	/**
				840	* cap_task_ioprio - Detemine if I/O priority change is permitted
				841	* @p: The task to affect
				842	* @ioprio: The I/O priority to set
				843	*
				844	* Detemine if the requested I/O priority change is permitted for the specified
				845	* task, returning 0 if permission is granted, -ve if denied.
				846	*/
				847	int cap_task_setioprio(struct task_struct *p, int ioprio)
				848	{
				849	return cap_safe_nice(p);
				850	}
				851
				852	/**
				853	* cap_task_ioprio - Detemine if task priority change is permitted
				854	* @p: The task to affect
				855	* @nice: The nice value to set
				856	*
				857	* Detemine if the requested task priority change is permitted for the
				858	* specified task, returning 0 if permission is granted, -ve if denied.
				859	*/
				860	int cap_task_setnice(struct task_struct *p, int nice)
				861	{
				862	return cap_safe_nice(p);
				863	}
				864
				865	/*
				866	* Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from
				867	* the current task's bounding set. Returns 0 on success, -ve on error.
				868	*/
				869	static int cap_prctl_drop(unsigned long cap)
				870	{
				871	struct cred *new;
				872
				873	if (!ns_capable(current_user_ns(), CAP_SETPCAP))
				874	return -EPERM;
				875	if (!cap_valid(cap))
				876	return -EINVAL;
				877
				878	new = prepare_creds();
				879	if (!new)
				880	return -ENOMEM;
				881	cap_lower(new->cap_bset, cap);
				882	return commit_creds(new);
				883	}
				884
				885	/**
				886	* cap_task_prctl - Implement process control functions for this security module
				887	* @option: The process control function requested
				888	* @arg2, @arg3, @arg4, @arg5: The argument data for this function
				889	*
				890	* Allow process control functions (sys_prctl()) to alter capabilities; may
				891	* also deny access to other functions not otherwise implemented here.
				892	*
				893	* Returns 0 or +ve on success, -ENOSYS if this function is not implemented
				894	* here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM
				895	* modules will consider performing the function.
				896	*/
				897	int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
				898	unsigned long arg4, unsigned long arg5)
				899	{
				900	const struct cred *old = current_cred();
				901	struct cred *new;
				902
				903	switch (option) {
				904	case PR_CAPBSET_READ:
				905	if (!cap_valid(arg2))
				906	return -EINVAL;
				907	return !!cap_raised(old->cap_bset, arg2);
				908
				909	case PR_CAPBSET_DROP:
				910	return cap_prctl_drop(arg2);
				911
				912	/*
				913	* The next four prctl's remain to assist with transitioning a
				914	* system from legacy UID=0 based privilege (when filesystem
				915	* capabilities are not in use) to a system using filesystem
				916	* capabilities only - as the POSIX.1e draft intended.
				917	*
				918	* Note:
				919	*
				920	* PR_SET_SECUREBITS =
				921	* issecure_mask(SECURE_KEEP_CAPS_LOCKED)
				922	* \| issecure_mask(SECURE_NOROOT)
				923	* \| issecure_mask(SECURE_NOROOT_LOCKED)
				924	* \| issecure_mask(SECURE_NO_SETUID_FIXUP)
				925	* \| issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
				926	*
				927	* will ensure that the current process and all of its
				928	* children will be locked into a pure
				929	* capability-based-privilege environment.
				930	*/
				931	case PR_SET_SECUREBITS:
				932	if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
				933	& (old->securebits ^ arg2)) /[1]/
				934	\|\| ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /[2]/
				935	\|\| (arg2 & ~(SECURE_ALL_LOCKS \| SECURE_ALL_BITS)) /[3]/
				936	\|\| (cap_capable(current_cred(),
				937	current_cred()->user_ns, CAP_SETPCAP,
				938	SECURITY_CAP_AUDIT) != 0) /[4]/
				939	/*
				940	* [1] no changing of bits that are locked
				941	* [2] no unlocking of locks
				942	* [3] no setting of unsupported bits
				943	* [4] doing anything requires privilege (go read about
				944	* the "sendmail capabilities bug")
				945	*/
				946	)
				947	/* cannot change a locked bit */
				948	return -EPERM;
				949
				950	new = prepare_creds();
				951	if (!new)
				952	return -ENOMEM;
				953	new->securebits = arg2;
				954	return commit_creds(new);
				955
				956	case PR_GET_SECUREBITS:
				957	return old->securebits;
				958
				959	case PR_GET_KEEPCAPS:
				960	return !!issecure(SECURE_KEEP_CAPS);
				961
				962	case PR_SET_KEEPCAPS:
				963	if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
				964	return -EINVAL;
				965	if (issecure(SECURE_KEEP_CAPS_LOCKED))
				966	return -EPERM;
				967
				968	new = prepare_creds();
				969	if (!new)
				970	return -ENOMEM;
				971	if (arg2)
				972	new->securebits \|= issecure_mask(SECURE_KEEP_CAPS);
				973	else
				974	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
				975	return commit_creds(new);
				976
				977	case PR_CAP_AMBIENT:
				978	if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
				979	if (arg3 \| arg4 \| arg5)
				980	return -EINVAL;
				981
				982	new = prepare_creds();
				983	if (!new)
				984	return -ENOMEM;
				985	cap_clear(new->cap_ambient);
				986	return commit_creds(new);
				987	}
				988
				989	if (((!cap_valid(arg3)) \| arg4 \| arg5))
				990	return -EINVAL;
				991
				992	if (arg2 == PR_CAP_AMBIENT_IS_SET) {
				993	return !!cap_raised(current_cred()->cap_ambient, arg3);
				994	} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
				995	arg2 != PR_CAP_AMBIENT_LOWER) {
				996	return -EINVAL;
				997	} else {
				998	if (arg2 == PR_CAP_AMBIENT_RAISE &&
				999	(!cap_raised(current_cred()->cap_permitted, arg3) \|\|
				1000	!cap_raised(current_cred()->cap_inheritable,
				1001	arg3) \|\|
				1002	issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
				1003	return -EPERM;
				1004
				1005	new = prepare_creds();
				1006	if (!new)
				1007	return -ENOMEM;
				1008	if (arg2 == PR_CAP_AMBIENT_RAISE)
				1009	cap_raise(new->cap_ambient, arg3);
				1010	else
				1011	cap_lower(new->cap_ambient, arg3);
				1012	return commit_creds(new);
				1013	}
				1014
				1015	default:
				1016	/* No functionality available - continue with default */
				1017	return -ENOSYS;
				1018	}
				1019	}
				1020
				1021	/**
				1022	* cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
				1023	* @mm: The VM space in which the new mapping is to be made
				1024	* @pages: The size of the mapping
				1025	*
				1026	* Determine whether the allocation of a new virtual mapping by the current
				1027	* task is permitted, returning 1 if permission is granted, 0 if not.
				1028	*/
				1029	int cap_vm_enough_memory(struct mm_struct *mm, long pages)
				1030	{
				1031	int cap_sys_admin = 0;
				1032
				1033	if (cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN,
				1034	SECURITY_CAP_NOAUDIT) == 0)
				1035	cap_sys_admin = 1;
				1036	return cap_sys_admin;
				1037	}
				1038
				1039	/*
				1040	* cap_mmap_addr - check if able to map given addr
				1041	* @addr: address attempting to be mapped
				1042	*
				1043	* If the process is attempting to map memory below dac_mmap_min_addr they need
				1044	* CAP_SYS_RAWIO. The other parameters to this function are unused by the
				1045	* capability security module. Returns 0 if this mapping should be allowed
				1046	* -EPERM if not.
				1047	*/
				1048	int cap_mmap_addr(unsigned long addr)
				1049	{
				1050	int ret = 0;
				1051
				1052	if (addr < dac_mmap_min_addr) {
				1053	ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
				1054	SECURITY_CAP_AUDIT);
				1055	/* set PF_SUPERPRIV if it turns out we allow the low mmap */
				1056	if (ret == 0)
				1057	current->flags \|= PF_SUPERPRIV;
				1058	}
				1059	return ret;
				1060	}
				1061
				1062	int cap_mmap_file(struct file *file, unsigned long reqprot,
				1063	unsigned long prot, unsigned long flags)
				1064	{
				1065	return 0;
				1066	}
				1067
				1068	#ifdef CONFIG_SECURITY
				1069
				1070	struct security_hook_list capability_hooks[] = {
				1071	LSM_HOOK_INIT(capable, cap_capable),
				1072	LSM_HOOK_INIT(settime, cap_settime),
				1073	LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
				1074	LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
				1075	LSM_HOOK_INIT(capget, cap_capget),
				1076	LSM_HOOK_INIT(capset, cap_capset),
				1077	LSM_HOOK_INIT(bprm_set_creds, cap_bprm_set_creds),
				1078	LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
				1079	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
				1080	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
				1081	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
				1082	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
				1083	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
				1084	LSM_HOOK_INIT(task_prctl, cap_task_prctl),
				1085	LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
				1086	LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
				1087	LSM_HOOK_INIT(task_setnice, cap_task_setnice),
				1088	LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
				1089	};
				1090
				1091	void __init capability_add_hooks(void)
				1092	{
				1093	security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks));
				1094	}
				1095
				1096	#endif /* CONFIG_SECURITY */