Blame - fs/namespace.c - codeaurora/cp-linux

blob: ec4078d16eb7cd2914bdcf9b7b26bf546173670e [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1	/*
				2	* linux/fs/namespace.c
				3	*
				4	* (C) Copyright Al Viro 2000, 2001
				5	* Released under GPL v2.
				6	*
				7	* Based on code from fs/super.c, copyright Linus Torvalds and others.
				8	* Heavily rewritten.
				9	*/
				10
				11	#include <linux/syscalls.h>
				12	#include <linux/export.h>
				13	#include <linux/capability.h>
				14	#include <linux/mnt_namespace.h>
				15	#include <linux/user_namespace.h>
				16	#include <linux/namei.h>
				17	#include <linux/security.h>
				18	#include <linux/idr.h>
				19	#include <linux/init.h> /* init_rootfs */
				20	#include <linux/fs_struct.h> /* get_fs_root et.al. */
				21	#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
				22	#include <linux/uaccess.h>
				23	#include <linux/proc_ns.h>
				24	#include <linux/magic.h>
				25	#include <linux/bootmem.h>
				26	#include <linux/task_work.h>
				27	#include "pnode.h"
				28	#include "internal.h"
				29
				30	/* Maximum number of mounts in a mount namespace */
				31	unsigned int sysctl_mount_max __read_mostly = 100000;
				32
				33	static unsigned int m_hash_mask __read_mostly;
				34	static unsigned int m_hash_shift __read_mostly;
				35	static unsigned int mp_hash_mask __read_mostly;
				36	static unsigned int mp_hash_shift __read_mostly;
				37
				38	static __initdata unsigned long mhash_entries;
				39	static int __init set_mhash_entries(char *str)
				40	{
				41	if (!str)
				42	return 0;
				43	mhash_entries = simple_strtoul(str, &str, 0);
				44	return 1;
				45	}
				46	__setup("mhash_entries=", set_mhash_entries);
				47
				48	static __initdata unsigned long mphash_entries;
				49	static int __init set_mphash_entries(char *str)
				50	{
				51	if (!str)
				52	return 0;
				53	mphash_entries = simple_strtoul(str, &str, 0);
				54	return 1;
				55	}
				56	__setup("mphash_entries=", set_mphash_entries);
				57
				58	static u64 event;
				59	static DEFINE_IDA(mnt_id_ida);
				60	static DEFINE_IDA(mnt_group_ida);
				61	static DEFINE_SPINLOCK(mnt_id_lock);
				62	static int mnt_id_start = 0;
				63	static int mnt_group_start = 1;
				64
				65	static struct hlist_head *mount_hashtable __read_mostly;
				66	static struct hlist_head *mountpoint_hashtable __read_mostly;
				67	static struct kmem_cache *mnt_cache __read_mostly;
				68	static DECLARE_RWSEM(namespace_sem);
				69
				70	/* /sys/fs */
				71	struct kobject *fs_kobj;
				72	EXPORT_SYMBOL_GPL(fs_kobj);
				73
				74	/*
				75	* vfsmount lock may be taken for read to prevent changes to the
				76	* vfsmount hash, ie. during mountpoint lookups or walking back
				77	* up the tree.
				78	*
				79	* It should be taken for write in all cases where the vfsmount
				80	* tree or hash is modified or when a vfsmount structure is modified.
				81	*/
				82	__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
				83
				84	static inline struct hlist_head m_hash(struct vfsmount mnt, struct dentry *dentry)
				85	{
				86	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
				87	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
				88	tmp = tmp + (tmp >> m_hash_shift);
				89	return &mount_hashtable[tmp & m_hash_mask];
				90	}
				91
				92	static inline struct hlist_head mp_hash(struct dentry dentry)
				93	{
				94	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
				95	tmp = tmp + (tmp >> mp_hash_shift);
				96	return &mountpoint_hashtable[tmp & mp_hash_mask];
				97	}
				98
				99	/*
				100	* allocation is serialized by namespace_sem, but we need the spinlock to
				101	* serialize with freeing.
				102	*/
				103	static int mnt_alloc_id(struct mount *mnt)
				104	{
				105	int res;
				106
				107	retry:
				108	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
				109	spin_lock(&mnt_id_lock);
				110	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
				111	if (!res)
				112	mnt_id_start = mnt->mnt_id + 1;
				113	spin_unlock(&mnt_id_lock);
				114	if (res == -EAGAIN)
				115	goto retry;
				116
				117	return res;
				118	}
				119
				120	static void mnt_free_id(struct mount *mnt)
				121	{
				122	int id = mnt->mnt_id;
				123	spin_lock(&mnt_id_lock);
				124	ida_remove(&mnt_id_ida, id);
				125	if (mnt_id_start > id)
				126	mnt_id_start = id;
				127	spin_unlock(&mnt_id_lock);
				128	}
				129
				130	/*
				131	* Allocate a new peer group ID
				132	*
				133	* mnt_group_ida is protected by namespace_sem
				134	*/
				135	static int mnt_alloc_group_id(struct mount *mnt)
				136	{
				137	int res;
				138
				139	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
				140	return -ENOMEM;
				141
				142	res = ida_get_new_above(&mnt_group_ida,
				143	mnt_group_start,
				144	&mnt->mnt_group_id);
				145	if (!res)
				146	mnt_group_start = mnt->mnt_group_id + 1;
				147
				148	return res;
				149	}
				150
				151	/*
				152	* Release a peer group ID
				153	*/
				154	void mnt_release_group_id(struct mount *mnt)
				155	{
				156	int id = mnt->mnt_group_id;
				157	ida_remove(&mnt_group_ida, id);
				158	if (mnt_group_start > id)
				159	mnt_group_start = id;
				160	mnt->mnt_group_id = 0;
				161	}
				162
				163	/*
				164	* vfsmount lock must be held for read
				165	*/
				166	static inline void mnt_add_count(struct mount *mnt, int n)
				167	{
				168	#ifdef CONFIG_SMP
				169	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
				170	#else
				171	preempt_disable();
				172	mnt->mnt_count += n;
				173	preempt_enable();
				174	#endif
				175	}
				176
				177	/*
				178	* vfsmount lock must be held for write
				179	*/
				180	unsigned int mnt_get_count(struct mount *mnt)
				181	{
				182	#ifdef CONFIG_SMP
				183	unsigned int count = 0;
				184	int cpu;
				185
				186	for_each_possible_cpu(cpu) {
				187	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
				188	}
				189
				190	return count;
				191	#else
				192	return mnt->mnt_count;
				193	#endif
				194	}
				195
				196	static void drop_mountpoint(struct fs_pin *p)
				197	{
				198	struct mount *m = container_of(p, struct mount, mnt_umount);
				199	dput(m->mnt_ex_mountpoint);
				200	pin_remove(p);
				201	mntput(&m->mnt);
				202	}
				203
				204	static struct mount alloc_vfsmnt(const char name)
				205	{
				206	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
				207	if (mnt) {
				208	int err;
				209
				210	err = mnt_alloc_id(mnt);
				211	if (err)
				212	goto out_free_cache;
				213
				214	if (name) {
				215	mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
				216	if (!mnt->mnt_devname)
				217	goto out_free_id;
				218	}
				219
				220	#ifdef CONFIG_SMP
				221	mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
				222	if (!mnt->mnt_pcp)
				223	goto out_free_devname;
				224
				225	this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
				226	#else
				227	mnt->mnt_count = 1;
				228	mnt->mnt_writers = 0;
				229	#endif
				230
				231	INIT_HLIST_NODE(&mnt->mnt_hash);
				232	INIT_LIST_HEAD(&mnt->mnt_child);
				233	INIT_LIST_HEAD(&mnt->mnt_mounts);
				234	INIT_LIST_HEAD(&mnt->mnt_list);
				235	INIT_LIST_HEAD(&mnt->mnt_expire);
				236	INIT_LIST_HEAD(&mnt->mnt_share);
				237	INIT_LIST_HEAD(&mnt->mnt_slave_list);
				238	INIT_LIST_HEAD(&mnt->mnt_slave);
				239	INIT_HLIST_NODE(&mnt->mnt_mp_list);
				240	INIT_LIST_HEAD(&mnt->mnt_umounting);
				241	#ifdef CONFIG_FSNOTIFY
				242	INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
				243	#endif
				244	init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
				245	}
				246	return mnt;
				247
				248	#ifdef CONFIG_SMP
				249	out_free_devname:
				250	kfree_const(mnt->mnt_devname);
				251	#endif
				252	out_free_id:
				253	mnt_free_id(mnt);
				254	out_free_cache:
				255	kmem_cache_free(mnt_cache, mnt);
				256	return NULL;
				257	}
				258
				259	/*
				260	* Most r/o checks on a fs are for operations that take
				261	* discrete amounts of time, like a write() or unlink().
				262	* We must keep track of when those operations start
				263	* (for permission checks) and when they end, so that
				264	* we can determine when writes are able to occur to
				265	* a filesystem.
				266	*/
				267	/*
				268	* __mnt_is_readonly: check whether a mount is read-only
				269	* @mnt: the mount to check for its write status
				270	*
				271	* This shouldn't be used directly ouside of the VFS.
				272	* It does not guarantee that the filesystem will stay
				273	* r/w, just that it is right now. This can not and
				274	* should not be used in place of IS_RDONLY(inode).
				275	* mnt_want/drop_write() will _keep_ the filesystem
				276	* r/w.
				277	*/
				278	int __mnt_is_readonly(struct vfsmount *mnt)
				279	{
				280	if (mnt->mnt_flags & MNT_READONLY)
				281	return 1;
				282	if (mnt->mnt_sb->s_flags & MS_RDONLY)
				283	return 1;
				284	return 0;
				285	}
				286	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
				287
				288	static inline void mnt_inc_writers(struct mount *mnt)
				289	{
				290	#ifdef CONFIG_SMP
				291	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
				292	#else
				293	mnt->mnt_writers++;
				294	#endif
				295	}
				296
				297	static inline void mnt_dec_writers(struct mount *mnt)
				298	{
				299	#ifdef CONFIG_SMP
				300	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
				301	#else
				302	mnt->mnt_writers--;
				303	#endif
				304	}
				305
				306	static unsigned int mnt_get_writers(struct mount *mnt)
				307	{
				308	#ifdef CONFIG_SMP
				309	unsigned int count = 0;
				310	int cpu;
				311
				312	for_each_possible_cpu(cpu) {
				313	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
				314	}
				315
				316	return count;
				317	#else
				318	return mnt->mnt_writers;
				319	#endif
				320	}
				321
				322	static int mnt_is_readonly(struct vfsmount *mnt)
				323	{
				324	if (mnt->mnt_sb->s_readonly_remount)
				325	return 1;
				326	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
				327	smp_rmb();
				328	return __mnt_is_readonly(mnt);
				329	}
				330
				331	/*
				332	* Most r/o & frozen checks on a fs are for operations that take discrete
				333	* amounts of time, like a write() or unlink(). We must keep track of when
				334	* those operations start (for permission checks) and when they end, so that we
				335	* can determine when writes are able to occur to a filesystem.
				336	*/
				337	/**
				338	* __mnt_want_write - get write access to a mount without freeze protection
				339	* @m: the mount on which to take a write
				340	*
				341	* This tells the low-level filesystem that a write is about to be performed to
				342	* it, and makes sure that writes are allowed (mnt it read-write) before
				343	* returning success. This operation does not protect against filesystem being
				344	* frozen. When the write operation is finished, __mnt_drop_write() must be
				345	* called. This is effectively a refcount.
				346	*/
				347	int __mnt_want_write(struct vfsmount *m)
				348	{
				349	struct mount *mnt = real_mount(m);
				350	int ret = 0;
				351
				352	preempt_disable();
				353	mnt_inc_writers(mnt);
				354	/*
				355	* The store to mnt_inc_writers must be visible before we pass
				356	* MNT_WRITE_HOLD loop below, so that the slowpath can see our
				357	* incremented count after it has set MNT_WRITE_HOLD.
				358	*/
				359	smp_mb();
				360	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
				361	cpu_relax();
				362	/*
				363	* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
				364	* be set to match its requirements. So we must not load that until
				365	* MNT_WRITE_HOLD is cleared.
				366	*/
				367	smp_rmb();
				368	if (mnt_is_readonly(m)) {
				369	mnt_dec_writers(mnt);
				370	ret = -EROFS;
				371	}
				372	preempt_enable();
				373
				374	return ret;
				375	}
				376
				377	/**
				378	* mnt_want_write - get write access to a mount
				379	* @m: the mount on which to take a write
				380	*
				381	* This tells the low-level filesystem that a write is about to be performed to
				382	* it, and makes sure that writes are allowed (mount is read-write, filesystem
				383	* is not frozen) before returning success. When the write operation is
				384	* finished, mnt_drop_write() must be called. This is effectively a refcount.
				385	*/
				386	int mnt_want_write(struct vfsmount *m)
				387	{
				388	int ret;
				389
				390	sb_start_write(m->mnt_sb);
				391	ret = __mnt_want_write(m);
				392	if (ret)
				393	sb_end_write(m->mnt_sb);
				394	return ret;
				395	}
				396	EXPORT_SYMBOL_GPL(mnt_want_write);
				397
				398	/**
				399	* mnt_clone_write - get write access to a mount
				400	* @mnt: the mount on which to take a write
				401	*
				402	* This is effectively like mnt_want_write, except
				403	* it must only be used to take an extra write reference
				404	* on a mountpoint that we already know has a write reference
				405	* on it. This allows some optimisation.
				406	*
				407	* After finished, mnt_drop_write must be called as usual to
				408	* drop the reference.
				409	*/
				410	int mnt_clone_write(struct vfsmount *mnt)
				411	{
				412	/* superblock may be r/o */
				413	if (__mnt_is_readonly(mnt))
				414	return -EROFS;
				415	preempt_disable();
				416	mnt_inc_writers(real_mount(mnt));
				417	preempt_enable();
				418	return 0;
				419	}
				420	EXPORT_SYMBOL_GPL(mnt_clone_write);
				421
				422	/**
				423	* __mnt_want_write_file - get write access to a file's mount
				424	* @file: the file who's mount on which to take a write
				425	*
				426	* This is like __mnt_want_write, but it takes a file and can
				427	* do some optimisations if the file is open for write already
				428	*/
				429	int __mnt_want_write_file(struct file *file)
				430	{
				431	if (!(file->f_mode & FMODE_WRITER))
				432	return __mnt_want_write(file->f_path.mnt);
				433	else
				434	return mnt_clone_write(file->f_path.mnt);
				435	}
				436
				437	/**
				438	* mnt_want_write_file - get write access to a file's mount
				439	* @file: the file who's mount on which to take a write
				440	*
				441	* This is like mnt_want_write, but it takes a file and can
				442	* do some optimisations if the file is open for write already
				443	*/
				444	int mnt_want_write_file(struct file *file)
				445	{
				446	int ret;
				447
				448	sb_start_write(file->f_path.mnt->mnt_sb);
				449	ret = __mnt_want_write_file(file);
				450	if (ret)
				451	sb_end_write(file->f_path.mnt->mnt_sb);
				452	return ret;
				453	}
				454	EXPORT_SYMBOL_GPL(mnt_want_write_file);
				455
				456	/**
				457	* __mnt_drop_write - give up write access to a mount
				458	* @mnt: the mount on which to give up write access
				459	*
				460	* Tells the low-level filesystem that we are done
				461	* performing writes to it. Must be matched with
				462	* __mnt_want_write() call above.
				463	*/
				464	void __mnt_drop_write(struct vfsmount *mnt)
				465	{
				466	preempt_disable();
				467	mnt_dec_writers(real_mount(mnt));
				468	preempt_enable();
				469	}
				470
				471	/**
				472	* mnt_drop_write - give up write access to a mount
				473	* @mnt: the mount on which to give up write access
				474	*
				475	* Tells the low-level filesystem that we are done performing writes to it and
				476	* also allows filesystem to be frozen again. Must be matched with
				477	* mnt_want_write() call above.
				478	*/
				479	void mnt_drop_write(struct vfsmount *mnt)
				480	{
				481	__mnt_drop_write(mnt);
				482	sb_end_write(mnt->mnt_sb);
				483	}
				484	EXPORT_SYMBOL_GPL(mnt_drop_write);
				485
				486	void __mnt_drop_write_file(struct file *file)
				487	{
				488	__mnt_drop_write(file->f_path.mnt);
				489	}
				490
				491	void mnt_drop_write_file(struct file *file)
				492	{
				493	mnt_drop_write(file->f_path.mnt);
				494	}
				495	EXPORT_SYMBOL(mnt_drop_write_file);
				496
				497	static int mnt_make_readonly(struct mount *mnt)
				498	{
				499	int ret = 0;
				500
				501	lock_mount_hash();
				502	mnt->mnt.mnt_flags \|= MNT_WRITE_HOLD;
				503	/*
				504	* After storing MNT_WRITE_HOLD, we'll read the counters. This store
				505	* should be visible before we do.
				506	*/
				507	smp_mb();
				508
				509	/*
				510	* With writers on hold, if this value is zero, then there are
				511	* definitely no active writers (although held writers may subsequently
				512	* increment the count, they'll have to wait, and decrement it after
				513	* seeing MNT_READONLY).
				514	*
				515	* It is OK to have counter incremented on one CPU and decremented on
				516	* another: the sum will add up correctly. The danger would be when we
				517	* sum up each counter, if we read a counter before it is incremented,
				518	* but then read another CPU's count which it has been subsequently
				519	* decremented from -- we would see more decrements than we should.
				520	* MNT_WRITE_HOLD protects against this scenario, because
				521	* mnt_want_write first increments count, then smp_mb, then spins on
				522	* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
				523	* we're counting up here.
				524	*/
				525	if (mnt_get_writers(mnt) > 0)
				526	ret = -EBUSY;
				527	else
				528	mnt->mnt.mnt_flags \|= MNT_READONLY;
				529	/*
				530	* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
				531	* that become unheld will see MNT_READONLY.
				532	*/
				533	smp_wmb();
				534	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
				535	unlock_mount_hash();
				536	return ret;
				537	}
				538
				539	static void __mnt_unmake_readonly(struct mount *mnt)
				540	{
				541	lock_mount_hash();
				542	mnt->mnt.mnt_flags &= ~MNT_READONLY;
				543	unlock_mount_hash();
				544	}
				545
				546	int sb_prepare_remount_readonly(struct super_block *sb)
				547	{
				548	struct mount *mnt;
				549	int err = 0;
				550
				551	/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
				552	if (atomic_long_read(&sb->s_remove_count))
				553	return -EBUSY;
				554
				555	lock_mount_hash();
				556	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
				557	if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
				558	mnt->mnt.mnt_flags \|= MNT_WRITE_HOLD;
				559	smp_mb();
				560	if (mnt_get_writers(mnt) > 0) {
				561	err = -EBUSY;
				562	break;
				563	}
				564	}
				565	}
				566	if (!err && atomic_long_read(&sb->s_remove_count))
				567	err = -EBUSY;
				568
				569	if (!err) {
				570	sb->s_readonly_remount = 1;
				571	smp_wmb();
				572	}
				573	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
				574	if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
				575	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
				576	}
				577	unlock_mount_hash();
				578
				579	return err;
				580	}
				581
				582	static void free_vfsmnt(struct mount *mnt)
				583	{
				584	kfree_const(mnt->mnt_devname);
				585	#ifdef CONFIG_SMP
				586	free_percpu(mnt->mnt_pcp);
				587	#endif
				588	kmem_cache_free(mnt_cache, mnt);
				589	}
				590
				591	static void delayed_free_vfsmnt(struct rcu_head *head)
				592	{
				593	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
				594	}
				595
				596	/* call under rcu_read_lock */
				597	int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
				598	{
				599	struct mount *mnt;
				600	if (read_seqretry(&mount_lock, seq))
				601	return 1;
				602	if (bastard == NULL)
				603	return 0;
				604	mnt = real_mount(bastard);
				605	mnt_add_count(mnt, 1);
				606	if (likely(!read_seqretry(&mount_lock, seq)))
				607	return 0;
				608	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
				609	mnt_add_count(mnt, -1);
				610	return 1;
				611	}
				612	return -1;
				613	}
				614
				615	/* call under rcu_read_lock */
				616	bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
				617	{
				618	int res = __legitimize_mnt(bastard, seq);
				619	if (likely(!res))
				620	return true;
				621	if (unlikely(res < 0)) {
				622	rcu_read_unlock();
				623	mntput(bastard);
				624	rcu_read_lock();
				625	}
				626	return false;
				627	}
				628
				629	/*
				630	* find the first mount at @dentry on vfsmount @mnt.
				631	* call under rcu_read_lock()
				632	*/
				633	struct mount __lookup_mnt(struct vfsmount mnt, struct dentry *dentry)
				634	{
				635	struct hlist_head *head = m_hash(mnt, dentry);
				636	struct mount *p;
				637
				638	hlist_for_each_entry_rcu(p, head, mnt_hash)
				639	if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
				640	return p;
				641	return NULL;
				642	}
				643
				644	/*
				645	* lookup_mnt - Return the first child mount mounted at path
				646	*
				647	* "First" means first mounted chronologically. If you create the
				648	* following mounts:
				649	*
				650	* mount /dev/sda1 /mnt
				651	* mount /dev/sda2 /mnt
				652	* mount /dev/sda3 /mnt
				653	*
				654	* Then lookup_mnt() on the base /mnt dentry in the root mount will
				655	* return successively the root dentry and vfsmount of /dev/sda1, then
				656	* /dev/sda2, then /dev/sda3, then NULL.
				657	*
				658	* lookup_mnt takes a reference to the found vfsmount.
				659	*/
				660	struct vfsmount lookup_mnt(struct path path)
				661	{
				662	struct mount *child_mnt;
				663	struct vfsmount *m;
				664	unsigned seq;
				665
				666	rcu_read_lock();
				667	do {
				668	seq = read_seqbegin(&mount_lock);
				669	child_mnt = __lookup_mnt(path->mnt, path->dentry);
				670	m = child_mnt ? &child_mnt->mnt : NULL;
				671	} while (!legitimize_mnt(m, seq));
				672	rcu_read_unlock();
				673	return m;
				674	}
				675
				676	/*
				677	* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
				678	* current mount namespace.
				679	*
				680	* The common case is dentries are not mountpoints at all and that
				681	* test is handled inline. For the slow case when we are actually
				682	* dealing with a mountpoint of some kind, walk through all of the
				683	* mounts in the current mount namespace and test to see if the dentry
				684	* is a mountpoint.
				685	*
				686	* The mount_hashtable is not usable in the context because we
				687	* need to identify all mounts that may be in the current mount
				688	* namespace not just a mount that happens to have some specified
				689	* parent mount.
				690	*/
				691	bool __is_local_mountpoint(struct dentry *dentry)
				692	{
				693	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
				694	struct mount *mnt;
				695	bool is_covered = false;
				696
				697	if (!d_mountpoint(dentry))
				698	goto out;
				699
				700	down_read(&namespace_sem);
				701	list_for_each_entry(mnt, &ns->list, mnt_list) {
				702	is_covered = (mnt->mnt_mountpoint == dentry);
				703	if (is_covered)
				704	break;
				705	}
				706	up_read(&namespace_sem);
				707	out:
				708	return is_covered;
				709	}
				710
				711	static struct mountpoint lookup_mountpoint(struct dentry dentry)
				712	{
				713	struct hlist_head *chain = mp_hash(dentry);
				714	struct mountpoint *mp;
				715
				716	hlist_for_each_entry(mp, chain, m_hash) {
				717	if (mp->m_dentry == dentry) {
				718	/* might be worth a WARN_ON() */
				719	if (d_unlinked(dentry))
				720	return ERR_PTR(-ENOENT);
				721	mp->m_count++;
				722	return mp;
				723	}
				724	}
				725	return NULL;
				726	}
				727
				728	static struct mountpoint get_mountpoint(struct dentry dentry)
				729	{
				730	struct mountpoint mp, new = NULL;
				731	int ret;
				732
				733	if (d_mountpoint(dentry)) {
				734	mountpoint:
				735	read_seqlock_excl(&mount_lock);
				736	mp = lookup_mountpoint(dentry);
				737	read_sequnlock_excl(&mount_lock);
				738	if (mp)
				739	goto done;
				740	}
				741
				742	if (!new)
				743	new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
				744	if (!new)
				745	return ERR_PTR(-ENOMEM);
				746
				747
				748	/* Exactly one processes may set d_mounted */
				749	ret = d_set_mounted(dentry);
				750
				751	/* Someone else set d_mounted? */
				752	if (ret == -EBUSY)
				753	goto mountpoint;
				754
				755	/* The dentry is not available as a mountpoint? */
				756	mp = ERR_PTR(ret);
				757	if (ret)
				758	goto done;
				759
				760	/* Add the new mountpoint to the hash table */
				761	read_seqlock_excl(&mount_lock);
				762	new->m_dentry = dentry;
				763	new->m_count = 1;
				764	hlist_add_head(&new->m_hash, mp_hash(dentry));
				765	INIT_HLIST_HEAD(&new->m_list);
				766	read_sequnlock_excl(&mount_lock);
				767
				768	mp = new;
				769	new = NULL;
				770	done:
				771	kfree(new);
				772	return mp;
				773	}
				774
				775	static void put_mountpoint(struct mountpoint *mp)
				776	{
				777	if (!--mp->m_count) {
				778	struct dentry *dentry = mp->m_dentry;
				779	BUG_ON(!hlist_empty(&mp->m_list));
				780	spin_lock(&dentry->d_lock);
				781	dentry->d_flags &= ~DCACHE_MOUNTED;
				782	spin_unlock(&dentry->d_lock);
				783	hlist_del(&mp->m_hash);
				784	kfree(mp);
				785	}
				786	}
				787
				788	static inline int check_mnt(struct mount *mnt)
				789	{
				790	return mnt->mnt_ns == current->nsproxy->mnt_ns;
				791	}
				792
				793	/*
				794	* vfsmount lock must be held for write
				795	*/
				796	static void touch_mnt_namespace(struct mnt_namespace *ns)
				797	{
				798	if (ns) {
				799	ns->event = ++event;
				800	wake_up_interruptible(&ns->poll);
				801	}
				802	}
				803
				804	/*
				805	* vfsmount lock must be held for write
				806	*/
				807	static void __touch_mnt_namespace(struct mnt_namespace *ns)
				808	{
				809	if (ns && ns->event != event) {
				810	ns->event = event;
				811	wake_up_interruptible(&ns->poll);
				812	}
				813	}
				814
				815	/*
				816	* vfsmount lock must be held for write
				817	*/
				818	static void unhash_mnt(struct mount *mnt)
				819	{
				820	mnt->mnt_parent = mnt;
				821	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				822	list_del_init(&mnt->mnt_child);
				823	hlist_del_init_rcu(&mnt->mnt_hash);
				824	hlist_del_init(&mnt->mnt_mp_list);
				825	put_mountpoint(mnt->mnt_mp);
				826	mnt->mnt_mp = NULL;
				827	}
				828
				829	/*
				830	* vfsmount lock must be held for write
				831	*/
				832	static void detach_mnt(struct mount mnt, struct path old_path)
				833	{
				834	old_path->dentry = mnt->mnt_mountpoint;
				835	old_path->mnt = &mnt->mnt_parent->mnt;
				836	unhash_mnt(mnt);
				837	}
				838
				839	/*
				840	* vfsmount lock must be held for write
				841	*/
				842	static void umount_mnt(struct mount *mnt)
				843	{
				844	/* old mountpoint will be dropped when we can do that */
				845	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
				846	unhash_mnt(mnt);
				847	}
				848
				849	/*
				850	* vfsmount lock must be held for write
				851	*/
				852	void mnt_set_mountpoint(struct mount *mnt,
				853	struct mountpoint *mp,
				854	struct mount *child_mnt)
				855	{
				856	mp->m_count++;
				857	mnt_add_count(mnt, 1); /* essentially, that's mntget */
				858	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
				859	child_mnt->mnt_parent = mnt;
				860	child_mnt->mnt_mp = mp;
				861	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
				862	}
				863
				864	static void __attach_mnt(struct mount mnt, struct mount parent)
				865	{
				866	hlist_add_head_rcu(&mnt->mnt_hash,
				867	m_hash(&parent->mnt, mnt->mnt_mountpoint));
				868	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
				869	}
				870
				871	/*
				872	* vfsmount lock must be held for write
				873	*/
				874	static void attach_mnt(struct mount *mnt,
				875	struct mount *parent,
				876	struct mountpoint *mp)
				877	{
				878	mnt_set_mountpoint(parent, mp, mnt);
				879	__attach_mnt(mnt, parent);
				880	}
				881
				882	void mnt_change_mountpoint(struct mount parent, struct mountpoint mp, struct mount *mnt)
				883	{
				884	struct mountpoint *old_mp = mnt->mnt_mp;
				885	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
				886	struct mount *old_parent = mnt->mnt_parent;
				887
				888	list_del_init(&mnt->mnt_child);
				889	hlist_del_init(&mnt->mnt_mp_list);
				890	hlist_del_init_rcu(&mnt->mnt_hash);
				891
				892	attach_mnt(mnt, parent, mp);
				893
				894	put_mountpoint(old_mp);
				895
				896	/*
				897	* Safely avoid even the suggestion this code might sleep or
				898	* lock the mount hash by taking advantage of the knowledge that
				899	* mnt_change_mountpoint will not release the final reference
				900	* to a mountpoint.
				901	*
				902	* During mounting, the mount passed in as the parent mount will
				903	* continue to use the old mountpoint and during unmounting, the
				904	* old mountpoint will continue to exist until namespace_unlock,
				905	* which happens well after mnt_change_mountpoint.
				906	*/
				907	spin_lock(&old_mountpoint->d_lock);
				908	old_mountpoint->d_lockref.count--;
				909	spin_unlock(&old_mountpoint->d_lock);
				910
				911	mnt_add_count(old_parent, -1);
				912	}
				913
				914	/*
				915	* vfsmount lock must be held for write
				916	*/
				917	static void commit_tree(struct mount *mnt)
				918	{
				919	struct mount *parent = mnt->mnt_parent;
				920	struct mount *m;
				921	LIST_HEAD(head);
				922	struct mnt_namespace *n = parent->mnt_ns;
				923
				924	BUG_ON(parent == mnt);
				925
				926	list_add_tail(&head, &mnt->mnt_list);
				927	list_for_each_entry(m, &head, mnt_list)
				928	m->mnt_ns = n;
				929
				930	list_splice(&head, n->list.prev);
				931
				932	n->mounts += n->pending_mounts;
				933	n->pending_mounts = 0;
				934
				935	__attach_mnt(mnt, parent);
				936	touch_mnt_namespace(n);
				937	}
				938
				939	static struct mount next_mnt(struct mount p, struct mount *root)
				940	{
				941	struct list_head *next = p->mnt_mounts.next;
				942	if (next == &p->mnt_mounts) {
				943	while (1) {
				944	if (p == root)
				945	return NULL;
				946	next = p->mnt_child.next;
				947	if (next != &p->mnt_parent->mnt_mounts)
				948	break;
				949	p = p->mnt_parent;
				950	}
				951	}
				952	return list_entry(next, struct mount, mnt_child);
				953	}
				954
				955	static struct mount skip_mnt_tree(struct mount p)
				956	{
				957	struct list_head *prev = p->mnt_mounts.prev;
				958	while (prev != &p->mnt_mounts) {
				959	p = list_entry(prev, struct mount, mnt_child);
				960	prev = p->mnt_mounts.prev;
				961	}
				962	return p;
				963	}
				964
				965	struct vfsmount *
				966	vfs_kern_mount(struct file_system_type type, int flags, const char name, void *data)
				967	{
				968	struct mount *mnt;
				969	struct dentry *root;
				970
				971	if (!type)
				972	return ERR_PTR(-ENODEV);
				973
				974	mnt = alloc_vfsmnt(name);
				975	if (!mnt)
				976	return ERR_PTR(-ENOMEM);
				977
				978	if (flags & MS_KERNMOUNT)
				979	mnt->mnt.mnt_flags = MNT_INTERNAL;
				980
				981	root = mount_fs(type, flags, name, data);
				982	if (IS_ERR(root)) {
				983	mnt_free_id(mnt);
				984	free_vfsmnt(mnt);
				985	return ERR_CAST(root);
				986	}
				987
				988	mnt->mnt.mnt_root = root;
				989	mnt->mnt.mnt_sb = root->d_sb;
				990	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				991	mnt->mnt_parent = mnt;
				992	lock_mount_hash();
				993	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
				994	unlock_mount_hash();
				995	return &mnt->mnt;
				996	}
				997	EXPORT_SYMBOL_GPL(vfs_kern_mount);
				998
				999	static struct mount clone_mnt(struct mount old, struct dentry *root,
				1000	int flag)
				1001	{
				1002	struct super_block *sb = old->mnt.mnt_sb;
				1003	struct mount *mnt;
				1004	int err;
				1005
				1006	mnt = alloc_vfsmnt(old->mnt_devname);
				1007	if (!mnt)
				1008	return ERR_PTR(-ENOMEM);
				1009
				1010	if (flag & (CL_SLAVE \| CL_PRIVATE \| CL_SHARED_TO_SLAVE))
				1011	mnt->mnt_group_id = 0; /* not a peer of original */
				1012	else
				1013	mnt->mnt_group_id = old->mnt_group_id;
				1014
				1015	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
				1016	err = mnt_alloc_group_id(mnt);
				1017	if (err)
				1018	goto out_free;
				1019	}
				1020
				1021	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD\|MNT_MARKED);
				1022	/* Don't allow unprivileged users to change mount flags */
				1023	if (flag & CL_UNPRIVILEGED) {
				1024	mnt->mnt.mnt_flags \|= MNT_LOCK_ATIME;
				1025
				1026	if (mnt->mnt.mnt_flags & MNT_READONLY)
				1027	mnt->mnt.mnt_flags \|= MNT_LOCK_READONLY;
				1028
				1029	if (mnt->mnt.mnt_flags & MNT_NODEV)
				1030	mnt->mnt.mnt_flags \|= MNT_LOCK_NODEV;
				1031
				1032	if (mnt->mnt.mnt_flags & MNT_NOSUID)
				1033	mnt->mnt.mnt_flags \|= MNT_LOCK_NOSUID;
				1034
				1035	if (mnt->mnt.mnt_flags & MNT_NOEXEC)
				1036	mnt->mnt.mnt_flags \|= MNT_LOCK_NOEXEC;
				1037	}
				1038
				1039	/* Don't allow unprivileged users to reveal what is under a mount */
				1040	if ((flag & CL_UNPRIVILEGED) &&
				1041	(!(flag & CL_EXPIRE) \|\| list_empty(&old->mnt_expire)))
				1042	mnt->mnt.mnt_flags \|= MNT_LOCKED;
				1043
				1044	atomic_inc(&sb->s_active);
				1045	mnt->mnt.mnt_sb = sb;
				1046	mnt->mnt.mnt_root = dget(root);
				1047	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				1048	mnt->mnt_parent = mnt;
				1049	lock_mount_hash();
				1050	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
				1051	unlock_mount_hash();
				1052
				1053	if ((flag & CL_SLAVE) \|\|
				1054	((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
				1055	list_add(&mnt->mnt_slave, &old->mnt_slave_list);
				1056	mnt->mnt_master = old;
				1057	CLEAR_MNT_SHARED(mnt);
				1058	} else if (!(flag & CL_PRIVATE)) {
				1059	if ((flag & CL_MAKE_SHARED) \|\| IS_MNT_SHARED(old))
				1060	list_add(&mnt->mnt_share, &old->mnt_share);
				1061	if (IS_MNT_SLAVE(old))
				1062	list_add(&mnt->mnt_slave, &old->mnt_slave);
				1063	mnt->mnt_master = old->mnt_master;
				1064	}
				1065	if (flag & CL_MAKE_SHARED)
				1066	set_mnt_shared(mnt);
				1067
				1068	/* stick the duplicate mount on the same expiry list
				1069	* as the original if that was on one */
				1070	if (flag & CL_EXPIRE) {
				1071	if (!list_empty(&old->mnt_expire))
				1072	list_add(&mnt->mnt_expire, &old->mnt_expire);
				1073	}
				1074
				1075	return mnt;
				1076
				1077	out_free:
				1078	mnt_free_id(mnt);
				1079	free_vfsmnt(mnt);
				1080	return ERR_PTR(err);
				1081	}
				1082
				1083	static void cleanup_mnt(struct mount *mnt)
				1084	{
				1085	/*
				1086	* This probably indicates that somebody messed
				1087	* up a mnt_want/drop_write() pair. If this
				1088	* happens, the filesystem was probably unable
				1089	* to make r/w->r/o transitions.
				1090	*/
				1091	/*
				1092	* The locking used to deal with mnt_count decrement provides barriers,
				1093	* so mnt_get_writers() below is safe.
				1094	*/
				1095	WARN_ON(mnt_get_writers(mnt));
				1096	if (unlikely(mnt->mnt_pins.first))
				1097	mnt_pin_kill(mnt);
				1098	fsnotify_vfsmount_delete(&mnt->mnt);
				1099	dput(mnt->mnt.mnt_root);
				1100	deactivate_super(mnt->mnt.mnt_sb);
				1101	mnt_free_id(mnt);
				1102	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
				1103	}
				1104
				1105	static void __cleanup_mnt(struct rcu_head *head)
				1106	{
				1107	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
				1108	}
				1109
				1110	static LLIST_HEAD(delayed_mntput_list);
				1111	static void delayed_mntput(struct work_struct *unused)
				1112	{
				1113	struct llist_node *node = llist_del_all(&delayed_mntput_list);
				1114	struct llist_node *next;
				1115
				1116	for (; node; node = next) {
				1117	next = llist_next(node);
				1118	cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
				1119	}
				1120	}
				1121	static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
				1122
				1123	static void mntput_no_expire(struct mount *mnt)
				1124	{
				1125	rcu_read_lock();
				1126	mnt_add_count(mnt, -1);
				1127	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
				1128	rcu_read_unlock();
				1129	return;
				1130	}
				1131	lock_mount_hash();
				1132	if (mnt_get_count(mnt)) {
				1133	rcu_read_unlock();
				1134	unlock_mount_hash();
				1135	return;
				1136	}
				1137	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
				1138	rcu_read_unlock();
				1139	unlock_mount_hash();
				1140	return;
				1141	}
				1142	mnt->mnt.mnt_flags \|= MNT_DOOMED;
				1143	rcu_read_unlock();
				1144
				1145	list_del(&mnt->mnt_instance);
				1146
				1147	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
				1148	struct mount p, tmp;
				1149	list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
				1150	umount_mnt(p);
				1151	}
				1152	}
				1153	unlock_mount_hash();
				1154
				1155	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
				1156	struct task_struct *task = current;
				1157	if (likely(!(task->flags & PF_KTHREAD))) {
				1158	init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
				1159	if (!task_work_add(task, &mnt->mnt_rcu, true))
				1160	return;
				1161	}
				1162	if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
				1163	schedule_delayed_work(&delayed_mntput_work, 1);
				1164	return;
				1165	}
				1166	cleanup_mnt(mnt);
				1167	}
				1168
				1169	void mntput(struct vfsmount *mnt)
				1170	{
				1171	if (mnt) {
				1172	struct mount *m = real_mount(mnt);
				1173	/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
				1174	if (unlikely(m->mnt_expiry_mark))
				1175	m->mnt_expiry_mark = 0;
				1176	mntput_no_expire(m);
				1177	}
				1178	}
				1179	EXPORT_SYMBOL(mntput);
				1180
				1181	struct vfsmount mntget(struct vfsmount mnt)
				1182	{
				1183	if (mnt)
				1184	mnt_add_count(real_mount(mnt), 1);
				1185	return mnt;
				1186	}
				1187	EXPORT_SYMBOL(mntget);
				1188
				1189	struct vfsmount mnt_clone_internal(struct path path)
				1190	{
				1191	struct mount *p;
				1192	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
				1193	if (IS_ERR(p))
				1194	return ERR_CAST(p);
				1195	p->mnt.mnt_flags \|= MNT_INTERNAL;
				1196	return &p->mnt;
				1197	}
				1198
				1199	static inline void mangle(struct seq_file m, const char s)
				1200	{
				1201	seq_escape(m, s, " \t\n\\");
				1202	}
				1203
				1204	/*
				1205	* Simple .show_options callback for filesystems which don't want to
				1206	* implement more complex mount option showing.
				1207	*
				1208	* See also save_mount_options().
				1209	*/
				1210	int generic_show_options(struct seq_file m, struct dentry root)
				1211	{
				1212	const char *options;
				1213
				1214	rcu_read_lock();
				1215	options = rcu_dereference(root->d_sb->s_options);
				1216
				1217	if (options != NULL && options[0]) {
				1218	seq_putc(m, ',');
				1219	mangle(m, options);
				1220	}
				1221	rcu_read_unlock();
				1222
				1223	return 0;
				1224	}
				1225	EXPORT_SYMBOL(generic_show_options);
				1226
				1227	/*
				1228	* If filesystem uses generic_show_options(), this function should be
				1229	* called from the fill_super() callback.
				1230	*
				1231	* The .remount_fs callback usually needs to be handled in a special
				1232	* way, to make sure, that previous options are not overwritten if the
				1233	* remount fails.
				1234	*
				1235	* Also note, that if the filesystem's .remount_fs function doesn't
				1236	* reset all options to their default value, but changes only newly
				1237	* given options, then the displayed options will not reflect reality
				1238	* any more.
				1239	*/
				1240	void save_mount_options(struct super_block sb, char options)
				1241	{
				1242	BUG_ON(sb->s_options);
				1243	rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
				1244	}
				1245	EXPORT_SYMBOL(save_mount_options);
				1246
				1247	void replace_mount_options(struct super_block sb, char options)
				1248	{
				1249	char *old = sb->s_options;
				1250	rcu_assign_pointer(sb->s_options, options);
				1251	if (old) {
				1252	synchronize_rcu();
				1253	kfree(old);
				1254	}
				1255	}
				1256	EXPORT_SYMBOL(replace_mount_options);
				1257
				1258	#ifdef CONFIG_PROC_FS
				1259	/* iterator; we want it to have access to namespace_sem, thus here... */
				1260	static void m_start(struct seq_file m, loff_t *pos)
				1261	{
				1262	struct proc_mounts *p = m->private;
				1263
				1264	down_read(&namespace_sem);
				1265	if (p->cached_event == p->ns->event) {
				1266	void *v = p->cached_mount;
				1267	if (*pos == p->cached_index)
				1268	return v;
				1269	if (*pos == p->cached_index + 1) {
				1270	v = seq_list_next(v, &p->ns->list, &p->cached_index);
				1271	return p->cached_mount = v;
				1272	}
				1273	}
				1274
				1275	p->cached_event = p->ns->event;
				1276	p->cached_mount = seq_list_start(&p->ns->list, *pos);
				1277	p->cached_index = *pos;
				1278	return p->cached_mount;
				1279	}
				1280
				1281	static void m_next(struct seq_file m, void v, loff_t pos)
				1282	{
				1283	struct proc_mounts *p = m->private;
				1284
				1285	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
				1286	p->cached_index = *pos;
				1287	return p->cached_mount;
				1288	}
				1289
				1290	static void m_stop(struct seq_file m, void v)
				1291	{
				1292	up_read(&namespace_sem);
				1293	}
				1294
				1295	static int m_show(struct seq_file m, void v)
				1296	{
				1297	struct proc_mounts *p = m->private;
				1298	struct mount *r = list_entry(v, struct mount, mnt_list);
				1299	return p->show(m, &r->mnt);
				1300	}
				1301
				1302	const struct seq_operations mounts_op = {
				1303	.start = m_start,
				1304	.next = m_next,
				1305	.stop = m_stop,
				1306	.show = m_show,
				1307	};
				1308	#endif /* CONFIG_PROC_FS */
				1309
				1310	/**
				1311	* may_umount_tree - check if a mount tree is busy
				1312	* @mnt: root of mount tree
				1313	*
				1314	* This is called to check if a tree of mounts has any
				1315	* open files, pwds, chroots or sub mounts that are
				1316	* busy.
				1317	*/
				1318	int may_umount_tree(struct vfsmount *m)
				1319	{
				1320	struct mount *mnt = real_mount(m);
				1321	int actual_refs = 0;
				1322	int minimum_refs = 0;
				1323	struct mount *p;
				1324	BUG_ON(!m);
				1325
				1326	/* write lock needed for mnt_get_count */
				1327	lock_mount_hash();
				1328	for (p = mnt; p; p = next_mnt(p, mnt)) {
				1329	actual_refs += mnt_get_count(p);
				1330	minimum_refs += 2;
				1331	}
				1332	unlock_mount_hash();
				1333
				1334	if (actual_refs > minimum_refs)
				1335	return 0;
				1336
				1337	return 1;
				1338	}
				1339
				1340	EXPORT_SYMBOL(may_umount_tree);
				1341
				1342	/**
				1343	* may_umount - check if a mount point is busy
				1344	* @mnt: root of mount
				1345	*
				1346	* This is called to check if a mount point has any
				1347	* open files, pwds, chroots or sub mounts. If the
				1348	* mount has sub mounts this will return busy
				1349	* regardless of whether the sub mounts are busy.
				1350	*
				1351	* Doesn't take quota and stuff into account. IOW, in some cases it will
				1352	* give false negatives. The main reason why it's here is that we need
				1353	* a non-destructive way to look for easily umountable filesystems.
				1354	*/
				1355	int may_umount(struct vfsmount *mnt)
				1356	{
				1357	int ret = 1;
				1358	down_read(&namespace_sem);
				1359	lock_mount_hash();
				1360	if (propagate_mount_busy(real_mount(mnt), 2))
				1361	ret = 0;
				1362	unlock_mount_hash();
				1363	up_read(&namespace_sem);
				1364	return ret;
				1365	}
				1366
				1367	EXPORT_SYMBOL(may_umount);
				1368
				1369	static HLIST_HEAD(unmounted); /* protected by namespace_sem */
				1370
				1371	static void namespace_unlock(void)
				1372	{
				1373	struct hlist_head head;
				1374
				1375	hlist_move_list(&unmounted, &head);
				1376
				1377	up_write(&namespace_sem);
				1378
				1379	if (likely(hlist_empty(&head)))
				1380	return;
				1381
				1382	synchronize_rcu();
				1383
				1384	group_pin_kill(&head);
				1385	}
				1386
				1387	static inline void namespace_lock(void)
				1388	{
				1389	down_write(&namespace_sem);
				1390	}
				1391
				1392	enum umount_tree_flags {
				1393	UMOUNT_SYNC = 1,
				1394	UMOUNT_PROPAGATE = 2,
				1395	UMOUNT_CONNECTED = 4,
				1396	};
				1397
				1398	static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
				1399	{
				1400	/* Leaving mounts connected is only valid for lazy umounts */
				1401	if (how & UMOUNT_SYNC)
				1402	return true;
				1403
				1404	/* A mount without a parent has nothing to be connected to */
				1405	if (!mnt_has_parent(mnt))
				1406	return true;
				1407
				1408	/* Because the reference counting rules change when mounts are
				1409	* unmounted and connected, umounted mounts may not be
				1410	* connected to mounted mounts.
				1411	*/
				1412	if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
				1413	return true;
				1414
				1415	/* Has it been requested that the mount remain connected? */
				1416	if (how & UMOUNT_CONNECTED)
				1417	return false;
				1418
				1419	/* Is the mount locked such that it needs to remain connected? */
				1420	if (IS_MNT_LOCKED(mnt))
				1421	return false;
				1422
				1423	/* By default disconnect the mount */
				1424	return true;
				1425	}
				1426
				1427	/*
				1428	* mount_lock must be held
				1429	* namespace_sem must be held for write
				1430	*/
				1431	static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
				1432	{
				1433	LIST_HEAD(tmp_list);
				1434	struct mount *p;
				1435
				1436	if (how & UMOUNT_PROPAGATE)
				1437	propagate_mount_unlock(mnt);
				1438
				1439	/* Gather the mounts to umount */
				1440	for (p = mnt; p; p = next_mnt(p, mnt)) {
				1441	p->mnt.mnt_flags \|= MNT_UMOUNT;
				1442	list_move(&p->mnt_list, &tmp_list);
				1443	}
				1444
				1445	/* Hide the mounts from mnt_mounts */
				1446	list_for_each_entry(p, &tmp_list, mnt_list) {
				1447	list_del_init(&p->mnt_child);
				1448	}
				1449
				1450	/* Add propogated mounts to the tmp_list */
				1451	if (how & UMOUNT_PROPAGATE)
				1452	propagate_umount(&tmp_list);
				1453
				1454	while (!list_empty(&tmp_list)) {
				1455	struct mnt_namespace *ns;
				1456	bool disconnect;
				1457	p = list_first_entry(&tmp_list, struct mount, mnt_list);
				1458	list_del_init(&p->mnt_expire);
				1459	list_del_init(&p->mnt_list);
				1460	ns = p->mnt_ns;
				1461	if (ns) {
				1462	ns->mounts--;
				1463	__touch_mnt_namespace(ns);
				1464	}
				1465	p->mnt_ns = NULL;
				1466	if (how & UMOUNT_SYNC)
				1467	p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
				1468
				1469	disconnect = disconnect_mount(p, how);
				1470
				1471	pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
				1472	disconnect ? &unmounted : NULL);
				1473	if (mnt_has_parent(p)) {
				1474	mnt_add_count(p->mnt_parent, -1);
				1475	if (!disconnect) {
				1476	/* Don't forget about p */
				1477	list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
				1478	} else {
				1479	umount_mnt(p);
				1480	}
				1481	}
				1482	change_mnt_propagation(p, MS_PRIVATE);
				1483	}
				1484	}
				1485
				1486	static void shrink_submounts(struct mount *mnt);
				1487
				1488	static int do_umount(struct mount *mnt, int flags)
				1489	{
				1490	struct super_block *sb = mnt->mnt.mnt_sb;
				1491	int retval;
				1492
				1493	retval = security_sb_umount(&mnt->mnt, flags);
				1494	if (retval)
				1495	return retval;
				1496
				1497	/*
				1498	* Allow userspace to request a mountpoint be expired rather than
				1499	* unmounting unconditionally. Unmount only happens if:
				1500	* (1) the mark is already set (the mark is cleared by mntput())
				1501	* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
				1502	*/
				1503	if (flags & MNT_EXPIRE) {
				1504	if (&mnt->mnt == current->fs->root.mnt \|\|
				1505	flags & (MNT_FORCE \| MNT_DETACH))
				1506	return -EINVAL;
				1507
				1508	/*
				1509	* probably don't strictly need the lock here if we examined
				1510	* all race cases, but it's a slowpath.
				1511	*/
				1512	lock_mount_hash();
				1513	if (mnt_get_count(mnt) != 2) {
				1514	unlock_mount_hash();
				1515	return -EBUSY;
				1516	}
				1517	unlock_mount_hash();
				1518
				1519	if (!xchg(&mnt->mnt_expiry_mark, 1))
				1520	return -EAGAIN;
				1521	}
				1522
				1523	/*
				1524	* If we may have to abort operations to get out of this
				1525	* mount, and they will themselves hold resources we must
				1526	* allow the fs to do things. In the Unix tradition of
				1527	* 'Gee thats tricky lets do it in userspace' the umount_begin
				1528	* might fail to complete on the first run through as other tasks
				1529	* must return, and the like. Thats for the mount program to worry
				1530	* about for the moment.
				1531	*/
				1532
				1533	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
				1534	sb->s_op->umount_begin(sb);
				1535	}
				1536
				1537	/*
				1538	* No sense to grab the lock for this test, but test itself looks
				1539	* somewhat bogus. Suggestions for better replacement?
				1540	* Ho-hum... In principle, we might treat that as umount + switch
				1541	* to rootfs. GC would eventually take care of the old vfsmount.
				1542	* Actually it makes sense, especially if rootfs would contain a
				1543	* /reboot - static binary that would close all descriptors and
				1544	* call reboot(9). Then init(8) could umount root and exec /reboot.
				1545	*/
				1546	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
				1547	/*
				1548	* Special case for "unmounting" root ...
				1549	* we just try to remount it readonly.
				1550	*/
				1551	if (!capable(CAP_SYS_ADMIN))
				1552	return -EPERM;
				1553	down_write(&sb->s_umount);
				1554	if (!(sb->s_flags & MS_RDONLY))
				1555	retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
				1556	up_write(&sb->s_umount);
				1557	return retval;
				1558	}
				1559
				1560	namespace_lock();
				1561	lock_mount_hash();
				1562	event++;
				1563
				1564	if (flags & MNT_DETACH) {
				1565	if (!list_empty(&mnt->mnt_list))
				1566	umount_tree(mnt, UMOUNT_PROPAGATE);
				1567	retval = 0;
				1568	} else {
				1569	shrink_submounts(mnt);
				1570	retval = -EBUSY;
				1571	if (!propagate_mount_busy(mnt, 2)) {
				1572	if (!list_empty(&mnt->mnt_list))
				1573	umount_tree(mnt, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				1574	retval = 0;
				1575	}
				1576	}
				1577	unlock_mount_hash();
				1578	namespace_unlock();
				1579	return retval;
				1580	}
				1581
				1582	/*
				1583	* __detach_mounts - lazily unmount all mounts on the specified dentry
				1584	*
				1585	* During unlink, rmdir, and d_drop it is possible to loose the path
				1586	* to an existing mountpoint, and wind up leaking the mount.
				1587	* detach_mounts allows lazily unmounting those mounts instead of
				1588	* leaking them.
				1589	*
				1590	* The caller may hold dentry->d_inode->i_mutex.
				1591	*/
				1592	void __detach_mounts(struct dentry *dentry)
				1593	{
				1594	struct mountpoint *mp;
				1595	struct mount *mnt;
				1596
				1597	namespace_lock();
				1598	lock_mount_hash();
				1599	mp = lookup_mountpoint(dentry);
				1600	if (IS_ERR_OR_NULL(mp))
				1601	goto out_unlock;
				1602
				1603	event++;
				1604	while (!hlist_empty(&mp->m_list)) {
				1605	mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
				1606	if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
				1607	hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
				1608	umount_mnt(mnt);
				1609	}
				1610	else umount_tree(mnt, UMOUNT_CONNECTED);
				1611	}
				1612	put_mountpoint(mp);
				1613	out_unlock:
				1614	unlock_mount_hash();
				1615	namespace_unlock();
				1616	}
				1617
				1618	/*
				1619	* Is the caller allowed to modify his namespace?
				1620	*/
				1621	static inline bool may_mount(void)
				1622	{
				1623	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
				1624	}
				1625
				1626	/*
				1627	* Now umount can handle mount points as well as block devices.
				1628	* This is important for filesystems which use unnamed block devices.
				1629	*
				1630	* We now support a flag for forced unmount like the other 'big iron'
				1631	* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
				1632	*/
				1633
				1634	SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
				1635	{
				1636	struct path path;
				1637	struct mount *mnt;
				1638	int retval;
				1639	int lookup_flags = 0;
				1640
				1641	if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
				1642	return -EINVAL;
				1643
				1644	if (!may_mount())
				1645	return -EPERM;
				1646
				1647	if (!(flags & UMOUNT_NOFOLLOW))
				1648	lookup_flags \|= LOOKUP_FOLLOW;
				1649
				1650	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
				1651	if (retval)
				1652	goto out;
				1653	mnt = real_mount(path.mnt);
				1654	retval = -EINVAL;
				1655	if (path.dentry != path.mnt->mnt_root)
				1656	goto dput_and_out;
				1657	if (!check_mnt(mnt))
				1658	goto dput_and_out;
				1659	if (mnt->mnt.mnt_flags & MNT_LOCKED)
				1660	goto dput_and_out;
				1661	retval = -EPERM;
				1662	if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
				1663	goto dput_and_out;
				1664
				1665	retval = do_umount(mnt, flags);
				1666	dput_and_out:
				1667	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
				1668	dput(path.dentry);
				1669	mntput_no_expire(mnt);
				1670	out:
				1671	return retval;
				1672	}
				1673
				1674	#ifdef __ARCH_WANT_SYS_OLDUMOUNT
				1675
				1676	/*
				1677	* The 2.0 compatible umount. No flags.
				1678	*/
				1679	SYSCALL_DEFINE1(oldumount, char __user *, name)
				1680	{
				1681	return sys_umount(name, 0);
				1682	}
				1683
				1684	#endif
				1685
				1686	static bool is_mnt_ns_file(struct dentry *dentry)
				1687	{
				1688	/* Is this a proxy for a mount namespace? */
				1689	return dentry->d_op == &ns_dentry_operations &&
				1690	dentry->d_fsdata == &mntns_operations;
				1691	}
				1692
				1693	struct mnt_namespace to_mnt_ns(struct ns_common ns)
				1694	{
				1695	return container_of(ns, struct mnt_namespace, ns);
				1696	}
				1697
				1698	static bool mnt_ns_loop(struct dentry *dentry)
				1699	{
				1700	/* Could bind mounting the mount namespace inode cause a
				1701	* mount namespace loop?
				1702	*/
				1703	struct mnt_namespace *mnt_ns;
				1704	if (!is_mnt_ns_file(dentry))
				1705	return false;
				1706
				1707	mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
				1708	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
				1709	}
				1710
				1711	struct mount copy_tree(struct mount mnt, struct dentry *dentry,
				1712	int flag)
				1713	{
				1714	struct mount res, p, q, r, *parent;
				1715
				1716	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
				1717	return ERR_PTR(-EINVAL);
				1718
				1719	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
				1720	return ERR_PTR(-EINVAL);
				1721
				1722	res = q = clone_mnt(mnt, dentry, flag);
				1723	if (IS_ERR(q))
				1724	return q;
				1725
				1726	q->mnt_mountpoint = mnt->mnt_mountpoint;
				1727
				1728	p = mnt;
				1729	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
				1730	struct mount *s;
				1731	if (!is_subdir(r->mnt_mountpoint, dentry))
				1732	continue;
				1733
				1734	for (s = r; s; s = next_mnt(s, r)) {
				1735	if (!(flag & CL_COPY_UNBINDABLE) &&
				1736	IS_MNT_UNBINDABLE(s)) {
				1737	s = skip_mnt_tree(s);
				1738	continue;
				1739	}
				1740	if (!(flag & CL_COPY_MNT_NS_FILE) &&
				1741	is_mnt_ns_file(s->mnt.mnt_root)) {
				1742	s = skip_mnt_tree(s);
				1743	continue;
				1744	}
				1745	while (p != s->mnt_parent) {
				1746	p = p->mnt_parent;
				1747	q = q->mnt_parent;
				1748	}
				1749	p = s;
				1750	parent = q;
				1751	q = clone_mnt(p, p->mnt.mnt_root, flag);
				1752	if (IS_ERR(q))
				1753	goto out;
				1754	lock_mount_hash();
				1755	list_add_tail(&q->mnt_list, &res->mnt_list);
				1756	attach_mnt(q, parent, p->mnt_mp);
				1757	unlock_mount_hash();
				1758	}
				1759	}
				1760	return res;
				1761	out:
				1762	if (res) {
				1763	lock_mount_hash();
				1764	umount_tree(res, UMOUNT_SYNC);
				1765	unlock_mount_hash();
				1766	}
				1767	return q;
				1768	}
				1769
				1770	/* Caller should check returned pointer for errors */
				1771
				1772	struct vfsmount collect_mounts(struct path path)
				1773	{
				1774	struct mount *tree;
				1775	namespace_lock();
				1776	if (!check_mnt(real_mount(path->mnt)))
				1777	tree = ERR_PTR(-EINVAL);
				1778	else
				1779	tree = copy_tree(real_mount(path->mnt), path->dentry,
				1780	CL_COPY_ALL \| CL_PRIVATE);
				1781	namespace_unlock();
				1782	if (IS_ERR(tree))
				1783	return ERR_CAST(tree);
				1784	return &tree->mnt;
				1785	}
				1786
				1787	void drop_collected_mounts(struct vfsmount *mnt)
				1788	{
				1789	namespace_lock();
				1790	lock_mount_hash();
				1791	umount_tree(real_mount(mnt), UMOUNT_SYNC);
				1792	unlock_mount_hash();
				1793	namespace_unlock();
				1794	}
				1795
				1796	/**
				1797	* clone_private_mount - create a private clone of a path
				1798	*
				1799	* This creates a new vfsmount, which will be the clone of @path. The new will
				1800	* not be attached anywhere in the namespace and will be private (i.e. changes
				1801	* to the originating mount won't be propagated into this).
				1802	*
				1803	* Release with mntput().
				1804	*/
				1805	struct vfsmount clone_private_mount(struct path path)
				1806	{
				1807	struct mount *old_mnt = real_mount(path->mnt);
				1808	struct mount *new_mnt;
				1809
				1810	if (IS_MNT_UNBINDABLE(old_mnt))
				1811	return ERR_PTR(-EINVAL);
				1812
				1813	down_read(&namespace_sem);
				1814	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
				1815	up_read(&namespace_sem);
				1816	if (IS_ERR(new_mnt))
				1817	return ERR_CAST(new_mnt);
				1818
				1819	return &new_mnt->mnt;
				1820	}
				1821	EXPORT_SYMBOL_GPL(clone_private_mount);
				1822
				1823	int iterate_mounts(int (f)(struct vfsmount , void ), void arg,
				1824	struct vfsmount *root)
				1825	{
				1826	struct mount *mnt;
				1827	int res = f(root, arg);
				1828	if (res)
				1829	return res;
				1830	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
				1831	res = f(&mnt->mnt, arg);
				1832	if (res)
				1833	return res;
				1834	}
				1835	return 0;
				1836	}
				1837
				1838	static void cleanup_group_ids(struct mount mnt, struct mount end)
				1839	{
				1840	struct mount *p;
				1841
				1842	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
				1843	if (p->mnt_group_id && !IS_MNT_SHARED(p))
				1844	mnt_release_group_id(p);
				1845	}
				1846	}
				1847
				1848	static int invent_group_ids(struct mount *mnt, bool recurse)
				1849	{
				1850	struct mount *p;
				1851
				1852	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
				1853	if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
				1854	int err = mnt_alloc_group_id(p);
				1855	if (err) {
				1856	cleanup_group_ids(mnt, p);
				1857	return err;
				1858	}
				1859	}
				1860	}
				1861
				1862	return 0;
				1863	}
				1864
				1865	int count_mounts(struct mnt_namespace ns, struct mount mnt)
				1866	{
				1867	unsigned int max = READ_ONCE(sysctl_mount_max);
				1868	unsigned int mounts = 0, old, pending, sum;
				1869	struct mount *p;
				1870
				1871	for (p = mnt; p; p = next_mnt(p, mnt))
				1872	mounts++;
				1873
				1874	old = ns->mounts;
				1875	pending = ns->pending_mounts;
				1876	sum = old + pending;
				1877	if ((old > sum) \|\|
				1878	(pending > sum) \|\|
				1879	(max < sum) \|\|
				1880	(mounts > (max - sum)))
				1881	return -ENOSPC;
				1882
				1883	ns->pending_mounts = pending + mounts;
				1884	return 0;
				1885	}
				1886
				1887	/*
				1888	* @source_mnt : mount tree to be attached
				1889	* @nd : place the mount tree @source_mnt is attached
				1890	* @parent_nd : if non-null, detach the source_mnt from its parent and
				1891	* store the parent mount and mountpoint dentry.
				1892	* (done when source_mnt is moved)
				1893	*
				1894	* NOTE: in the table below explains the semantics when a source mount
				1895	* of a given type is attached to a destination mount of a given type.
				1896	* ---------------------------------------------------------------------------
				1897	* \| BIND MOUNT OPERATION \|
				1898	* \|**************************************************************************
				1899	* \| source-->\| shared \| private \| slave \| unbindable \|
				1900	* \| dest \| \| \| \| \|
				1901	* \| \| \| \| \| \| \|
				1902	* \| v \| \| \| \| \|
				1903	* \|**************************************************************************
				1904	* \| shared \| shared (++) \| shared (+) \| shared(+++)\| invalid \|
				1905	* \| \| \| \| \| \|
				1906	* \|non-shared\| shared (+) \| private \| slave (*) \| invalid \|
				1907	* ***************************************************************************
				1908	* A bind operation clones the source mount and mounts the clone on the
				1909	* destination mount.
				1910	*
				1911	* (++) the cloned mount is propagated to all the mounts in the propagation
				1912	* tree of the destination mount and the cloned mount is added to
				1913	* the peer group of the source mount.
				1914	* (+) the cloned mount is created under the destination mount and is marked
				1915	* as shared. The cloned mount is added to the peer group of the source
				1916	* mount.
				1917	* (+++) the mount is propagated to all the mounts in the propagation tree
				1918	* of the destination mount and the cloned mount is made slave
				1919	* of the same master as that of the source mount. The cloned mount
				1920	* is marked as 'shared and slave'.
				1921	* (*) the cloned mount is made a slave of the same master as that of the
				1922	* source mount.
				1923	*
				1924	* ---------------------------------------------------------------------------
				1925	* \| MOVE MOUNT OPERATION \|
				1926	* \|**************************************************************************
				1927	* \| source-->\| shared \| private \| slave \| unbindable \|
				1928	* \| dest \| \| \| \| \|
				1929	* \| \| \| \| \| \| \|
				1930	* \| v \| \| \| \| \|
				1931	* \|**************************************************************************
				1932	* \| shared \| shared (+) \| shared (+) \| shared(+++) \| invalid \|
				1933	* \| \| \| \| \| \|
				1934	* \|non-shared\| shared (+) \| private \| slave () \| unbindable \|
				1935	* ***************************************************************************
				1936	*
				1937	* (+) the mount is moved to the destination. And is then propagated to
				1938	* all the mounts in the propagation tree of the destination mount.
				1939	* (+*) the mount is moved to the destination.
				1940	* (+++) the mount is moved to the destination and is then propagated to
				1941	* all the mounts belonging to the destination mount's propagation tree.
				1942	* the mount is marked as 'shared and slave'.
				1943	* (*) the mount continues to be a slave at the new location.
				1944	*
				1945	* if the source mount is a tree, the operations explained above is
				1946	* applied to each mount in the tree.
				1947	* Must be called without spinlocks held, since this function can sleep
				1948	* in allocations.
				1949	*/
				1950	static int attach_recursive_mnt(struct mount *source_mnt,
				1951	struct mount *dest_mnt,
				1952	struct mountpoint *dest_mp,
				1953	struct path *parent_path)
				1954	{
				1955	HLIST_HEAD(tree_list);
				1956	struct mnt_namespace *ns = dest_mnt->mnt_ns;
				1957	struct mountpoint *smp;
				1958	struct mount child, p;
				1959	struct hlist_node *n;
				1960	int err;
				1961
				1962	/* Preallocate a mountpoint in case the new mounts need
				1963	* to be tucked under other mounts.
				1964	*/
				1965	smp = get_mountpoint(source_mnt->mnt.mnt_root);
				1966	if (IS_ERR(smp))
				1967	return PTR_ERR(smp);
				1968
				1969	/* Is there space to add these mounts to the mount namespace? */
				1970	if (!parent_path) {
				1971	err = count_mounts(ns, source_mnt);
				1972	if (err)
				1973	goto out;
				1974	}
				1975
				1976	if (IS_MNT_SHARED(dest_mnt)) {
				1977	err = invent_group_ids(source_mnt, true);
				1978	if (err)
				1979	goto out;
				1980	err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
				1981	lock_mount_hash();
				1982	if (err)
				1983	goto out_cleanup_ids;
				1984	for (p = source_mnt; p; p = next_mnt(p, source_mnt))
				1985	set_mnt_shared(p);
				1986	} else {
				1987	lock_mount_hash();
				1988	}
				1989	if (parent_path) {
				1990	detach_mnt(source_mnt, parent_path);
				1991	attach_mnt(source_mnt, dest_mnt, dest_mp);
				1992	touch_mnt_namespace(source_mnt->mnt_ns);
				1993	} else {
				1994	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
				1995	commit_tree(source_mnt);
				1996	}
				1997
				1998	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
				1999	struct mount *q;
				2000	hlist_del_init(&child->mnt_hash);
				2001	q = __lookup_mnt(&child->mnt_parent->mnt,
				2002	child->mnt_mountpoint);
				2003	if (q)
				2004	mnt_change_mountpoint(child, smp, q);
				2005	commit_tree(child);
				2006	}
				2007	put_mountpoint(smp);
				2008	unlock_mount_hash();
				2009
				2010	return 0;
				2011
				2012	out_cleanup_ids:
				2013	while (!hlist_empty(&tree_list)) {
				2014	child = hlist_entry(tree_list.first, struct mount, mnt_hash);
				2015	child->mnt_parent->mnt_ns->pending_mounts = 0;
				2016	umount_tree(child, UMOUNT_SYNC);
				2017	}
				2018	unlock_mount_hash();
				2019	cleanup_group_ids(source_mnt, NULL);
				2020	out:
				2021	ns->pending_mounts = 0;
				2022
				2023	read_seqlock_excl(&mount_lock);
				2024	put_mountpoint(smp);
				2025	read_sequnlock_excl(&mount_lock);
				2026
				2027	return err;
				2028	}
				2029
				2030	static struct mountpoint lock_mount(struct path path)
				2031	{
				2032	struct vfsmount *mnt;
				2033	struct dentry *dentry = path->dentry;
				2034	retry:
				2035	mutex_lock(&dentry->d_inode->i_mutex);
				2036	if (unlikely(cant_mount(dentry))) {
				2037	mutex_unlock(&dentry->d_inode->i_mutex);
				2038	return ERR_PTR(-ENOENT);
				2039	}
				2040	namespace_lock();
				2041	mnt = lookup_mnt(path);
				2042	if (likely(!mnt)) {
				2043	struct mountpoint *mp = get_mountpoint(dentry);
				2044	if (IS_ERR(mp)) {
				2045	namespace_unlock();
				2046	mutex_unlock(&dentry->d_inode->i_mutex);
				2047	return mp;
				2048	}
				2049	return mp;
				2050	}
				2051	namespace_unlock();
				2052	mutex_unlock(&path->dentry->d_inode->i_mutex);
				2053	path_put(path);
				2054	path->mnt = mnt;
				2055	dentry = path->dentry = dget(mnt->mnt_root);
				2056	goto retry;
				2057	}
				2058
				2059	static void unlock_mount(struct mountpoint *where)
				2060	{
				2061	struct dentry *dentry = where->m_dentry;
				2062
				2063	read_seqlock_excl(&mount_lock);
				2064	put_mountpoint(where);
				2065	read_sequnlock_excl(&mount_lock);
				2066
				2067	namespace_unlock();
				2068	mutex_unlock(&dentry->d_inode->i_mutex);
				2069	}
				2070
				2071	static int graft_tree(struct mount mnt, struct mount p, struct mountpoint *mp)
				2072	{
				2073	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
				2074	return -EINVAL;
				2075
				2076	if (d_is_dir(mp->m_dentry) !=
				2077	d_is_dir(mnt->mnt.mnt_root))
				2078	return -ENOTDIR;
				2079
				2080	return attach_recursive_mnt(mnt, p, mp, NULL);
				2081	}
				2082
				2083	/*
				2084	* Sanity check the flags to change_mnt_propagation.
				2085	*/
				2086
				2087	static int flags_to_propagation_type(int flags)
				2088	{
				2089	int type = flags & ~(MS_REC \| MS_SILENT);
				2090
				2091	/* Fail if any non-propagation flags are set */
				2092	if (type & ~(MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
				2093	return 0;
				2094	/* Only one propagation flag should be set */
				2095	if (!is_power_of_2(type))
				2096	return 0;
				2097	return type;
				2098	}
				2099
				2100	/*
				2101	* recursively change the type of the mountpoint.
				2102	*/
				2103	static int do_change_type(struct path *path, int flag)
				2104	{
				2105	struct mount *m;
				2106	struct mount *mnt = real_mount(path->mnt);
				2107	int recurse = flag & MS_REC;
				2108	int type;
				2109	int err = 0;
				2110
				2111	if (path->dentry != path->mnt->mnt_root)
				2112	return -EINVAL;
				2113
				2114	type = flags_to_propagation_type(flag);
				2115	if (!type)
				2116	return -EINVAL;
				2117
				2118	namespace_lock();
				2119	if (type == MS_SHARED) {
				2120	err = invent_group_ids(mnt, recurse);
				2121	if (err)
				2122	goto out_unlock;
				2123	}
				2124
				2125	lock_mount_hash();
				2126	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
				2127	change_mnt_propagation(m, type);
				2128	unlock_mount_hash();
				2129
				2130	out_unlock:
				2131	namespace_unlock();
				2132	return err;
				2133	}
				2134
				2135	static bool has_locked_children(struct mount mnt, struct dentry dentry)
				2136	{
				2137	struct mount *child;
				2138	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
				2139	if (!is_subdir(child->mnt_mountpoint, dentry))
				2140	continue;
				2141
				2142	if (child->mnt.mnt_flags & MNT_LOCKED)
				2143	return true;
				2144	}
				2145	return false;
				2146	}
				2147
				2148	/*
				2149	* do loopback mount.
				2150	*/
				2151	static int do_loopback(struct path path, const char old_name,
				2152	int recurse)
				2153	{
				2154	struct path old_path;
				2155	struct mount mnt = NULL, old, *parent;
				2156	struct mountpoint *mp;
				2157	int err;
				2158	if (!old_name \|\| !*old_name)
				2159	return -EINVAL;
				2160	err = kern_path(old_name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &old_path);
				2161	if (err)
				2162	return err;
				2163
				2164	err = -EINVAL;
				2165	if (mnt_ns_loop(old_path.dentry))
				2166	goto out;
				2167
				2168	mp = lock_mount(path);
				2169	err = PTR_ERR(mp);
				2170	if (IS_ERR(mp))
				2171	goto out;
				2172
				2173	old = real_mount(old_path.mnt);
				2174	parent = real_mount(path->mnt);
				2175
				2176	err = -EINVAL;
				2177	if (IS_MNT_UNBINDABLE(old))
				2178	goto out2;
				2179
				2180	if (!check_mnt(parent))
				2181	goto out2;
				2182
				2183	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
				2184	goto out2;
				2185
				2186	if (!recurse && has_locked_children(old, old_path.dentry))
				2187	goto out2;
				2188
				2189	if (recurse)
				2190	mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
				2191	else
				2192	mnt = clone_mnt(old, old_path.dentry, 0);
				2193
				2194	if (IS_ERR(mnt)) {
				2195	err = PTR_ERR(mnt);
				2196	goto out2;
				2197	}
				2198
				2199	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
				2200
				2201	err = graft_tree(mnt, parent, mp);
				2202	if (err) {
				2203	lock_mount_hash();
				2204	umount_tree(mnt, UMOUNT_SYNC);
				2205	unlock_mount_hash();
				2206	}
				2207	out2:
				2208	unlock_mount(mp);
				2209	out:
				2210	path_put(&old_path);
				2211	return err;
				2212	}
				2213
				2214	static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
				2215	{
				2216	int error = 0;
				2217	int readonly_request = 0;
				2218
				2219	if (ms_flags & MS_RDONLY)
				2220	readonly_request = 1;
				2221	if (readonly_request == __mnt_is_readonly(mnt))
				2222	return 0;
				2223
				2224	if (readonly_request)
				2225	error = mnt_make_readonly(real_mount(mnt));
				2226	else
				2227	__mnt_unmake_readonly(real_mount(mnt));
				2228	return error;
				2229	}
				2230
				2231	/*
				2232	* change filesystem flags. dir should be a physical root of filesystem.
				2233	* If you've mounted a non-root directory somewhere and want to do remount
				2234	* on it - tough luck.
				2235	*/
				2236	static int do_remount(struct path *path, int flags, int mnt_flags,
				2237	void *data)
				2238	{
				2239	int err;
				2240	struct super_block *sb = path->mnt->mnt_sb;
				2241	struct mount *mnt = real_mount(path->mnt);
				2242
				2243	if (!check_mnt(mnt))
				2244	return -EINVAL;
				2245
				2246	if (path->dentry != path->mnt->mnt_root)
				2247	return -EINVAL;
				2248
				2249	/* Don't allow changing of locked mnt flags.
				2250	*
				2251	* No locks need to be held here while testing the various
				2252	* MNT_LOCK flags because those flags can never be cleared
				2253	* once they are set.
				2254	*/
				2255	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
				2256	!(mnt_flags & MNT_READONLY)) {
				2257	return -EPERM;
				2258	}
				2259	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
				2260	!(mnt_flags & MNT_NODEV)) {
				2261	/* Was the nodev implicitly added in mount? */
				2262	if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
				2263	!(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
				2264	mnt_flags \|= MNT_NODEV;
				2265	} else {
				2266	return -EPERM;
				2267	}
				2268	}
				2269	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
				2270	!(mnt_flags & MNT_NOSUID)) {
				2271	return -EPERM;
				2272	}
				2273	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
				2274	!(mnt_flags & MNT_NOEXEC)) {
				2275	return -EPERM;
				2276	}
				2277	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
				2278	((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
				2279	return -EPERM;
				2280	}
				2281
				2282	err = security_sb_remount(sb, data);
				2283	if (err)
				2284	return err;
				2285
				2286	down_write(&sb->s_umount);
				2287	if (flags & MS_BIND)
				2288	err = change_mount_flags(path->mnt, flags);
				2289	else if (!capable(CAP_SYS_ADMIN))
				2290	err = -EPERM;
				2291	else
				2292	err = do_remount_sb(sb, flags, data, 0);
				2293	if (!err) {
				2294	lock_mount_hash();
				2295	mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
				2296	mnt->mnt.mnt_flags = mnt_flags;
				2297	touch_mnt_namespace(mnt->mnt_ns);
				2298	unlock_mount_hash();
				2299	}
				2300	up_write(&sb->s_umount);
				2301	return err;
				2302	}
				2303
				2304	static inline int tree_contains_unbindable(struct mount *mnt)
				2305	{
				2306	struct mount *p;
				2307	for (p = mnt; p; p = next_mnt(p, mnt)) {
				2308	if (IS_MNT_UNBINDABLE(p))
				2309	return 1;
				2310	}
				2311	return 0;
				2312	}
				2313
				2314	static int do_move_mount(struct path path, const char old_name)
				2315	{
				2316	struct path old_path, parent_path;
				2317	struct mount *p;
				2318	struct mount *old;
				2319	struct mountpoint *mp;
				2320	int err;
				2321	if (!old_name \|\| !*old_name)
				2322	return -EINVAL;
				2323	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
				2324	if (err)
				2325	return err;
				2326
				2327	mp = lock_mount(path);
				2328	err = PTR_ERR(mp);
				2329	if (IS_ERR(mp))
				2330	goto out;
				2331
				2332	old = real_mount(old_path.mnt);
				2333	p = real_mount(path->mnt);
				2334
				2335	err = -EINVAL;
				2336	if (!check_mnt(p) \|\| !check_mnt(old))
				2337	goto out1;
				2338
				2339	if (old->mnt.mnt_flags & MNT_LOCKED)
				2340	goto out1;
				2341
				2342	err = -EINVAL;
				2343	if (old_path.dentry != old_path.mnt->mnt_root)
				2344	goto out1;
				2345
				2346	if (!mnt_has_parent(old))
				2347	goto out1;
				2348
				2349	if (d_is_dir(path->dentry) !=
				2350	d_is_dir(old_path.dentry))
				2351	goto out1;
				2352	/*
				2353	* Don't move a mount residing in a shared parent.
				2354	*/
				2355	if (IS_MNT_SHARED(old->mnt_parent))
				2356	goto out1;
				2357	/*
				2358	* Don't move a mount tree containing unbindable mounts to a destination
				2359	* mount which is shared.
				2360	*/
				2361	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
				2362	goto out1;
				2363	err = -ELOOP;
				2364	for (; mnt_has_parent(p); p = p->mnt_parent)
				2365	if (p == old)
				2366	goto out1;
				2367
				2368	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
				2369	if (err)
				2370	goto out1;
				2371
				2372	/* if the mount is moved, it should no longer be expire
				2373	* automatically */
				2374	list_del_init(&old->mnt_expire);
				2375	out1:
				2376	unlock_mount(mp);
				2377	out:
				2378	if (!err)
				2379	path_put(&parent_path);
				2380	path_put(&old_path);
				2381	return err;
				2382	}
				2383
				2384	static struct vfsmount fs_set_subtype(struct vfsmount mnt, const char *fstype)
				2385	{
				2386	int err;
				2387	const char *subtype = strchr(fstype, '.');
				2388	if (subtype) {
				2389	subtype++;
				2390	err = -EINVAL;
				2391	if (!subtype[0])
				2392	goto err;
				2393	} else
				2394	subtype = "";
				2395
				2396	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
				2397	err = -ENOMEM;
				2398	if (!mnt->mnt_sb->s_subtype)
				2399	goto err;
				2400	return mnt;
				2401
				2402	err:
				2403	mntput(mnt);
				2404	return ERR_PTR(err);
				2405	}
				2406
				2407	/*
				2408	* add a mount into a namespace's mount tree
				2409	*/
				2410	static int do_add_mount(struct mount newmnt, struct path path, int mnt_flags)
				2411	{
				2412	struct mountpoint *mp;
				2413	struct mount *parent;
				2414	int err;
				2415
				2416	mnt_flags &= ~MNT_INTERNAL_FLAGS;
				2417
				2418	mp = lock_mount(path);
				2419	if (IS_ERR(mp))
				2420	return PTR_ERR(mp);
				2421
				2422	parent = real_mount(path->mnt);
				2423	err = -EINVAL;
				2424	if (unlikely(!check_mnt(parent))) {
				2425	/* that's acceptable only for automounts done in private ns */
				2426	if (!(mnt_flags & MNT_SHRINKABLE))
				2427	goto unlock;
				2428	/* ... and for those we'd better have mountpoint still alive */
				2429	if (!parent->mnt_ns)
				2430	goto unlock;
				2431	}
				2432
				2433	/* Refuse the same filesystem on the same mount point */
				2434	err = -EBUSY;
				2435	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
				2436	path->mnt->mnt_root == path->dentry)
				2437	goto unlock;
				2438
				2439	err = -EINVAL;
				2440	if (d_is_symlink(newmnt->mnt.mnt_root))
				2441	goto unlock;
				2442
				2443	newmnt->mnt.mnt_flags = mnt_flags;
				2444	err = graft_tree(newmnt, parent, mp);
				2445
				2446	unlock:
				2447	unlock_mount(mp);
				2448	return err;
				2449	}
				2450
				2451	static bool fs_fully_visible(struct file_system_type fs_type, int new_mnt_flags);
				2452
				2453	/*
				2454	* create a new mount for userspace and request it to be added into the
				2455	* namespace's tree
				2456	*/
				2457	static int do_new_mount(struct path path, const char fstype, int flags,
				2458	int mnt_flags, const char name, void data)
				2459	{
				2460	struct file_system_type *type;
				2461	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
				2462	struct vfsmount *mnt;
				2463	int err;
				2464
				2465	if (!fstype)
				2466	return -EINVAL;
				2467
				2468	type = get_fs_type(fstype);
				2469	if (!type)
				2470	return -ENODEV;
				2471
				2472	if (user_ns != &init_user_ns) {
				2473	if (!(type->fs_flags & FS_USERNS_MOUNT)) {
				2474	put_filesystem(type);
				2475	return -EPERM;
				2476	}
				2477	/* Only in special cases allow devices from mounts
				2478	* created outside the initial user namespace.
				2479	*/
				2480	if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
				2481	flags \|= MS_NODEV;
				2482	mnt_flags \|= MNT_NODEV \| MNT_LOCK_NODEV;
				2483	}
				2484	if (type->fs_flags & FS_USERNS_VISIBLE) {
				2485	if (!fs_fully_visible(type, &mnt_flags)) {
				2486	put_filesystem(type);
				2487	return -EPERM;
				2488	}
				2489	}
				2490	}
				2491
				2492	mnt = vfs_kern_mount(type, flags, name, data);
				2493	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
				2494	!mnt->mnt_sb->s_subtype)
				2495	mnt = fs_set_subtype(mnt, fstype);
				2496
				2497	put_filesystem(type);
				2498	if (IS_ERR(mnt))
				2499	return PTR_ERR(mnt);
				2500
				2501	err = do_add_mount(real_mount(mnt), path, mnt_flags);
				2502	if (err)
				2503	mntput(mnt);
				2504	return err;
				2505	}
				2506
				2507	int finish_automount(struct vfsmount m, struct path path)
				2508	{
				2509	struct mount *mnt = real_mount(m);
				2510	int err;
				2511	/* The new mount record should have at least 2 refs to prevent it being
				2512	* expired before we get a chance to add it
				2513	*/
				2514	BUG_ON(mnt_get_count(mnt) < 2);
				2515
				2516	if (m->mnt_sb == path->mnt->mnt_sb &&
				2517	m->mnt_root == path->dentry) {
				2518	err = -ELOOP;
				2519	goto fail;
				2520	}
				2521
				2522	err = do_add_mount(mnt, path, path->mnt->mnt_flags \| MNT_SHRINKABLE);
				2523	if (!err)
				2524	return 0;
				2525	fail:
				2526	/* remove m from any expiration list it may be on */
				2527	if (!list_empty(&mnt->mnt_expire)) {
				2528	namespace_lock();
				2529	list_del_init(&mnt->mnt_expire);
				2530	namespace_unlock();
				2531	}
				2532	mntput(m);
				2533	mntput(m);
				2534	return err;
				2535	}
				2536
				2537	/**
				2538	* mnt_set_expiry - Put a mount on an expiration list
				2539	* @mnt: The mount to list.
				2540	* @expiry_list: The list to add the mount to.
				2541	*/
				2542	void mnt_set_expiry(struct vfsmount mnt, struct list_head expiry_list)
				2543	{
				2544	namespace_lock();
				2545
				2546	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
				2547
				2548	namespace_unlock();
				2549	}
				2550	EXPORT_SYMBOL(mnt_set_expiry);
				2551
				2552	/*
				2553	* process a list of expirable mountpoints with the intent of discarding any
				2554	* mountpoints that aren't in use and haven't been touched since last we came
				2555	* here
				2556	*/
				2557	void mark_mounts_for_expiry(struct list_head *mounts)
				2558	{
				2559	struct mount mnt, next;
				2560	LIST_HEAD(graveyard);
				2561
				2562	if (list_empty(mounts))
				2563	return;
				2564
				2565	namespace_lock();
				2566	lock_mount_hash();
				2567
				2568	/* extract from the expiration list every vfsmount that matches the
				2569	* following criteria:
				2570	* - only referenced by its parent vfsmount
				2571	* - still marked for expiry (marked on the last call here; marks are
				2572	* cleared by mntput())
				2573	*/
				2574	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
				2575	if (!xchg(&mnt->mnt_expiry_mark, 1) \|\|
				2576	propagate_mount_busy(mnt, 1))
				2577	continue;
				2578	list_move(&mnt->mnt_expire, &graveyard);
				2579	}
				2580	while (!list_empty(&graveyard)) {
				2581	mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
				2582	touch_mnt_namespace(mnt->mnt_ns);
				2583	umount_tree(mnt, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				2584	}
				2585	unlock_mount_hash();
				2586	namespace_unlock();
				2587	}
				2588
				2589	EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
				2590
				2591	/*
				2592	* Ripoff of 'select_parent()'
				2593	*
				2594	* search the list of submounts for a given mountpoint, and move any
				2595	* shrinkable submounts to the 'graveyard' list.
				2596	*/
				2597	static int select_submounts(struct mount parent, struct list_head graveyard)
				2598	{
				2599	struct mount *this_parent = parent;
				2600	struct list_head *next;
				2601	int found = 0;
				2602
				2603	repeat:
				2604	next = this_parent->mnt_mounts.next;
				2605	resume:
				2606	while (next != &this_parent->mnt_mounts) {
				2607	struct list_head *tmp = next;
				2608	struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
				2609
				2610	next = tmp->next;
				2611	if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
				2612	continue;
				2613	/*
				2614	* Descend a level if the d_mounts list is non-empty.
				2615	*/
				2616	if (!list_empty(&mnt->mnt_mounts)) {
				2617	this_parent = mnt;
				2618	goto repeat;
				2619	}
				2620
				2621	if (!propagate_mount_busy(mnt, 1)) {
				2622	list_move_tail(&mnt->mnt_expire, graveyard);
				2623	found++;
				2624	}
				2625	}
				2626	/*
				2627	* All done at this level ... ascend and resume the search
				2628	*/
				2629	if (this_parent != parent) {
				2630	next = this_parent->mnt_child.next;
				2631	this_parent = this_parent->mnt_parent;
				2632	goto resume;
				2633	}
				2634	return found;
				2635	}
				2636
				2637	/*
				2638	* process a list of expirable mountpoints with the intent of discarding any
				2639	* submounts of a specific parent mountpoint
				2640	*
				2641	* mount_lock must be held for write
				2642	*/
				2643	static void shrink_submounts(struct mount *mnt)
				2644	{
				2645	LIST_HEAD(graveyard);
				2646	struct mount *m;
				2647
				2648	/* extract submounts of 'mountpoint' from the expiration list */
				2649	while (select_submounts(mnt, &graveyard)) {
				2650	while (!list_empty(&graveyard)) {
				2651	m = list_first_entry(&graveyard, struct mount,
				2652	mnt_expire);
				2653	touch_mnt_namespace(m->mnt_ns);
				2654	umount_tree(m, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				2655	}
				2656	}
				2657	}
				2658
				2659	/*
				2660	* Some copy_from_user() implementations do not return the exact number of
				2661	* bytes remaining to copy on a fault. But copy_mount_options() requires that.
				2662	* Note that this function differs from copy_from_user() in that it will oops
				2663	* on bad values of `to', rather than returning a short copy.
				2664	*/
				2665	static long exact_copy_from_user(void to, const void __user from,
				2666	unsigned long n)
				2667	{
				2668	char *t = to;
				2669	const char __user *f = from;
				2670	char c;
				2671
				2672	if (!access_ok(VERIFY_READ, from, n))
				2673	return n;
				2674
				2675	while (n) {
				2676	if (__get_user(c, f)) {
				2677	memset(t, 0, n);
				2678	break;
				2679	}
				2680	*t++ = c;
				2681	f++;
				2682	n--;
				2683	}
				2684	return n;
				2685	}
				2686
				2687	int copy_mount_options(const void __user * data, unsigned long *where)
				2688	{
				2689	int i;
				2690	unsigned long page;
				2691	unsigned long size;
				2692
				2693	*where = 0;
				2694	if (!data)
				2695	return 0;
				2696
				2697	if (!(page = __get_free_page(GFP_KERNEL)))
				2698	return -ENOMEM;
				2699
				2700	/* We only care that some data at the address the user
				2701	* gave us is valid. Just in case, we'll zero
				2702	* the remainder of the page.
				2703	*/
				2704	/* copy_from_user cannot cross TASK_SIZE ! */
				2705	size = TASK_SIZE - (unsigned long)data;
				2706	if (size > PAGE_SIZE)
				2707	size = PAGE_SIZE;
				2708
				2709	i = size - exact_copy_from_user((void *)page, data, size);
				2710	if (!i) {
				2711	free_page(page);
				2712	return -EFAULT;
				2713	}
				2714	if (i != PAGE_SIZE)
				2715	memset((char *)page + i, 0, PAGE_SIZE - i);
				2716	*where = page;
				2717	return 0;
				2718	}
				2719
				2720	char copy_mount_string(const void __user data)
				2721	{
				2722	return data ? strndup_user(data, PAGE_SIZE) : NULL;
				2723	}
				2724
				2725	/*
				2726	* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
				2727	* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
				2728	*
				2729	* data is a (void *) that can point to any structure up to
				2730	* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
				2731	* information (or be NULL).
				2732	*
				2733	* Pre-0.97 versions of mount() didn't have a flags word.
				2734	* When the flags word was introduced its top half was required
				2735	* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
				2736	* Therefore, if this magic number is present, it carries no information
				2737	* and must be discarded.
				2738	*/
				2739	long do_mount(const char dev_name, const char __user dir_name,
				2740	const char type_page, unsigned long flags, void data_page)
				2741	{
				2742	struct path path;
				2743	int retval = 0;
				2744	int mnt_flags = 0;
				2745
				2746	/* Discard magic */
				2747	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
				2748	flags &= ~MS_MGC_MSK;
				2749
				2750	/* Basic sanity checks */
				2751	if (data_page)
				2752	((char *)data_page)[PAGE_SIZE - 1] = 0;
				2753
				2754	/* ... and get the mountpoint */
				2755	retval = user_path(dir_name, &path);
				2756	if (retval)
				2757	return retval;
				2758
				2759	retval = security_sb_mount(dev_name, &path,
				2760	type_page, flags, data_page);
				2761	if (!retval && !may_mount())
				2762	retval = -EPERM;
				2763	if (retval)
				2764	goto dput_out;
				2765
				2766	/* Default to relatime unless overriden */
				2767	if (!(flags & MS_NOATIME))
				2768	mnt_flags \|= MNT_RELATIME;
				2769
				2770	/* Separate the per-mountpoint flags */
				2771	if (flags & MS_NOSUID)
				2772	mnt_flags \|= MNT_NOSUID;
				2773	if (flags & MS_NODEV)
				2774	mnt_flags \|= MNT_NODEV;
				2775	if (flags & MS_NOEXEC)
				2776	mnt_flags \|= MNT_NOEXEC;
				2777	if (flags & MS_NOATIME)
				2778	mnt_flags \|= MNT_NOATIME;
				2779	if (flags & MS_NODIRATIME)
				2780	mnt_flags \|= MNT_NODIRATIME;
				2781	if (flags & MS_STRICTATIME)
				2782	mnt_flags &= ~(MNT_RELATIME \| MNT_NOATIME);
				2783	if (flags & MS_RDONLY)
				2784	mnt_flags \|= MNT_READONLY;
				2785
				2786	/* The default atime for remount is preservation */
				2787	if ((flags & MS_REMOUNT) &&
				2788	((flags & (MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME \|
				2789	MS_STRICTATIME)) == 0)) {
				2790	mnt_flags &= ~MNT_ATIME_MASK;
				2791	mnt_flags \|= path.mnt->mnt_flags & MNT_ATIME_MASK;
				2792	}
				2793
				2794	flags &= ~(MS_NOSUID \| MS_NOEXEC \| MS_NODEV \| MS_ACTIVE \| MS_BORN \|
				2795	MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME\| MS_KERNMOUNT \|
				2796	MS_STRICTATIME);
				2797
				2798	if (flags & MS_REMOUNT)
				2799	retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
				2800	data_page);
				2801	else if (flags & MS_BIND)
				2802	retval = do_loopback(&path, dev_name, flags & MS_REC);
				2803	else if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
				2804	retval = do_change_type(&path, flags);
				2805	else if (flags & MS_MOVE)
				2806	retval = do_move_mount(&path, dev_name);
				2807	else
				2808	retval = do_new_mount(&path, type_page, flags, mnt_flags,
				2809	dev_name, data_page);
				2810	dput_out:
				2811	path_put(&path);
				2812	return retval;
				2813	}
				2814
				2815	static void free_mnt_ns(struct mnt_namespace *ns)
				2816	{
				2817	ns_free_inum(&ns->ns);
				2818	put_user_ns(ns->user_ns);
				2819	kfree(ns);
				2820	}
				2821
				2822	/*
				2823	* Assign a sequence number so we can detect when we attempt to bind
				2824	* mount a reference to an older mount namespace into the current
				2825	* mount namespace, preventing reference counting loops. A 64bit
				2826	* number incrementing at 10Ghz will take 12,427 years to wrap which
				2827	* is effectively never, so we can ignore the possibility.
				2828	*/
				2829	static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
				2830
				2831	static struct mnt_namespace alloc_mnt_ns(struct user_namespace user_ns)
				2832	{
				2833	struct mnt_namespace *new_ns;
				2834	int ret;
				2835
				2836	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
				2837	if (!new_ns)
				2838	return ERR_PTR(-ENOMEM);
				2839	ret = ns_alloc_inum(&new_ns->ns);
				2840	if (ret) {
				2841	kfree(new_ns);
				2842	return ERR_PTR(ret);
				2843	}
				2844	new_ns->ns.ops = &mntns_operations;
				2845	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
				2846	atomic_set(&new_ns->count, 1);
				2847	new_ns->root = NULL;
				2848	INIT_LIST_HEAD(&new_ns->list);
				2849	init_waitqueue_head(&new_ns->poll);
				2850	new_ns->event = 0;
				2851	new_ns->user_ns = get_user_ns(user_ns);
				2852	new_ns->mounts = 0;
				2853	new_ns->pending_mounts = 0;
				2854	return new_ns;
				2855	}
				2856
				2857	struct mnt_namespace copy_mnt_ns(unsigned long flags, struct mnt_namespace ns,
				2858	struct user_namespace user_ns, struct fs_struct new_fs)
				2859	{
				2860	struct mnt_namespace *new_ns;
				2861	struct vfsmount rootmnt = NULL, pwdmnt = NULL;
				2862	struct mount p, q;
				2863	struct mount *old;
				2864	struct mount *new;
				2865	int copy_flags;
				2866
				2867	BUG_ON(!ns);
				2868
				2869	if (likely(!(flags & CLONE_NEWNS))) {
				2870	get_mnt_ns(ns);
				2871	return ns;
				2872	}
				2873
				2874	old = ns->root;
				2875
				2876	new_ns = alloc_mnt_ns(user_ns);
				2877	if (IS_ERR(new_ns))
				2878	return new_ns;
				2879
				2880	namespace_lock();
				2881	/* First pass: copy the tree topology */
				2882	copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
				2883	if (user_ns != ns->user_ns)
				2884	copy_flags \|= CL_SHARED_TO_SLAVE \| CL_UNPRIVILEGED;
				2885	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
				2886	if (IS_ERR(new)) {
				2887	namespace_unlock();
				2888	free_mnt_ns(new_ns);
				2889	return ERR_CAST(new);
				2890	}
				2891	new_ns->root = new;
				2892	list_add_tail(&new_ns->list, &new->mnt_list);
				2893
				2894	/*
				2895	* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
				2896	* as belonging to new namespace. We have already acquired a private
				2897	* fs_struct, so tsk->fs->lock is not needed.
				2898	*/
				2899	p = old;
				2900	q = new;
				2901	while (p) {
				2902	q->mnt_ns = new_ns;
				2903	new_ns->mounts++;
				2904	if (new_fs) {
				2905	if (&p->mnt == new_fs->root.mnt) {
				2906	new_fs->root.mnt = mntget(&q->mnt);
				2907	rootmnt = &p->mnt;
				2908	}
				2909	if (&p->mnt == new_fs->pwd.mnt) {
				2910	new_fs->pwd.mnt = mntget(&q->mnt);
				2911	pwdmnt = &p->mnt;
				2912	}
				2913	}
				2914	p = next_mnt(p, old);
				2915	q = next_mnt(q, new);
				2916	if (!q)
				2917	break;
				2918	while (p->mnt.mnt_root != q->mnt.mnt_root)
				2919	p = next_mnt(p, old);
				2920	}
				2921	namespace_unlock();
				2922
				2923	if (rootmnt)
				2924	mntput(rootmnt);
				2925	if (pwdmnt)
				2926	mntput(pwdmnt);
				2927
				2928	return new_ns;
				2929	}
				2930
				2931	/**
				2932	* create_mnt_ns - creates a private namespace and adds a root filesystem
				2933	* @mnt: pointer to the new root filesystem mountpoint
				2934	*/
				2935	static struct mnt_namespace create_mnt_ns(struct vfsmount m)
				2936	{
				2937	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
				2938	if (!IS_ERR(new_ns)) {
				2939	struct mount *mnt = real_mount(m);
				2940	mnt->mnt_ns = new_ns;
				2941	new_ns->root = mnt;
				2942	new_ns->mounts++;
				2943	list_add(&mnt->mnt_list, &new_ns->list);
				2944	} else {
				2945	mntput(m);
				2946	}
				2947	return new_ns;
				2948	}
				2949
				2950	struct dentry mount_subtree(struct vfsmount mnt, const char *name)
				2951	{
				2952	struct mnt_namespace *ns;
				2953	struct super_block *s;
				2954	struct path path;
				2955	int err;
				2956
				2957	ns = create_mnt_ns(mnt);
				2958	if (IS_ERR(ns))
				2959	return ERR_CAST(ns);
				2960
				2961	err = vfs_path_lookup(mnt->mnt_root, mnt,
				2962	name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &path);
				2963
				2964	put_mnt_ns(ns);
				2965
				2966	if (err)
				2967	return ERR_PTR(err);
				2968
				2969	/* trade a vfsmount reference for active sb one */
				2970	s = path.mnt->mnt_sb;
				2971	atomic_inc(&s->s_active);
				2972	mntput(path.mnt);
				2973	/* lock the sucker */
				2974	down_write(&s->s_umount);
				2975	/* ... and return the root of (sub)tree on it */
				2976	return path.dentry;
				2977	}
				2978	EXPORT_SYMBOL(mount_subtree);
				2979
				2980	SYSCALL_DEFINE5(mount, char __user , dev_name, char __user , dir_name,
				2981	char __user , type, unsigned long, flags, void __user , data)
				2982	{
				2983	int ret;
				2984	char *kernel_type;
				2985	char *kernel_dev;
				2986	unsigned long data_page;
				2987
				2988	kernel_type = copy_mount_string(type);
				2989	ret = PTR_ERR(kernel_type);
				2990	if (IS_ERR(kernel_type))
				2991	goto out_type;
				2992
				2993	kernel_dev = copy_mount_string(dev_name);
				2994	ret = PTR_ERR(kernel_dev);
				2995	if (IS_ERR(kernel_dev))
				2996	goto out_dev;
				2997
				2998	ret = copy_mount_options(data, &data_page);
				2999	if (ret < 0)
				3000	goto out_data;
				3001
				3002	ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
				3003	(void *) data_page);
				3004
				3005	free_page(data_page);
				3006	out_data:
				3007	kfree(kernel_dev);
				3008	out_dev:
				3009	kfree(kernel_type);
				3010	out_type:
				3011	return ret;
				3012	}
				3013
				3014	/*
				3015	* Return true if path is reachable from root
				3016	*
				3017	* namespace_sem or mount_lock is held
				3018	*/
				3019	bool is_path_reachable(struct mount mnt, struct dentry dentry,
				3020	const struct path *root)
				3021	{
				3022	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
				3023	dentry = mnt->mnt_mountpoint;
				3024	mnt = mnt->mnt_parent;
				3025	}
				3026	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
				3027	}
				3028
				3029	int path_is_under(struct path path1, struct path path2)
				3030	{
				3031	int res;
				3032	read_seqlock_excl(&mount_lock);
				3033	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
				3034	read_sequnlock_excl(&mount_lock);
				3035	return res;
				3036	}
				3037	EXPORT_SYMBOL(path_is_under);
				3038
				3039	/*
				3040	* pivot_root Semantics:
				3041	* Moves the root file system of the current process to the directory put_old,
				3042	* makes new_root as the new root file system of the current process, and sets
				3043	* root/cwd of all processes which had them on the current root to new_root.
				3044	*
				3045	* Restrictions:
				3046	* The new_root and put_old must be directories, and must not be on the
				3047	* same file system as the current process root. The put_old must be
				3048	* underneath new_root, i.e. adding a non-zero number of /.. to the string
				3049	* pointed to by put_old must yield the same directory as new_root. No other
				3050	* file system may be mounted on put_old. After all, new_root is a mountpoint.
				3051	*
				3052	* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
				3053	* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
				3054	* in this situation.
				3055	*
				3056	* Notes:
				3057	* - we don't move root/cwd if they are not at the root (reason: if something
				3058	* cared enough to change them, it's probably wrong to force them elsewhere)
				3059	* - it's okay to pick a root that isn't the root of a file system, e.g.
				3060	* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
				3061	* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
				3062	* first.
				3063	*/
				3064	SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
				3065	const char __user *, put_old)
				3066	{
				3067	struct path new, old, parent_path, root_parent, root;
				3068	struct mount new_mnt, root_mnt, *old_mnt;
				3069	struct mountpoint old_mp, root_mp;
				3070	int error;
				3071
				3072	if (!may_mount())
				3073	return -EPERM;
				3074
				3075	error = user_path_dir(new_root, &new);
				3076	if (error)
				3077	goto out0;
				3078
				3079	error = user_path_dir(put_old, &old);
				3080	if (error)
				3081	goto out1;
				3082
				3083	error = security_sb_pivotroot(&old, &new);
				3084	if (error)
				3085	goto out2;
				3086
				3087	get_fs_root(current->fs, &root);
				3088	old_mp = lock_mount(&old);
				3089	error = PTR_ERR(old_mp);
				3090	if (IS_ERR(old_mp))
				3091	goto out3;
				3092
				3093	error = -EINVAL;
				3094	new_mnt = real_mount(new.mnt);
				3095	root_mnt = real_mount(root.mnt);
				3096	old_mnt = real_mount(old.mnt);
				3097	if (IS_MNT_SHARED(old_mnt) \|\|
				3098	IS_MNT_SHARED(new_mnt->mnt_parent) \|\|
				3099	IS_MNT_SHARED(root_mnt->mnt_parent))
				3100	goto out4;
				3101	if (!check_mnt(root_mnt) \|\| !check_mnt(new_mnt))
				3102	goto out4;
				3103	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
				3104	goto out4;
				3105	error = -ENOENT;
				3106	if (d_unlinked(new.dentry))
				3107	goto out4;
				3108	error = -EBUSY;
				3109	if (new_mnt == root_mnt \|\| old_mnt == root_mnt)
				3110	goto out4; /* loop, on the same file system */
				3111	error = -EINVAL;
				3112	if (root.mnt->mnt_root != root.dentry)
				3113	goto out4; /* not a mountpoint */
				3114	if (!mnt_has_parent(root_mnt))
				3115	goto out4; /* not attached */
				3116	root_mp = root_mnt->mnt_mp;
				3117	if (new.mnt->mnt_root != new.dentry)
				3118	goto out4; /* not a mountpoint */
				3119	if (!mnt_has_parent(new_mnt))
				3120	goto out4; /* not attached */
				3121	/* make sure we can reach put_old from new_root */
				3122	if (!is_path_reachable(old_mnt, old.dentry, &new))
				3123	goto out4;
				3124	/* make certain new is below the root */
				3125	if (!is_path_reachable(new_mnt, new.dentry, &root))
				3126	goto out4;
				3127	root_mp->m_count++; /* pin it so it won't go away */
				3128	lock_mount_hash();
				3129	detach_mnt(new_mnt, &parent_path);
				3130	detach_mnt(root_mnt, &root_parent);
				3131	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
				3132	new_mnt->mnt.mnt_flags \|= MNT_LOCKED;
				3133	root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
				3134	}
				3135	/* mount old root on put_old */
				3136	attach_mnt(root_mnt, old_mnt, old_mp);
				3137	/* mount new_root on / */
				3138	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
				3139	touch_mnt_namespace(current->nsproxy->mnt_ns);
				3140	/* A moved mount should not expire automatically */
				3141	list_del_init(&new_mnt->mnt_expire);
				3142	put_mountpoint(root_mp);
				3143	unlock_mount_hash();
				3144	chroot_fs_refs(&root, &new);
				3145	error = 0;
				3146	out4:
				3147	unlock_mount(old_mp);
				3148	if (!error) {
				3149	path_put(&root_parent);
				3150	path_put(&parent_path);
				3151	}
				3152	out3:
				3153	path_put(&root);
				3154	out2:
				3155	path_put(&old);
				3156	out1:
				3157	path_put(&new);
				3158	out0:
				3159	return error;
				3160	}
				3161
				3162	static void __init init_mount_tree(void)
				3163	{
				3164	struct vfsmount *mnt;
				3165	struct mnt_namespace *ns;
				3166	struct path root;
				3167	struct file_system_type *type;
				3168
				3169	type = get_fs_type("rootfs");
				3170	if (!type)
				3171	panic("Can't find rootfs type");
				3172	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
				3173	put_filesystem(type);
				3174	if (IS_ERR(mnt))
				3175	panic("Can't create rootfs");
				3176
				3177	ns = create_mnt_ns(mnt);
				3178	if (IS_ERR(ns))
				3179	panic("Can't allocate initial namespace");
				3180
				3181	init_task.nsproxy->mnt_ns = ns;
				3182	get_mnt_ns(ns);
				3183
				3184	root.mnt = mnt;
				3185	root.dentry = mnt->mnt_root;
				3186	mnt->mnt_flags \|= MNT_LOCKED;
				3187
				3188	set_fs_pwd(current->fs, &root);
				3189	set_fs_root(current->fs, &root);
				3190	}
				3191
				3192	void __init mnt_init(void)
				3193	{
				3194	unsigned u;
				3195	int err;
				3196
				3197	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
				3198	0, SLAB_HWCACHE_ALIGN \| SLAB_PANIC, NULL);
				3199
				3200	mount_hashtable = alloc_large_system_hash("Mount-cache",
				3201	sizeof(struct hlist_head),
				3202	mhash_entries, 19,
				3203	0,
				3204	&m_hash_shift, &m_hash_mask, 0, 0);
				3205	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
				3206	sizeof(struct hlist_head),
				3207	mphash_entries, 19,
				3208	0,
				3209	&mp_hash_shift, &mp_hash_mask, 0, 0);
				3210
				3211	if (!mount_hashtable \|\| !mountpoint_hashtable)
				3212	panic("Failed to allocate mount hash table\n");
				3213
				3214	for (u = 0; u <= m_hash_mask; u++)
				3215	INIT_HLIST_HEAD(&mount_hashtable[u]);
				3216	for (u = 0; u <= mp_hash_mask; u++)
				3217	INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
				3218
				3219	kernfs_init();
				3220
				3221	err = sysfs_init();
				3222	if (err)
				3223	printk(KERN_WARNING "%s: sysfs_init error: %d\n",
				3224	__func__, err);
				3225	fs_kobj = kobject_create_and_add("fs", NULL);
				3226	if (!fs_kobj)
				3227	printk(KERN_WARNING "%s: kobj create error\n", __func__);
				3228	init_rootfs();
				3229	init_mount_tree();
				3230	}
				3231
				3232	void put_mnt_ns(struct mnt_namespace *ns)
				3233	{
				3234	if (!atomic_dec_and_test(&ns->count))
				3235	return;
				3236	drop_collected_mounts(&ns->root->mnt);
				3237	free_mnt_ns(ns);
				3238	}
				3239
				3240	struct vfsmount kern_mount_data(struct file_system_type type, void *data)
				3241	{
				3242	struct vfsmount *mnt;
				3243	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
				3244	if (!IS_ERR(mnt)) {
				3245	/*
				3246	* it is a longterm mount, don't release mnt until
				3247	* we unmount before file sys is unregistered
				3248	*/
				3249	real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
				3250	}
				3251	return mnt;
				3252	}
				3253	EXPORT_SYMBOL_GPL(kern_mount_data);
				3254
				3255	void kern_unmount(struct vfsmount *mnt)
				3256	{
				3257	/* release long term mount so mount point can be released */
				3258	if (!IS_ERR_OR_NULL(mnt)) {
				3259	real_mount(mnt)->mnt_ns = NULL;
				3260	synchronize_rcu(); /* yecchhh... */
				3261	mntput(mnt);
				3262	}
				3263	}
				3264	EXPORT_SYMBOL(kern_unmount);
				3265
				3266	bool our_mnt(struct vfsmount *mnt)
				3267	{
				3268	return check_mnt(real_mount(mnt));
				3269	}
				3270
				3271	bool current_chrooted(void)
				3272	{
				3273	/* Does the current process have a non-standard root */
				3274	struct path ns_root;
				3275	struct path fs_root;
				3276	bool chrooted;
				3277
				3278	/* Find the namespace root */
				3279	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
				3280	ns_root.dentry = ns_root.mnt->mnt_root;
				3281	path_get(&ns_root);
				3282	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
				3283	;
				3284
				3285	get_fs_root(current->fs, &fs_root);
				3286
				3287	chrooted = !path_equal(&fs_root, &ns_root);
				3288
				3289	path_put(&fs_root);
				3290	path_put(&ns_root);
				3291
				3292	return chrooted;
				3293	}
				3294
				3295	static bool fs_fully_visible(struct file_system_type type, int new_mnt_flags)
				3296	{
				3297	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
				3298	int new_flags = *new_mnt_flags;
				3299	struct mount *mnt;
				3300	bool visible = false;
				3301
				3302	if (unlikely(!ns))
				3303	return false;
				3304
				3305	down_read(&namespace_sem);
				3306	list_for_each_entry(mnt, &ns->list, mnt_list) {
				3307	struct mount *child;
				3308	int mnt_flags;
				3309
				3310	if (mnt->mnt.mnt_sb->s_type != type)
				3311	continue;
				3312
				3313	/* This mount is not fully visible if it's root directory
				3314	* is not the root directory of the filesystem.
				3315	*/
				3316	if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
				3317	continue;
				3318
				3319	/* Read the mount flags and filter out flags that
				3320	* may safely be ignored.
				3321	*/
				3322	mnt_flags = mnt->mnt.mnt_flags;
				3323	if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
				3324	mnt_flags &= ~(MNT_LOCK_NOSUID \| MNT_LOCK_NOEXEC);
				3325
				3326	/* Don't miss readonly hidden in the superblock flags */
				3327	if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY)
				3328	mnt_flags \|= MNT_LOCK_READONLY;
				3329
				3330	/* Verify the mount flags are equal to or more permissive
				3331	* than the proposed new mount.
				3332	*/
				3333	if ((mnt_flags & MNT_LOCK_READONLY) &&
				3334	!(new_flags & MNT_READONLY))
				3335	continue;
				3336	if ((mnt_flags & MNT_LOCK_NODEV) &&
				3337	!(new_flags & MNT_NODEV))
				3338	continue;
				3339	if ((mnt_flags & MNT_LOCK_NOSUID) &&
				3340	!(new_flags & MNT_NOSUID))
				3341	continue;
				3342	if ((mnt_flags & MNT_LOCK_NOEXEC) &&
				3343	!(new_flags & MNT_NOEXEC))
				3344	continue;
				3345	if ((mnt_flags & MNT_LOCK_ATIME) &&
				3346	((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
				3347	continue;
				3348
				3349	/* This mount is not fully visible if there are any
				3350	* locked child mounts that cover anything except for
				3351	* empty directories.
				3352	*/
				3353	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
				3354	struct inode *inode = child->mnt_mountpoint->d_inode;
				3355	/* Only worry about locked mounts */
				3356	if (!(child->mnt.mnt_flags & MNT_LOCKED))
				3357	continue;
				3358	/* Is the directory permanetly empty? */
				3359	if (!is_empty_dir_inode(inode))
				3360	goto next;
				3361	}
				3362	/* Preserve the locked attributes */
				3363	*new_mnt_flags \|= mnt_flags & (MNT_LOCK_READONLY \| \
				3364	MNT_LOCK_NODEV \| \
				3365	MNT_LOCK_NOSUID \| \
				3366	MNT_LOCK_NOEXEC \| \
				3367	MNT_LOCK_ATIME);
				3368	visible = true;
				3369	goto found;
				3370	next: ;
				3371	}
				3372	found:
				3373	up_read(&namespace_sem);
				3374	return visible;
				3375	}
				3376
				3377	static struct ns_common mntns_get(struct task_struct task)
				3378	{
				3379	struct ns_common *ns = NULL;
				3380	struct nsproxy *nsproxy;
				3381
				3382	task_lock(task);
				3383	nsproxy = task->nsproxy;
				3384	if (nsproxy) {
				3385	ns = &nsproxy->mnt_ns->ns;
				3386	get_mnt_ns(to_mnt_ns(ns));
				3387	}
				3388	task_unlock(task);
				3389
				3390	return ns;
				3391	}
				3392
				3393	static void mntns_put(struct ns_common *ns)
				3394	{
				3395	put_mnt_ns(to_mnt_ns(ns));
				3396	}
				3397
				3398	static int mntns_install(struct nsproxy nsproxy, struct ns_common ns)
				3399	{
				3400	struct fs_struct *fs = current->fs;
				3401	struct mnt_namespace *mnt_ns = to_mnt_ns(ns);
				3402	struct path root;
				3403
				3404	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) \|\|
				3405	!ns_capable(current_user_ns(), CAP_SYS_CHROOT) \|\|
				3406	!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
				3407	return -EPERM;
				3408
				3409	if (fs->users != 1)
				3410	return -EINVAL;
				3411
				3412	get_mnt_ns(mnt_ns);
				3413	put_mnt_ns(nsproxy->mnt_ns);
				3414	nsproxy->mnt_ns = mnt_ns;
				3415
				3416	/* Find the root */
				3417	root.mnt = &mnt_ns->root->mnt;
				3418	root.dentry = mnt_ns->root->mnt.mnt_root;
				3419	path_get(&root);
				3420	while(d_mountpoint(root.dentry) && follow_down_one(&root))
				3421	;
				3422
				3423	/* Update the pwd and root */
				3424	set_fs_pwd(fs, &root);
				3425	set_fs_root(fs, &root);
				3426
				3427	path_put(&root);
				3428	return 0;
				3429	}
				3430
				3431	const struct proc_ns_operations mntns_operations = {
				3432	.name = "mnt",
				3433	.type = CLONE_NEWNS,
				3434	.get = mntns_get,
				3435	.put = mntns_put,
				3436	.install = mntns_install,
				3437	};