Blame - net/ceph/osd_client.c - codeaurora/cp-linux

blob: a28e47ff1b1b3496f753ae1aaf0ede6032a75b7b [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1
				2	#include <linux/ceph/ceph_debug.h>
				3
				4	#include <linux/module.h>
				5	#include <linux/err.h>
				6	#include <linux/highmem.h>
				7	#include <linux/mm.h>
				8	#include <linux/pagemap.h>
				9	#include <linux/slab.h>
				10	#include <linux/uaccess.h>
				11	#ifdef CONFIG_BLOCK
				12	#include <linux/bio.h>
				13	#endif
				14
				15	#include <linux/ceph/libceph.h>
				16	#include <linux/ceph/osd_client.h>
				17	#include <linux/ceph/messenger.h>
				18	#include <linux/ceph/decode.h>
				19	#include <linux/ceph/auth.h>
				20	#include <linux/ceph/pagelist.h>
				21
				22	#define OSD_OP_FRONT_LEN 4096
				23	#define OSD_OPREPLY_FRONT_LEN 512
				24
				25	static struct kmem_cache *ceph_osd_request_cache;
				26
				27	static const struct ceph_connection_operations osd_con_ops;
				28
				29	static void __send_queued(struct ceph_osd_client *osdc);
				30	static int __reset_osd(struct ceph_osd_client osdc, struct ceph_osd osd);
				31	static void __register_request(struct ceph_osd_client *osdc,
				32	struct ceph_osd_request *req);
				33	static void __unregister_request(struct ceph_osd_client *osdc,
				34	struct ceph_osd_request *req);
				35	static void __unregister_linger_request(struct ceph_osd_client *osdc,
				36	struct ceph_osd_request *req);
				37	static void __enqueue_request(struct ceph_osd_request *req);
				38	static void __send_request(struct ceph_osd_client *osdc,
				39	struct ceph_osd_request *req);
				40
				41	/*
				42	* Implement client access to distributed object storage cluster.
				43	*
				44	* All data objects are stored within a cluster/cloud of OSDs, or
				45	* "object storage devices." (Note that Ceph OSDs have _nothing_ to
				46	* do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
				47	* remote daemons serving up and coordinating consistent and safe
				48	* access to storage.
				49	*
				50	* Cluster membership and the mapping of data objects onto storage devices
				51	* are described by the osd map.
				52	*
				53	* We keep track of pending OSD requests (read, write), resubmit
				54	* requests to different OSDs when the cluster topology/data layout
				55	* change, or retry the affected requests when the communications
				56	* channel with an OSD is reset.
				57	*/
				58
				59	/*
				60	* calculate the mapping of a file extent onto an object, and fill out the
				61	* request accordingly. shorten extent as necessary if it crosses an
				62	* object boundary.
				63	*
				64	* fill osd op in request message.
				65	*/
				66	static int calc_layout(struct ceph_file_layout layout, u64 off, u64 plen,
				67	u64 objnum, u64 objoff, u64 *objlen)
				68	{
				69	u64 orig_len = *plen;
				70	int r;
				71
				72	/* object extent? */
				73	r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
				74	objoff, objlen);
				75	if (r < 0)
				76	return r;
				77	if (*objlen < orig_len) {
				78	plen = objlen;
				79	dout(" skipping last %llu, final file extent %llu~%llu\n",
				80	orig_len - plen, off, plen);
				81	}
				82
				83	dout("calc_layout objnum=%llx %llu~%llu\n", objnum, objoff, *objlen);
				84
				85	return 0;
				86	}
				87
				88	static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
				89	{
				90	memset(osd_data, 0, sizeof (*osd_data));
				91	osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
				92	}
				93
				94	static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
				95	struct page **pages, u64 length, u32 alignment,
				96	bool pages_from_pool, bool own_pages)
				97	{
				98	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
				99	osd_data->pages = pages;
				100	osd_data->length = length;
				101	osd_data->alignment = alignment;
				102	osd_data->pages_from_pool = pages_from_pool;
				103	osd_data->own_pages = own_pages;
				104	}
				105
				106	static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
				107	struct ceph_pagelist *pagelist)
				108	{
				109	osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
				110	osd_data->pagelist = pagelist;
				111	}
				112
				113	#ifdef CONFIG_BLOCK
				114	static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
				115	struct bio *bio, size_t bio_length)
				116	{
				117	osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
				118	osd_data->bio = bio;
				119	osd_data->bio_length = bio_length;
				120	}
				121	#endif /* CONFIG_BLOCK */
				122
				123	#define osd_req_op_data(oreq, whch, typ, fld) \
				124	({ \
				125	struct ceph_osd_request *__oreq = (oreq); \
				126	unsigned int __whch = (whch); \
				127	BUG_ON(__whch >= __oreq->r_num_ops); \
				128	&__oreq->r_ops[__whch].typ.fld; \
				129	})
				130
				131	static struct ceph_osd_data *
				132	osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
				133	{
				134	BUG_ON(which >= osd_req->r_num_ops);
				135
				136	return &osd_req->r_ops[which].raw_data_in;
				137	}
				138
				139	struct ceph_osd_data *
				140	osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
				141	unsigned int which)
				142	{
				143	return osd_req_op_data(osd_req, which, extent, osd_data);
				144	}
				145	EXPORT_SYMBOL(osd_req_op_extent_osd_data);
				146
				147	struct ceph_osd_data *
				148	osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
				149	unsigned int which)
				150	{
				151	return osd_req_op_data(osd_req, which, cls, response_data);
				152	}
				153	EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */
				154
				155	void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
				156	unsigned int which, struct page **pages,
				157	u64 length, u32 alignment,
				158	bool pages_from_pool, bool own_pages)
				159	{
				160	struct ceph_osd_data *osd_data;
				161
				162	osd_data = osd_req_op_raw_data_in(osd_req, which);
				163	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				164	pages_from_pool, own_pages);
				165	}
				166	EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
				167
				168	void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
				169	unsigned int which, struct page **pages,
				170	u64 length, u32 alignment,
				171	bool pages_from_pool, bool own_pages)
				172	{
				173	struct ceph_osd_data *osd_data;
				174
				175	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				176	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				177	pages_from_pool, own_pages);
				178	}
				179	EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
				180
				181	void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
				182	unsigned int which, struct ceph_pagelist *pagelist)
				183	{
				184	struct ceph_osd_data *osd_data;
				185
				186	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				187	ceph_osd_data_pagelist_init(osd_data, pagelist);
				188	}
				189	EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
				190
				191	#ifdef CONFIG_BLOCK
				192	void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
				193	unsigned int which, struct bio *bio, size_t bio_length)
				194	{
				195	struct ceph_osd_data *osd_data;
				196
				197	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				198	ceph_osd_data_bio_init(osd_data, bio, bio_length);
				199	}
				200	EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
				201	#endif /* CONFIG_BLOCK */
				202
				203	static void osd_req_op_cls_request_info_pagelist(
				204	struct ceph_osd_request *osd_req,
				205	unsigned int which, struct ceph_pagelist *pagelist)
				206	{
				207	struct ceph_osd_data *osd_data;
				208
				209	osd_data = osd_req_op_data(osd_req, which, cls, request_info);
				210	ceph_osd_data_pagelist_init(osd_data, pagelist);
				211	}
				212
				213	void osd_req_op_cls_request_data_pagelist(
				214	struct ceph_osd_request *osd_req,
				215	unsigned int which, struct ceph_pagelist *pagelist)
				216	{
				217	struct ceph_osd_data *osd_data;
				218
				219	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				220	ceph_osd_data_pagelist_init(osd_data, pagelist);
				221	}
				222	EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
				223
				224	void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
				225	unsigned int which, struct page **pages, u64 length,
				226	u32 alignment, bool pages_from_pool, bool own_pages)
				227	{
				228	struct ceph_osd_data *osd_data;
				229
				230	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				231	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				232	pages_from_pool, own_pages);
				233	}
				234	EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
				235
				236	void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
				237	unsigned int which, struct page **pages, u64 length,
				238	u32 alignment, bool pages_from_pool, bool own_pages)
				239	{
				240	struct ceph_osd_data *osd_data;
				241
				242	osd_data = osd_req_op_data(osd_req, which, cls, response_data);
				243	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				244	pages_from_pool, own_pages);
				245	}
				246	EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
				247
				248	static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
				249	{
				250	switch (osd_data->type) {
				251	case CEPH_OSD_DATA_TYPE_NONE:
				252	return 0;
				253	case CEPH_OSD_DATA_TYPE_PAGES:
				254	return osd_data->length;
				255	case CEPH_OSD_DATA_TYPE_PAGELIST:
				256	return (u64)osd_data->pagelist->length;
				257	#ifdef CONFIG_BLOCK
				258	case CEPH_OSD_DATA_TYPE_BIO:
				259	return (u64)osd_data->bio_length;
				260	#endif /* CONFIG_BLOCK */
				261	default:
				262	WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
				263	return 0;
				264	}
				265	}
				266
				267	static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
				268	{
				269	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
				270	int num_pages;
				271
				272	num_pages = calc_pages_for((u64)osd_data->alignment,
				273	(u64)osd_data->length);
				274	ceph_release_page_vector(osd_data->pages, num_pages);
				275	}
				276	ceph_osd_data_init(osd_data);
				277	}
				278
				279	static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
				280	unsigned int which)
				281	{
				282	struct ceph_osd_req_op *op;
				283
				284	BUG_ON(which >= osd_req->r_num_ops);
				285	op = &osd_req->r_ops[which];
				286
				287	switch (op->op) {
				288	case CEPH_OSD_OP_READ:
				289	case CEPH_OSD_OP_WRITE:
				290	case CEPH_OSD_OP_WRITEFULL:
				291	ceph_osd_data_release(&op->extent.osd_data);
				292	break;
				293	case CEPH_OSD_OP_CALL:
				294	ceph_osd_data_release(&op->cls.request_info);
				295	ceph_osd_data_release(&op->cls.request_data);
				296	ceph_osd_data_release(&op->cls.response_data);
				297	break;
				298	case CEPH_OSD_OP_SETXATTR:
				299	case CEPH_OSD_OP_CMPXATTR:
				300	ceph_osd_data_release(&op->xattr.osd_data);
				301	break;
				302	case CEPH_OSD_OP_STAT:
				303	ceph_osd_data_release(&op->raw_data_in);
				304	break;
				305	default:
				306	break;
				307	}
				308	}
				309
				310	/*
				311	* requests
				312	*/
				313	static void ceph_osdc_release_request(struct kref *kref)
				314	{
				315	struct ceph_osd_request *req = container_of(kref,
				316	struct ceph_osd_request, r_kref);
				317	unsigned int which;
				318
				319	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
				320	req->r_request, req->r_reply);
				321	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
				322	WARN_ON(!list_empty(&req->r_req_lru_item));
				323	WARN_ON(!list_empty(&req->r_osd_item));
				324	WARN_ON(!list_empty(&req->r_linger_item));
				325	WARN_ON(!list_empty(&req->r_linger_osd_item));
				326	WARN_ON(req->r_osd);
				327
				328	if (req->r_request)
				329	ceph_msg_put(req->r_request);
				330	if (req->r_reply) {
				331	ceph_msg_revoke_incoming(req->r_reply);
				332	ceph_msg_put(req->r_reply);
				333	}
				334
				335	for (which = 0; which < req->r_num_ops; which++)
				336	osd_req_op_data_release(req, which);
				337
				338	ceph_put_snap_context(req->r_snapc);
				339	if (req->r_mempool)
				340	mempool_free(req, req->r_osdc->req_mempool);
				341	else
				342	kmem_cache_free(ceph_osd_request_cache, req);
				343
				344	}
				345
				346	void ceph_osdc_get_request(struct ceph_osd_request *req)
				347	{
				348	dout("%s %p (was %d)\n", __func__, req,
				349	atomic_read(&req->r_kref.refcount));
				350	kref_get(&req->r_kref);
				351	}
				352	EXPORT_SYMBOL(ceph_osdc_get_request);
				353
				354	void ceph_osdc_put_request(struct ceph_osd_request *req)
				355	{
				356	dout("%s %p (was %d)\n", __func__, req,
				357	atomic_read(&req->r_kref.refcount));
				358	kref_put(&req->r_kref, ceph_osdc_release_request);
				359	}
				360	EXPORT_SYMBOL(ceph_osdc_put_request);
				361
				362	struct ceph_osd_request ceph_osdc_alloc_request(struct ceph_osd_client osdc,
				363	struct ceph_snap_context *snapc,
				364	unsigned int num_ops,
				365	bool use_mempool,
				366	gfp_t gfp_flags)
				367	{
				368	struct ceph_osd_request *req;
				369	struct ceph_msg *msg;
				370	size_t msg_size;
				371
				372	BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
				373	BUG_ON(num_ops > CEPH_OSD_MAX_OP);
				374
				375	msg_size = 4 + 4 + 8 + 8 + 4+8;
				376	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
				377	msg_size += 1 + 8 + 4 + 4; /* pg_t */
				378	msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
				379	msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
				380	msg_size += 8; /* snapid */
				381	msg_size += 8; /* snap_seq */
				382	msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
				383	msg_size += 4;
				384
				385	if (use_mempool) {
				386	req = mempool_alloc(osdc->req_mempool, gfp_flags);
				387	memset(req, 0, sizeof(*req));
				388	} else {
				389	req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
				390	}
				391	if (req == NULL)
				392	return NULL;
				393
				394	req->r_osdc = osdc;
				395	req->r_mempool = use_mempool;
				396	req->r_num_ops = num_ops;
				397
				398	kref_init(&req->r_kref);
				399	init_completion(&req->r_completion);
				400	init_completion(&req->r_safe_completion);
				401	RB_CLEAR_NODE(&req->r_node);
				402	INIT_LIST_HEAD(&req->r_unsafe_item);
				403	INIT_LIST_HEAD(&req->r_linger_item);
				404	INIT_LIST_HEAD(&req->r_linger_osd_item);
				405	INIT_LIST_HEAD(&req->r_req_lru_item);
				406	INIT_LIST_HEAD(&req->r_osd_item);
				407
				408	req->r_base_oloc.pool = -1;
				409	req->r_target_oloc.pool = -1;
				410
				411	/* create reply message */
				412	if (use_mempool)
				413	msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
				414	else
				415	msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
				416	OSD_OPREPLY_FRONT_LEN, gfp_flags, true);
				417	if (!msg) {
				418	ceph_osdc_put_request(req);
				419	return NULL;
				420	}
				421	req->r_reply = msg;
				422
				423	/* create request message; allow space for oid */
				424	if (use_mempool)
				425	msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
				426	else
				427	msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
				428	if (!msg) {
				429	ceph_osdc_put_request(req);
				430	return NULL;
				431	}
				432
				433	memset(msg->front.iov_base, 0, msg->front.iov_len);
				434
				435	req->r_request = msg;
				436
				437	return req;
				438	}
				439	EXPORT_SYMBOL(ceph_osdc_alloc_request);
				440
				441	static bool osd_req_opcode_valid(u16 opcode)
				442	{
				443	switch (opcode) {
				444	#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true;
				445	__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
				446	#undef GENERATE_CASE
				447	default:
				448	return false;
				449	}
				450	}
				451
				452	/*
				453	* This is an osd op init function for opcodes that have no data or
				454	* other information associated with them. It also serves as a
				455	* common init routine for all the other init functions, below.
				456	*/
				457	static struct ceph_osd_req_op *
				458	_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
				459	u16 opcode, u32 flags)
				460	{
				461	struct ceph_osd_req_op *op;
				462
				463	BUG_ON(which >= osd_req->r_num_ops);
				464	BUG_ON(!osd_req_opcode_valid(opcode));
				465
				466	op = &osd_req->r_ops[which];
				467	memset(op, 0, sizeof (*op));
				468	op->op = opcode;
				469	op->flags = flags;
				470
				471	return op;
				472	}
				473
				474	void osd_req_op_init(struct ceph_osd_request *osd_req,
				475	unsigned int which, u16 opcode, u32 flags)
				476	{
				477	(void)_osd_req_op_init(osd_req, which, opcode, flags);
				478	}
				479	EXPORT_SYMBOL(osd_req_op_init);
				480
				481	void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
				482	unsigned int which, u16 opcode,
				483	u64 offset, u64 length,
				484	u64 truncate_size, u32 truncate_seq)
				485	{
				486	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				487	opcode, 0);
				488	size_t payload_len = 0;
				489
				490	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
				491	opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
				492	opcode != CEPH_OSD_OP_TRUNCATE);
				493
				494	op->extent.offset = offset;
				495	op->extent.length = length;
				496	op->extent.truncate_size = truncate_size;
				497	op->extent.truncate_seq = truncate_seq;
				498	if (opcode == CEPH_OSD_OP_WRITE \|\| opcode == CEPH_OSD_OP_WRITEFULL)
				499	payload_len += length;
				500
				501	op->payload_len = payload_len;
				502	}
				503	EXPORT_SYMBOL(osd_req_op_extent_init);
				504
				505	void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
				506	unsigned int which, u64 length)
				507	{
				508	struct ceph_osd_req_op *op;
				509	u64 previous;
				510
				511	BUG_ON(which >= osd_req->r_num_ops);
				512	op = &osd_req->r_ops[which];
				513	previous = op->extent.length;
				514
				515	if (length == previous)
				516	return; /* Nothing to do */
				517	BUG_ON(length > previous);
				518
				519	op->extent.length = length;
				520	op->payload_len -= previous - length;
				521	}
				522	EXPORT_SYMBOL(osd_req_op_extent_update);
				523
				524	void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
				525	u16 opcode, const char class, const char method)
				526	{
				527	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				528	opcode, 0);
				529	struct ceph_pagelist *pagelist;
				530	size_t payload_len = 0;
				531	size_t size;
				532
				533	BUG_ON(opcode != CEPH_OSD_OP_CALL);
				534
				535	pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
				536	BUG_ON(!pagelist);
				537	ceph_pagelist_init(pagelist);
				538
				539	op->cls.class_name = class;
				540	size = strlen(class);
				541	BUG_ON(size > (size_t) U8_MAX);
				542	op->cls.class_len = size;
				543	ceph_pagelist_append(pagelist, class, size);
				544	payload_len += size;
				545
				546	op->cls.method_name = method;
				547	size = strlen(method);
				548	BUG_ON(size > (size_t) U8_MAX);
				549	op->cls.method_len = size;
				550	ceph_pagelist_append(pagelist, method, size);
				551	payload_len += size;
				552
				553	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
				554
				555	op->cls.argc = 0; /* currently unused */
				556
				557	op->payload_len = payload_len;
				558	}
				559	EXPORT_SYMBOL(osd_req_op_cls_init);
				560
				561	int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
				562	u16 opcode, const char name, const void value,
				563	size_t size, u8 cmp_op, u8 cmp_mode)
				564	{
				565	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				566	opcode, 0);
				567	struct ceph_pagelist *pagelist;
				568	size_t payload_len;
				569
				570	BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
				571
				572	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
				573	if (!pagelist)
				574	return -ENOMEM;
				575
				576	ceph_pagelist_init(pagelist);
				577
				578	payload_len = strlen(name);
				579	op->xattr.name_len = payload_len;
				580	ceph_pagelist_append(pagelist, name, payload_len);
				581
				582	op->xattr.value_len = size;
				583	ceph_pagelist_append(pagelist, value, size);
				584	payload_len += size;
				585
				586	op->xattr.cmp_op = cmp_op;
				587	op->xattr.cmp_mode = cmp_mode;
				588
				589	ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
				590	op->payload_len = payload_len;
				591	return 0;
				592	}
				593	EXPORT_SYMBOL(osd_req_op_xattr_init);
				594
				595	void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
				596	unsigned int which, u16 opcode,
				597	u64 cookie, u64 version, int flag)
				598	{
				599	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				600	opcode, 0);
				601
				602	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
				603
				604	op->watch.cookie = cookie;
				605	op->watch.ver = version;
				606	if (opcode == CEPH_OSD_OP_WATCH && flag)
				607	op->watch.flag = (u8)1;
				608	}
				609	EXPORT_SYMBOL(osd_req_op_watch_init);
				610
				611	void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
				612	unsigned int which,
				613	u64 expected_object_size,
				614	u64 expected_write_size)
				615	{
				616	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				617	CEPH_OSD_OP_SETALLOCHINT,
				618	0);
				619
				620	op->alloc_hint.expected_object_size = expected_object_size;
				621	op->alloc_hint.expected_write_size = expected_write_size;
				622
				623	/*
				624	* CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
				625	* not worth a feature bit. Set FAILOK per-op flag to make
				626	* sure older osds don't trip over an unsupported opcode.
				627	*/
				628	op->flags \|= CEPH_OSD_OP_FLAG_FAILOK;
				629	}
				630	EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
				631
				632	static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
				633	struct ceph_osd_data *osd_data)
				634	{
				635	u64 length = ceph_osd_data_length(osd_data);
				636
				637	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
				638	BUG_ON(length > (u64) SIZE_MAX);
				639	if (length)
				640	ceph_msg_data_add_pages(msg, osd_data->pages,
				641	length, osd_data->alignment);
				642	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
				643	BUG_ON(!length);
				644	ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
				645	#ifdef CONFIG_BLOCK
				646	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
				647	ceph_msg_data_add_bio(msg, osd_data->bio, length);
				648	#endif
				649	} else {
				650	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
				651	}
				652	}
				653
				654	static u64 osd_req_encode_op(struct ceph_osd_request *req,
				655	struct ceph_osd_op *dst, unsigned int which)
				656	{
				657	struct ceph_osd_req_op *src;
				658	struct ceph_osd_data *osd_data;
				659	u64 request_data_len = 0;
				660	u64 data_length;
				661
				662	BUG_ON(which >= req->r_num_ops);
				663	src = &req->r_ops[which];
				664	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
				665	pr_err("unrecognized osd opcode %d\n", src->op);
				666
				667	return 0;
				668	}
				669
				670	switch (src->op) {
				671	case CEPH_OSD_OP_STAT:
				672	osd_data = &src->raw_data_in;
				673	ceph_osdc_msg_data_add(req->r_reply, osd_data);
				674	break;
				675	case CEPH_OSD_OP_READ:
				676	case CEPH_OSD_OP_WRITE:
				677	case CEPH_OSD_OP_WRITEFULL:
				678	case CEPH_OSD_OP_ZERO:
				679	case CEPH_OSD_OP_TRUNCATE:
				680	if (src->op == CEPH_OSD_OP_WRITE \|\|
				681	src->op == CEPH_OSD_OP_WRITEFULL)
				682	request_data_len = src->extent.length;
				683	dst->extent.offset = cpu_to_le64(src->extent.offset);
				684	dst->extent.length = cpu_to_le64(src->extent.length);
				685	dst->extent.truncate_size =
				686	cpu_to_le64(src->extent.truncate_size);
				687	dst->extent.truncate_seq =
				688	cpu_to_le32(src->extent.truncate_seq);
				689	osd_data = &src->extent.osd_data;
				690	if (src->op == CEPH_OSD_OP_WRITE \|\|
				691	src->op == CEPH_OSD_OP_WRITEFULL)
				692	ceph_osdc_msg_data_add(req->r_request, osd_data);
				693	else
				694	ceph_osdc_msg_data_add(req->r_reply, osd_data);
				695	break;
				696	case CEPH_OSD_OP_CALL:
				697	dst->cls.class_len = src->cls.class_len;
				698	dst->cls.method_len = src->cls.method_len;
				699	osd_data = &src->cls.request_info;
				700	ceph_osdc_msg_data_add(req->r_request, osd_data);
				701	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
				702	request_data_len = osd_data->pagelist->length;
				703
				704	osd_data = &src->cls.request_data;
				705	data_length = ceph_osd_data_length(osd_data);
				706	if (data_length) {
				707	BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
				708	dst->cls.indata_len = cpu_to_le32(data_length);
				709	ceph_osdc_msg_data_add(req->r_request, osd_data);
				710	src->payload_len += data_length;
				711	request_data_len += data_length;
				712	}
				713	osd_data = &src->cls.response_data;
				714	ceph_osdc_msg_data_add(req->r_reply, osd_data);
				715	break;
				716	case CEPH_OSD_OP_STARTSYNC:
				717	break;
				718	case CEPH_OSD_OP_NOTIFY_ACK:
				719	case CEPH_OSD_OP_WATCH:
				720	dst->watch.cookie = cpu_to_le64(src->watch.cookie);
				721	dst->watch.ver = cpu_to_le64(src->watch.ver);
				722	dst->watch.flag = src->watch.flag;
				723	break;
				724	case CEPH_OSD_OP_SETALLOCHINT:
				725	dst->alloc_hint.expected_object_size =
				726	cpu_to_le64(src->alloc_hint.expected_object_size);
				727	dst->alloc_hint.expected_write_size =
				728	cpu_to_le64(src->alloc_hint.expected_write_size);
				729	break;
				730	case CEPH_OSD_OP_SETXATTR:
				731	case CEPH_OSD_OP_CMPXATTR:
				732	dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
				733	dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
				734	dst->xattr.cmp_op = src->xattr.cmp_op;
				735	dst->xattr.cmp_mode = src->xattr.cmp_mode;
				736	osd_data = &src->xattr.osd_data;
				737	ceph_osdc_msg_data_add(req->r_request, osd_data);
				738	request_data_len = osd_data->pagelist->length;
				739	break;
				740	case CEPH_OSD_OP_CREATE:
				741	case CEPH_OSD_OP_DELETE:
				742	break;
				743	default:
				744	pr_err("unsupported osd opcode %s\n",
				745	ceph_osd_op_name(src->op));
				746	WARN_ON(1);
				747
				748	return 0;
				749	}
				750
				751	dst->op = cpu_to_le16(src->op);
				752	dst->flags = cpu_to_le32(src->flags);
				753	dst->payload_len = cpu_to_le32(src->payload_len);
				754
				755	return request_data_len;
				756	}
				757
				758	/*
				759	* build new request AND message, calculate layout, and adjust file
				760	* extent as needed.
				761	*
				762	* if the file was recently truncated, we include information about its
				763	* old and new size so that the object can be updated appropriately. (we
				764	* avoid synchronously deleting truncated objects because it's slow.)
				765	*
				766	* if @do_sync, include a 'startsync' command so that the osd will flush
				767	* data quickly.
				768	*/
				769	struct ceph_osd_request ceph_osdc_new_request(struct ceph_osd_client osdc,
				770	struct ceph_file_layout *layout,
				771	struct ceph_vino vino,
				772	u64 off, u64 *plen,
				773	unsigned int which, int num_ops,
				774	int opcode, int flags,
				775	struct ceph_snap_context *snapc,
				776	u32 truncate_seq,
				777	u64 truncate_size,
				778	bool use_mempool)
				779	{
				780	struct ceph_osd_request *req;
				781	u64 objnum = 0;
				782	u64 objoff = 0;
				783	u64 objlen = 0;
				784	int r;
				785
				786	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
				787	opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
				788	opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
				789
				790	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
				791	GFP_NOFS);
				792	if (!req)
				793	return ERR_PTR(-ENOMEM);
				794
				795	req->r_flags = flags;
				796
				797	/* calculate max write size */
				798	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
				799	if (r < 0) {
				800	ceph_osdc_put_request(req);
				801	return ERR_PTR(r);
				802	}
				803
				804	if (opcode == CEPH_OSD_OP_CREATE \|\| opcode == CEPH_OSD_OP_DELETE) {
				805	osd_req_op_init(req, which, opcode, 0);
				806	} else {
				807	u32 object_size = le32_to_cpu(layout->fl_object_size);
				808	u32 object_base = off - objoff;
				809	if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
				810	if (truncate_size <= object_base) {
				811	truncate_size = 0;
				812	} else {
				813	truncate_size -= object_base;
				814	if (truncate_size > object_size)
				815	truncate_size = object_size;
				816	}
				817	}
				818	osd_req_op_extent_init(req, which, opcode, objoff, objlen,
				819	truncate_size, truncate_seq);
				820	}
				821
				822	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
				823
				824	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
				825	"%llx.%08llx", vino.ino, objnum);
				826	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
				827
				828	return req;
				829	}
				830	EXPORT_SYMBOL(ceph_osdc_new_request);
				831
				832	/*
				833	* We keep osd requests in an rbtree, sorted by ->r_tid.
				834	*/
				835	static void __insert_request(struct ceph_osd_client *osdc,
				836	struct ceph_osd_request *new)
				837	{
				838	struct rb_node **p = &osdc->requests.rb_node;
				839	struct rb_node *parent = NULL;
				840	struct ceph_osd_request *req = NULL;
				841
				842	while (*p) {
				843	parent = *p;
				844	req = rb_entry(parent, struct ceph_osd_request, r_node);
				845	if (new->r_tid < req->r_tid)
				846	p = &(*p)->rb_left;
				847	else if (new->r_tid > req->r_tid)
				848	p = &(*p)->rb_right;
				849	else
				850	BUG();
				851	}
				852
				853	rb_link_node(&new->r_node, parent, p);
				854	rb_insert_color(&new->r_node, &osdc->requests);
				855	}
				856
				857	static struct ceph_osd_request __lookup_request(struct ceph_osd_client osdc,
				858	u64 tid)
				859	{
				860	struct ceph_osd_request *req;
				861	struct rb_node *n = osdc->requests.rb_node;
				862
				863	while (n) {
				864	req = rb_entry(n, struct ceph_osd_request, r_node);
				865	if (tid < req->r_tid)
				866	n = n->rb_left;
				867	else if (tid > req->r_tid)
				868	n = n->rb_right;
				869	else
				870	return req;
				871	}
				872	return NULL;
				873	}
				874
				875	static struct ceph_osd_request *
				876	__lookup_request_ge(struct ceph_osd_client *osdc,
				877	u64 tid)
				878	{
				879	struct ceph_osd_request *req;
				880	struct rb_node *n = osdc->requests.rb_node;
				881
				882	while (n) {
				883	req = rb_entry(n, struct ceph_osd_request, r_node);
				884	if (tid < req->r_tid) {
				885	if (!n->rb_left)
				886	return req;
				887	n = n->rb_left;
				888	} else if (tid > req->r_tid) {
				889	n = n->rb_right;
				890	} else {
				891	return req;
				892	}
				893	}
				894	return NULL;
				895	}
				896
				897	static void __kick_linger_request(struct ceph_osd_request *req)
				898	{
				899	struct ceph_osd_client *osdc = req->r_osdc;
				900	struct ceph_osd *osd = req->r_osd;
				901
				902	/*
				903	* Linger requests need to be resent with a new tid to avoid
				904	* the dup op detection logic on the OSDs. Achieve this with
				905	* a re-register dance instead of open-coding.
				906	*/
				907	ceph_osdc_get_request(req);
				908	if (!list_empty(&req->r_linger_item))
				909	__unregister_linger_request(osdc, req);
				910	else
				911	__unregister_request(osdc, req);
				912	__register_request(osdc, req);
				913	ceph_osdc_put_request(req);
				914
				915	/*
				916	* Unless request has been registered as both normal and
				917	* lingering, __unregister{,_linger}_request clears r_osd.
				918	* However, here we need to preserve r_osd to make sure we
				919	* requeue on the same OSD.
				920	*/
				921	WARN_ON(req->r_osd \|\| !osd);
				922	req->r_osd = osd;
				923
				924	dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
				925	__enqueue_request(req);
				926	}
				927
				928	/*
				929	* Resubmit requests pending on the given osd.
				930	*/
				931	static void __kick_osd_requests(struct ceph_osd_client *osdc,
				932	struct ceph_osd *osd)
				933	{
				934	struct ceph_osd_request req, nreq;
				935	LIST_HEAD(resend);
				936	LIST_HEAD(resend_linger);
				937	int err;
				938
				939	dout("%s osd%d\n", __func__, osd->o_osd);
				940	err = __reset_osd(osdc, osd);
				941	if (err)
				942	return;
				943
				944	/*
				945	* Build up a list of requests to resend by traversing the
				946	* osd's list of requests. Requests for a given object are
				947	* sent in tid order, and that is also the order they're
				948	* kept on this list. Therefore all requests that are in
				949	* flight will be found first, followed by all requests that
				950	* have not yet been sent. And to resend requests while
				951	* preserving this order we will want to put any sent
				952	* requests back on the front of the osd client's unsent
				953	* list.
				954	*
				955	* So we build a separate ordered list of already-sent
				956	* requests for the affected osd and splice it onto the
				957	* front of the osd client's unsent list. Once we've seen a
				958	* request that has not yet been sent we're done. Those
				959	* requests are already sitting right where they belong.
				960	*/
				961	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
				962	if (!req->r_sent)
				963	break;
				964
				965	if (!req->r_linger) {
				966	dout("%s requeueing %p tid %llu\n", __func__, req,
				967	req->r_tid);
				968	list_move_tail(&req->r_req_lru_item, &resend);
				969	req->r_flags \|= CEPH_OSD_FLAG_RETRY;
				970	} else {
				971	list_move_tail(&req->r_req_lru_item, &resend_linger);
				972	}
				973	}
				974	list_splice(&resend, &osdc->req_unsent);
				975
				976	/*
				977	* Both registered and not yet registered linger requests are
				978	* enqueued with a new tid on the same OSD. We add/move them
				979	* to req_unsent/o_requests at the end to keep things in tid
				980	* order.
				981	*/
				982	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
				983	r_linger_osd_item) {
				984	WARN_ON(!list_empty(&req->r_req_lru_item));
				985	__kick_linger_request(req);
				986	}
				987
				988	list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
				989	__kick_linger_request(req);
				990	}
				991
				992	/*
				993	* If the osd connection drops, we need to resubmit all requests.
				994	*/
				995	static void osd_reset(struct ceph_connection *con)
				996	{
				997	struct ceph_osd *osd = con->private;
				998	struct ceph_osd_client *osdc;
				999
				1000	if (!osd)
				1001	return;
				1002	dout("osd_reset osd%d\n", osd->o_osd);
				1003	osdc = osd->o_osdc;
				1004	down_read(&osdc->map_sem);
				1005	mutex_lock(&osdc->request_mutex);
				1006	__kick_osd_requests(osdc, osd);
				1007	__send_queued(osdc);
				1008	mutex_unlock(&osdc->request_mutex);
				1009	up_read(&osdc->map_sem);
				1010	}
				1011
				1012	/*
				1013	* Track open sessions with osds.
				1014	*/
				1015	static struct ceph_osd create_osd(struct ceph_osd_client osdc, int onum)
				1016	{
				1017	struct ceph_osd *osd;
				1018
				1019	osd = kzalloc(sizeof(*osd), GFP_NOFS);
				1020	if (!osd)
				1021	return NULL;
				1022
				1023	atomic_set(&osd->o_ref, 1);
				1024	osd->o_osdc = osdc;
				1025	osd->o_osd = onum;
				1026	RB_CLEAR_NODE(&osd->o_node);
				1027	INIT_LIST_HEAD(&osd->o_requests);
				1028	INIT_LIST_HEAD(&osd->o_linger_requests);
				1029	INIT_LIST_HEAD(&osd->o_osd_lru);
				1030	osd->o_incarnation = 1;
				1031
				1032	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
				1033
				1034	INIT_LIST_HEAD(&osd->o_keepalive_item);
				1035	return osd;
				1036	}
				1037
				1038	static struct ceph_osd get_osd(struct ceph_osd osd)
				1039	{
				1040	if (atomic_inc_not_zero(&osd->o_ref)) {
				1041	dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
				1042	atomic_read(&osd->o_ref));
				1043	return osd;
				1044	} else {
				1045	dout("get_osd %p FAIL\n", osd);
				1046	return NULL;
				1047	}
				1048	}
				1049
				1050	static void put_osd(struct ceph_osd *osd)
				1051	{
				1052	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
				1053	atomic_read(&osd->o_ref) - 1);
				1054	if (atomic_dec_and_test(&osd->o_ref)) {
				1055	struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
				1056
				1057	if (osd->o_auth.authorizer)
				1058	ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
				1059	kfree(osd);
				1060	}
				1061	}
				1062
				1063	/*
				1064	* remove an osd from our map
				1065	*/
				1066	static void __remove_osd(struct ceph_osd_client osdc, struct ceph_osd osd)
				1067	{
				1068	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
				1069	WARN_ON(!list_empty(&osd->o_requests));
				1070	WARN_ON(!list_empty(&osd->o_linger_requests));
				1071
				1072	list_del_init(&osd->o_osd_lru);
				1073	rb_erase(&osd->o_node, &osdc->osds);
				1074	RB_CLEAR_NODE(&osd->o_node);
				1075	}
				1076
				1077	static void remove_osd(struct ceph_osd_client osdc, struct ceph_osd osd)
				1078	{
				1079	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
				1080
				1081	if (!RB_EMPTY_NODE(&osd->o_node)) {
				1082	ceph_con_close(&osd->o_con);
				1083	__remove_osd(osdc, osd);
				1084	put_osd(osd);
				1085	}
				1086	}
				1087
				1088	static void remove_all_osds(struct ceph_osd_client *osdc)
				1089	{
				1090	dout("%s %p\n", __func__, osdc);
				1091	mutex_lock(&osdc->request_mutex);
				1092	while (!RB_EMPTY_ROOT(&osdc->osds)) {
				1093	struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
				1094	struct ceph_osd, o_node);
				1095	remove_osd(osdc, osd);
				1096	}
				1097	mutex_unlock(&osdc->request_mutex);
				1098	}
				1099
				1100	static void __move_osd_to_lru(struct ceph_osd_client *osdc,
				1101	struct ceph_osd *osd)
				1102	{
				1103	dout("%s %p\n", __func__, osd);
				1104	BUG_ON(!list_empty(&osd->o_osd_lru));
				1105
				1106	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
				1107	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
				1108	}
				1109
				1110	static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
				1111	struct ceph_osd *osd)
				1112	{
				1113	dout("%s %p\n", __func__, osd);
				1114
				1115	if (list_empty(&osd->o_requests) &&
				1116	list_empty(&osd->o_linger_requests))
				1117	__move_osd_to_lru(osdc, osd);
				1118	}
				1119
				1120	static void __remove_osd_from_lru(struct ceph_osd *osd)
				1121	{
				1122	dout("__remove_osd_from_lru %p\n", osd);
				1123	if (!list_empty(&osd->o_osd_lru))
				1124	list_del_init(&osd->o_osd_lru);
				1125	}
				1126
				1127	static void remove_old_osds(struct ceph_osd_client *osdc)
				1128	{
				1129	struct ceph_osd osd, nosd;
				1130
				1131	dout("__remove_old_osds %p\n", osdc);
				1132	mutex_lock(&osdc->request_mutex);
				1133	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
				1134	if (time_before(jiffies, osd->lru_ttl))
				1135	break;
				1136	remove_osd(osdc, osd);
				1137	}
				1138	mutex_unlock(&osdc->request_mutex);
				1139	}
				1140
				1141	/*
				1142	* reset osd connect
				1143	*/
				1144	static int __reset_osd(struct ceph_osd_client osdc, struct ceph_osd osd)
				1145	{
				1146	struct ceph_entity_addr *peer_addr;
				1147
				1148	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
				1149	if (list_empty(&osd->o_requests) &&
				1150	list_empty(&osd->o_linger_requests)) {
				1151	remove_osd(osdc, osd);
				1152	return -ENODEV;
				1153	}
				1154
				1155	peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
				1156	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
				1157	!ceph_con_opened(&osd->o_con)) {
				1158	struct ceph_osd_request *req;
				1159
				1160	dout("osd addr hasn't changed and connection never opened, "
				1161	"letting msgr retry\n");
				1162	/* touch each r_stamp for handle_timeout()'s benfit */
				1163	list_for_each_entry(req, &osd->o_requests, r_osd_item)
				1164	req->r_stamp = jiffies;
				1165
				1166	return -EAGAIN;
				1167	}
				1168
				1169	ceph_con_close(&osd->o_con);
				1170	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
				1171	osd->o_incarnation++;
				1172
				1173	return 0;
				1174	}
				1175
				1176	static void __insert_osd(struct ceph_osd_client osdc, struct ceph_osd new)
				1177	{
				1178	struct rb_node **p = &osdc->osds.rb_node;
				1179	struct rb_node *parent = NULL;
				1180	struct ceph_osd *osd = NULL;
				1181
				1182	dout("__insert_osd %p osd%d\n", new, new->o_osd);
				1183	while (*p) {
				1184	parent = *p;
				1185	osd = rb_entry(parent, struct ceph_osd, o_node);
				1186	if (new->o_osd < osd->o_osd)
				1187	p = &(*p)->rb_left;
				1188	else if (new->o_osd > osd->o_osd)
				1189	p = &(*p)->rb_right;
				1190	else
				1191	BUG();
				1192	}
				1193
				1194	rb_link_node(&new->o_node, parent, p);
				1195	rb_insert_color(&new->o_node, &osdc->osds);
				1196	}
				1197
				1198	static struct ceph_osd __lookup_osd(struct ceph_osd_client osdc, int o)
				1199	{
				1200	struct ceph_osd *osd;
				1201	struct rb_node *n = osdc->osds.rb_node;
				1202
				1203	while (n) {
				1204	osd = rb_entry(n, struct ceph_osd, o_node);
				1205	if (o < osd->o_osd)
				1206	n = n->rb_left;
				1207	else if (o > osd->o_osd)
				1208	n = n->rb_right;
				1209	else
				1210	return osd;
				1211	}
				1212	return NULL;
				1213	}
				1214
				1215	static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
				1216	{
				1217	schedule_delayed_work(&osdc->timeout_work,
				1218	osdc->client->options->osd_keepalive_timeout);
				1219	}
				1220
				1221	static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
				1222	{
				1223	cancel_delayed_work(&osdc->timeout_work);
				1224	}
				1225
				1226	/*
				1227	* Register request, assign tid. If this is the first request, set up
				1228	* the timeout event.
				1229	*/
				1230	static void __register_request(struct ceph_osd_client *osdc,
				1231	struct ceph_osd_request *req)
				1232	{
				1233	req->r_tid = ++osdc->last_tid;
				1234	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
				1235	dout("__register_request %p tid %lld\n", req, req->r_tid);
				1236	__insert_request(osdc, req);
				1237	ceph_osdc_get_request(req);
				1238	osdc->num_requests++;
				1239	if (osdc->num_requests == 1) {
				1240	dout(" first request, scheduling timeout\n");
				1241	__schedule_osd_timeout(osdc);
				1242	}
				1243	}
				1244
				1245	/*
				1246	* called under osdc->request_mutex
				1247	*/
				1248	static void __unregister_request(struct ceph_osd_client *osdc,
				1249	struct ceph_osd_request *req)
				1250	{
				1251	if (RB_EMPTY_NODE(&req->r_node)) {
				1252	dout("__unregister_request %p tid %lld not registered\n",
				1253	req, req->r_tid);
				1254	return;
				1255	}
				1256
				1257	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
				1258	rb_erase(&req->r_node, &osdc->requests);
				1259	RB_CLEAR_NODE(&req->r_node);
				1260	osdc->num_requests--;
				1261
				1262	if (req->r_osd) {
				1263	/* make sure the original request isn't in flight. */
				1264	ceph_msg_revoke(req->r_request);
				1265
				1266	list_del_init(&req->r_osd_item);
				1267	maybe_move_osd_to_lru(osdc, req->r_osd);
				1268	if (list_empty(&req->r_linger_osd_item))
				1269	req->r_osd = NULL;
				1270	}
				1271
				1272	list_del_init(&req->r_req_lru_item);
				1273	ceph_osdc_put_request(req);
				1274
				1275	if (osdc->num_requests == 0) {
				1276	dout(" no requests, canceling timeout\n");
				1277	__cancel_osd_timeout(osdc);
				1278	}
				1279	}
				1280
				1281	/*
				1282	* Cancel a previously queued request message
				1283	*/
				1284	static void __cancel_request(struct ceph_osd_request *req)
				1285	{
				1286	if (req->r_sent && req->r_osd) {
				1287	ceph_msg_revoke(req->r_request);
				1288	req->r_sent = 0;
				1289	}
				1290	}
				1291
				1292	static void __register_linger_request(struct ceph_osd_client *osdc,
				1293	struct ceph_osd_request *req)
				1294	{
				1295	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
				1296	WARN_ON(!req->r_linger);
				1297
				1298	ceph_osdc_get_request(req);
				1299	list_add_tail(&req->r_linger_item, &osdc->req_linger);
				1300	if (req->r_osd)
				1301	list_add_tail(&req->r_linger_osd_item,
				1302	&req->r_osd->o_linger_requests);
				1303	}
				1304
				1305	static void __unregister_linger_request(struct ceph_osd_client *osdc,
				1306	struct ceph_osd_request *req)
				1307	{
				1308	WARN_ON(!req->r_linger);
				1309
				1310	if (list_empty(&req->r_linger_item)) {
				1311	dout("%s %p tid %llu not registered\n", __func__, req,
				1312	req->r_tid);
				1313	return;
				1314	}
				1315
				1316	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
				1317	list_del_init(&req->r_linger_item);
				1318
				1319	if (req->r_osd) {
				1320	list_del_init(&req->r_linger_osd_item);
				1321	maybe_move_osd_to_lru(osdc, req->r_osd);
				1322	if (list_empty(&req->r_osd_item))
				1323	req->r_osd = NULL;
				1324	}
				1325	ceph_osdc_put_request(req);
				1326	}
				1327
				1328	void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
				1329	struct ceph_osd_request *req)
				1330	{
				1331	if (!req->r_linger) {
				1332	dout("set_request_linger %p\n", req);
				1333	req->r_linger = 1;
				1334	}
				1335	}
				1336	EXPORT_SYMBOL(ceph_osdc_set_request_linger);
				1337
				1338	/*
				1339	* Returns whether a request should be blocked from being sent
				1340	* based on the current osdmap and osd_client settings.
				1341	*
				1342	* Caller should hold map_sem for read.
				1343	*/
				1344	static bool __req_should_be_paused(struct ceph_osd_client *osdc,
				1345	struct ceph_osd_request *req)
				1346	{
				1347	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
				1348	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) \|\|
				1349	ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
				1350	return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) \|\|
				1351	(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
				1352	}
				1353
				1354	/*
				1355	* Calculate mapping of a request to a PG. Takes tiering into account.
				1356	*/
				1357	static int __calc_request_pg(struct ceph_osdmap *osdmap,
				1358	struct ceph_osd_request *req,
				1359	struct ceph_pg *pg_out)
				1360	{
				1361	bool need_check_tiering;
				1362
				1363	need_check_tiering = false;
				1364	if (req->r_target_oloc.pool == -1) {
				1365	req->r_target_oloc = req->r_base_oloc; /* struct */
				1366	need_check_tiering = true;
				1367	}
				1368	if (req->r_target_oid.name_len == 0) {
				1369	ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
				1370	need_check_tiering = true;
				1371	}
				1372
				1373	if (need_check_tiering &&
				1374	(req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
				1375	struct ceph_pg_pool_info *pi;
				1376
				1377	pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
				1378	if (pi) {
				1379	if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
				1380	pi->read_tier >= 0)
				1381	req->r_target_oloc.pool = pi->read_tier;
				1382	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
				1383	pi->write_tier >= 0)
				1384	req->r_target_oloc.pool = pi->write_tier;
				1385	}
				1386	/* !pi is caught in ceph_oloc_oid_to_pg() */
				1387	}
				1388
				1389	return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
				1390	&req->r_target_oid, pg_out);
				1391	}
				1392
				1393	static void __enqueue_request(struct ceph_osd_request *req)
				1394	{
				1395	struct ceph_osd_client *osdc = req->r_osdc;
				1396
				1397	dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
				1398	req->r_osd ? req->r_osd->o_osd : -1);
				1399
				1400	if (req->r_osd) {
				1401	__remove_osd_from_lru(req->r_osd);
				1402	list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
				1403	list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
				1404	} else {
				1405	list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
				1406	}
				1407	}
				1408
				1409	/*
				1410	* Pick an osd (the first 'up' osd in the pg), allocate the osd struct
				1411	* (as needed), and set the request r_osd appropriately. If there is
				1412	* no up osd, set r_osd to NULL. Move the request to the appropriate list
				1413	* (unsent, homeless) or leave on in-flight lru.
				1414	*
				1415	* Return 0 if unchanged, 1 if changed, or negative on error.
				1416	*
				1417	* Caller should hold map_sem for read and request_mutex.
				1418	*/
				1419	static int __map_request(struct ceph_osd_client *osdc,
				1420	struct ceph_osd_request *req, int force_resend)
				1421	{
				1422	struct ceph_pg pgid;
				1423	int acting[CEPH_PG_MAX_SIZE];
				1424	int num, o;
				1425	int err;
				1426	bool was_paused;
				1427
				1428	dout("map_request %p tid %lld\n", req, req->r_tid);
				1429
				1430	err = __calc_request_pg(osdc->osdmap, req, &pgid);
				1431	if (err) {
				1432	list_move(&req->r_req_lru_item, &osdc->req_notarget);
				1433	return err;
				1434	}
				1435	req->r_pgid = pgid;
				1436
				1437	num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
				1438	if (num < 0)
				1439	num = 0;
				1440
				1441	was_paused = req->r_paused;
				1442	req->r_paused = __req_should_be_paused(osdc, req);
				1443	if (was_paused && !req->r_paused)
				1444	force_resend = 1;
				1445
				1446	if ((!force_resend &&
				1447	req->r_osd && req->r_osd->o_osd == o &&
				1448	req->r_sent >= req->r_osd->o_incarnation &&
				1449	req->r_num_pg_osds == num &&
				1450	memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) \|\|
				1451	(req->r_osd == NULL && o == -1) \|\|
				1452	req->r_paused)
				1453	return 0; /* no change */
				1454
				1455	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
				1456	req->r_tid, pgid.pool, pgid.seed, o,
				1457	req->r_osd ? req->r_osd->o_osd : -1);
				1458
				1459	/* record full pg acting set */
				1460	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
				1461	req->r_num_pg_osds = num;
				1462
				1463	if (req->r_osd) {
				1464	__cancel_request(req);
				1465	list_del_init(&req->r_osd_item);
				1466	list_del_init(&req->r_linger_osd_item);
				1467	req->r_osd = NULL;
				1468	}
				1469
				1470	req->r_osd = __lookup_osd(osdc, o);
				1471	if (!req->r_osd && o >= 0) {
				1472	err = -ENOMEM;
				1473	req->r_osd = create_osd(osdc, o);
				1474	if (!req->r_osd) {
				1475	list_move(&req->r_req_lru_item, &osdc->req_notarget);
				1476	goto out;
				1477	}
				1478
				1479	dout("map_request osd %p is osd%d\n", req->r_osd, o);
				1480	__insert_osd(osdc, req->r_osd);
				1481
				1482	ceph_con_open(&req->r_osd->o_con,
				1483	CEPH_ENTITY_TYPE_OSD, o,
				1484	&osdc->osdmap->osd_addr[o]);
				1485	}
				1486
				1487	__enqueue_request(req);
				1488	err = 1; /* osd or pg changed */
				1489
				1490	out:
				1491	return err;
				1492	}
				1493
				1494	/*
				1495	* caller should hold map_sem (for read) and request_mutex
				1496	*/
				1497	static void __send_request(struct ceph_osd_client *osdc,
				1498	struct ceph_osd_request *req)
				1499	{
				1500	void *p;
				1501
				1502	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
				1503	req, req->r_tid, req->r_osd->o_osd, req->r_flags,
				1504	(unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
				1505
				1506	/* fill in message content that changes each time we send it */
				1507	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
				1508	put_unaligned_le32(req->r_flags, req->r_request_flags);
				1509	put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
				1510	p = req->r_request_pgid;
				1511	ceph_encode_64(&p, req->r_pgid.pool);
				1512	ceph_encode_32(&p, req->r_pgid.seed);
				1513	put_unaligned_le64(1, req->r_request_attempts); /* FIXME */
				1514	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
				1515	sizeof(req->r_reassert_version));
				1516
				1517	req->r_stamp = jiffies;
				1518	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
				1519
				1520	ceph_msg_get(req->r_request); /* send consumes a ref */
				1521
				1522	req->r_sent = req->r_osd->o_incarnation;
				1523
				1524	ceph_con_send(&req->r_osd->o_con, req->r_request);
				1525	}
				1526
				1527	/*
				1528	* Send any requests in the queue (req_unsent).
				1529	*/
				1530	static void __send_queued(struct ceph_osd_client *osdc)
				1531	{
				1532	struct ceph_osd_request req, tmp;
				1533
				1534	dout("__send_queued\n");
				1535	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
				1536	__send_request(osdc, req);
				1537	}
				1538
				1539	/*
				1540	* Caller should hold map_sem for read and request_mutex.
				1541	*/
				1542	static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
				1543	struct ceph_osd_request *req,
				1544	bool nofail)
				1545	{
				1546	int rc;
				1547
				1548	__register_request(osdc, req);
				1549	req->r_sent = 0;
				1550	req->r_got_reply = 0;
				1551	rc = __map_request(osdc, req, 0);
				1552	if (rc < 0) {
				1553	if (nofail) {
				1554	dout("osdc_start_request failed map, "
				1555	" will retry %lld\n", req->r_tid);
				1556	rc = 0;
				1557	} else {
				1558	__unregister_request(osdc, req);
				1559	}
				1560	return rc;
				1561	}
				1562
				1563	if (req->r_osd == NULL) {
				1564	dout("send_request %p no up osds in pg\n", req);
				1565	ceph_monc_request_next_osdmap(&osdc->client->monc);
				1566	} else {
				1567	__send_queued(osdc);
				1568	}
				1569
				1570	return 0;
				1571	}
				1572
				1573	/*
				1574	* Timeout callback, called every N seconds when 1 or more osd
				1575	* requests has been active for more than N seconds. When this
				1576	* happens, we ping all OSDs with requests who have timed out to
				1577	* ensure any communications channel reset is detected. Reset the
				1578	* request timeouts another N seconds in the future as we go.
				1579	* Reschedule the timeout event another N seconds in future (unless
				1580	* there are no open requests).
				1581	*/
				1582	static void handle_timeout(struct work_struct *work)
				1583	{
				1584	struct ceph_osd_client *osdc =
				1585	container_of(work, struct ceph_osd_client, timeout_work.work);
				1586	struct ceph_options *opts = osdc->client->options;
				1587	struct ceph_osd_request *req;
				1588	struct ceph_osd *osd;
				1589	struct list_head slow_osds;
				1590	dout("timeout\n");
				1591	down_read(&osdc->map_sem);
				1592
				1593	ceph_monc_request_next_osdmap(&osdc->client->monc);
				1594
				1595	mutex_lock(&osdc->request_mutex);
				1596
				1597	/*
				1598	* ping osds that are a bit slow. this ensures that if there
				1599	* is a break in the TCP connection we will notice, and reopen
				1600	* a connection with that osd (from the fault callback).
				1601	*/
				1602	INIT_LIST_HEAD(&slow_osds);
				1603	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
				1604	if (time_before(jiffies,
				1605	req->r_stamp + opts->osd_keepalive_timeout))
				1606	break;
				1607
				1608	osd = req->r_osd;
				1609	BUG_ON(!osd);
				1610	dout(" tid %llu is slow, will send keepalive on osd%d\n",
				1611	req->r_tid, osd->o_osd);
				1612	list_move_tail(&osd->o_keepalive_item, &slow_osds);
				1613	}
				1614	while (!list_empty(&slow_osds)) {
				1615	osd = list_entry(slow_osds.next, struct ceph_osd,
				1616	o_keepalive_item);
				1617	list_del_init(&osd->o_keepalive_item);
				1618	ceph_con_keepalive(&osd->o_con);
				1619	}
				1620
				1621	__schedule_osd_timeout(osdc);
				1622	__send_queued(osdc);
				1623	mutex_unlock(&osdc->request_mutex);
				1624	up_read(&osdc->map_sem);
				1625	}
				1626
				1627	static void handle_osds_timeout(struct work_struct *work)
				1628	{
				1629	struct ceph_osd_client *osdc =
				1630	container_of(work, struct ceph_osd_client,
				1631	osds_timeout_work.work);
				1632	unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
				1633
				1634	dout("osds timeout\n");
				1635	down_read(&osdc->map_sem);
				1636	remove_old_osds(osdc);
				1637	up_read(&osdc->map_sem);
				1638
				1639	schedule_delayed_work(&osdc->osds_timeout_work,
				1640	round_jiffies_relative(delay));
				1641	}
				1642
				1643	static int ceph_oloc_decode(void *p, void end,
				1644	struct ceph_object_locator *oloc)
				1645	{
				1646	u8 struct_v, struct_cv;
				1647	u32 len;
				1648	void *struct_end;
				1649	int ret = 0;
				1650
				1651	ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
				1652	struct_v = ceph_decode_8(p);
				1653	struct_cv = ceph_decode_8(p);
				1654	if (struct_v < 3) {
				1655	pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
				1656	struct_v, struct_cv);
				1657	goto e_inval;
				1658	}
				1659	if (struct_cv > 6) {
				1660	pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
				1661	struct_v, struct_cv);
				1662	goto e_inval;
				1663	}
				1664	len = ceph_decode_32(p);
				1665	ceph_decode_need(p, end, len, e_inval);
				1666	struct_end = *p + len;
				1667
				1668	oloc->pool = ceph_decode_64(p);
				1669	p += 4; / skip preferred */
				1670
				1671	len = ceph_decode_32(p);
				1672	if (len > 0) {
				1673	pr_warn("ceph_object_locator::key is set\n");
				1674	goto e_inval;
				1675	}
				1676
				1677	if (struct_v >= 5) {
				1678	len = ceph_decode_32(p);
				1679	if (len > 0) {
				1680	pr_warn("ceph_object_locator::nspace is set\n");
				1681	goto e_inval;
				1682	}
				1683	}
				1684
				1685	if (struct_v >= 6) {
				1686	s64 hash = ceph_decode_64(p);
				1687	if (hash != -1) {
				1688	pr_warn("ceph_object_locator::hash is set\n");
				1689	goto e_inval;
				1690	}
				1691	}
				1692
				1693	/* skip the rest */
				1694	*p = struct_end;
				1695	out:
				1696	return ret;
				1697
				1698	e_inval:
				1699	ret = -EINVAL;
				1700	goto out;
				1701	}
				1702
				1703	static int ceph_redirect_decode(void *p, void end,
				1704	struct ceph_request_redirect *redir)
				1705	{
				1706	u8 struct_v, struct_cv;
				1707	u32 len;
				1708	void *struct_end;
				1709	int ret;
				1710
				1711	ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
				1712	struct_v = ceph_decode_8(p);
				1713	struct_cv = ceph_decode_8(p);
				1714	if (struct_cv > 1) {
				1715	pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
				1716	struct_v, struct_cv);
				1717	goto e_inval;
				1718	}
				1719	len = ceph_decode_32(p);
				1720	ceph_decode_need(p, end, len, e_inval);
				1721	struct_end = *p + len;
				1722
				1723	ret = ceph_oloc_decode(p, end, &redir->oloc);
				1724	if (ret)
				1725	goto out;
				1726
				1727	len = ceph_decode_32(p);
				1728	if (len > 0) {
				1729	pr_warn("ceph_request_redirect::object_name is set\n");
				1730	goto e_inval;
				1731	}
				1732
				1733	len = ceph_decode_32(p);
				1734	p += len; / skip osd_instructions */
				1735
				1736	/* skip the rest */
				1737	*p = struct_end;
				1738	out:
				1739	return ret;
				1740
				1741	e_inval:
				1742	ret = -EINVAL;
				1743	goto out;
				1744	}
				1745
				1746	static void complete_request(struct ceph_osd_request *req)
				1747	{
				1748	complete_all(&req->r_safe_completion); /* fsync waiter */
				1749	}
				1750
				1751	/*
				1752	* handle osd op reply. either call the callback if it is specified,
				1753	* or do the completion to wake up the waiting thread.
				1754	*/
				1755	static void handle_reply(struct ceph_osd_client osdc, struct ceph_msg msg)
				1756	{
				1757	void p, end;
				1758	struct ceph_osd_request *req;
				1759	struct ceph_request_redirect redir;
				1760	u64 tid;
				1761	int object_len;
				1762	unsigned int numops;
				1763	int payload_len, flags;
				1764	s32 result;
				1765	s32 retry_attempt;
				1766	struct ceph_pg pg;
				1767	int err;
				1768	u32 reassert_epoch;
				1769	u64 reassert_version;
				1770	u32 osdmap_epoch;
				1771	int already_completed;
				1772	u32 bytes;
				1773	unsigned int i;
				1774
				1775	tid = le64_to_cpu(msg->hdr.tid);
				1776	dout("handle_reply %p tid %llu\n", msg, tid);
				1777
				1778	p = msg->front.iov_base;
				1779	end = p + msg->front.iov_len;
				1780
				1781	ceph_decode_need(&p, end, 4, bad);
				1782	object_len = ceph_decode_32(&p);
				1783	ceph_decode_need(&p, end, object_len, bad);
				1784	p += object_len;
				1785
				1786	err = ceph_decode_pgid(&p, end, &pg);
				1787	if (err)
				1788	goto bad;
				1789
				1790	ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
				1791	flags = ceph_decode_64(&p);
				1792	result = ceph_decode_32(&p);
				1793	reassert_epoch = ceph_decode_32(&p);
				1794	reassert_version = ceph_decode_64(&p);
				1795	osdmap_epoch = ceph_decode_32(&p);
				1796
				1797	/* lookup */
				1798	down_read(&osdc->map_sem);
				1799	mutex_lock(&osdc->request_mutex);
				1800	req = __lookup_request(osdc, tid);
				1801	if (req == NULL) {
				1802	dout("handle_reply tid %llu dne\n", tid);
				1803	goto bad_mutex;
				1804	}
				1805	ceph_osdc_get_request(req);
				1806
				1807	dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
				1808	req, result);
				1809
				1810	ceph_decode_need(&p, end, 4, bad_put);
				1811	numops = ceph_decode_32(&p);
				1812	if (numops > CEPH_OSD_MAX_OP)
				1813	goto bad_put;
				1814	if (numops != req->r_num_ops)
				1815	goto bad_put;
				1816	payload_len = 0;
				1817	ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
				1818	for (i = 0; i < numops; i++) {
				1819	struct ceph_osd_op *op = p;
				1820	int len;
				1821
				1822	len = le32_to_cpu(op->payload_len);
				1823	req->r_reply_op_len[i] = len;
				1824	dout(" op %d has %d bytes\n", i, len);
				1825	payload_len += len;
				1826	p += sizeof(*op);
				1827	}
				1828	bytes = le32_to_cpu(msg->hdr.data_len);
				1829	if (payload_len != bytes) {
				1830	pr_warn("sum of op payload lens %d != data_len %d\n",
				1831	payload_len, bytes);
				1832	goto bad_put;
				1833	}
				1834
				1835	ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
				1836	retry_attempt = ceph_decode_32(&p);
				1837	for (i = 0; i < numops; i++)
				1838	req->r_reply_op_result[i] = ceph_decode_32(&p);
				1839
				1840	if (le16_to_cpu(msg->hdr.version) >= 6) {
				1841	p += 8 + 4; /* skip replay_version */
				1842	p += 8; /* skip user_version */
				1843
				1844	err = ceph_redirect_decode(&p, end, &redir);
				1845	if (err)
				1846	goto bad_put;
				1847	} else {
				1848	redir.oloc.pool = -1;
				1849	}
				1850
				1851	if (redir.oloc.pool != -1) {
				1852	dout("redirect pool %lld\n", redir.oloc.pool);
				1853
				1854	__unregister_request(osdc, req);
				1855
				1856	req->r_target_oloc = redir.oloc; /* struct */
				1857
				1858	/*
				1859	* Start redirect requests with nofail=true. If
				1860	* mapping fails, request will end up on the notarget
				1861	* list, waiting for the new osdmap (which can take
				1862	* a while), even though the original request mapped
				1863	* successfully. In the future we might want to follow
				1864	* original request's nofail setting here.
				1865	*/
				1866	err = __ceph_osdc_start_request(osdc, req, true);
				1867	BUG_ON(err);
				1868
				1869	goto out_unlock;
				1870	}
				1871
				1872	already_completed = req->r_got_reply;
				1873	if (!req->r_got_reply) {
				1874	req->r_result = result;
				1875	dout("handle_reply result %d bytes %d\n", req->r_result,
				1876	bytes);
				1877	if (req->r_result == 0)
				1878	req->r_result = bytes;
				1879
				1880	/* in case this is a write and we need to replay, */
				1881	req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
				1882	req->r_reassert_version.version = cpu_to_le64(reassert_version);
				1883
				1884	req->r_got_reply = 1;
				1885	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
				1886	dout("handle_reply tid %llu dup ack\n", tid);
				1887	goto out_unlock;
				1888	}
				1889
				1890	dout("handle_reply tid %llu flags %d\n", tid, flags);
				1891
				1892	if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
				1893	__register_linger_request(osdc, req);
				1894
				1895	/* either this is a read, or we got the safe response */
				1896	if (result < 0 \|\|
				1897	(flags & CEPH_OSD_FLAG_ONDISK) \|\|
				1898	((flags & CEPH_OSD_FLAG_WRITE) == 0))
				1899	__unregister_request(osdc, req);
				1900
				1901	mutex_unlock(&osdc->request_mutex);
				1902	up_read(&osdc->map_sem);
				1903
				1904	if (!already_completed) {
				1905	if (req->r_unsafe_callback &&
				1906	result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
				1907	req->r_unsafe_callback(req, true);
				1908	if (req->r_callback)
				1909	req->r_callback(req, msg);
				1910	else
				1911	complete_all(&req->r_completion);
				1912	}
				1913
				1914	if (flags & CEPH_OSD_FLAG_ONDISK) {
				1915	if (req->r_unsafe_callback && already_completed)
				1916	req->r_unsafe_callback(req, false);
				1917	complete_request(req);
				1918	}
				1919
				1920	out:
				1921	dout("req=%p req->r_linger=%d\n", req, req->r_linger);
				1922	ceph_osdc_put_request(req);
				1923	return;
				1924	out_unlock:
				1925	mutex_unlock(&osdc->request_mutex);
				1926	up_read(&osdc->map_sem);
				1927	goto out;
				1928
				1929	bad_put:
				1930	req->r_result = -EIO;
				1931	__unregister_request(osdc, req);
				1932	if (req->r_callback)
				1933	req->r_callback(req, msg);
				1934	else
				1935	complete_all(&req->r_completion);
				1936	complete_request(req);
				1937	ceph_osdc_put_request(req);
				1938	bad_mutex:
				1939	mutex_unlock(&osdc->request_mutex);
				1940	up_read(&osdc->map_sem);
				1941	bad:
				1942	pr_err("corrupt osd_op_reply got %d %d\n",
				1943	(int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
				1944	ceph_msg_dump(msg);
				1945	}
				1946
				1947	static void reset_changed_osds(struct ceph_osd_client *osdc)
				1948	{
				1949	struct rb_node p, n;
				1950
				1951	dout("%s %p\n", __func__, osdc);
				1952	for (p = rb_first(&osdc->osds); p; p = n) {
				1953	struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
				1954
				1955	n = rb_next(p);
				1956	if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) \|\|
				1957	memcmp(&osd->o_con.peer_addr,
				1958	ceph_osd_addr(osdc->osdmap,
				1959	osd->o_osd),
				1960	sizeof(struct ceph_entity_addr)) != 0)
				1961	__reset_osd(osdc, osd);
				1962	}
				1963	}
				1964
				1965	/*
				1966	* Requeue requests whose mapping to an OSD has changed. If requests map to
				1967	* no osd, request a new map.
				1968	*
				1969	* Caller should hold map_sem for read.
				1970	*/
				1971	static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
				1972	bool force_resend_writes)
				1973	{
				1974	struct ceph_osd_request req, nreq;
				1975	struct rb_node *p;
				1976	int needmap = 0;
				1977	int err;
				1978	bool force_resend_req;
				1979
				1980	dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
				1981	force_resend_writes ? " (force resend writes)" : "");
				1982	mutex_lock(&osdc->request_mutex);
				1983	for (p = rb_first(&osdc->requests); p; ) {
				1984	req = rb_entry(p, struct ceph_osd_request, r_node);
				1985	p = rb_next(p);
				1986
				1987	/*
				1988	* For linger requests that have not yet been
				1989	* registered, move them to the linger list; they'll
				1990	* be sent to the osd in the loop below. Unregister
				1991	* the request before re-registering it as a linger
				1992	* request to ensure the __map_request() below
				1993	* will decide it needs to be sent.
				1994	*/
				1995	if (req->r_linger && list_empty(&req->r_linger_item)) {
				1996	dout("%p tid %llu restart on osd%d\n",
				1997	req, req->r_tid,
				1998	req->r_osd ? req->r_osd->o_osd : -1);
				1999	ceph_osdc_get_request(req);
				2000	__unregister_request(osdc, req);
				2001	__register_linger_request(osdc, req);
				2002	ceph_osdc_put_request(req);
				2003	continue;
				2004	}
				2005
				2006	force_resend_req = force_resend \|\|
				2007	(force_resend_writes &&
				2008	req->r_flags & CEPH_OSD_FLAG_WRITE);
				2009	err = __map_request(osdc, req, force_resend_req);
				2010	if (err < 0)
				2011	continue; /* error */
				2012	if (req->r_osd == NULL) {
				2013	dout("%p tid %llu maps to no osd\n", req, req->r_tid);
				2014	needmap++; /* request a newer map */
				2015	} else if (err > 0) {
				2016	if (!req->r_linger) {
				2017	dout("%p tid %llu requeued on osd%d\n", req,
				2018	req->r_tid,
				2019	req->r_osd ? req->r_osd->o_osd : -1);
				2020	req->r_flags \|= CEPH_OSD_FLAG_RETRY;
				2021	}
				2022	}
				2023	}
				2024
				2025	list_for_each_entry_safe(req, nreq, &osdc->req_linger,
				2026	r_linger_item) {
				2027	dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
				2028
				2029	err = __map_request(osdc, req,
				2030	force_resend \|\| force_resend_writes);
				2031	dout("__map_request returned %d\n", err);
				2032	if (err < 0)
				2033	continue; /* hrm! */
				2034	if (req->r_osd == NULL \|\| err > 0) {
				2035	if (req->r_osd == NULL) {
				2036	dout("lingering %p tid %llu maps to no osd\n",
				2037	req, req->r_tid);
				2038	/*
				2039	* A homeless lingering request makes
				2040	* no sense, as it's job is to keep
				2041	* a particular OSD connection open.
				2042	* Request a newer map and kick the
				2043	* request, knowing that it won't be
				2044	* resent until we actually get a map
				2045	* that can tell us where to send it.
				2046	*/
				2047	needmap++;
				2048	}
				2049
				2050	dout("kicking lingering %p tid %llu osd%d\n", req,
				2051	req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
				2052	__register_request(osdc, req);
				2053	__unregister_linger_request(osdc, req);
				2054	}
				2055	}
				2056	reset_changed_osds(osdc);
				2057	mutex_unlock(&osdc->request_mutex);
				2058
				2059	if (needmap) {
				2060	dout("%d requests for down osds, need new map\n", needmap);
				2061	ceph_monc_request_next_osdmap(&osdc->client->monc);
				2062	}
				2063	}
				2064
				2065
				2066	/*
				2067	* Process updated osd map.
				2068	*
				2069	* The message contains any number of incremental and full maps, normally
				2070	* indicating some sort of topology change in the cluster. Kick requests
				2071	* off to different OSDs as needed.
				2072	*/
				2073	void ceph_osdc_handle_map(struct ceph_osd_client osdc, struct ceph_msg msg)
				2074	{
				2075	void p, end, *next;
				2076	u32 nr_maps, maplen;
				2077	u32 epoch;
				2078	struct ceph_osdmap newmap = NULL, oldmap;
				2079	int err;
				2080	struct ceph_fsid fsid;
				2081	bool was_full;
				2082
				2083	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
				2084	p = msg->front.iov_base;
				2085	end = p + msg->front.iov_len;
				2086
				2087	/* verify fsid */
				2088	ceph_decode_need(&p, end, sizeof(fsid), bad);
				2089	ceph_decode_copy(&p, &fsid, sizeof(fsid));
				2090	if (ceph_check_fsid(osdc->client, &fsid) < 0)
				2091	return;
				2092
				2093	down_write(&osdc->map_sem);
				2094
				2095	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
				2096
				2097	/* incremental maps */
				2098	ceph_decode_32_safe(&p, end, nr_maps, bad);
				2099	dout(" %d inc maps\n", nr_maps);
				2100	while (nr_maps > 0) {
				2101	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				2102	epoch = ceph_decode_32(&p);
				2103	maplen = ceph_decode_32(&p);
				2104	ceph_decode_need(&p, end, maplen, bad);
				2105	next = p + maplen;
				2106	if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
				2107	dout("applying incremental map %u len %d\n",
				2108	epoch, maplen);
				2109	newmap = osdmap_apply_incremental(&p, next,
				2110	osdc->osdmap,
				2111	&osdc->client->msgr);
				2112	if (IS_ERR(newmap)) {
				2113	err = PTR_ERR(newmap);
				2114	goto bad;
				2115	}
				2116	BUG_ON(!newmap);
				2117	if (newmap != osdc->osdmap) {
				2118	ceph_osdmap_destroy(osdc->osdmap);
				2119	osdc->osdmap = newmap;
				2120	}
				2121	was_full = was_full \|\|
				2122	ceph_osdmap_flag(osdc->osdmap,
				2123	CEPH_OSDMAP_FULL);
				2124	kick_requests(osdc, 0, was_full);
				2125	} else {
				2126	dout("ignoring incremental map %u len %d\n",
				2127	epoch, maplen);
				2128	}
				2129	p = next;
				2130	nr_maps--;
				2131	}
				2132	if (newmap)
				2133	goto done;
				2134
				2135	/* full maps */
				2136	ceph_decode_32_safe(&p, end, nr_maps, bad);
				2137	dout(" %d full maps\n", nr_maps);
				2138	while (nr_maps) {
				2139	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				2140	epoch = ceph_decode_32(&p);
				2141	maplen = ceph_decode_32(&p);
				2142	ceph_decode_need(&p, end, maplen, bad);
				2143	if (nr_maps > 1) {
				2144	dout("skipping non-latest full map %u len %d\n",
				2145	epoch, maplen);
				2146	} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
				2147	dout("skipping full map %u len %d, "
				2148	"older than our %u\n", epoch, maplen,
				2149	osdc->osdmap->epoch);
				2150	} else {
				2151	int skipped_map = 0;
				2152
				2153	dout("taking full map %u len %d\n", epoch, maplen);
				2154	newmap = ceph_osdmap_decode(&p, p+maplen);
				2155	if (IS_ERR(newmap)) {
				2156	err = PTR_ERR(newmap);
				2157	goto bad;
				2158	}
				2159	BUG_ON(!newmap);
				2160	oldmap = osdc->osdmap;
				2161	osdc->osdmap = newmap;
				2162	if (oldmap) {
				2163	if (oldmap->epoch + 1 < newmap->epoch)
				2164	skipped_map = 1;
				2165	ceph_osdmap_destroy(oldmap);
				2166	}
				2167	was_full = was_full \|\|
				2168	ceph_osdmap_flag(osdc->osdmap,
				2169	CEPH_OSDMAP_FULL);
				2170	kick_requests(osdc, skipped_map, was_full);
				2171	}
				2172	p += maplen;
				2173	nr_maps--;
				2174	}
				2175
				2176	if (!osdc->osdmap)
				2177	goto bad;
				2178	done:
				2179	downgrade_write(&osdc->map_sem);
				2180	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
				2181
				2182	/*
				2183	* subscribe to subsequent osdmap updates if full to ensure
				2184	* we find out when we are no longer full and stop returning
				2185	* ENOSPC.
				2186	*/
				2187	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) \|\|
				2188	ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) \|\|
				2189	ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
				2190	ceph_monc_request_next_osdmap(&osdc->client->monc);
				2191
				2192	mutex_lock(&osdc->request_mutex);
				2193	__send_queued(osdc);
				2194	mutex_unlock(&osdc->request_mutex);
				2195	up_read(&osdc->map_sem);
				2196	wake_up_all(&osdc->client->auth_wq);
				2197	return;
				2198
				2199	bad:
				2200	pr_err("osdc handle_map corrupt msg\n");
				2201	ceph_msg_dump(msg);
				2202	up_write(&osdc->map_sem);
				2203	}
				2204
				2205	/*
				2206	* watch/notify callback event infrastructure
				2207	*
				2208	* These callbacks are used both for watch and notify operations.
				2209	*/
				2210	static void __release_event(struct kref *kref)
				2211	{
				2212	struct ceph_osd_event *event =
				2213	container_of(kref, struct ceph_osd_event, kref);
				2214
				2215	dout("__release_event %p\n", event);
				2216	kfree(event);
				2217	}
				2218
				2219	static void get_event(struct ceph_osd_event *event)
				2220	{
				2221	kref_get(&event->kref);
				2222	}
				2223
				2224	void ceph_osdc_put_event(struct ceph_osd_event *event)
				2225	{
				2226	kref_put(&event->kref, __release_event);
				2227	}
				2228	EXPORT_SYMBOL(ceph_osdc_put_event);
				2229
				2230	static void __insert_event(struct ceph_osd_client *osdc,
				2231	struct ceph_osd_event *new)
				2232	{
				2233	struct rb_node **p = &osdc->event_tree.rb_node;
				2234	struct rb_node *parent = NULL;
				2235	struct ceph_osd_event *event = NULL;
				2236
				2237	while (*p) {
				2238	parent = *p;
				2239	event = rb_entry(parent, struct ceph_osd_event, node);
				2240	if (new->cookie < event->cookie)
				2241	p = &(*p)->rb_left;
				2242	else if (new->cookie > event->cookie)
				2243	p = &(*p)->rb_right;
				2244	else
				2245	BUG();
				2246	}
				2247
				2248	rb_link_node(&new->node, parent, p);
				2249	rb_insert_color(&new->node, &osdc->event_tree);
				2250	}
				2251
				2252	static struct ceph_osd_event __find_event(struct ceph_osd_client osdc,
				2253	u64 cookie)
				2254	{
				2255	struct rb_node **p = &osdc->event_tree.rb_node;
				2256	struct rb_node *parent = NULL;
				2257	struct ceph_osd_event *event = NULL;
				2258
				2259	while (*p) {
				2260	parent = *p;
				2261	event = rb_entry(parent, struct ceph_osd_event, node);
				2262	if (cookie < event->cookie)
				2263	p = &(*p)->rb_left;
				2264	else if (cookie > event->cookie)
				2265	p = &(*p)->rb_right;
				2266	else
				2267	return event;
				2268	}
				2269	return NULL;
				2270	}
				2271
				2272	static void __remove_event(struct ceph_osd_event *event)
				2273	{
				2274	struct ceph_osd_client *osdc = event->osdc;
				2275
				2276	if (!RB_EMPTY_NODE(&event->node)) {
				2277	dout("__remove_event removed %p\n", event);
				2278	rb_erase(&event->node, &osdc->event_tree);
				2279	ceph_osdc_put_event(event);
				2280	} else {
				2281	dout("__remove_event didn't remove %p\n", event);
				2282	}
				2283	}
				2284
				2285	int ceph_osdc_create_event(struct ceph_osd_client *osdc,
				2286	void (event_cb)(u64, u64, u8, void ),
				2287	void data, struct ceph_osd_event *pevent)
				2288	{
				2289	struct ceph_osd_event *event;
				2290
				2291	event = kmalloc(sizeof(*event), GFP_NOIO);
				2292	if (!event)
				2293	return -ENOMEM;
				2294
				2295	dout("create_event %p\n", event);
				2296	event->cb = event_cb;
				2297	event->one_shot = 0;
				2298	event->data = data;
				2299	event->osdc = osdc;
				2300	INIT_LIST_HEAD(&event->osd_node);
				2301	RB_CLEAR_NODE(&event->node);
				2302	kref_init(&event->kref); /* one ref for us */
				2303	kref_get(&event->kref); /* one ref for the caller */
				2304
				2305	spin_lock(&osdc->event_lock);
				2306	event->cookie = ++osdc->event_count;
				2307	__insert_event(osdc, event);
				2308	spin_unlock(&osdc->event_lock);
				2309
				2310	*pevent = event;
				2311	return 0;
				2312	}
				2313	EXPORT_SYMBOL(ceph_osdc_create_event);
				2314
				2315	void ceph_osdc_cancel_event(struct ceph_osd_event *event)
				2316	{
				2317	struct ceph_osd_client *osdc = event->osdc;
				2318
				2319	dout("cancel_event %p\n", event);
				2320	spin_lock(&osdc->event_lock);
				2321	__remove_event(event);
				2322	spin_unlock(&osdc->event_lock);
				2323	ceph_osdc_put_event(event); /* caller's */
				2324	}
				2325	EXPORT_SYMBOL(ceph_osdc_cancel_event);
				2326
				2327
				2328	static void do_event_work(struct work_struct *work)
				2329	{
				2330	struct ceph_osd_event_work *event_work =
				2331	container_of(work, struct ceph_osd_event_work, work);
				2332	struct ceph_osd_event *event = event_work->event;
				2333	u64 ver = event_work->ver;
				2334	u64 notify_id = event_work->notify_id;
				2335	u8 opcode = event_work->opcode;
				2336
				2337	dout("do_event_work completing %p\n", event);
				2338	event->cb(ver, notify_id, opcode, event->data);
				2339	dout("do_event_work completed %p\n", event);
				2340	ceph_osdc_put_event(event);
				2341	kfree(event_work);
				2342	}
				2343
				2344
				2345	/*
				2346	* Process osd watch notifications
				2347	*/
				2348	static void handle_watch_notify(struct ceph_osd_client *osdc,
				2349	struct ceph_msg *msg)
				2350	{
				2351	void p, end;
				2352	u8 proto_ver;
				2353	u64 cookie, ver, notify_id;
				2354	u8 opcode;
				2355	struct ceph_osd_event *event;
				2356	struct ceph_osd_event_work *event_work;
				2357
				2358	p = msg->front.iov_base;
				2359	end = p + msg->front.iov_len;
				2360
				2361	ceph_decode_8_safe(&p, end, proto_ver, bad);
				2362	ceph_decode_8_safe(&p, end, opcode, bad);
				2363	ceph_decode_64_safe(&p, end, cookie, bad);
				2364	ceph_decode_64_safe(&p, end, ver, bad);
				2365	ceph_decode_64_safe(&p, end, notify_id, bad);
				2366
				2367	spin_lock(&osdc->event_lock);
				2368	event = __find_event(osdc, cookie);
				2369	if (event) {
				2370	BUG_ON(event->one_shot);
				2371	get_event(event);
				2372	}
				2373	spin_unlock(&osdc->event_lock);
				2374	dout("handle_watch_notify cookie %lld ver %lld event %p\n",
				2375	cookie, ver, event);
				2376	if (event) {
				2377	event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
				2378	if (!event_work) {
				2379	pr_err("couldn't allocate event_work\n");
				2380	ceph_osdc_put_event(event);
				2381	return;
				2382	}
				2383	INIT_WORK(&event_work->work, do_event_work);
				2384	event_work->event = event;
				2385	event_work->ver = ver;
				2386	event_work->notify_id = notify_id;
				2387	event_work->opcode = opcode;
				2388
				2389	queue_work(osdc->notify_wq, &event_work->work);
				2390	}
				2391
				2392	return;
				2393
				2394	bad:
				2395	pr_err("osdc handle_watch_notify corrupt msg\n");
				2396	}
				2397
				2398	/*
				2399	* build new request AND message
				2400	*
				2401	*/
				2402	void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
				2403	struct ceph_snap_context *snapc, u64 snap_id,
				2404	struct timespec *mtime)
				2405	{
				2406	struct ceph_msg *msg = req->r_request;
				2407	void *p;
				2408	size_t msg_size;
				2409	int flags = req->r_flags;
				2410	u64 data_len;
				2411	unsigned int i;
				2412
				2413	req->r_snapid = snap_id;
				2414	req->r_snapc = ceph_get_snap_context(snapc);
				2415
				2416	/* encode request */
				2417	msg->hdr.version = cpu_to_le16(4);
				2418
				2419	p = msg->front.iov_base;
				2420	ceph_encode_32(&p, 1); /* client_inc is always 1 */
				2421	req->r_request_osdmap_epoch = p;
				2422	p += 4;
				2423	req->r_request_flags = p;
				2424	p += 4;
				2425	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
				2426	ceph_encode_timespec(p, mtime);
				2427	p += sizeof(struct ceph_timespec);
				2428	req->r_request_reassert_version = p;
				2429	p += sizeof(struct ceph_eversion); /* will get filled in */
				2430
				2431	/* oloc */
				2432	ceph_encode_8(&p, 4);
				2433	ceph_encode_8(&p, 4);
				2434	ceph_encode_32(&p, 8 + 4 + 4);
				2435	req->r_request_pool = p;
				2436	p += 8;
				2437	ceph_encode_32(&p, -1); /* preferred */
				2438	ceph_encode_32(&p, 0); /* key len */
				2439
				2440	ceph_encode_8(&p, 1);
				2441	req->r_request_pgid = p;
				2442	p += 8 + 4;
				2443	ceph_encode_32(&p, -1); /* preferred */
				2444
				2445	/* oid */
				2446	ceph_encode_32(&p, req->r_base_oid.name_len);
				2447	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
				2448	dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
				2449	req->r_base_oid.name, req->r_base_oid.name_len);
				2450	p += req->r_base_oid.name_len;
				2451
				2452	/* ops--can imply data */
				2453	ceph_encode_16(&p, (u16)req->r_num_ops);
				2454	data_len = 0;
				2455	for (i = 0; i < req->r_num_ops; i++) {
				2456	data_len += osd_req_encode_op(req, p, i);
				2457	p += sizeof(struct ceph_osd_op);
				2458	}
				2459
				2460	/* snaps */
				2461	ceph_encode_64(&p, req->r_snapid);
				2462	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
				2463	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
				2464	if (req->r_snapc) {
				2465	for (i = 0; i < snapc->num_snaps; i++) {
				2466	ceph_encode_64(&p, req->r_snapc->snaps[i]);
				2467	}
				2468	}
				2469
				2470	req->r_request_attempts = p;
				2471	p += 4;
				2472
				2473	/* data */
				2474	if (flags & CEPH_OSD_FLAG_WRITE) {
				2475	u16 data_off;
				2476
				2477	/*
				2478	* The header "data_off" is a hint to the receiver
				2479	* allowing it to align received data into its
				2480	* buffers such that there's no need to re-copy
				2481	* it before writing it to disk (direct I/O).
				2482	*/
				2483	data_off = (u16) (off & 0xffff);
				2484	req->r_request->hdr.data_off = cpu_to_le16(data_off);
				2485	}
				2486	req->r_request->hdr.data_len = cpu_to_le32(data_len);
				2487
				2488	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
				2489	msg_size = p - msg->front.iov_base;
				2490	msg->front.iov_len = msg_size;
				2491	msg->hdr.front_len = cpu_to_le32(msg_size);
				2492
				2493	dout("build_request msg_size was %d\n", (int)msg_size);
				2494	}
				2495	EXPORT_SYMBOL(ceph_osdc_build_request);
				2496
				2497	/*
				2498	* Register request, send initial attempt.
				2499	*/
				2500	int ceph_osdc_start_request(struct ceph_osd_client *osdc,
				2501	struct ceph_osd_request *req,
				2502	bool nofail)
				2503	{
				2504	int rc;
				2505
				2506	down_read(&osdc->map_sem);
				2507	mutex_lock(&osdc->request_mutex);
				2508
				2509	rc = __ceph_osdc_start_request(osdc, req, nofail);
				2510
				2511	mutex_unlock(&osdc->request_mutex);
				2512	up_read(&osdc->map_sem);
				2513
				2514	return rc;
				2515	}
				2516	EXPORT_SYMBOL(ceph_osdc_start_request);
				2517
				2518	/*
				2519	* Unregister a registered request. The request is not completed (i.e.
				2520	* no callbacks or wakeups) - higher layers are supposed to know what
				2521	* they are canceling.
				2522	*/
				2523	void ceph_osdc_cancel_request(struct ceph_osd_request *req)
				2524	{
				2525	struct ceph_osd_client *osdc = req->r_osdc;
				2526
				2527	mutex_lock(&osdc->request_mutex);
				2528	if (req->r_linger)
				2529	__unregister_linger_request(osdc, req);
				2530	__unregister_request(osdc, req);
				2531	mutex_unlock(&osdc->request_mutex);
				2532
				2533	dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
				2534	}
				2535	EXPORT_SYMBOL(ceph_osdc_cancel_request);
				2536
				2537	/*
				2538	* wait for a request to complete
				2539	*/
				2540	int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
				2541	struct ceph_osd_request *req)
				2542	{
				2543	int rc;
				2544
				2545	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
				2546
				2547	rc = wait_for_completion_interruptible(&req->r_completion);
				2548	if (rc < 0) {
				2549	dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
				2550	ceph_osdc_cancel_request(req);
				2551	complete_request(req);
				2552	return rc;
				2553	}
				2554
				2555	dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
				2556	req->r_result);
				2557	return req->r_result;
				2558	}
				2559	EXPORT_SYMBOL(ceph_osdc_wait_request);
				2560
				2561	/*
				2562	* sync - wait for all in-flight requests to flush. avoid starvation.
				2563	*/
				2564	void ceph_osdc_sync(struct ceph_osd_client *osdc)
				2565	{
				2566	struct ceph_osd_request *req;
				2567	u64 last_tid, next_tid = 0;
				2568
				2569	mutex_lock(&osdc->request_mutex);
				2570	last_tid = osdc->last_tid;
				2571	while (1) {
				2572	req = __lookup_request_ge(osdc, next_tid);
				2573	if (!req)
				2574	break;
				2575	if (req->r_tid > last_tid)
				2576	break;
				2577
				2578	next_tid = req->r_tid + 1;
				2579	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
				2580	continue;
				2581
				2582	ceph_osdc_get_request(req);
				2583	mutex_unlock(&osdc->request_mutex);
				2584	dout("sync waiting on tid %llu (last is %llu)\n",
				2585	req->r_tid, last_tid);
				2586	wait_for_completion(&req->r_safe_completion);
				2587	mutex_lock(&osdc->request_mutex);
				2588	ceph_osdc_put_request(req);
				2589	}
				2590	mutex_unlock(&osdc->request_mutex);
				2591	dout("sync done (thru tid %llu)\n", last_tid);
				2592	}
				2593	EXPORT_SYMBOL(ceph_osdc_sync);
				2594
				2595	/*
				2596	* Call all pending notify callbacks - for use after a watch is
				2597	* unregistered, to make sure no more callbacks for it will be invoked
				2598	*/
				2599	void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
				2600	{
				2601	flush_workqueue(osdc->notify_wq);
				2602	}
				2603	EXPORT_SYMBOL(ceph_osdc_flush_notifies);
				2604
				2605
				2606	/*
				2607	* init, shutdown
				2608	*/
				2609	int ceph_osdc_init(struct ceph_osd_client osdc, struct ceph_client client)
				2610	{
				2611	int err;
				2612
				2613	dout("init\n");
				2614	osdc->client = client;
				2615	osdc->osdmap = NULL;
				2616	init_rwsem(&osdc->map_sem);
				2617	init_completion(&osdc->map_waiters);
				2618	osdc->last_requested_map = 0;
				2619	mutex_init(&osdc->request_mutex);
				2620	osdc->last_tid = 0;
				2621	osdc->osds = RB_ROOT;
				2622	INIT_LIST_HEAD(&osdc->osd_lru);
				2623	osdc->requests = RB_ROOT;
				2624	INIT_LIST_HEAD(&osdc->req_lru);
				2625	INIT_LIST_HEAD(&osdc->req_unsent);
				2626	INIT_LIST_HEAD(&osdc->req_notarget);
				2627	INIT_LIST_HEAD(&osdc->req_linger);
				2628	osdc->num_requests = 0;
				2629	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
				2630	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
				2631	spin_lock_init(&osdc->event_lock);
				2632	osdc->event_tree = RB_ROOT;
				2633	osdc->event_count = 0;
				2634
				2635	schedule_delayed_work(&osdc->osds_timeout_work,
				2636	round_jiffies_relative(osdc->client->options->osd_idle_ttl));
				2637
				2638	err = -ENOMEM;
				2639	osdc->req_mempool = mempool_create_kmalloc_pool(10,
				2640	sizeof(struct ceph_osd_request));
				2641	if (!osdc->req_mempool)
				2642	goto out;
				2643
				2644	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
				2645	OSD_OP_FRONT_LEN, 10, true,
				2646	"osd_op");
				2647	if (err < 0)
				2648	goto out_mempool;
				2649	err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
				2650	OSD_OPREPLY_FRONT_LEN, 10, true,
				2651	"osd_op_reply");
				2652	if (err < 0)
				2653	goto out_msgpool;
				2654
				2655	err = -ENOMEM;
				2656	osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
				2657	if (!osdc->notify_wq)
				2658	goto out_msgpool_reply;
				2659
				2660	return 0;
				2661
				2662	out_msgpool_reply:
				2663	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
				2664	out_msgpool:
				2665	ceph_msgpool_destroy(&osdc->msgpool_op);
				2666	out_mempool:
				2667	mempool_destroy(osdc->req_mempool);
				2668	out:
				2669	return err;
				2670	}
				2671
				2672	void ceph_osdc_stop(struct ceph_osd_client *osdc)
				2673	{
				2674	flush_workqueue(osdc->notify_wq);
				2675	destroy_workqueue(osdc->notify_wq);
				2676	cancel_delayed_work_sync(&osdc->timeout_work);
				2677	cancel_delayed_work_sync(&osdc->osds_timeout_work);
				2678	if (osdc->osdmap) {
				2679	ceph_osdmap_destroy(osdc->osdmap);
				2680	osdc->osdmap = NULL;
				2681	}
				2682	remove_all_osds(osdc);
				2683	mempool_destroy(osdc->req_mempool);
				2684	ceph_msgpool_destroy(&osdc->msgpool_op);
				2685	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
				2686	}
				2687
				2688	/*
				2689	* Read some contiguous pages. If we cross a stripe boundary, shorten
				2690	* *plen. Return number of bytes read, or error.
				2691	*/
				2692	int ceph_osdc_readpages(struct ceph_osd_client *osdc,
				2693	struct ceph_vino vino, struct ceph_file_layout *layout,
				2694	u64 off, u64 *plen,
				2695	u32 truncate_seq, u64 truncate_size,
				2696	struct page **pages, int num_pages, int page_align)
				2697	{
				2698	struct ceph_osd_request *req;
				2699	int rc = 0;
				2700
				2701	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
				2702	vino.snap, off, *plen);
				2703	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
				2704	CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
				2705	NULL, truncate_seq, truncate_size,
				2706	false);
				2707	if (IS_ERR(req))
				2708	return PTR_ERR(req);
				2709
				2710	/* it may be a short read due to an object boundary */
				2711
				2712	osd_req_op_extent_osd_data_pages(req, 0,
				2713	pages, *plen, page_align, false, false);
				2714
				2715	dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
				2716	off, plen, plen, page_align);
				2717
				2718	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
				2719
				2720	rc = ceph_osdc_start_request(osdc, req, false);
				2721	if (!rc)
				2722	rc = ceph_osdc_wait_request(osdc, req);
				2723
				2724	ceph_osdc_put_request(req);
				2725	dout("readpages result %d\n", rc);
				2726	return rc;
				2727	}
				2728	EXPORT_SYMBOL(ceph_osdc_readpages);
				2729
				2730	/*
				2731	* do a synchronous write on N pages
				2732	*/
				2733	int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
				2734	struct ceph_file_layout *layout,
				2735	struct ceph_snap_context *snapc,
				2736	u64 off, u64 len,
				2737	u32 truncate_seq, u64 truncate_size,
				2738	struct timespec *mtime,
				2739	struct page **pages, int num_pages)
				2740	{
				2741	struct ceph_osd_request *req;
				2742	int rc = 0;
				2743	int page_align = off & ~PAGE_MASK;
				2744
				2745	BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */
				2746	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
				2747	CEPH_OSD_OP_WRITE,
				2748	CEPH_OSD_FLAG_ONDISK \| CEPH_OSD_FLAG_WRITE,
				2749	snapc, truncate_seq, truncate_size,
				2750	true);
				2751	if (IS_ERR(req))
				2752	return PTR_ERR(req);
				2753
				2754	/* it may be a short write due to an object boundary */
				2755	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
				2756	false, false);
				2757	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
				2758
				2759	ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
				2760
				2761	rc = ceph_osdc_start_request(osdc, req, true);
				2762	if (!rc)
				2763	rc = ceph_osdc_wait_request(osdc, req);
				2764
				2765	ceph_osdc_put_request(req);
				2766	if (rc == 0)
				2767	rc = len;
				2768	dout("writepages result %d\n", rc);
				2769	return rc;
				2770	}
				2771	EXPORT_SYMBOL(ceph_osdc_writepages);
				2772
				2773	int ceph_osdc_setup(void)
				2774	{
				2775	BUG_ON(ceph_osd_request_cache);
				2776	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
				2777	sizeof (struct ceph_osd_request),
				2778	__alignof__(struct ceph_osd_request),
				2779	0, NULL);
				2780
				2781	return ceph_osd_request_cache ? 0 : -ENOMEM;
				2782	}
				2783	EXPORT_SYMBOL(ceph_osdc_setup);
				2784
				2785	void ceph_osdc_cleanup(void)
				2786	{
				2787	BUG_ON(!ceph_osd_request_cache);
				2788	kmem_cache_destroy(ceph_osd_request_cache);
				2789	ceph_osd_request_cache = NULL;
				2790	}
				2791	EXPORT_SYMBOL(ceph_osdc_cleanup);
				2792
				2793	/*
				2794	* handle incoming message
				2795	*/
				2796	static void dispatch(struct ceph_connection con, struct ceph_msg msg)
				2797	{
				2798	struct ceph_osd *osd = con->private;
				2799	struct ceph_osd_client *osdc;
				2800	int type = le16_to_cpu(msg->hdr.type);
				2801
				2802	if (!osd)
				2803	goto out;
				2804	osdc = osd->o_osdc;
				2805
				2806	switch (type) {
				2807	case CEPH_MSG_OSD_MAP:
				2808	ceph_osdc_handle_map(osdc, msg);
				2809	break;
				2810	case CEPH_MSG_OSD_OPREPLY:
				2811	handle_reply(osdc, msg);
				2812	break;
				2813	case CEPH_MSG_WATCH_NOTIFY:
				2814	handle_watch_notify(osdc, msg);
				2815	break;
				2816
				2817	default:
				2818	pr_err("received unknown message type %d %s\n", type,
				2819	ceph_msg_type_name(type));
				2820	}
				2821	out:
				2822	ceph_msg_put(msg);
				2823	}
				2824
				2825	/*
				2826	* Lookup and return message for incoming reply. Don't try to do
				2827	* anything about a larger than preallocated data portion of the
				2828	* message at the moment - for now, just skip the message.
				2829	*/
				2830	static struct ceph_msg get_reply(struct ceph_connection con,
				2831	struct ceph_msg_header *hdr,
				2832	int *skip)
				2833	{
				2834	struct ceph_osd *osd = con->private;
				2835	struct ceph_osd_client *osdc = osd->o_osdc;
				2836	struct ceph_msg *m;
				2837	struct ceph_osd_request *req;
				2838	int front_len = le32_to_cpu(hdr->front_len);
				2839	int data_len = le32_to_cpu(hdr->data_len);
				2840	u64 tid;
				2841
				2842	tid = le64_to_cpu(hdr->tid);
				2843	mutex_lock(&osdc->request_mutex);
				2844	req = __lookup_request(osdc, tid);
				2845	if (!req) {
				2846	dout("%s osd%d tid %llu unknown, skipping\n", __func__,
				2847	osd->o_osd, tid);
				2848	m = NULL;
				2849	*skip = 1;
				2850	goto out;
				2851	}
				2852
				2853	ceph_msg_revoke_incoming(req->r_reply);
				2854
				2855	if (front_len > req->r_reply->front_alloc_len) {
				2856	pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
				2857	__func__, osd->o_osd, req->r_tid, front_len,
				2858	req->r_reply->front_alloc_len);
				2859	m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
				2860	false);
				2861	if (!m)
				2862	goto out;
				2863	ceph_msg_put(req->r_reply);
				2864	req->r_reply = m;
				2865	}
				2866
				2867	if (data_len > req->r_reply->data_length) {
				2868	pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
				2869	__func__, osd->o_osd, req->r_tid, data_len,
				2870	req->r_reply->data_length);
				2871	m = NULL;
				2872	*skip = 1;
				2873	goto out;
				2874	}
				2875
				2876	m = ceph_msg_get(req->r_reply);
				2877	dout("get_reply tid %lld %p\n", tid, m);
				2878
				2879	out:
				2880	mutex_unlock(&osdc->request_mutex);
				2881	return m;
				2882	}
				2883
				2884	static struct ceph_msg alloc_msg(struct ceph_connection con,
				2885	struct ceph_msg_header *hdr,
				2886	int *skip)
				2887	{
				2888	struct ceph_osd *osd = con->private;
				2889	int type = le16_to_cpu(hdr->type);
				2890	int front = le32_to_cpu(hdr->front_len);
				2891
				2892	*skip = 0;
				2893	switch (type) {
				2894	case CEPH_MSG_OSD_MAP:
				2895	case CEPH_MSG_WATCH_NOTIFY:
				2896	return ceph_msg_new(type, front, GFP_NOFS, false);
				2897	case CEPH_MSG_OSD_OPREPLY:
				2898	return get_reply(con, hdr, skip);
				2899	default:
				2900	pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
				2901	osd->o_osd);
				2902	*skip = 1;
				2903	return NULL;
				2904	}
				2905	}
				2906
				2907	/*
				2908	* Wrappers to refcount containing ceph_osd struct
				2909	*/
				2910	static struct ceph_connection get_osd_con(struct ceph_connection con)
				2911	{
				2912	struct ceph_osd *osd = con->private;
				2913	if (get_osd(osd))
				2914	return con;
				2915	return NULL;
				2916	}
				2917
				2918	static void put_osd_con(struct ceph_connection *con)
				2919	{
				2920	struct ceph_osd *osd = con->private;
				2921	put_osd(osd);
				2922	}
				2923
				2924	/*
				2925	* authentication
				2926	*/
				2927	/*
				2928	* Note: returned pointer is the address of a structure that's
				2929	* managed separately. Caller must not attempt to free it.
				2930	*/
				2931	static struct ceph_auth_handshake get_authorizer(struct ceph_connection con,
				2932	int *proto, int force_new)
				2933	{
				2934	struct ceph_osd *o = con->private;
				2935	struct ceph_osd_client *osdc = o->o_osdc;
				2936	struct ceph_auth_client *ac = osdc->client->monc.auth;
				2937	struct ceph_auth_handshake *auth = &o->o_auth;
				2938
				2939	if (force_new && auth->authorizer) {
				2940	ceph_auth_destroy_authorizer(ac, auth->authorizer);
				2941	auth->authorizer = NULL;
				2942	}
				2943	if (!auth->authorizer) {
				2944	int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
				2945	auth);
				2946	if (ret)
				2947	return ERR_PTR(ret);
				2948	} else {
				2949	int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
				2950	auth);
				2951	if (ret)
				2952	return ERR_PTR(ret);
				2953	}
				2954	*proto = ac->protocol;
				2955
				2956	return auth;
				2957	}
				2958
				2959
				2960	static int verify_authorizer_reply(struct ceph_connection *con, int len)
				2961	{
				2962	struct ceph_osd *o = con->private;
				2963	struct ceph_osd_client *osdc = o->o_osdc;
				2964	struct ceph_auth_client *ac = osdc->client->monc.auth;
				2965
				2966	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
				2967	}
				2968
				2969	static int invalidate_authorizer(struct ceph_connection *con)
				2970	{
				2971	struct ceph_osd *o = con->private;
				2972	struct ceph_osd_client *osdc = o->o_osdc;
				2973	struct ceph_auth_client *ac = osdc->client->monc.auth;
				2974
				2975	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
				2976	return ceph_monc_validate_auth(&osdc->client->monc);
				2977	}
				2978
				2979	static int osd_sign_message(struct ceph_msg *msg)
				2980	{
				2981	struct ceph_osd *o = msg->con->private;
				2982	struct ceph_auth_handshake *auth = &o->o_auth;
				2983
				2984	return ceph_auth_sign_message(auth, msg);
				2985	}
				2986
				2987	static int osd_check_message_signature(struct ceph_msg *msg)
				2988	{
				2989	struct ceph_osd *o = msg->con->private;
				2990	struct ceph_auth_handshake *auth = &o->o_auth;
				2991
				2992	return ceph_auth_check_message_signature(auth, msg);
				2993	}
				2994
				2995	static const struct ceph_connection_operations osd_con_ops = {
				2996	.get = get_osd_con,
				2997	.put = put_osd_con,
				2998	.dispatch = dispatch,
				2999	.get_authorizer = get_authorizer,
				3000	.verify_authorizer_reply = verify_authorizer_reply,
				3001	.invalidate_authorizer = invalidate_authorizer,
				3002	.alloc_msg = alloc_msg,
				3003	.sign_message = osd_sign_message,
				3004	.check_message_signature = osd_check_message_signature,
				3005	.fault = osd_reset,
				3006	};