Blame - drivers/block/drbd/drbd_req.c - codeaurora/cp-linux

blob: 3ae2c00865635f889e4040d0e218786c237ff3e6 [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame^]	1	/*
				2	drbd_req.c
				3
				4	This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
				5
				6	Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
				7	Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
				8	Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
				9
				10	drbd is free software; you can redistribute it and/or modify
				11	it under the terms of the GNU General Public License as published by
				12	the Free Software Foundation; either version 2, or (at your option)
				13	any later version.
				14
				15	drbd is distributed in the hope that it will be useful,
				16	but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				18	GNU General Public License for more details.
				19
				20	You should have received a copy of the GNU General Public License
				21	along with drbd; see the file COPYING. If not, write to
				22	the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
				23
				24	*/
				25
				26	#include <linux/module.h>
				27
				28	#include <linux/slab.h>
				29	#include <linux/drbd.h>
				30	#include "drbd_int.h"
				31	#include "drbd_req.h"
				32
				33
				34	static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size);
				35
				36	/* Update disk stats at start of I/O request */
				37	static void _drbd_start_io_acct(struct drbd_device device, struct drbd_request req)
				38	{
				39	generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9,
				40	&device->vdisk->part0);
				41	}
				42
				43	/* Update disk stats when completing request upwards */
				44	static void _drbd_end_io_acct(struct drbd_device device, struct drbd_request req)
				45	{
				46	generic_end_io_acct(bio_data_dir(req->master_bio),
				47	&device->vdisk->part0, req->start_jif);
				48	}
				49
				50	static struct drbd_request drbd_req_new(struct drbd_device device,
				51	struct bio *bio_src)
				52	{
				53	struct drbd_request *req;
				54
				55	req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
				56	if (!req)
				57	return NULL;
				58	memset(req, 0, sizeof(*req));
				59
				60	drbd_req_make_private_bio(req, bio_src);
				61	req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
				62	req->device = device;
				63	req->master_bio = bio_src;
				64	req->epoch = 0;
				65
				66	drbd_clear_interval(&req->i);
				67	req->i.sector = bio_src->bi_iter.bi_sector;
				68	req->i.size = bio_src->bi_iter.bi_size;
				69	req->i.local = true;
				70	req->i.waiting = false;
				71
				72	INIT_LIST_HEAD(&req->tl_requests);
				73	INIT_LIST_HEAD(&req->w.list);
				74	INIT_LIST_HEAD(&req->req_pending_master_completion);
				75	INIT_LIST_HEAD(&req->req_pending_local);
				76
				77	/* one reference to be put by __drbd_make_request */
				78	atomic_set(&req->completion_ref, 1);
				79	/* one kref as long as completion_ref > 0 */
				80	kref_init(&req->kref);
				81	return req;
				82	}
				83
				84	static void drbd_remove_request_interval(struct rb_root *root,
				85	struct drbd_request *req)
				86	{
				87	struct drbd_device *device = req->device;
				88	struct drbd_interval *i = &req->i;
				89
				90	drbd_remove_interval(root, i);
				91
				92	/* Wake up any processes waiting for this request to complete. */
				93	if (i->waiting)
				94	wake_up(&device->misc_wait);
				95	}
				96
				97	void drbd_req_destroy(struct kref *kref)
				98	{
				99	struct drbd_request *req = container_of(kref, struct drbd_request, kref);
				100	struct drbd_device *device = req->device;
				101	const unsigned s = req->rq_state;
				102
				103	if ((req->master_bio && !(s & RQ_POSTPONED)) \|\|
				104	atomic_read(&req->completion_ref) \|\|
				105	(s & RQ_LOCAL_PENDING) \|\|
				106	((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
				107	drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
				108	s, atomic_read(&req->completion_ref));
				109	return;
				110	}
				111
				112	/* If called from mod_rq_state (expected normal case) or
				113	* drbd_send_and_submit (the less likely normal path), this holds the
				114	* req_lock, and req->tl_requests will typicaly be on ->transfer_log,
				115	* though it may be still empty (never added to the transfer log).
				116	*
				117	* If called from do_retry(), we do NOT hold the req_lock, but we are
				118	* still allowed to unconditionally list_del(&req->tl_requests),
				119	* because it will be on a local on-stack list only. */
				120	list_del_init(&req->tl_requests);
				121
				122	/* finally remove the request from the conflict detection
				123	* respective block_id verification interval tree. */
				124	if (!drbd_interval_empty(&req->i)) {
				125	struct rb_root *root;
				126
				127	if (s & RQ_WRITE)
				128	root = &device->write_requests;
				129	else
				130	root = &device->read_requests;
				131	drbd_remove_request_interval(root, req);
				132	} else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
				133	drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
				134	s, (unsigned long long)req->i.sector, req->i.size);
				135
				136	/* if it was a write, we may have to set the corresponding
				137	* bit(s) out-of-sync first. If it had a local part, we need to
				138	* release the reference to the activity log. */
				139	if (s & RQ_WRITE) {
				140	/* Set out-of-sync unless both OK flags are set
				141	* (local only or remote failed).
				142	* Other places where we set out-of-sync:
				143	* READ with local io-error */
				144
				145	/* There is a special case:
				146	* we may notice late that IO was suspended,
				147	* and postpone, or schedule for retry, a write,
				148	* before it even was submitted or sent.
				149	* In that case we do not want to touch the bitmap at all.
				150	*/
				151	if ((s & (RQ_POSTPONED\|RQ_LOCAL_MASK\|RQ_NET_MASK)) != RQ_POSTPONED) {
				152	if (!(s & RQ_NET_OK) \|\| !(s & RQ_LOCAL_OK))
				153	drbd_set_out_of_sync(device, req->i.sector, req->i.size);
				154
				155	if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
				156	drbd_set_in_sync(device, req->i.sector, req->i.size);
				157	}
				158
				159	/* one might be tempted to move the drbd_al_complete_io
				160	* to the local io completion callback drbd_request_endio.
				161	* but, if this was a mirror write, we may only
				162	* drbd_al_complete_io after this is RQ_NET_DONE,
				163	* otherwise the extent could be dropped from the al
				164	* before it has actually been written on the peer.
				165	* if we crash before our peer knows about the request,
				166	* but after the extent has been dropped from the al,
				167	* we would forget to resync the corresponding extent.
				168	*/
				169	if (s & RQ_IN_ACT_LOG) {
				170	if (get_ldev_if_state(device, D_FAILED)) {
				171	drbd_al_complete_io(device, &req->i);
				172	put_ldev(device);
				173	} else if (__ratelimit(&drbd_ratelimit_state)) {
				174	drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
				175	"but my Disk seems to have failed :(\n",
				176	(unsigned long long) req->i.sector, req->i.size);
				177	}
				178	}
				179	}
				180
				181	mempool_free(req, drbd_request_mempool);
				182	}
				183
				184	static void wake_all_senders(struct drbd_connection *connection)
				185	{
				186	wake_up(&connection->sender_work.q_wait);
				187	}
				188
				189	/* must hold resource->req_lock */
				190	void start_new_tl_epoch(struct drbd_connection *connection)
				191	{
				192	/* no point closing an epoch, if it is empty, anyways. */
				193	if (connection->current_tle_writes == 0)
				194	return;
				195
				196	connection->current_tle_writes = 0;
				197	atomic_inc(&connection->current_tle_nr);
				198	wake_all_senders(connection);
				199	}
				200
				201	void complete_master_bio(struct drbd_device *device,
				202	struct bio_and_error *m)
				203	{
				204	m->bio->bi_error = m->error;
				205	bio_endio(m->bio);
				206	dec_ap_bio(device);
				207	}
				208
				209
				210	/* Helper for __req_mod().
				211	* Set m->bio to the master bio, if it is fit to be completed,
				212	* or leave it alone (it is initialized to NULL in __req_mod),
				213	* if it has already been completed, or cannot be completed yet.
				214	* If m->bio is set, the error status to be returned is placed in m->error.
				215	*/
				216	static
				217	void drbd_req_complete(struct drbd_request req, struct bio_and_error m)
				218	{
				219	const unsigned s = req->rq_state;
				220	struct drbd_device *device = req->device;
				221	int rw;
				222	int error, ok;
				223
				224	/* we must not complete the master bio, while it is
				225	* still being processed by _drbd_send_zc_bio (drbd_send_dblock)
				226	* not yet acknowledged by the peer
				227	* not yet completed by the local io subsystem
				228	* these flags may get cleared in any order by
				229	* the worker,
				230	* the receiver,
				231	* the bio_endio completion callbacks.
				232	*/
				233	if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) \|\|
				234	(s & RQ_NET_QUEUED) \|\| (s & RQ_NET_PENDING) \|\|
				235	(s & RQ_COMPLETION_SUSP)) {
				236	drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
				237	return;
				238	}
				239
				240	if (!req->master_bio) {
				241	drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
				242	return;
				243	}
				244
				245	rw = bio_rw(req->master_bio);
				246
				247	/*
				248	* figure out whether to report success or failure.
				249	*
				250	* report success when at least one of the operations succeeded.
				251	* or, to put the other way,
				252	* only report failure, when both operations failed.
				253	*
				254	* what to do about the failures is handled elsewhere.
				255	* what we need to do here is just: complete the master_bio.
				256	*
				257	* local completion error, if any, has been stored as ERR_PTR
				258	* in private_bio within drbd_request_endio.
				259	*/
				260	ok = (s & RQ_LOCAL_OK) \|\| (s & RQ_NET_OK);
				261	error = PTR_ERR(req->private_bio);
				262
				263	/* Before we can signal completion to the upper layers,
				264	* we may need to close the current transfer log epoch.
				265	* We are within the request lock, so we can simply compare
				266	* the request epoch number with the current transfer log
				267	* epoch number. If they match, increase the current_tle_nr,
				268	* and reset the transfer log epoch write_cnt.
				269	*/
				270	if (rw == WRITE &&
				271	req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
				272	start_new_tl_epoch(first_peer_device(device)->connection);
				273
				274	/* Update disk stats */
				275	_drbd_end_io_acct(device, req);
				276
				277	/* If READ failed,
				278	* have it be pushed back to the retry work queue,
				279	* so it will re-enter __drbd_make_request(),
				280	* and be re-assigned to a suitable local or remote path,
				281	* or failed if we do not have access to good data anymore.
				282	*
				283	* Unless it was failed early by __drbd_make_request(),
				284	* because no path was available, in which case
				285	* it was not even added to the transfer_log.
				286	*
				287	* READA may fail, and will not be retried.
				288	*
				289	* WRITE should have used all available paths already.
				290	*/
				291	if (!ok && rw == READ && !list_empty(&req->tl_requests))
				292	req->rq_state \|= RQ_POSTPONED;
				293
				294	if (!(req->rq_state & RQ_POSTPONED)) {
				295	m->error = ok ? 0 : (error ?: -EIO);
				296	m->bio = req->master_bio;
				297	req->master_bio = NULL;
				298	/* We leave it in the tree, to be able to verify later
				299	* write-acks in protocol != C during resync.
				300	* But we mark it as "complete", so it won't be counted as
				301	* conflict in a multi-primary setup. */
				302	req->i.completed = true;
				303	}
				304
				305	if (req->i.waiting)
				306	wake_up(&device->misc_wait);
				307
				308	/* Either we are about to complete to upper layers,
				309	* or we will restart this request.
				310	* In either case, the request object will be destroyed soon,
				311	* so better remove it from all lists. */
				312	list_del_init(&req->req_pending_master_completion);
				313	}
				314
				315	/* still holds resource->req_lock */
				316	static int drbd_req_put_completion_ref(struct drbd_request req, struct bio_and_error m, int put)
				317	{
				318	struct drbd_device *device = req->device;
				319	D_ASSERT(device, m \|\| (req->rq_state & RQ_POSTPONED));
				320
				321	if (!atomic_sub_and_test(put, &req->completion_ref))
				322	return 0;
				323
				324	drbd_req_complete(req, m);
				325
				326	if (req->rq_state & RQ_POSTPONED) {
				327	/* don't destroy the req object just yet,
				328	* but queue it for retry */
				329	drbd_restart_request(req);
				330	return 0;
				331	}
				332
				333	return 1;
				334	}
				335
				336	static void set_if_null_req_next(struct drbd_peer_device peer_device, struct drbd_request req)
				337	{
				338	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
				339	if (!connection)
				340	return;
				341	if (connection->req_next == NULL)
				342	connection->req_next = req;
				343	}
				344
				345	static void advance_conn_req_next(struct drbd_peer_device peer_device, struct drbd_request req)
				346	{
				347	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
				348	if (!connection)
				349	return;
				350	if (connection->req_next != req)
				351	return;
				352	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
				353	const unsigned s = req->rq_state;
				354	if (s & RQ_NET_QUEUED)
				355	break;
				356	}
				357	if (&req->tl_requests == &connection->transfer_log)
				358	req = NULL;
				359	connection->req_next = req;
				360	}
				361
				362	static void set_if_null_req_ack_pending(struct drbd_peer_device peer_device, struct drbd_request req)
				363	{
				364	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
				365	if (!connection)
				366	return;
				367	if (connection->req_ack_pending == NULL)
				368	connection->req_ack_pending = req;
				369	}
				370
				371	static void advance_conn_req_ack_pending(struct drbd_peer_device peer_device, struct drbd_request req)
				372	{
				373	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
				374	if (!connection)
				375	return;
				376	if (connection->req_ack_pending != req)
				377	return;
				378	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
				379	const unsigned s = req->rq_state;
				380	if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
				381	break;
				382	}
				383	if (&req->tl_requests == &connection->transfer_log)
				384	req = NULL;
				385	connection->req_ack_pending = req;
				386	}
				387
				388	static void set_if_null_req_not_net_done(struct drbd_peer_device peer_device, struct drbd_request req)
				389	{
				390	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
				391	if (!connection)
				392	return;
				393	if (connection->req_not_net_done == NULL)
				394	connection->req_not_net_done = req;
				395	}
				396
				397	static void advance_conn_req_not_net_done(struct drbd_peer_device peer_device, struct drbd_request req)
				398	{
				399	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
				400	if (!connection)
				401	return;
				402	if (connection->req_not_net_done != req)
				403	return;
				404	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
				405	const unsigned s = req->rq_state;
				406	if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
				407	break;
				408	}
				409	if (&req->tl_requests == &connection->transfer_log)
				410	req = NULL;
				411	connection->req_not_net_done = req;
				412	}
				413
				414	/* I'd like this to be the only place that manipulates
				415	* req->completion_ref and req->kref. */
				416	static void mod_rq_state(struct drbd_request req, struct bio_and_error m,
				417	int clear, int set)
				418	{
				419	struct drbd_device *device = req->device;
				420	struct drbd_peer_device *peer_device = first_peer_device(device);
				421	unsigned s = req->rq_state;
				422	int c_put = 0;
				423	int k_put = 0;
				424
				425	if (drbd_suspended(device) && !((s \| clear) & RQ_COMPLETION_SUSP))
				426	set \|= RQ_COMPLETION_SUSP;
				427
				428	/* apply */
				429
				430	req->rq_state &= ~clear;
				431	req->rq_state \|= set;
				432
				433	/* no change? */
				434	if (req->rq_state == s)
				435	return;
				436
				437	/* intent: get references */
				438
				439	if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
				440	atomic_inc(&req->completion_ref);
				441
				442	if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
				443	inc_ap_pending(device);
				444	atomic_inc(&req->completion_ref);
				445	}
				446
				447	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
				448	atomic_inc(&req->completion_ref);
				449	set_if_null_req_next(peer_device, req);
				450	}
				451
				452	if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
				453	kref_get(&req->kref); /* wait for the DONE */
				454
				455	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
				456	/* potentially already completed in the asender thread */
				457	if (!(s & RQ_NET_DONE)) {
				458	atomic_add(req->i.size >> 9, &device->ap_in_flight);
				459	set_if_null_req_not_net_done(peer_device, req);
				460	}
				461	if (s & RQ_NET_PENDING)
				462	set_if_null_req_ack_pending(peer_device, req);
				463	}
				464
				465	if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
				466	atomic_inc(&req->completion_ref);
				467
				468	/* progress: put references */
				469
				470	if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
				471	++c_put;
				472
				473	if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
				474	D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
				475	/* local completion may still come in later,
				476	* we need to keep the req object around. */
				477	kref_get(&req->kref);
				478	++c_put;
				479	}
				480
				481	if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
				482	if (req->rq_state & RQ_LOCAL_ABORTED)
				483	++k_put;
				484	else
				485	++c_put;
				486	list_del_init(&req->req_pending_local);
				487	}
				488
				489	if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
				490	dec_ap_pending(device);
				491	++c_put;
				492	req->acked_jif = jiffies;
				493	advance_conn_req_ack_pending(peer_device, req);
				494	}
				495
				496	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
				497	++c_put;
				498	advance_conn_req_next(peer_device, req);
				499	}
				500
				501	if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
				502	if (s & RQ_NET_SENT)
				503	atomic_sub(req->i.size >> 9, &device->ap_in_flight);
				504	if (s & RQ_EXP_BARR_ACK)
				505	++k_put;
				506	req->net_done_jif = jiffies;
				507
				508	/* in ahead/behind mode, or just in case,
				509	* before we finally destroy this request,
				510	* the caching pointers must not reference it anymore */
				511	advance_conn_req_next(peer_device, req);
				512	advance_conn_req_ack_pending(peer_device, req);
				513	advance_conn_req_not_net_done(peer_device, req);
				514	}
				515
				516	/* potentially complete and destroy */
				517
				518	if (k_put \|\| c_put) {
				519	/* Completion does it's own kref_put. If we are going to
				520	* kref_sub below, we need req to be still around then. */
				521	int at_least = k_put + !!c_put;
				522	int refcount = atomic_read(&req->kref.refcount);
				523	if (refcount < at_least)
				524	drbd_err(device,
				525	"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
				526	s, req->rq_state, refcount, at_least);
				527	}
				528
				529	/* If we made progress, retry conflicting peer requests, if any. */
				530	if (req->i.waiting)
				531	wake_up(&device->misc_wait);
				532
				533	if (c_put)
				534	k_put += drbd_req_put_completion_ref(req, m, c_put);
				535	if (k_put)
				536	kref_sub(&req->kref, k_put, drbd_req_destroy);
				537	}
				538
				539	static void drbd_report_io_error(struct drbd_device device, struct drbd_request req)
				540	{
				541	char b[BDEVNAME_SIZE];
				542
				543	if (!__ratelimit(&drbd_ratelimit_state))
				544	return;
				545
				546	drbd_warn(device, "local %s IO error sector %llu+%u on %s\n",
				547	(req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
				548	(unsigned long long)req->i.sector,
				549	req->i.size >> 9,
				550	bdevname(device->ldev->backing_bdev, b));
				551	}
				552
				553	/* Helper for HANDED_OVER_TO_NETWORK.
				554	* Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
				555	* Is it also still "PENDING"?
				556	* --> If so, clear PENDING and set NET_OK below.
				557	* If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
				558	* (and we must not set RQ_NET_OK) */
				559	static inline bool is_pending_write_protocol_A(struct drbd_request *req)
				560	{
				561	return (req->rq_state &
				562	(RQ_WRITE\|RQ_NET_PENDING\|RQ_EXP_WRITE_ACK\|RQ_EXP_RECEIVE_ACK))
				563	== (RQ_WRITE\|RQ_NET_PENDING);
				564	}
				565
				566	/* obviously this could be coded as many single functions
				567	* instead of one huge switch,
				568	* or by putting the code directly in the respective locations
				569	* (as it has been before).
				570	*
				571	* but having it this way
				572	* enforces that it is all in this one place, where it is easier to audit,
				573	* it makes it obvious that whatever "event" "happens" to a request should
				574	* happen "atomically" within the req_lock,
				575	* and it enforces that we have to think in a very structured manner
				576	* about the "events" that may happen to a request during its life time ...
				577	*/
				578	int __req_mod(struct drbd_request *req, enum drbd_req_event what,
				579	struct bio_and_error *m)
				580	{
				581	struct drbd_device *const device = req->device;
				582	struct drbd_peer_device *const peer_device = first_peer_device(device);
				583	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
				584	struct net_conf *nc;
				585	int p, rv = 0;
				586
				587	if (m)
				588	m->bio = NULL;
				589
				590	switch (what) {
				591	default:
				592	drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
				593	break;
				594
				595	/* does not happen...
				596	* initialization done in drbd_req_new
				597	case CREATED:
				598	break;
				599	*/
				600
				601	case TO_BE_SENT: /* via network */
				602	/* reached via __drbd_make_request
				603	* and from w_read_retry_remote */
				604	D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
				605	rcu_read_lock();
				606	nc = rcu_dereference(connection->net_conf);
				607	p = nc->wire_protocol;
				608	rcu_read_unlock();
				609	req->rq_state \|=
				610	p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
				611	p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
				612	mod_rq_state(req, m, 0, RQ_NET_PENDING);
				613	break;
				614
				615	case TO_BE_SUBMITTED: /* locally */
				616	/* reached via __drbd_make_request */
				617	D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK));
				618	mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
				619	break;
				620
				621	case COMPLETED_OK:
				622	if (req->rq_state & RQ_WRITE)
				623	device->writ_cnt += req->i.size >> 9;
				624	else
				625	device->read_cnt += req->i.size >> 9;
				626
				627	mod_rq_state(req, m, RQ_LOCAL_PENDING,
				628	RQ_LOCAL_COMPLETED\|RQ_LOCAL_OK);
				629	break;
				630
				631	case ABORT_DISK_IO:
				632	mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
				633	break;
				634
				635	case WRITE_COMPLETED_WITH_ERROR:
				636	drbd_report_io_error(device, req);
				637	__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
				638	mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
				639	break;
				640
				641	case READ_COMPLETED_WITH_ERROR:
				642	drbd_set_out_of_sync(device, req->i.sector, req->i.size);
				643	drbd_report_io_error(device, req);
				644	__drbd_chk_io_error(device, DRBD_READ_ERROR);
				645	/* fall through. */
				646	case READ_AHEAD_COMPLETED_WITH_ERROR:
				647	/* it is legal to fail READA, no __drbd_chk_io_error in that case. */
				648	mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
				649	break;
				650
				651	case DISCARD_COMPLETED_NOTSUPP:
				652	case DISCARD_COMPLETED_WITH_ERROR:
				653	/* I'd rather not detach from local disk just because it
				654	* failed a REQ_DISCARD. */
				655	mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
				656	break;
				657
				658	case QUEUE_FOR_NET_READ:
				659	/* READ or READA, and
				660	* no local disk,
				661	* or target area marked as invalid,
				662	* or just got an io-error. */
				663	/* from __drbd_make_request
				664	* or from bio_endio during read io-error recovery */
				665
				666	/* So we can verify the handle in the answer packet.
				667	* Corresponding drbd_remove_request_interval is in
				668	* drbd_req_complete() */
				669	D_ASSERT(device, drbd_interval_empty(&req->i));
				670	drbd_insert_interval(&device->read_requests, &req->i);
				671
				672	set_bit(UNPLUG_REMOTE, &device->flags);
				673
				674	D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
				675	D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
				676	mod_rq_state(req, m, 0, RQ_NET_QUEUED);
				677	req->w.cb = w_send_read_req;
				678	drbd_queue_work(&connection->sender_work,
				679	&req->w);
				680	break;
				681
				682	case QUEUE_FOR_NET_WRITE:
				683	/* assert something? */
				684	/* from __drbd_make_request only */
				685
				686	/* Corresponding drbd_remove_request_interval is in
				687	* drbd_req_complete() */
				688	D_ASSERT(device, drbd_interval_empty(&req->i));
				689	drbd_insert_interval(&device->write_requests, &req->i);
				690
				691	/* NOTE
				692	* In case the req ended up on the transfer log before being
				693	* queued on the worker, it could lead to this request being
				694	* missed during cleanup after connection loss.
				695	* So we have to do both operations here,
				696	* within the same lock that protects the transfer log.
				697	*
				698	* _req_add_to_epoch(req); this has to be after the
				699	* _maybe_start_new_epoch(req); which happened in
				700	* __drbd_make_request, because we now may set the bit
				701	* again ourselves to close the current epoch.
				702	*
				703	* Add req to the (now) current epoch (barrier). */
				704
				705	/* otherwise we may lose an unplug, which may cause some remote
				706	* io-scheduler timeout to expire, increasing maximum latency,
				707	* hurting performance. */
				708	set_bit(UNPLUG_REMOTE, &device->flags);
				709
				710	/* queue work item to send data */
				711	D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
				712	mod_rq_state(req, m, 0, RQ_NET_QUEUED\|RQ_EXP_BARR_ACK);
				713	req->w.cb = w_send_dblock;
				714	drbd_queue_work(&connection->sender_work,
				715	&req->w);
				716
				717	/* close the epoch, in case it outgrew the limit */
				718	rcu_read_lock();
				719	nc = rcu_dereference(connection->net_conf);
				720	p = nc->max_epoch_size;
				721	rcu_read_unlock();
				722	if (connection->current_tle_writes >= p)
				723	start_new_tl_epoch(connection);
				724
				725	break;
				726
				727	case QUEUE_FOR_SEND_OOS:
				728	mod_rq_state(req, m, 0, RQ_NET_QUEUED);
				729	req->w.cb = w_send_out_of_sync;
				730	drbd_queue_work(&connection->sender_work,
				731	&req->w);
				732	break;
				733
				734	case READ_RETRY_REMOTE_CANCELED:
				735	case SEND_CANCELED:
				736	case SEND_FAILED:
				737	/* real cleanup will be done from tl_clear. just update flags
				738	* so it is no longer marked as on the worker queue */
				739	mod_rq_state(req, m, RQ_NET_QUEUED, 0);
				740	break;
				741
				742	case HANDED_OVER_TO_NETWORK:
				743	/* assert something? */
				744	if (is_pending_write_protocol_A(req))
				745	/* this is what is dangerous about protocol A:
				746	* pretend it was successfully written on the peer. */
				747	mod_rq_state(req, m, RQ_NET_QUEUED\|RQ_NET_PENDING,
				748	RQ_NET_SENT\|RQ_NET_OK);
				749	else
				750	mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
				751	/* It is still not yet RQ_NET_DONE until the
				752	* corresponding epoch barrier got acked as well,
				753	* so we know what to dirty on connection loss. */
				754	break;
				755
				756	case OOS_HANDED_TO_NETWORK:
				757	/* Was not set PENDING, no longer QUEUED, so is now DONE
				758	* as far as this connection is concerned. */
				759	mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
				760	break;
				761
				762	case CONNECTION_LOST_WHILE_PENDING:
				763	/* transfer log cleanup after connection loss */
				764	mod_rq_state(req, m,
				765	RQ_NET_OK\|RQ_NET_PENDING\|RQ_COMPLETION_SUSP,
				766	RQ_NET_DONE);
				767	break;
				768
				769	case CONFLICT_RESOLVED:
				770	/* for superseded conflicting writes of multiple primaries,
				771	* there is no need to keep anything in the tl, potential
				772	* node crashes are covered by the activity log.
				773	*
				774	* If this request had been marked as RQ_POSTPONED before,
				775	* it will actually not be completed, but "restarted",
				776	* resubmitted from the retry worker context. */
				777	D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
				778	D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
				779	mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE\|RQ_NET_OK);
				780	break;
				781
				782	case WRITE_ACKED_BY_PEER_AND_SIS:
				783	req->rq_state \|= RQ_NET_SIS;
				784	case WRITE_ACKED_BY_PEER:
				785	/* Normal operation protocol C: successfully written on peer.
				786	* During resync, even in protocol != C,
				787	* we requested an explicit write ack anyways.
				788	* Which means we cannot even assert anything here.
				789	* Nothing more to do here.
				790	* We want to keep the tl in place for all protocols, to cater
				791	* for volatile write-back caches on lower level devices. */
				792	goto ack_common;
				793	case RECV_ACKED_BY_PEER:
				794	D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
				795	/* protocol B; pretends to be successfully written on peer.
				796	* see also notes above in HANDED_OVER_TO_NETWORK about
				797	* protocol != C */
				798	ack_common:
				799	mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
				800	break;
				801
				802	case POSTPONE_WRITE:
				803	D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
				804	/* If this node has already detected the write conflict, the
				805	* worker will be waiting on misc_wait. Wake it up once this
				806	* request has completed locally.
				807	*/
				808	D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
				809	req->rq_state \|= RQ_POSTPONED;
				810	if (req->i.waiting)
				811	wake_up(&device->misc_wait);
				812	/* Do not clear RQ_NET_PENDING. This request will make further
				813	* progress via restart_conflicting_writes() or
				814	* fail_postponed_requests(). Hopefully. */
				815	break;
				816
				817	case NEG_ACKED:
				818	mod_rq_state(req, m, RQ_NET_OK\|RQ_NET_PENDING, 0);
				819	break;
				820
				821	case FAIL_FROZEN_DISK_IO:
				822	if (!(req->rq_state & RQ_LOCAL_COMPLETED))
				823	break;
				824	mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
				825	break;
				826
				827	case RESTART_FROZEN_DISK_IO:
				828	if (!(req->rq_state & RQ_LOCAL_COMPLETED))
				829	break;
				830
				831	mod_rq_state(req, m,
				832	RQ_COMPLETION_SUSP\|RQ_LOCAL_COMPLETED,
				833	RQ_LOCAL_PENDING);
				834
				835	rv = MR_READ;
				836	if (bio_data_dir(req->master_bio) == WRITE)
				837	rv = MR_WRITE;
				838
				839	get_ldev(device); /* always succeeds in this call path */
				840	req->w.cb = w_restart_disk_io;
				841	drbd_queue_work(&connection->sender_work,
				842	&req->w);
				843	break;
				844
				845	case RESEND:
				846	/* Simply complete (local only) READs. */
				847	if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
				848	mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
				849	break;
				850	}
				851
				852	/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
				853	before the connection loss (B&C only); only P_BARRIER_ACK
				854	(or the local completion?) was missing when we suspended.
				855	Throwing them out of the TL here by pretending we got a BARRIER_ACK.
				856	During connection handshake, we ensure that the peer was not rebooted. */
				857	if (!(req->rq_state & RQ_NET_OK)) {
				858	/* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync?
				859	* in that case we must not set RQ_NET_PENDING. */
				860
				861	mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED\|RQ_NET_PENDING);
				862	if (req->w.cb) {
				863	/* w.cb expected to be w_send_dblock, or w_send_read_req */
				864	drbd_queue_work(&connection->sender_work,
				865	&req->w);
				866	rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
				867	} /* else: FIXME can this happen? */
				868	break;
				869	}
				870	/* else, fall through to BARRIER_ACKED */
				871
				872	case BARRIER_ACKED:
				873	/* barrier ack for READ requests does not make sense */
				874	if (!(req->rq_state & RQ_WRITE))
				875	break;
				876
				877	if (req->rq_state & RQ_NET_PENDING) {
				878	/* barrier came in before all requests were acked.
				879	* this is bad, because if the connection is lost now,
				880	* we won't be able to clean them up... */
				881	drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n");
				882	}
				883	/* Allowed to complete requests, even while suspended.
				884	* As this is called for all requests within a matching epoch,
				885	* we need to filter, and only set RQ_NET_DONE for those that
				886	* have actually been on the wire. */
				887	mod_rq_state(req, m, RQ_COMPLETION_SUSP,
				888	(req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
				889	break;
				890
				891	case DATA_RECEIVED:
				892	D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
				893	mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK\|RQ_NET_DONE);
				894	break;
				895
				896	case QUEUE_AS_DRBD_BARRIER:
				897	start_new_tl_epoch(connection);
				898	mod_rq_state(req, m, 0, RQ_NET_OK\|RQ_NET_DONE);
				899	break;
				900	};
				901
				902	return rv;
				903	}
				904
				905	/* we may do a local read if:
				906	* - we are consistent (of course),
				907	* - or we are generally inconsistent,
				908	* BUT we are still/already IN SYNC for this area.
				909	* since size may be bigger than BM_BLOCK_SIZE,
				910	* we may need to check several bits.
				911	*/
				912	static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size)
				913	{
				914	unsigned long sbnr, ebnr;
				915	sector_t esector, nr_sectors;
				916
				917	if (device->state.disk == D_UP_TO_DATE)
				918	return true;
				919	if (device->state.disk != D_INCONSISTENT)
				920	return false;
				921	esector = sector + (size >> 9) - 1;
				922	nr_sectors = drbd_get_capacity(device->this_bdev);
				923	D_ASSERT(device, sector < nr_sectors);
				924	D_ASSERT(device, esector < nr_sectors);
				925
				926	sbnr = BM_SECT_TO_BIT(sector);
				927	ebnr = BM_SECT_TO_BIT(esector);
				928
				929	return drbd_bm_count_bits(device, sbnr, ebnr) == 0;
				930	}
				931
				932	static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector,
				933	enum drbd_read_balancing rbm)
				934	{
				935	struct backing_dev_info *bdi;
				936	int stripe_shift;
				937
				938	switch (rbm) {
				939	case RB_CONGESTED_REMOTE:
				940	bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
				941	return bdi_read_congested(bdi);
				942	case RB_LEAST_PENDING:
				943	return atomic_read(&device->local_cnt) >
				944	atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt);
				945	case RB_32K_STRIPING: /* stripe_shift = 15 */
				946	case RB_64K_STRIPING:
				947	case RB_128K_STRIPING:
				948	case RB_256K_STRIPING:
				949	case RB_512K_STRIPING:
				950	case RB_1M_STRIPING: /* stripe_shift = 20 */
				951	stripe_shift = (rbm - RB_32K_STRIPING + 15);
				952	return (sector >> (stripe_shift - 9)) & 1;
				953	case RB_ROUND_ROBIN:
				954	return test_and_change_bit(READ_BALANCE_RR, &device->flags);
				955	case RB_PREFER_REMOTE:
				956	return true;
				957	case RB_PREFER_LOCAL:
				958	default:
				959	return false;
				960	}
				961	}
				962
				963	/*
				964	* complete_conflicting_writes - wait for any conflicting write requests
				965	*
				966	* The write_requests tree contains all active write requests which we
				967	* currently know about. Wait for any requests to complete which conflict with
				968	* the new one.
				969	*
				970	* Only way out: remove the conflicting intervals from the tree.
				971	*/
				972	static void complete_conflicting_writes(struct drbd_request *req)
				973	{
				974	DEFINE_WAIT(wait);
				975	struct drbd_device *device = req->device;
				976	struct drbd_interval *i;
				977	sector_t sector = req->i.sector;
				978	int size = req->i.size;
				979
				980	i = drbd_find_overlap(&device->write_requests, sector, size);
				981	if (!i)
				982	return;
				983
				984	for (;;) {
				985	prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
				986	i = drbd_find_overlap(&device->write_requests, sector, size);
				987	if (!i)
				988	break;
				989	/* Indicate to wake up device->misc_wait on progress. */
				990	i->waiting = true;
				991	spin_unlock_irq(&device->resource->req_lock);
				992	schedule();
				993	spin_lock_irq(&device->resource->req_lock);
				994	}
				995	finish_wait(&device->misc_wait, &wait);
				996	}
				997
				998	/* called within req_lock and rcu_read_lock() */
				999	static void maybe_pull_ahead(struct drbd_device *device)
				1000	{
				1001	struct drbd_connection *connection = first_peer_device(device)->connection;
				1002	struct net_conf *nc;
				1003	bool congested = false;
				1004	enum drbd_on_congestion on_congestion;
				1005
				1006	rcu_read_lock();
				1007	nc = rcu_dereference(connection->net_conf);
				1008	on_congestion = nc ? nc->on_congestion : OC_BLOCK;
				1009	rcu_read_unlock();
				1010	if (on_congestion == OC_BLOCK \|\|
				1011	connection->agreed_pro_version < 96)
				1012	return;
				1013
				1014	if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
				1015	return; /* nothing to do ... */
				1016
				1017	/* If I don't even have good local storage, we can not reasonably try
				1018	* to pull ahead of the peer. We also need the local reference to make
				1019	* sure device->act_log is there.
				1020	*/
				1021	if (!get_ldev_if_state(device, D_UP_TO_DATE))
				1022	return;
				1023
				1024	if (nc->cong_fill &&
				1025	atomic_read(&device->ap_in_flight) >= nc->cong_fill) {
				1026	drbd_info(device, "Congestion-fill threshold reached\n");
				1027	congested = true;
				1028	}
				1029
				1030	if (device->act_log->used >= nc->cong_extents) {
				1031	drbd_info(device, "Congestion-extents threshold reached\n");
				1032	congested = true;
				1033	}
				1034
				1035	if (congested) {
				1036	/* start a new epoch for non-mirrored writes */
				1037	start_new_tl_epoch(first_peer_device(device)->connection);
				1038
				1039	if (on_congestion == OC_PULL_AHEAD)
				1040	_drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL);
				1041	else /nc->on_congestion == OC_DISCONNECT /
				1042	_drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL);
				1043	}
				1044	put_ldev(device);
				1045	}
				1046
				1047	/* If this returns false, and req->private_bio is still set,
				1048	* this should be submitted locally.
				1049	*
				1050	* If it returns false, but req->private_bio is not set,
				1051	* we do not have access to good data :(
				1052	*
				1053	* Otherwise, this destroys req->private_bio, if any,
				1054	* and returns true.
				1055	*/
				1056	static bool do_remote_read(struct drbd_request *req)
				1057	{
				1058	struct drbd_device *device = req->device;
				1059	enum drbd_read_balancing rbm;
				1060
				1061	if (req->private_bio) {
				1062	if (!drbd_may_do_local_read(device,
				1063	req->i.sector, req->i.size)) {
				1064	bio_put(req->private_bio);
				1065	req->private_bio = NULL;
				1066	put_ldev(device);
				1067	}
				1068	}
				1069
				1070	if (device->state.pdsk != D_UP_TO_DATE)
				1071	return false;
				1072
				1073	if (req->private_bio == NULL)
				1074	return true;
				1075
				1076	/* TODO: improve read balancing decisions, take into account drbd
				1077	* protocol, pending requests etc. */
				1078
				1079	rcu_read_lock();
				1080	rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing;
				1081	rcu_read_unlock();
				1082
				1083	if (rbm == RB_PREFER_LOCAL && req->private_bio)
				1084	return false; /* submit locally */
				1085
				1086	if (remote_due_to_read_balancing(device, req->i.sector, rbm)) {
				1087	if (req->private_bio) {
				1088	bio_put(req->private_bio);
				1089	req->private_bio = NULL;
				1090	put_ldev(device);
				1091	}
				1092	return true;
				1093	}
				1094
				1095	return false;
				1096	}
				1097
				1098	/* returns number of connections (== 1, for drbd 8.4)
				1099	* expected to actually write this data,
				1100	* which does NOT include those that we are L_AHEAD for. */
				1101	static int drbd_process_write_request(struct drbd_request *req)
				1102	{
				1103	struct drbd_device *device = req->device;
				1104	int remote, send_oos;
				1105
				1106	remote = drbd_should_do_remote(device->state);
				1107	send_oos = drbd_should_send_out_of_sync(device->state);
				1108
				1109	/* Need to replicate writes. Unless it is an empty flush,
				1110	* which is better mapped to a DRBD P_BARRIER packet,
				1111	* also for drbd wire protocol compatibility reasons.
				1112	* If this was a flush, just start a new epoch.
				1113	* Unless the current epoch was empty anyways, or we are not currently
				1114	* replicating, in which case there is no point. */
				1115	if (unlikely(req->i.size == 0)) {
				1116	/* The only size==0 bios we expect are empty flushes. */
				1117	D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH);
				1118	if (remote)
				1119	_req_mod(req, QUEUE_AS_DRBD_BARRIER);
				1120	return remote;
				1121	}
				1122
				1123	if (!remote && !send_oos)
				1124	return 0;
				1125
				1126	D_ASSERT(device, !(remote && send_oos));
				1127
				1128	if (remote) {
				1129	_req_mod(req, TO_BE_SENT);
				1130	_req_mod(req, QUEUE_FOR_NET_WRITE);
				1131	} else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size))
				1132	_req_mod(req, QUEUE_FOR_SEND_OOS);
				1133
				1134	return remote;
				1135	}
				1136
				1137	static void
				1138	drbd_submit_req_private_bio(struct drbd_request *req)
				1139	{
				1140	struct drbd_device *device = req->device;
				1141	struct bio *bio = req->private_bio;
				1142	const int rw = bio_rw(bio);
				1143
				1144	bio->bi_bdev = device->ldev->backing_bdev;
				1145
				1146	/* State may have changed since we grabbed our reference on the
				1147	* ->ldev member. Double check, and short-circuit to endio.
				1148	* In case the last activity log transaction failed to get on
				1149	* stable storage, and this is a WRITE, we may not even submit
				1150	* this bio. */
				1151	if (get_ldev(device)) {
				1152	req->pre_submit_jif = jiffies;
				1153	if (drbd_insert_fault(device,
				1154	rw == WRITE ? DRBD_FAULT_DT_WR
				1155	: rw == READ ? DRBD_FAULT_DT_RD
				1156	: DRBD_FAULT_DT_RA))
				1157	bio_io_error(bio);
				1158	else
				1159	generic_make_request(bio);
				1160	put_ldev(device);
				1161	} else
				1162	bio_io_error(bio);
				1163	}
				1164
				1165	static void drbd_queue_write(struct drbd_device device, struct drbd_request req)
				1166	{
				1167	spin_lock_irq(&device->resource->req_lock);
				1168	list_add_tail(&req->tl_requests, &device->submit.writes);
				1169	list_add_tail(&req->req_pending_master_completion,
				1170	&device->pending_master_completion[1 /* WRITE */]);
				1171	spin_unlock_irq(&device->resource->req_lock);
				1172	queue_work(device->submit.wq, &device->submit.worker);
				1173	/* do_submit() may sleep internally on al_wait, too */
				1174	wake_up(&device->al_wait);
				1175	}
				1176
				1177	/* returns the new drbd_request pointer, if the caller is expected to
				1178	* drbd_send_and_submit() it (to save latency), or NULL if we queued the
				1179	* request on the submitter thread.
				1180	* Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
				1181	*/
				1182	static struct drbd_request *
				1183	drbd_request_prepare(struct drbd_device device, struct bio bio, unsigned long start_jif)
				1184	{
				1185	const int rw = bio_data_dir(bio);
				1186	struct drbd_request *req;
				1187
				1188	/* allocate outside of all locks; */
				1189	req = drbd_req_new(device, bio);
				1190	if (!req) {
				1191	dec_ap_bio(device);
				1192	/* only pass the error to the upper layers.
				1193	* if user cannot handle io errors, that's not our business. */
				1194	drbd_err(device, "could not kmalloc() req\n");
				1195	bio->bi_error = -ENOMEM;
				1196	bio_endio(bio);
				1197	return ERR_PTR(-ENOMEM);
				1198	}
				1199	req->start_jif = start_jif;
				1200
				1201	if (!get_ldev(device)) {
				1202	bio_put(req->private_bio);
				1203	req->private_bio = NULL;
				1204	}
				1205
				1206	/* Update disk stats */
				1207	_drbd_start_io_acct(device, req);
				1208
				1209	if (rw == WRITE && req->private_bio && req->i.size
				1210	&& !test_bit(AL_SUSPENDED, &device->flags)) {
				1211	if (!drbd_al_begin_io_fastpath(device, &req->i)) {
				1212	atomic_inc(&device->ap_actlog_cnt);
				1213	drbd_queue_write(device, req);
				1214	return NULL;
				1215	}
				1216	req->rq_state \|= RQ_IN_ACT_LOG;
				1217	req->in_actlog_jif = jiffies;
				1218	}
				1219
				1220	return req;
				1221	}
				1222
				1223	static void drbd_send_and_submit(struct drbd_device device, struct drbd_request req)
				1224	{
				1225	struct drbd_resource *resource = device->resource;
				1226	const int rw = bio_rw(req->master_bio);
				1227	struct bio_and_error m = { NULL, };
				1228	bool no_remote = false;
				1229	bool submit_private_bio = false;
				1230
				1231	spin_lock_irq(&resource->req_lock);
				1232	if (rw == WRITE) {
				1233	/* This may temporarily give up the req_lock,
				1234	* but will re-aquire it before it returns here.
				1235	* Needs to be before the check on drbd_suspended() */
				1236	complete_conflicting_writes(req);
				1237	/* no more giving up req_lock from now on! */
				1238
				1239	/* check for congestion, and potentially stop sending
				1240	* full data updates, but start sending "dirty bits" only. */
				1241	maybe_pull_ahead(device);
				1242	}
				1243
				1244
				1245	if (drbd_suspended(device)) {
				1246	/* push back and retry: */
				1247	req->rq_state \|= RQ_POSTPONED;
				1248	if (req->private_bio) {
				1249	bio_put(req->private_bio);
				1250	req->private_bio = NULL;
				1251	put_ldev(device);
				1252	}
				1253	goto out;
				1254	}
				1255
				1256	/* We fail READ/READA early, if we can not serve it.
				1257	* We must do this before req is registered on any lists.
				1258	* Otherwise, drbd_req_complete() will queue failed READ for retry. */
				1259	if (rw != WRITE) {
				1260	if (!do_remote_read(req) && !req->private_bio)
				1261	goto nodata;
				1262	}
				1263
				1264	/* which transfer log epoch does this belong to? */
				1265	req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr);
				1266
				1267	/* no point in adding empty flushes to the transfer log,
				1268	* they are mapped to drbd barriers already. */
				1269	if (likely(req->i.size!=0)) {
				1270	if (rw == WRITE)
				1271	first_peer_device(device)->connection->current_tle_writes++;
				1272
				1273	list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log);
				1274	}
				1275
				1276	if (rw == WRITE) {
				1277	if (!drbd_process_write_request(req))
				1278	no_remote = true;
				1279	} else {
				1280	/* We either have a private_bio, or we can read from remote.
				1281	* Otherwise we had done the goto nodata above. */
				1282	if (req->private_bio == NULL) {
				1283	_req_mod(req, TO_BE_SENT);
				1284	_req_mod(req, QUEUE_FOR_NET_READ);
				1285	} else
				1286	no_remote = true;
				1287	}
				1288
				1289	/* If it took the fast path in drbd_request_prepare, add it here.
				1290	* The slow path has added it already. */
				1291	if (list_empty(&req->req_pending_master_completion))
				1292	list_add_tail(&req->req_pending_master_completion,
				1293	&device->pending_master_completion[rw == WRITE]);
				1294	if (req->private_bio) {
				1295	/* needs to be marked within the same spinlock */
				1296	list_add_tail(&req->req_pending_local,
				1297	&device->pending_completion[rw == WRITE]);
				1298	_req_mod(req, TO_BE_SUBMITTED);
				1299	/* but we need to give up the spinlock to submit */
				1300	submit_private_bio = true;
				1301	} else if (no_remote) {
				1302	nodata:
				1303	if (__ratelimit(&drbd_ratelimit_state))
				1304	drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
				1305	(unsigned long long)req->i.sector, req->i.size >> 9);
				1306	/* A write may have been queued for send_oos, however.
				1307	* So we can not simply free it, we must go through drbd_req_put_completion_ref() */
				1308	}
				1309
				1310	out:
				1311	if (drbd_req_put_completion_ref(req, &m, 1))
				1312	kref_put(&req->kref, drbd_req_destroy);
				1313	spin_unlock_irq(&resource->req_lock);
				1314
				1315	/* Even though above is a kref_put(), this is safe.
				1316	* As long as we still need to submit our private bio,
				1317	* we hold a completion ref, and the request cannot disappear.
				1318	* If however this request did not even have a private bio to submit
				1319	* (e.g. remote read), req may already be invalid now.
				1320	* That's why we cannot check on req->private_bio. */
				1321	if (submit_private_bio)
				1322	drbd_submit_req_private_bio(req);
				1323	if (m.bio)
				1324	complete_master_bio(device, &m);
				1325	}
				1326
				1327	void __drbd_make_request(struct drbd_device device, struct bio bio, unsigned long start_jif)
				1328	{
				1329	struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
				1330	if (IS_ERR_OR_NULL(req))
				1331	return;
				1332	drbd_send_and_submit(device, req);
				1333	}
				1334
				1335	static void submit_fast_path(struct drbd_device device, struct list_head incoming)
				1336	{
				1337	struct drbd_request req, tmp;
				1338	list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
				1339	const int rw = bio_data_dir(req->master_bio);
				1340
				1341	if (rw == WRITE /* rw != WRITE should not even end up here! */
				1342	&& req->private_bio && req->i.size
				1343	&& !test_bit(AL_SUSPENDED, &device->flags)) {
				1344	if (!drbd_al_begin_io_fastpath(device, &req->i))
				1345	continue;
				1346
				1347	req->rq_state \|= RQ_IN_ACT_LOG;
				1348	req->in_actlog_jif = jiffies;
				1349	atomic_dec(&device->ap_actlog_cnt);
				1350	}
				1351
				1352	list_del_init(&req->tl_requests);
				1353	drbd_send_and_submit(device, req);
				1354	}
				1355	}
				1356
				1357	static bool prepare_al_transaction_nonblock(struct drbd_device *device,
				1358	struct list_head *incoming,
				1359	struct list_head *pending,
				1360	struct list_head *later)
				1361	{
				1362	struct drbd_request req, tmp;
				1363	int wake = 0;
				1364	int err;
				1365
				1366	spin_lock_irq(&device->al_lock);
				1367	list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
				1368	err = drbd_al_begin_io_nonblock(device, &req->i);
				1369	if (err == -ENOBUFS)
				1370	break;
				1371	if (err == -EBUSY)
				1372	wake = 1;
				1373	if (err)
				1374	list_move_tail(&req->tl_requests, later);
				1375	else
				1376	list_move_tail(&req->tl_requests, pending);
				1377	}
				1378	spin_unlock_irq(&device->al_lock);
				1379	if (wake)
				1380	wake_up(&device->al_wait);
				1381	return !list_empty(pending);
				1382	}
				1383
				1384	void send_and_submit_pending(struct drbd_device device, struct list_head pending)
				1385	{
				1386	struct drbd_request req, tmp;
				1387
				1388	list_for_each_entry_safe(req, tmp, pending, tl_requests) {
				1389	req->rq_state \|= RQ_IN_ACT_LOG;
				1390	req->in_actlog_jif = jiffies;
				1391	atomic_dec(&device->ap_actlog_cnt);
				1392	list_del_init(&req->tl_requests);
				1393	drbd_send_and_submit(device, req);
				1394	}
				1395	}
				1396
				1397	void do_submit(struct work_struct *ws)
				1398	{
				1399	struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
				1400	LIST_HEAD(incoming); /* from drbd_make_request() */
				1401	LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */
				1402	LIST_HEAD(busy); /* blocked by resync requests */
				1403
				1404	/* grab new incoming requests */
				1405	spin_lock_irq(&device->resource->req_lock);
				1406	list_splice_tail_init(&device->submit.writes, &incoming);
				1407	spin_unlock_irq(&device->resource->req_lock);
				1408
				1409	for (;;) {
				1410	DEFINE_WAIT(wait);
				1411
				1412	/* move used-to-be-busy back to front of incoming */
				1413	list_splice_init(&busy, &incoming);
				1414	submit_fast_path(device, &incoming);
				1415	if (list_empty(&incoming))
				1416	break;
				1417
				1418	for (;;) {
				1419	prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
				1420
				1421	list_splice_init(&busy, &incoming);
				1422	prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
				1423	if (!list_empty(&pending))
				1424	break;
				1425
				1426	schedule();
				1427
				1428	/* If all currently "hot" activity log extents are kept busy by
				1429	* incoming requests, we still must not totally starve new
				1430	* requests to "cold" extents.
				1431	* Something left on &incoming means there had not been
				1432	* enough update slots available, and the activity log
				1433	* has been marked as "starving".
				1434	*
				1435	* Try again now, without looking for new requests,
				1436	* effectively blocking all new requests until we made
				1437	* at least _some_ progress with what we currently have.
				1438	*/
				1439	if (!list_empty(&incoming))
				1440	continue;
				1441
				1442	/* Nothing moved to pending, but nothing left
				1443	* on incoming: all moved to busy!
				1444	* Grab new and iterate. */
				1445	spin_lock_irq(&device->resource->req_lock);
				1446	list_splice_tail_init(&device->submit.writes, &incoming);
				1447	spin_unlock_irq(&device->resource->req_lock);
				1448	}
				1449	finish_wait(&device->al_wait, &wait);
				1450
				1451	/* If the transaction was full, before all incoming requests
				1452	* had been processed, skip ahead to commit, and iterate
				1453	* without splicing in more incoming requests from upper layers.
				1454	*
				1455	* Else, if all incoming have been processed,
				1456	* they have become either "pending" (to be submitted after
				1457	* next transaction commit) or "busy" (blocked by resync).
				1458	*
				1459	* Maybe more was queued, while we prepared the transaction?
				1460	* Try to stuff those into this transaction as well.
				1461	* Be strictly non-blocking here,
				1462	* we already have something to commit.
				1463	*
				1464	* Commit if we don't make any more progres.
				1465	*/
				1466
				1467	while (list_empty(&incoming)) {
				1468	LIST_HEAD(more_pending);
				1469	LIST_HEAD(more_incoming);
				1470	bool made_progress;
				1471
				1472	/* It is ok to look outside the lock,
				1473	* it's only an optimization anyways */
				1474	if (list_empty(&device->submit.writes))
				1475	break;
				1476
				1477	spin_lock_irq(&device->resource->req_lock);
				1478	list_splice_tail_init(&device->submit.writes, &more_incoming);
				1479	spin_unlock_irq(&device->resource->req_lock);
				1480
				1481	if (list_empty(&more_incoming))
				1482	break;
				1483
				1484	made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
				1485
				1486	list_splice_tail_init(&more_pending, &pending);
				1487	list_splice_tail_init(&more_incoming, &incoming);
				1488	if (!made_progress)
				1489	break;
				1490	}
				1491
				1492	drbd_al_begin_io_commit(device);
				1493	send_and_submit_pending(device, &pending);
				1494	}
				1495	}
				1496
				1497	blk_qc_t drbd_make_request(struct request_queue q, struct bio bio)
				1498	{
				1499	struct drbd_device device = (struct drbd_device ) q->queuedata;
				1500	unsigned long start_jif;
				1501
				1502	blk_queue_split(q, &bio, q->bio_split);
				1503
				1504	start_jif = jiffies;
				1505
				1506	/*
				1507	* what we "blindly" assume:
				1508	*/
				1509	D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
				1510
				1511	inc_ap_bio(device);
				1512	__drbd_make_request(device, bio, start_jif);
				1513	return BLK_QC_T_NONE;
				1514	}
				1515
				1516	void request_timer_fn(unsigned long data)
				1517	{
				1518	struct drbd_device device = (struct drbd_device ) data;
				1519	struct drbd_connection *connection = first_peer_device(device)->connection;
				1520	struct drbd_request req_read, req_write, req_peer; / oldest request */
				1521	struct net_conf *nc;
				1522	unsigned long oldest_submit_jif;
				1523	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
				1524	unsigned long now;
				1525
				1526	rcu_read_lock();
				1527	nc = rcu_dereference(connection->net_conf);
				1528	if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
				1529	ent = nc->timeout * HZ/10 * nc->ko_count;
				1530
				1531	if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
				1532	dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
				1533	put_ldev(device);
				1534	}
				1535	rcu_read_unlock();
				1536
				1537	et = min_not_zero(dt, ent);
				1538
				1539	if (!et)
				1540	return; /* Recurring timer stopped */
				1541
				1542	now = jiffies;
				1543	nt = now + et;
				1544
				1545	spin_lock_irq(&device->resource->req_lock);
				1546	req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
				1547	req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
				1548	req_peer = connection->req_not_net_done;
				1549	/* maybe the oldest request waiting for the peer is in fact still
				1550	* blocking in tcp sendmsg */
				1551	if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
				1552	req_peer = connection->req_next;
				1553
				1554	/* evaluate the oldest peer request only in one timer! */
				1555	if (req_peer && req_peer->device != device)
				1556	req_peer = NULL;
				1557
				1558	/* do we have something to evaluate? */
				1559	if (req_peer == NULL && req_write == NULL && req_read == NULL)
				1560	goto out;
				1561
				1562	oldest_submit_jif =
				1563	(req_write && req_read)
				1564	? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
				1565	? req_write->pre_submit_jif : req_read->pre_submit_jif )
				1566	: req_write ? req_write->pre_submit_jif
				1567	: req_read ? req_read->pre_submit_jif : now;
				1568
				1569	/* The request is considered timed out, if
				1570	* - we have some effective timeout from the configuration,
				1571	* with above state restrictions applied,
				1572	* - the oldest request is waiting for a response from the network
				1573	* resp. the local disk,
				1574	* - the oldest request is in fact older than the effective timeout,
				1575	* - the connection was established (resp. disk was attached)
				1576	* for longer than the timeout already.
				1577	* Note that for 32bit jiffies and very stable connections/disks,
				1578	* we may have a wrap around, which is catched by
				1579	* !time_in_range(now, last_..._jif, last_..._jif + timeout).
				1580	*
				1581	* Side effect: once per 32bit wrap-around interval, which means every
				1582	* ~198 days with 250 HZ, we have a window where the timeout would need
				1583	* to expire twice (worst case) to become effective. Good enough.
				1584	*/
				1585	if (ent && req_peer &&
				1586	time_after(now, req_peer->pre_send_jif + ent) &&
				1587	!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
				1588	drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
				1589	_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE \| CS_HARD);
				1590	}
				1591	if (dt && oldest_submit_jif != now &&
				1592	time_after(now, oldest_submit_jif + dt) &&
				1593	!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
				1594	drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
				1595	__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
				1596	}
				1597
				1598	/* Reschedule timer for the nearest not already expired timeout.
				1599	* Fallback to now + min(effective network timeout, disk timeout). */
				1600	ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
				1601	? req_peer->pre_send_jif + ent : now + et;
				1602	dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
				1603	? oldest_submit_jif + dt : now + et;
				1604	nt = time_before(ent, dt) ? ent : dt;
				1605	out:
				1606	spin_unlock_irq(&device->resource->req_lock);
				1607	mod_timer(&device->request_timer, nt);
				1608	}