blob: 2d964ce4560609778c8d88e9d1d6e4ce7539d3a8 [file] [log] [blame]
Kyle Swenson8d8f6542021-03-15 11:02:55 -06001/*
2 * linux/fs/jbd2/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd2.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
23#include <linux/jiffies.h>
24#include <linux/crc32.h>
25#include <linux/writeback.h>
26#include <linux/backing-dev.h>
27#include <linux/bio.h>
28#include <linux/blkdev.h>
29#include <linux/bitops.h>
30#include <trace/events/jbd2.h>
31
32/*
33 * IO end handler for temporary buffer_heads handling writes to the journal.
34 */
35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36{
37 struct buffer_head *orig_bh = bh->b_private;
38
39 BUFFER_TRACE(bh, "");
40 if (uptodate)
41 set_buffer_uptodate(bh);
42 else
43 clear_buffer_uptodate(bh);
44 if (orig_bh) {
45 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 smp_mb__after_atomic();
47 wake_up_bit(&orig_bh->b_state, BH_Shadow);
48 }
49 unlock_buffer(bh);
50}
51
52/*
53 * When an ext4 file is truncated, it is possible that some pages are not
54 * successfully freed, because they are attached to a committing transaction.
55 * After the transaction commits, these pages are left on the LRU, with no
56 * ->mapping, and with attached buffers. These pages are trivially reclaimable
57 * by the VM, but their apparent absence upsets the VM accounting, and it makes
58 * the numbers in /proc/meminfo look odd.
59 *
60 * So here, we have a buffer which has just come off the forget list. Look to
61 * see if we can strip all buffers from the backing page.
62 *
63 * Called under lock_journal(), and possibly under journal_datalist_lock. The
64 * caller provided us with a ref against the buffer, and we drop that here.
65 */
66static void release_buffer_page(struct buffer_head *bh)
67{
68 struct page *page;
69
70 if (buffer_dirty(bh))
71 goto nope;
72 if (atomic_read(&bh->b_count) != 1)
73 goto nope;
74 page = bh->b_page;
75 if (!page)
76 goto nope;
77 if (page->mapping)
78 goto nope;
79
80 /* OK, it's a truncated page */
81 if (!trylock_page(page))
82 goto nope;
83
84 page_cache_get(page);
85 __brelse(bh);
86 try_to_free_buffers(page);
87 unlock_page(page);
88 page_cache_release(page);
89 return;
90
91nope:
92 __brelse(bh);
93}
94
95static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
96{
97 struct commit_header *h;
98 __u32 csum;
99
100 if (!jbd2_journal_has_csum_v2or3(j))
101 return;
102
103 h = (struct commit_header *)(bh->b_data);
104 h->h_chksum_type = 0;
105 h->h_chksum_size = 0;
106 h->h_chksum[0] = 0;
107 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108 h->h_chksum[0] = cpu_to_be32(csum);
109}
110
111/*
112 * Done it all: now submit the commit record. We should have
113 * cleaned up our previous buffers by now, so if we are in abort
114 * mode we can now just skip the rest of the journal write
115 * entirely.
116 *
117 * Returns 1 if the journal needs to be aborted or 0 on success
118 */
119static int journal_submit_commit_record(journal_t *journal,
120 transaction_t *commit_transaction,
121 struct buffer_head **cbh,
122 __u32 crc32_sum)
123{
124 struct commit_header *tmp;
125 struct buffer_head *bh;
126 int ret;
127 struct timespec64 now = current_kernel_time64();
128
129 *cbh = NULL;
130
131 if (is_journal_aborted(journal))
132 return 0;
133
134 bh = jbd2_journal_get_descriptor_buffer(journal);
135 if (!bh)
136 return 1;
137
138 tmp = (struct commit_header *)bh->b_data;
139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
141 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
142 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
143 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
144
145 if (jbd2_has_feature_checksum(journal)) {
146 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
147 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
148 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
149 }
150 jbd2_commit_block_csum_set(journal, bh);
151
152 BUFFER_TRACE(bh, "submit commit block");
153 lock_buffer(bh);
154 clear_buffer_dirty(bh);
155 set_buffer_uptodate(bh);
156 bh->b_end_io = journal_end_buffer_io_sync;
157
158 if (journal->j_flags & JBD2_BARRIER &&
159 !jbd2_has_feature_async_commit(journal))
160 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
161 else
162 ret = submit_bh(WRITE_SYNC, bh);
163
164 *cbh = bh;
165 return ret;
166}
167
168/*
169 * This function along with journal_submit_commit_record
170 * allows to write the commit record asynchronously.
171 */
172static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
174{
175 int ret = 0;
176
177 clear_buffer_dirty(bh);
178 wait_on_buffer(bh);
179
180 if (unlikely(!buffer_uptodate(bh)))
181 ret = -EIO;
182 put_bh(bh); /* One for getblk() */
183
184 return ret;
185}
186
187/*
188 * write the filemap data using writepage() address_space_operations.
189 * We don't do block allocation here even for delalloc. We don't
190 * use writepages() because with dealyed allocation we may be doing
191 * block allocation in writepages().
192 */
193static int journal_submit_inode_data_buffers(struct address_space *mapping)
194{
195 int ret;
196 struct writeback_control wbc = {
197 .sync_mode = WB_SYNC_ALL,
198 .nr_to_write = mapping->nrpages * 2,
199 .range_start = 0,
200 .range_end = i_size_read(mapping->host),
201 };
202
203 ret = generic_writepages(mapping, &wbc);
204 return ret;
205}
206
207/*
208 * Submit all the data buffers of inode associated with the transaction to
209 * disk.
210 *
211 * We are in a committing transaction. Therefore no new inode can be added to
212 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
213 * operate on from being released while we write out pages.
214 */
215static int journal_submit_data_buffers(journal_t *journal,
216 transaction_t *commit_transaction)
217{
218 struct jbd2_inode *jinode;
219 int err, ret = 0;
220 struct address_space *mapping;
221
222 spin_lock(&journal->j_list_lock);
223 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
224 mapping = jinode->i_vfs_inode->i_mapping;
225 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
226 spin_unlock(&journal->j_list_lock);
227 /*
228 * submit the inode data buffers. We use writepage
229 * instead of writepages. Because writepages can do
230 * block allocation with delalloc. We need to write
231 * only allocated blocks here.
232 */
233 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
234 err = journal_submit_inode_data_buffers(mapping);
235 if (!ret)
236 ret = err;
237 spin_lock(&journal->j_list_lock);
238 J_ASSERT(jinode->i_transaction == commit_transaction);
239 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
240 smp_mb__after_atomic();
241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
242 }
243 spin_unlock(&journal->j_list_lock);
244 return ret;
245}
246
247/*
248 * Wait for data submitted for writeout, refile inodes to proper
249 * transaction if needed.
250 *
251 */
252static int journal_finish_inode_data_buffers(journal_t *journal,
253 transaction_t *commit_transaction)
254{
255 struct jbd2_inode *jinode, *next_i;
256 int err, ret = 0;
257
258 /* For locking, see the comment in journal_submit_data_buffers() */
259 spin_lock(&journal->j_list_lock);
260 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
262 spin_unlock(&journal->j_list_lock);
263 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
264 if (err) {
265 /*
266 * Because AS_EIO is cleared by
267 * filemap_fdatawait_range(), set it again so
268 * that user process can get -EIO from fsync().
269 */
270 set_bit(AS_EIO,
271 &jinode->i_vfs_inode->i_mapping->flags);
272
273 if (!ret)
274 ret = err;
275 }
276 spin_lock(&journal->j_list_lock);
277 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
278 smp_mb__after_atomic();
279 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
280 }
281
282 /* Now refile inode to proper lists */
283 list_for_each_entry_safe(jinode, next_i,
284 &commit_transaction->t_inode_list, i_list) {
285 list_del(&jinode->i_list);
286 if (jinode->i_next_transaction) {
287 jinode->i_transaction = jinode->i_next_transaction;
288 jinode->i_next_transaction = NULL;
289 list_add(&jinode->i_list,
290 &jinode->i_transaction->t_inode_list);
291 } else {
292 jinode->i_transaction = NULL;
293 }
294 }
295 spin_unlock(&journal->j_list_lock);
296
297 return ret;
298}
299
300static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
301{
302 struct page *page = bh->b_page;
303 char *addr;
304 __u32 checksum;
305
306 addr = kmap_atomic(page);
307 checksum = crc32_be(crc32_sum,
308 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
309 kunmap_atomic(addr);
310
311 return checksum;
312}
313
314static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
315 unsigned long long block)
316{
317 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
318 if (jbd2_has_feature_64bit(j))
319 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
320}
321
322static void jbd2_descr_block_csum_set(journal_t *j,
323 struct buffer_head *bh)
324{
325 struct jbd2_journal_block_tail *tail;
326 __u32 csum;
327
328 if (!jbd2_journal_has_csum_v2or3(j))
329 return;
330
331 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
332 sizeof(struct jbd2_journal_block_tail));
333 tail->t_checksum = 0;
334 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
335 tail->t_checksum = cpu_to_be32(csum);
336}
337
338static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
339 struct buffer_head *bh, __u32 sequence)
340{
341 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
342 struct page *page = bh->b_page;
343 __u8 *addr;
344 __u32 csum32;
345 __be32 seq;
346
347 if (!jbd2_journal_has_csum_v2or3(j))
348 return;
349
350 seq = cpu_to_be32(sequence);
351 addr = kmap_atomic(page);
352 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
353 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
354 bh->b_size);
355 kunmap_atomic(addr);
356
357 if (jbd2_has_feature_csum3(j))
358 tag3->t_checksum = cpu_to_be32(csum32);
359 else
360 tag->t_checksum = cpu_to_be16(csum32);
361}
362/*
363 * jbd2_journal_commit_transaction
364 *
365 * The primary function for committing a transaction to the log. This
366 * function is called by the journal thread to begin a complete commit.
367 */
368void jbd2_journal_commit_transaction(journal_t *journal)
369{
370 struct transaction_stats_s stats;
371 transaction_t *commit_transaction;
372 struct journal_head *jh;
373 struct buffer_head *descriptor;
374 struct buffer_head **wbuf = journal->j_wbuf;
375 int bufs;
376 int flags;
377 int err;
378 unsigned long long blocknr;
379 ktime_t start_time;
380 u64 commit_time;
381 char *tagp = NULL;
382 journal_header_t *header;
383 journal_block_tag_t *tag = NULL;
384 int space_left = 0;
385 int first_tag = 0;
386 int tag_flag;
387 int i;
388 int tag_bytes = journal_tag_bytes(journal);
389 struct buffer_head *cbh = NULL; /* For transactional checksums */
390 __u32 crc32_sum = ~0;
391 struct blk_plug plug;
392 /* Tail of the journal */
393 unsigned long first_block;
394 tid_t first_tid;
395 int update_tail;
396 int csum_size = 0;
397 LIST_HEAD(io_bufs);
398 LIST_HEAD(log_bufs);
399
400 if (jbd2_journal_has_csum_v2or3(journal))
401 csum_size = sizeof(struct jbd2_journal_block_tail);
402
403 /*
404 * First job: lock down the current transaction and wait for
405 * all outstanding updates to complete.
406 */
407
408 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
409 if (journal->j_flags & JBD2_FLUSHED) {
410 jbd_debug(3, "super block updated\n");
411 mutex_lock(&journal->j_checkpoint_mutex);
412 /*
413 * We hold j_checkpoint_mutex so tail cannot change under us.
414 * We don't need any special data guarantees for writing sb
415 * since journal is empty and it is ok for write to be
416 * flushed only with transaction commit.
417 */
418 jbd2_journal_update_sb_log_tail(journal,
419 journal->j_tail_sequence,
420 journal->j_tail,
421 WRITE_SYNC);
422 mutex_unlock(&journal->j_checkpoint_mutex);
423 } else {
424 jbd_debug(3, "superblock not updated\n");
425 }
426
427 J_ASSERT(journal->j_running_transaction != NULL);
428 J_ASSERT(journal->j_committing_transaction == NULL);
429
430 commit_transaction = journal->j_running_transaction;
431
432 trace_jbd2_start_commit(journal, commit_transaction);
433 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
434 commit_transaction->t_tid);
435
436 write_lock(&journal->j_state_lock);
437 J_ASSERT(commit_transaction->t_state == T_RUNNING);
438 commit_transaction->t_state = T_LOCKED;
439
440 trace_jbd2_commit_locking(journal, commit_transaction);
441 stats.run.rs_wait = commit_transaction->t_max_wait;
442 stats.run.rs_request_delay = 0;
443 stats.run.rs_locked = jiffies;
444 if (commit_transaction->t_requested)
445 stats.run.rs_request_delay =
446 jbd2_time_diff(commit_transaction->t_requested,
447 stats.run.rs_locked);
448 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
449 stats.run.rs_locked);
450
451 spin_lock(&commit_transaction->t_handle_lock);
452 while (atomic_read(&commit_transaction->t_updates)) {
453 DEFINE_WAIT(wait);
454
455 prepare_to_wait(&journal->j_wait_updates, &wait,
456 TASK_UNINTERRUPTIBLE);
457 if (atomic_read(&commit_transaction->t_updates)) {
458 spin_unlock(&commit_transaction->t_handle_lock);
459 write_unlock(&journal->j_state_lock);
460 schedule();
461 write_lock(&journal->j_state_lock);
462 spin_lock(&commit_transaction->t_handle_lock);
463 }
464 finish_wait(&journal->j_wait_updates, &wait);
465 }
466 spin_unlock(&commit_transaction->t_handle_lock);
467
468 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
469 journal->j_max_transaction_buffers);
470
471 /*
472 * First thing we are allowed to do is to discard any remaining
473 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
474 * that there are no such buffers: if a large filesystem
475 * operation like a truncate needs to split itself over multiple
476 * transactions, then it may try to do a jbd2_journal_restart() while
477 * there are still BJ_Reserved buffers outstanding. These must
478 * be released cleanly from the current transaction.
479 *
480 * In this case, the filesystem must still reserve write access
481 * again before modifying the buffer in the new transaction, but
482 * we do not require it to remember exactly which old buffers it
483 * has reserved. This is consistent with the existing behaviour
484 * that multiple jbd2_journal_get_write_access() calls to the same
485 * buffer are perfectly permissible.
486 */
487 while (commit_transaction->t_reserved_list) {
488 jh = commit_transaction->t_reserved_list;
489 JBUFFER_TRACE(jh, "reserved, unused: refile");
490 /*
491 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
492 * leave undo-committed data.
493 */
494 if (jh->b_committed_data) {
495 struct buffer_head *bh = jh2bh(jh);
496
497 jbd_lock_bh_state(bh);
498 jbd2_free(jh->b_committed_data, bh->b_size);
499 jh->b_committed_data = NULL;
500 jbd_unlock_bh_state(bh);
501 }
502 jbd2_journal_refile_buffer(journal, jh);
503 }
504
505 /*
506 * Now try to drop any written-back buffers from the journal's
507 * checkpoint lists. We do this *before* commit because it potentially
508 * frees some memory
509 */
510 spin_lock(&journal->j_list_lock);
511 __jbd2_journal_clean_checkpoint_list(journal, false);
512 spin_unlock(&journal->j_list_lock);
513
514 jbd_debug(3, "JBD2: commit phase 1\n");
515
516 /*
517 * Clear revoked flag to reflect there is no revoked buffers
518 * in the next transaction which is going to be started.
519 */
520 jbd2_clear_buffer_revoked_flags(journal);
521
522 /*
523 * Switch to a new revoke table.
524 */
525 jbd2_journal_switch_revoke_table(journal);
526
527 /*
528 * Reserved credits cannot be claimed anymore, free them
529 */
530 atomic_sub(atomic_read(&journal->j_reserved_credits),
531 &commit_transaction->t_outstanding_credits);
532
533 trace_jbd2_commit_flushing(journal, commit_transaction);
534 stats.run.rs_flushing = jiffies;
535 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
536 stats.run.rs_flushing);
537
538 commit_transaction->t_state = T_FLUSH;
539 journal->j_committing_transaction = commit_transaction;
540 journal->j_running_transaction = NULL;
541 start_time = ktime_get();
542 commit_transaction->t_log_start = journal->j_head;
543 wake_up(&journal->j_wait_transaction_locked);
544 write_unlock(&journal->j_state_lock);
545
546 jbd_debug(3, "JBD2: commit phase 2a\n");
547
548 /*
549 * Now start flushing things to disk, in the order they appear
550 * on the transaction lists. Data blocks go first.
551 */
552 err = journal_submit_data_buffers(journal, commit_transaction);
553 if (err)
554 jbd2_journal_abort(journal, err);
555
556 blk_start_plug(&plug);
557 jbd2_journal_write_revoke_records(journal, commit_transaction,
558 &log_bufs, WRITE_SYNC);
559
560 jbd_debug(3, "JBD2: commit phase 2b\n");
561
562 /*
563 * Way to go: we have now written out all of the data for a
564 * transaction! Now comes the tricky part: we need to write out
565 * metadata. Loop over the transaction's entire buffer list:
566 */
567 write_lock(&journal->j_state_lock);
568 commit_transaction->t_state = T_COMMIT;
569 write_unlock(&journal->j_state_lock);
570
571 trace_jbd2_commit_logging(journal, commit_transaction);
572 stats.run.rs_logging = jiffies;
573 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
574 stats.run.rs_logging);
575 stats.run.rs_blocks =
576 atomic_read(&commit_transaction->t_outstanding_credits);
577 stats.run.rs_blocks_logged = 0;
578
579 J_ASSERT(commit_transaction->t_nr_buffers <=
580 atomic_read(&commit_transaction->t_outstanding_credits));
581
582 err = 0;
583 bufs = 0;
584 descriptor = NULL;
585 while (commit_transaction->t_buffers) {
586
587 /* Find the next buffer to be journaled... */
588
589 jh = commit_transaction->t_buffers;
590
591 /* If we're in abort mode, we just un-journal the buffer and
592 release it. */
593
594 if (is_journal_aborted(journal)) {
595 clear_buffer_jbddirty(jh2bh(jh));
596 JBUFFER_TRACE(jh, "journal is aborting: refile");
597 jbd2_buffer_abort_trigger(jh,
598 jh->b_frozen_data ?
599 jh->b_frozen_triggers :
600 jh->b_triggers);
601 jbd2_journal_refile_buffer(journal, jh);
602 /* If that was the last one, we need to clean up
603 * any descriptor buffers which may have been
604 * already allocated, even if we are now
605 * aborting. */
606 if (!commit_transaction->t_buffers)
607 goto start_journal_io;
608 continue;
609 }
610
611 /* Make sure we have a descriptor block in which to
612 record the metadata buffer. */
613
614 if (!descriptor) {
615 J_ASSERT (bufs == 0);
616
617 jbd_debug(4, "JBD2: get descriptor\n");
618
619 descriptor = jbd2_journal_get_descriptor_buffer(journal);
620 if (!descriptor) {
621 jbd2_journal_abort(journal, -EIO);
622 continue;
623 }
624
625 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
626 (unsigned long long)descriptor->b_blocknr,
627 descriptor->b_data);
628 header = (journal_header_t *)descriptor->b_data;
629 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
630 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
631 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
632
633 tagp = &descriptor->b_data[sizeof(journal_header_t)];
634 space_left = descriptor->b_size -
635 sizeof(journal_header_t);
636 first_tag = 1;
637 set_buffer_jwrite(descriptor);
638 set_buffer_dirty(descriptor);
639 wbuf[bufs++] = descriptor;
640
641 /* Record it so that we can wait for IO
642 completion later */
643 BUFFER_TRACE(descriptor, "ph3: file as descriptor");
644 jbd2_file_log_bh(&log_bufs, descriptor);
645 }
646
647 /* Where is the buffer to be written? */
648
649 err = jbd2_journal_next_log_block(journal, &blocknr);
650 /* If the block mapping failed, just abandon the buffer
651 and repeat this loop: we'll fall into the
652 refile-on-abort condition above. */
653 if (err) {
654 jbd2_journal_abort(journal, err);
655 continue;
656 }
657
658 /*
659 * start_this_handle() uses t_outstanding_credits to determine
660 * the free space in the log, but this counter is changed
661 * by jbd2_journal_next_log_block() also.
662 */
663 atomic_dec(&commit_transaction->t_outstanding_credits);
664
665 /* Bump b_count to prevent truncate from stumbling over
666 the shadowed buffer! @@@ This can go if we ever get
667 rid of the shadow pairing of buffers. */
668 atomic_inc(&jh2bh(jh)->b_count);
669
670 /*
671 * Make a temporary IO buffer with which to write it out
672 * (this will requeue the metadata buffer to BJ_Shadow).
673 */
674 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
675 JBUFFER_TRACE(jh, "ph3: write metadata");
676 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
677 jh, &wbuf[bufs], blocknr);
678 if (flags < 0) {
679 jbd2_journal_abort(journal, flags);
680 continue;
681 }
682 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
683
684 /* Record the new block's tag in the current descriptor
685 buffer */
686
687 tag_flag = 0;
688 if (flags & 1)
689 tag_flag |= JBD2_FLAG_ESCAPE;
690 if (!first_tag)
691 tag_flag |= JBD2_FLAG_SAME_UUID;
692
693 tag = (journal_block_tag_t *) tagp;
694 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
695 tag->t_flags = cpu_to_be16(tag_flag);
696 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
697 commit_transaction->t_tid);
698 tagp += tag_bytes;
699 space_left -= tag_bytes;
700 bufs++;
701
702 if (first_tag) {
703 memcpy (tagp, journal->j_uuid, 16);
704 tagp += 16;
705 space_left -= 16;
706 first_tag = 0;
707 }
708
709 /* If there's no more to do, or if the descriptor is full,
710 let the IO rip! */
711
712 if (bufs == journal->j_wbufsize ||
713 commit_transaction->t_buffers == NULL ||
714 space_left < tag_bytes + 16 + csum_size) {
715
716 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
717
718 /* Write an end-of-descriptor marker before
719 submitting the IOs. "tag" still points to
720 the last tag we set up. */
721
722 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
723
724 jbd2_descr_block_csum_set(journal, descriptor);
725start_journal_io:
726 for (i = 0; i < bufs; i++) {
727 struct buffer_head *bh = wbuf[i];
728 /*
729 * Compute checksum.
730 */
731 if (jbd2_has_feature_checksum(journal)) {
732 crc32_sum =
733 jbd2_checksum_data(crc32_sum, bh);
734 }
735
736 lock_buffer(bh);
737 clear_buffer_dirty(bh);
738 set_buffer_uptodate(bh);
739 bh->b_end_io = journal_end_buffer_io_sync;
740 submit_bh(WRITE_SYNC, bh);
741 }
742 cond_resched();
743 stats.run.rs_blocks_logged += bufs;
744
745 /* Force a new descriptor to be generated next
746 time round the loop. */
747 descriptor = NULL;
748 bufs = 0;
749 }
750 }
751
752 err = journal_finish_inode_data_buffers(journal, commit_transaction);
753 if (err) {
754 printk(KERN_WARNING
755 "JBD2: Detected IO errors while flushing file data "
756 "on %s\n", journal->j_devname);
757 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
758 jbd2_journal_abort(journal, err);
759 err = 0;
760 }
761
762 /*
763 * Get current oldest transaction in the log before we issue flush
764 * to the filesystem device. After the flush we can be sure that
765 * blocks of all older transactions are checkpointed to persistent
766 * storage and we will be safe to update journal start in the
767 * superblock with the numbers we get here.
768 */
769 update_tail =
770 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
771
772 write_lock(&journal->j_state_lock);
773 if (update_tail) {
774 long freed = first_block - journal->j_tail;
775
776 if (first_block < journal->j_tail)
777 freed += journal->j_last - journal->j_first;
778 /* Update tail only if we free significant amount of space */
779 if (freed < journal->j_maxlen / 4)
780 update_tail = 0;
781 }
782 J_ASSERT(commit_transaction->t_state == T_COMMIT);
783 commit_transaction->t_state = T_COMMIT_DFLUSH;
784 write_unlock(&journal->j_state_lock);
785
786 /*
787 * If the journal is not located on the file system device,
788 * then we must flush the file system device before we issue
789 * the commit record
790 */
791 if (commit_transaction->t_need_data_flush &&
792 (journal->j_fs_dev != journal->j_dev) &&
793 (journal->j_flags & JBD2_BARRIER))
794 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
795
796 /* Done it all: now write the commit record asynchronously. */
797 if (jbd2_has_feature_async_commit(journal)) {
798 err = journal_submit_commit_record(journal, commit_transaction,
799 &cbh, crc32_sum);
800 if (err)
801 __jbd2_journal_abort_hard(journal);
802 }
803
804 blk_finish_plug(&plug);
805
806 /* Lo and behold: we have just managed to send a transaction to
807 the log. Before we can commit it, wait for the IO so far to
808 complete. Control buffers being written are on the
809 transaction's t_log_list queue, and metadata buffers are on
810 the io_bufs list.
811
812 Wait for the buffers in reverse order. That way we are
813 less likely to be woken up until all IOs have completed, and
814 so we incur less scheduling load.
815 */
816
817 jbd_debug(3, "JBD2: commit phase 3\n");
818
819 while (!list_empty(&io_bufs)) {
820 struct buffer_head *bh = list_entry(io_bufs.prev,
821 struct buffer_head,
822 b_assoc_buffers);
823
824 wait_on_buffer(bh);
825 cond_resched();
826
827 if (unlikely(!buffer_uptodate(bh)))
828 err = -EIO;
829 jbd2_unfile_log_bh(bh);
830
831 /*
832 * The list contains temporary buffer heads created by
833 * jbd2_journal_write_metadata_buffer().
834 */
835 BUFFER_TRACE(bh, "dumping temporary bh");
836 __brelse(bh);
837 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
838 free_buffer_head(bh);
839
840 /* We also have to refile the corresponding shadowed buffer */
841 jh = commit_transaction->t_shadow_list->b_tprev;
842 bh = jh2bh(jh);
843 clear_buffer_jwrite(bh);
844 J_ASSERT_BH(bh, buffer_jbddirty(bh));
845 J_ASSERT_BH(bh, !buffer_shadow(bh));
846
847 /* The metadata is now released for reuse, but we need
848 to remember it against this transaction so that when
849 we finally commit, we can do any checkpointing
850 required. */
851 JBUFFER_TRACE(jh, "file as BJ_Forget");
852 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
853 JBUFFER_TRACE(jh, "brelse shadowed buffer");
854 __brelse(bh);
855 }
856
857 J_ASSERT (commit_transaction->t_shadow_list == NULL);
858
859 jbd_debug(3, "JBD2: commit phase 4\n");
860
861 /* Here we wait for the revoke record and descriptor record buffers */
862 while (!list_empty(&log_bufs)) {
863 struct buffer_head *bh;
864
865 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
866 wait_on_buffer(bh);
867 cond_resched();
868
869 if (unlikely(!buffer_uptodate(bh)))
870 err = -EIO;
871
872 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
873 clear_buffer_jwrite(bh);
874 jbd2_unfile_log_bh(bh);
875 __brelse(bh); /* One for getblk */
876 /* AKPM: bforget here */
877 }
878
879 if (err)
880 jbd2_journal_abort(journal, err);
881
882 jbd_debug(3, "JBD2: commit phase 5\n");
883 write_lock(&journal->j_state_lock);
884 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
885 commit_transaction->t_state = T_COMMIT_JFLUSH;
886 write_unlock(&journal->j_state_lock);
887
888 if (!jbd2_has_feature_async_commit(journal)) {
889 err = journal_submit_commit_record(journal, commit_transaction,
890 &cbh, crc32_sum);
891 if (err)
892 __jbd2_journal_abort_hard(journal);
893 }
894 if (cbh)
895 err = journal_wait_on_commit_record(journal, cbh);
896 if (jbd2_has_feature_async_commit(journal) &&
897 journal->j_flags & JBD2_BARRIER) {
898 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
899 }
900
901 if (err)
902 jbd2_journal_abort(journal, err);
903
904 /*
905 * Now disk caches for filesystem device are flushed so we are safe to
906 * erase checkpointed transactions from the log by updating journal
907 * superblock.
908 */
909 if (update_tail)
910 jbd2_update_log_tail(journal, first_tid, first_block);
911
912 /* End of a transaction! Finally, we can do checkpoint
913 processing: any buffers committed as a result of this
914 transaction can be removed from any checkpoint list it was on
915 before. */
916
917 jbd_debug(3, "JBD2: commit phase 6\n");
918
919 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
920 J_ASSERT(commit_transaction->t_buffers == NULL);
921 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
922 J_ASSERT(commit_transaction->t_shadow_list == NULL);
923
924restart_loop:
925 /*
926 * As there are other places (journal_unmap_buffer()) adding buffers
927 * to this list we have to be careful and hold the j_list_lock.
928 */
929 spin_lock(&journal->j_list_lock);
930 while (commit_transaction->t_forget) {
931 transaction_t *cp_transaction;
932 struct buffer_head *bh;
933 int try_to_free = 0;
934
935 jh = commit_transaction->t_forget;
936 spin_unlock(&journal->j_list_lock);
937 bh = jh2bh(jh);
938 /*
939 * Get a reference so that bh cannot be freed before we are
940 * done with it.
941 */
942 get_bh(bh);
943 jbd_lock_bh_state(bh);
944 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
945
946 /*
947 * If there is undo-protected committed data against
948 * this buffer, then we can remove it now. If it is a
949 * buffer needing such protection, the old frozen_data
950 * field now points to a committed version of the
951 * buffer, so rotate that field to the new committed
952 * data.
953 *
954 * Otherwise, we can just throw away the frozen data now.
955 *
956 * We also know that the frozen data has already fired
957 * its triggers if they exist, so we can clear that too.
958 */
959 if (jh->b_committed_data) {
960 jbd2_free(jh->b_committed_data, bh->b_size);
961 jh->b_committed_data = NULL;
962 if (jh->b_frozen_data) {
963 jh->b_committed_data = jh->b_frozen_data;
964 jh->b_frozen_data = NULL;
965 jh->b_frozen_triggers = NULL;
966 }
967 } else if (jh->b_frozen_data) {
968 jbd2_free(jh->b_frozen_data, bh->b_size);
969 jh->b_frozen_data = NULL;
970 jh->b_frozen_triggers = NULL;
971 }
972
973 spin_lock(&journal->j_list_lock);
974 cp_transaction = jh->b_cp_transaction;
975 if (cp_transaction) {
976 JBUFFER_TRACE(jh, "remove from old cp transaction");
977 cp_transaction->t_chp_stats.cs_dropped++;
978 __jbd2_journal_remove_checkpoint(jh);
979 }
980
981 /* Only re-checkpoint the buffer_head if it is marked
982 * dirty. If the buffer was added to the BJ_Forget list
983 * by jbd2_journal_forget, it may no longer be dirty and
984 * there's no point in keeping a checkpoint record for
985 * it. */
986
987 /*
988 * A buffer which has been freed while still being journaled by
989 * a previous transaction.
990 */
991 if (buffer_freed(bh)) {
992 /*
993 * If the running transaction is the one containing
994 * "add to orphan" operation (b_next_transaction !=
995 * NULL), we have to wait for that transaction to
996 * commit before we can really get rid of the buffer.
997 * So just clear b_modified to not confuse transaction
998 * credit accounting and refile the buffer to
999 * BJ_Forget of the running transaction. If the just
1000 * committed transaction contains "add to orphan"
1001 * operation, we can completely invalidate the buffer
1002 * now. We are rather through in that since the
1003 * buffer may be still accessible when blocksize <
1004 * pagesize and it is attached to the last partial
1005 * page.
1006 */
1007 jh->b_modified = 0;
1008 if (!jh->b_next_transaction) {
1009 clear_buffer_freed(bh);
1010 clear_buffer_jbddirty(bh);
1011 clear_buffer_mapped(bh);
1012 clear_buffer_new(bh);
1013 clear_buffer_req(bh);
1014 bh->b_bdev = NULL;
1015 }
1016 }
1017
1018 if (buffer_jbddirty(bh)) {
1019 JBUFFER_TRACE(jh, "add to new checkpointing trans");
1020 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1021 if (is_journal_aborted(journal))
1022 clear_buffer_jbddirty(bh);
1023 } else {
1024 J_ASSERT_BH(bh, !buffer_dirty(bh));
1025 /*
1026 * The buffer on BJ_Forget list and not jbddirty means
1027 * it has been freed by this transaction and hence it
1028 * could not have been reallocated until this
1029 * transaction has committed. *BUT* it could be
1030 * reallocated once we have written all the data to
1031 * disk and before we process the buffer on BJ_Forget
1032 * list.
1033 */
1034 if (!jh->b_next_transaction)
1035 try_to_free = 1;
1036 }
1037 JBUFFER_TRACE(jh, "refile or unfile buffer");
1038 __jbd2_journal_refile_buffer(jh);
1039 jbd_unlock_bh_state(bh);
1040 if (try_to_free)
1041 release_buffer_page(bh); /* Drops bh reference */
1042 else
1043 __brelse(bh);
1044 cond_resched_lock(&journal->j_list_lock);
1045 }
1046 spin_unlock(&journal->j_list_lock);
1047 /*
1048 * This is a bit sleazy. We use j_list_lock to protect transition
1049 * of a transaction into T_FINISHED state and calling
1050 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1051 * other checkpointing code processing the transaction...
1052 */
1053 write_lock(&journal->j_state_lock);
1054 spin_lock(&journal->j_list_lock);
1055 /*
1056 * Now recheck if some buffers did not get attached to the transaction
1057 * while the lock was dropped...
1058 */
1059 if (commit_transaction->t_forget) {
1060 spin_unlock(&journal->j_list_lock);
1061 write_unlock(&journal->j_state_lock);
1062 goto restart_loop;
1063 }
1064
1065 /* Add the transaction to the checkpoint list
1066 * __journal_remove_checkpoint() can not destroy transaction
1067 * under us because it is not marked as T_FINISHED yet */
1068 if (journal->j_checkpoint_transactions == NULL) {
1069 journal->j_checkpoint_transactions = commit_transaction;
1070 commit_transaction->t_cpnext = commit_transaction;
1071 commit_transaction->t_cpprev = commit_transaction;
1072 } else {
1073 commit_transaction->t_cpnext =
1074 journal->j_checkpoint_transactions;
1075 commit_transaction->t_cpprev =
1076 commit_transaction->t_cpnext->t_cpprev;
1077 commit_transaction->t_cpnext->t_cpprev =
1078 commit_transaction;
1079 commit_transaction->t_cpprev->t_cpnext =
1080 commit_transaction;
1081 }
1082 spin_unlock(&journal->j_list_lock);
1083
1084 /* Done with this transaction! */
1085
1086 jbd_debug(3, "JBD2: commit phase 7\n");
1087
1088 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1089
1090 commit_transaction->t_start = jiffies;
1091 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1092 commit_transaction->t_start);
1093
1094 /*
1095 * File the transaction statistics
1096 */
1097 stats.ts_tid = commit_transaction->t_tid;
1098 stats.run.rs_handle_count =
1099 atomic_read(&commit_transaction->t_handle_count);
1100 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1101 commit_transaction->t_tid, &stats.run);
1102 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1103
1104 commit_transaction->t_state = T_COMMIT_CALLBACK;
1105 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1106 journal->j_commit_sequence = commit_transaction->t_tid;
1107 journal->j_committing_transaction = NULL;
1108 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1109
1110 /*
1111 * weight the commit time higher than the average time so we don't
1112 * react too strongly to vast changes in the commit time
1113 */
1114 if (likely(journal->j_average_commit_time))
1115 journal->j_average_commit_time = (commit_time +
1116 journal->j_average_commit_time*3) / 4;
1117 else
1118 journal->j_average_commit_time = commit_time;
1119
1120 write_unlock(&journal->j_state_lock);
1121
1122 if (journal->j_commit_callback)
1123 journal->j_commit_callback(journal, commit_transaction);
1124
1125 trace_jbd2_end_commit(journal, commit_transaction);
1126 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1127 journal->j_commit_sequence, journal->j_tail_sequence);
1128
1129 write_lock(&journal->j_state_lock);
1130 spin_lock(&journal->j_list_lock);
1131 commit_transaction->t_state = T_FINISHED;
1132 /* Check if the transaction can be dropped now that we are finished */
1133 if (commit_transaction->t_checkpoint_list == NULL &&
1134 commit_transaction->t_checkpoint_io_list == NULL) {
1135 __jbd2_journal_drop_transaction(journal, commit_transaction);
1136 jbd2_journal_free_transaction(commit_transaction);
1137 }
1138 spin_unlock(&journal->j_list_lock);
1139 write_unlock(&journal->j_state_lock);
1140 wake_up(&journal->j_wait_done_commit);
1141
1142 /*
1143 * Calculate overall stats
1144 */
1145 spin_lock(&journal->j_history_lock);
1146 journal->j_stats.ts_tid++;
1147 journal->j_stats.ts_requested += stats.ts_requested;
1148 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1149 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1150 journal->j_stats.run.rs_running += stats.run.rs_running;
1151 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1152 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1153 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1154 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1155 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1156 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1157 spin_unlock(&journal->j_history_lock);
1158}