--- linux-2.4.19-ext3/fs/buffer.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/buffer.c Fri Oct 11 20:18:58 2002 @@ -1754,9 +1754,14 @@ } /* Stage 3: start the IO */ - for (i = 0; i < nr; i++) - submit_bh(READ, arr[i]); - + for (i = 0; i < nr; i++) { + struct buffer_head * bh = arr[i]; + if (buffer_uptodate(bh)) + end_buffer_io_async(bh, 1); + else + submit_bh(READ, bh); + } + return 0; } --- linux-2.4.19-ext3/fs/ext3/file.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/ext3/file.c Fri Oct 11 20:18:58 2002 @@ -61,19 +61,52 @@ static ssize_t ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { + int ret, err; struct inode *inode = file->f_dentry->d_inode; - /* - * Nasty: if the file is subject to synchronous writes then we need - * to force generic_osync_inode() to call ext3_write_inode(). - * We do that by marking the inode dirty. This adds much more - * computational expense than we need, but we're going to sync - * anyway. - */ - if (IS_SYNC(inode) || (file->f_flags & O_SYNC)) - mark_inode_dirty(inode); + ret = generic_file_write(file, buf, count, ppos); - return generic_file_write(file, buf, count, ppos); + /* Skip file flushing code if there was an error, or if nothing + was written. */ + if (ret <= 0) + return ret; + + /* If the inode is IS_SYNC, or is O_SYNC and we are doing + data-journaling, then we need to make sure that we force the + transaction to disk to keep all metadata uptodate + synchronously. */ + + if (file->f_flags & O_SYNC) { + /* If we are non-data-journaled, then the dirty data has + already been flushed to backing store by + generic_osync_inode, and the inode has been flushed + too if there have been any modifications other than + mere timestamp updates. + + Open question --- do we care about flushing + timestamps too if the inode is IS_SYNC? */ + if (!ext3_should_journal_data(inode)) + return ret; + + goto force_commit; + } + + /* So we know that there has been no forced data flush. If the + inode is marked IS_SYNC, we need to force one ourselves. */ + if (!IS_SYNC(inode)) + return ret; + + /* Open question #2 --- should we force data to disk here too? + If we don't, the only impact is that data=writeback + filesystems won't flush data to disk automatically on + IS_SYNC, only metadata (but historically, that is what ext2 + has done.) */ + +force_commit: + err = ext3_force_commit(inode->i_sb); + if (err) + return err; + return ret; } struct file_operations ext3_file_operations = { --- linux-2.4.19-ext3/fs/ext3/fsync.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/ext3/fsync.c Fri Oct 11 20:18:58 2002 @@ -62,7 +62,12 @@ * we'll end up waiting on them in commit. */ ret = fsync_inode_buffers(inode); - ret |= fsync_inode_data_buffers(inode); + + /* In writeback mode, we need to force out data buffers too. In + * the other modes, ext3_force_commit takes care of forcing out + * just the right data blocks. */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) + ret |= fsync_inode_data_buffers(inode); ext3_force_commit(inode->i_sb); --- linux-2.4.19-ext3/fs/ext3/ialloc.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/ext3/ialloc.c Fri Oct 11 20:18:58 2002 @@ -392,7 +392,7 @@ err = -ENOSPC; if (!gdp) - goto fail; + goto out; err = -EIO; bitmap_nr = load_inode_bitmap (sb, i); @@ -523,9 +523,10 @@ return inode; fail: + ext3_std_error(sb, err); +out: unlock_super(sb); iput(inode); - ext3_std_error(sb, err); return ERR_PTR(err); } --- linux-2.4.19-ext3/fs/ext3/inode.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/ext3/inode.c Fri Oct 11 20:18:58 2002 @@ -412,6 +412,7 @@ return NULL; changed: + brelse(bh); *err = -EAGAIN; goto no_block; failure: @@ -948,11 +949,13 @@ } static int walk_page_buffers( handle_t *handle, + struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)( handle_t *handle, + struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; @@ -970,7 +973,7 @@ *partial = 1; continue; } - err = (*fn)(handle, bh); + err = (*fn)(handle, inode, bh); if (!ret) ret = err; } @@ -1003,7 +1006,7 @@ * write. */ -static int do_journal_get_write_access(handle_t *handle, +static int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return ext3_journal_get_write_access(handle, bh); @@ -1029,7 +1032,7 @@ goto prepare_write_failed; if (ext3_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, NULL, do_journal_get_write_access); if (ret) { /* @@ -1050,24 +1053,32 @@ return ret; } -static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) +static int journal_dirty_sync_data(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { - return ext3_journal_dirty_data(handle, bh, 0); + int ret = ext3_journal_dirty_data(handle, bh, 0); + if (bh->b_inode != inode) + buffer_insert_inode_data_queue(bh, inode); + return ret; } /* * For ext3_writepage(). We also brelse() the buffer to account for * the bget() which ext3_writepage() performs. */ -static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) +static int journal_dirty_async_data(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int ret = ext3_journal_dirty_data(handle, bh, 1); + if (bh->b_inode != inode) + buffer_insert_inode_data_queue(bh, inode); __brelse(bh); return ret; } /* For commit_write() in data=journal mode */ -static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +static int commit_write_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { set_bit(BH_Uptodate, &bh->b_state); return ext3_journal_dirty_metadata(handle, bh); @@ -1102,7 +1113,7 @@ int partial = 0; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, &partial, commit_write_fn); if (!partial) SetPageUptodate(page); @@ -1112,7 +1123,7 @@ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; } else { if (ext3_should_order_data(inode)) { - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, NULL, journal_dirty_sync_data); } /* Be careful here if generic_commit_write becomes a @@ -1194,7 +1205,8 @@ return generic_block_bmap(mapping,block,ext3_get_block); } -static int bget_one(handle_t *handle, struct buffer_head *bh) +static int bget_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { atomic_inc(&bh->b_count); return 0; @@ -1293,7 +1305,7 @@ create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize); page_buffers = page->buffers; - walk_page_buffers(handle, page_buffers, 0, + walk_page_buffers(handle, inode, page_buffers, 0, PAGE_CACHE_SIZE, NULL, bget_one); } @@ -1311,7 +1323,7 @@ /* And attach them to the current transaction */ if (order_data) { - err = walk_page_buffers(handle, page_buffers, + err = walk_page_buffers(handle, inode, page_buffers, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); if (!ret) ret = err; @@ -1579,8 +1591,10 @@ } ext3_mark_inode_dirty(handle, inode); ext3_journal_test_restart(handle, inode); - BUFFER_TRACE(bh, "get_write_access"); - ext3_journal_get_write_access(handle, bh); + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ext3_journal_get_write_access(handle, bh); + } } /* --- linux-2.4.19-ext3/fs/ext3/namei.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/ext3/namei.c Fri Oct 11 20:18:58 2002 @@ -354,8 +354,8 @@ */ dir->i_mtime = dir->i_ctime = CURRENT_TIME; dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; - ext3_mark_inode_dirty(handle, dir); dir->i_version = ++event; + ext3_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ext3_journal_dirty_metadata(handle, bh); brelse(bh); @@ -464,8 +464,8 @@ inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; inode->i_mapping->a_ops = &ext3_aops; - ext3_mark_inode_dirty(handle, inode); err = ext3_add_nondir(handle, dentry, inode); + ext3_mark_inode_dirty(handle, inode); } ext3_journal_stop(handle, dir); return err; @@ -489,8 +489,8 @@ err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, mode, rdev); - ext3_mark_inode_dirty(handle, inode); err = ext3_add_nondir(handle, dentry, inode); + ext3_mark_inode_dirty(handle, inode); } ext3_journal_stop(handle, dir); return err; @@ -829,9 +829,9 @@ * recovery. */ inode->i_size = 0; ext3_orphan_add(handle, inode); - ext3_mark_inode_dirty(handle, inode); dir->i_nlink--; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ext3_mark_inode_dirty(handle, dir); @@ -883,8 +883,8 @@ inode->i_nlink--; if (!inode->i_nlink) ext3_orphan_add(handle, inode); - ext3_mark_inode_dirty(handle, inode); inode->i_ctime = dir->i_ctime; + ext3_mark_inode_dirty(handle, inode); retval = 0; end_unlink: @@ -933,8 +933,8 @@ inode->i_size = l-1; } inode->u.ext3_i.i_disksize = inode->i_size; - ext3_mark_inode_dirty(handle, inode); err = ext3_add_nondir(handle, dentry, inode); + ext3_mark_inode_dirty(handle, inode); out_stop: ext3_journal_stop(handle, dir); return err; @@ -970,8 +970,8 @@ ext3_inc_count(handle, inode); atomic_inc(&inode->i_count); - ext3_mark_inode_dirty(handle, inode); err = ext3_add_nondir(handle, dentry, inode); + ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle, dir); return err; } --- linux-2.4.19-ext3/fs/ext3/super.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/ext3/super.c Fri Oct 11 20:18:58 2002 @@ -646,6 +646,11 @@ *mount_options &= ~EXT3_MOUNT_DATA_FLAGS; *mount_options |= data_opt; } + } else if (!strcmp (this_char, "commit")) { + unsigned long v; + if (want_numeric(value, "commit", &v)) + return 0; + sbi->s_commit_interval = (HZ * v); } else { printk (KERN_ERR "EXT3-fs: Unrecognized mount option %s\n", @@ -1229,6 +1234,22 @@ return NULL; } +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext3_init_journal_params(struct ext3_sb_info *sbi, + journal_t *journal) +{ + if (sbi->s_commit_interval) + journal->j_commit_interval = sbi->s_commit_interval; + /* We could also set up an ext3-specific default for the commit + * interval here, but for now we'll just fall back to the jbd + * default. */ +} + + static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) { struct inode *journal_inode; @@ -1263,7 +1284,7 @@ printk(KERN_ERR "EXT3-fs: Could not load journal inode\n"); iput(journal_inode); } - + ext3_init_journal_params(EXT3_SB(sb), journal); return journal; } @@ -1341,6 +1362,7 @@ goto out_journal; } EXT3_SB(sb)->journal_bdev = bdev; + ext3_init_journal_params(EXT3_SB(sb), journal); return journal; out_journal: journal_destroy(journal); @@ -1589,8 +1611,10 @@ journal_t *journal = EXT3_SB(sb)->s_journal; /* Now we set up the journal barrier. */ + unlock_super(sb); journal_lock_updates(journal); journal_flush(journal); + lock_super(sb); /* Journal blocked and flushed, clear needs_recovery flag. */ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); @@ -1636,6 +1660,8 @@ es = sbi->s_es; + ext3_init_journal_params(sbi, sbi->s_journal); + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) return -EROFS; --- linux-2.4.19-ext3/fs/jbd/checkpoint.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/jbd/checkpoint.c Fri Oct 11 20:18:58 2002 @@ -594,7 +594,8 @@ J_ASSERT (transaction->t_log_list == NULL); J_ASSERT (transaction->t_checkpoint_list == NULL); J_ASSERT (transaction->t_updates == 0); - + J_ASSERT (list_empty(&transaction->t_jcb)); + J_ASSERT (transaction->t_journal->j_committing_transaction != transaction); --- linux-2.4.19-ext3/fs/jbd/commit.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/jbd/commit.c Fri Oct 11 20:18:58 2002 @@ -475,7 +475,7 @@ transaction's t_log_list queue, and metadata buffers are on the t_iobuf_list queue. - Wait for the transactions in reverse order. That way we are + Wait for the buffers in reverse order. That way we are less likely to be woken up until all IOs have completed, and so we incur less scheduling load. */ @@ -566,8 +566,10 @@ jbd_debug(3, "JBD: commit phase 6\n"); - if (is_journal_aborted(journal)) + if (is_journal_aborted(journal)) { + unlock_journal(journal); goto skip_commit; + } /* Done it all: now write the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort @@ -577,9 +579,10 @@ descriptor = journal_get_descriptor_buffer(journal); if (!descriptor) { __journal_abort_hard(journal); + unlock_journal(journal); goto skip_commit; } - + /* AKPM: buglet - add `i' to tmp! */ for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { journal_header_t *tmp = @@ -600,14 +603,32 @@ put_bh(bh); /* One for getblk() */ journal_unlock_journal_head(descriptor); } - lock_journal(journal); /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this transaction can be removed from any checkpoint list it was on before. */ -skip_commit: +skip_commit: /* The journal should be unlocked by now. */ + + /* Call any callbacks that had been registered for handles in this + * transaction. It is up to the callback to free any allocated + * memory. + */ + if (!list_empty(&commit_transaction->t_jcb)) { + struct list_head *p, *n; + int error = is_journal_aborted(journal); + + list_for_each_safe(p, n, &commit_transaction->t_jcb) { + struct journal_callback *jcb; + + jcb = list_entry(p, struct journal_callback, jcb_list); + list_del(p); + jcb->jcb_func(jcb, error); + } + } + + lock_journal(journal); jbd_debug(3, "JBD: commit phase 7\n"); @@ -663,19 +684,45 @@ * there's no point in keeping a checkpoint record for * it. */ bh = jh2bh(jh); + + /* A buffer which has been freed while still being + * journaled by a previous transaction may end up still + * being dirty here, but we want to avoid writing back + * that buffer in the future now that the last use has + * been committed. That's not only a performance gain, + * it also stops aliasing problems if the buffer is left + * behind for writeback and gets reallocated for another + * use in a different page. */ + if (__buffer_state(bh, Freed)) { + clear_bit(BH_Freed, &bh->b_state); + clear_bit(BH_JBDDirty, &bh->b_state); + } + if (buffer_jdirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __journal_refile_buffer(jh); } else { + struct page *page = bh->b_page; + J_ASSERT_BH(bh, !buffer_dirty(bh)); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); __journal_unfile_buffer(jh); jh->b_transaction = 0; __journal_remove_journal_head(bh); - __brelse(bh); + + if (TryLockPage(page)) { + __brelse(bh); + } else { + __brelse(bh); + page_cache_get(page); + try_to_free_buffers(page, 0); + unlock_page(page); + page_cache_release(page); + } } + spin_unlock(&journal_datalist_lock); } --- linux-2.4.19-ext3/fs/jbd/journal.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/jbd/journal.c Fri Oct 11 20:18:58 2002 @@ -58,6 +58,7 @@ #endif EXPORT_SYMBOL(journal_flush); EXPORT_SYMBOL(journal_revoke); +EXPORT_SYMBOL(journal_callback_set); EXPORT_SYMBOL(journal_init_dev); EXPORT_SYMBOL(journal_init_inode); @@ -1488,6 +1489,49 @@ unlock_journal(journal); } + +/* + * Report any unexpected dirty buffers which turn up. Normally those + * indicate an error, but they can occur if the user is running (say) + * tune2fs to modify the live filesystem, so we need the option of + * continuing as gracefully as possible. # + * + * The caller should already hold the journal lock and + * journal_datalist_lock spinlock: most callers will need those anyway + * in order to probe the buffer's journaling state safely. + */ +void __jbd_unexpected_dirty_buffer(char *function, int line, + struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + int jlist; + + if (buffer_dirty(bh)) { + printk ("%sUnexpected dirty buffer encountered at " + "%s:%d (%s blocknr %lu)\n", + KERN_WARNING, function, line, + kdevname(bh->b_dev), bh->b_blocknr); +#ifdef JBD_PARANOID_WRITES + J_ASSERT_BH (bh, !buffer_dirty(bh)); +#endif + + /* If this buffer is one which might reasonably be dirty + * --- ie. data, or not part of this journal --- then + * we're OK to leave it alone, but otherwise we need to + * move the dirty bit to the journal's own internal + * JBDDirty bit. */ + jlist = jh->b_jlist; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (atomic_set_buffer_clean(jh2bh(jh))) { + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); + } + } + } +} + + int journal_blocks_per_page(struct inode *inode) { return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); --- linux-2.4.19-ext3/fs/jbd/transaction.c.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/fs/jbd/transaction.c Fri Oct 11 20:18:58 2002 @@ -57,6 +57,7 @@ transaction->t_state = T_RUNNING; transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; + INIT_LIST_HEAD(&transaction->t_jcb); /* Set up the commit timer for the new transaction. */ J_ASSERT (!journal->j_commit_timer_active); @@ -90,7 +91,14 @@ transaction_t *transaction; int needed; int nblocks = handle->h_buffer_credits; - + + if (nblocks > journal->j_max_transaction_buffers) { + jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n", + current->comm, nblocks, + journal->j_max_transaction_buffers); + return -ENOSPC; + } + jbd_debug(3, "New handle %p going live.\n", handle); repeat: @@ -201,6 +209,20 @@ return 0; } +/* Allocate a new handle. This should probably be in a slab... */ +static handle_t *new_handle(int nblocks) +{ + handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); + if (!handle) + return NULL; + memset(handle, 0, sizeof (handle_t)); + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + INIT_LIST_HEAD(&handle->h_jcb); + + return handle; +} + /* * Obtain a new handle. * @@ -227,14 +249,11 @@ handle->h_ref++; return handle; } - - handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); + + handle = new_handle(nblocks); if (!handle) return ERR_PTR(-ENOMEM); - memset (handle, 0, sizeof (handle_t)); - handle->h_buffer_credits = nblocks; - handle->h_ref = 1; current->journal_info = handle; err = start_this_handle(journal, handle); @@ -333,14 +352,11 @@ if (is_journal_aborted(journal)) return ERR_PTR(-EIO); - - handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); + + handle = new_handle(nblocks); if (!handle) return ERR_PTR(-ENOMEM); - memset (handle, 0, sizeof (handle_t)); - handle->h_buffer_credits = nblocks; - handle->h_ref = 1; current->journal_info = handle; err = try_start_this_handle(journal, handle); @@ -539,76 +555,67 @@ static int do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) { + struct buffer_head *bh; transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; int error; char *frozen_buffer = NULL; int need_copy = 0; - + int locked; + jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: + bh = jh2bh(jh); + /* @@@ Need to check for errors here at some point. */ /* - * AKPM: neither bdflush nor kupdate run with the BKL. There's - * nothing we can do to prevent them from starting writeout of a - * BUF_DIRTY buffer at any time. And checkpointing buffers are on - * BUF_DIRTY. So. We no longer assert that the buffer is unlocked. - * - * However. It is very wrong for us to allow ext3 to start directly - * altering the ->b_data of buffers which may at that very time be - * undergoing writeout to the client filesystem. This can leave - * the filesystem in an inconsistent, transient state if we crash. - * So what we do is to steal the buffer if it is in checkpoint - * mode and dirty. The journal lock will keep out checkpoint-mode - * state transitions within journal_remove_checkpoint() and the buffer - * is locked to keep bdflush/kupdate/whoever away from it as well. - * * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a * simple lock_journal(). This code here will care for locked buffers. */ - /* - * The buffer_locked() || buffer_dirty() tests here are simply an - * optimisation tweak. If anyone else in the system decides to - * lock this buffer later on, we'll blow up. There doesn't seem - * to be a good reason why they should do this. - */ - if (jh->b_cp_transaction && - (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) { + locked = test_and_set_bit(BH_Lock, &bh->b_state); + if (locked) { + /* We can't reliably test the buffer state if we found + * it already locked, so just wait for the lock and + * retry. */ unlock_journal(journal); - lock_buffer(jh2bh(jh)); - spin_lock(&journal_datalist_lock); - if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) { - /* OK, we need to steal it */ - JBUFFER_TRACE(jh, "stealing from checkpoint mode"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_frozen_data == NULL); - - J_ASSERT(handle->h_buffer_credits > 0); - handle->h_buffer_credits--; - - /* This will clear BH_Dirty and set BH_JBDDirty. */ - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - __journal_file_buffer(jh, transaction, BJ_Reserved); - - /* And pull it off BUF_DIRTY, onto BUF_CLEAN */ - refile_buffer(jh2bh(jh)); + __wait_on_buffer(bh); + lock_journal(journal); + goto repeat; + } + + /* We now hold the buffer lock so it is safe to query the buffer + * state. Is the buffer dirty? + * + * If so, there are two possibilities. The buffer may be + * non-journaled, and undergoing a quite legitimate writeback. + * Otherwise, it is journaled, and we don't expect dirty buffers + * in that state (the buffers should be marked JBD_Dirty + * instead.) So either the IO is being done under our own + * control and this is a bug, or it's a third party IO such as + * dump(8) (which may leave the buffer scheduled for read --- + * ie. locked but not dirty) or tune2fs (which may actually have + * the buffer dirtied, ugh.) */ - /* - * The buffer is now hidden from bdflush. It is - * metadata against the current transaction. - */ - JBUFFER_TRACE(jh, "steal from cp mode is complete"); + if (buffer_dirty(bh)) { + spin_lock(&journal_datalist_lock); + /* First question: is this buffer already part of the + * current transaction or the existing committing + * transaction? */ + if (jh->b_transaction) { + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == journal->j_committing_transaction); + if (jh->b_next_transaction) + J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + JBUFFER_TRACE(jh, "Unexpected dirty buffer"); + jbd_unexpected_dirty_buffer(jh); } spin_unlock(&journal_datalist_lock); - unlock_buffer(jh2bh(jh)); - lock_journal(journal); - goto repeat; } - J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh))); + unlock_buffer(bh); error = -EROFS; if (is_handle_aborted(handle)) @@ -1328,6 +1335,28 @@ #endif /* + * Register a callback function for this handle. The function will be + * called when the transaction that this handle is part of has been + * committed to disk with the original callback data struct and the + * error status of the journal as parameters. There is no guarantee of + * ordering between handles within a single transaction, nor between + * callbacks registered on the same handle. + * + * The caller is responsible for allocating the journal_callback struct. + * This is to allow the caller to add as much extra data to the callback + * as needed, but reduce the overhead of multiple allocations. The caller + * allocated struct must start with a struct journal_callback at offset 0, + * and has the caller-specific data afterwards. + */ +void journal_callback_set(handle_t *handle, + void (*func)(struct journal_callback *jcb, int error), + struct journal_callback *jcb) +{ + list_add_tail(&jcb->jcb_list, &handle->h_jcb); + jcb->jcb_func = func; +} + +/* * All done for a particular handle. * * There is not much action needed here. We just return any remaining @@ -1393,7 +1422,10 @@ wake_up(&journal->j_wait_transaction_locked); } - /* + /* Move callbacks from the handle to the transaction. */ + list_splice(&handle->h_jcb, &transaction->t_jcb); + + /* * If the handle is marked SYNC, we need to set another commit * going! We also want to force a commit if the current * transaction is occupying too much of the log, or if the @@ -1516,9 +1548,6 @@ assert_spin_locked(&journal_datalist_lock); transaction = jh->b_transaction; -#ifdef __SMP__ - J_ASSERT (current->lock_depth >= 0); -#endif J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); if (jh->b_jlist != BJ_None) @@ -1843,6 +1872,7 @@ * running transaction if that is set, but nothing * else. */ JBUFFER_TRACE(jh, "on committing transaction"); + set_bit(BH_Freed, &bh->b_state); if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == journal->j_running_transaction); @@ -1912,8 +1942,29 @@ unlock_journal(journal); if (!offset) { - if (!may_free || !try_to_free_buffers(page, 0)) + if (!may_free || !try_to_free_buffers(page, 0)) { + if (!offset) { + /* We are still using the page, but only + because a transaction is pinning the + page. Once it commits, we want to + encourage the page to be reaped as + quickly as possible. */ + ClearPageReferenced(page); + +#if 0 + /* Ugh, this is not exactly portable + between VMs: we need a modular + solution for this some day.. */ + if (PageActive(page)) { + spin_lock(&pagemap_lru_lock); + del_page_from_active_list(page); + add_page_to_inactive_list(page); + spin_unlock(&pagemap_lru_lock); + } +#endif + } return 0; + } J_ASSERT(page->buffers == NULL); } return 1; @@ -1926,23 +1977,32 @@ transaction_t *transaction, int jlist) { struct journal_head **list = 0; + int was_dirty = 0; assert_spin_locked(&journal_datalist_lock); -#ifdef __SMP__ - J_ASSERT (current->lock_depth >= 0); -#endif J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_transaction == 0); - if (jh->b_transaction) { - if (jh->b_jlist == jlist) - return; + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + /* The following list of buffer states needs to be consistent + * with __jbd_unexpected_dirty_buffer()'s handling of dirty + * state. */ + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (atomic_set_buffer_clean(jh2bh(jh)) || + test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) + was_dirty = 1; + } + + if (jh->b_transaction) __journal_unfile_buffer(jh); - } else { + else jh->b_transaction = transaction; - } switch (jlist) { case BJ_None: @@ -1979,12 +2039,8 @@ __blist_add_buffer(list, jh); jh->b_jlist = jlist; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - if (atomic_set_buffer_clean(jh2bh(jh))) { - set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); - } - } + if (was_dirty) + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); } void journal_file_buffer(struct journal_head *jh, @@ -2004,26 +2060,33 @@ void __journal_refile_buffer(struct journal_head *jh) { + int was_dirty = 0; + assert_spin_locked(&journal_datalist_lock); -#ifdef __SMP__ - J_ASSERT_JH(jh, current->lock_depth >= 0); -#endif - __journal_unfile_buffer(jh); + /* If the buffer is now unused, just drop it. */ + if (jh->b_next_transaction == NULL) { + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + /* Onto BUF_DIRTY for writeback */ + refile_buffer(jh2bh(jh)); + return; + } + + /* It has been modified by a later transaction: add it to the + * new transaction's metadata list. */ - /* If the buffer is now unused, just drop it. If it has been - modified by a later transaction, add it to the new - transaction's metadata list. */ + if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) + was_dirty = 1; + __journal_unfile_buffer(jh); jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; + __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + + if (was_dirty) + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); - if (jh->b_transaction != NULL) { - __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); - J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); - } else { - /* Onto BUF_DIRTY for writeback */ - refile_buffer(jh2bh(jh)); - } } /* --- linux-2.4.19-ext3/include/linux/ext3_fs.h.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/include/linux/ext3_fs.h Fri Oct 11 20:18:58 2002 @@ -36,8 +36,8 @@ /* * The second extended file system version */ -#define EXT3FS_DATE "10 Jan 2002" -#define EXT3FS_VERSION "2.4-0.9.17" +#define EXT3FS_DATE "19 August 2002" +#define EXT3FS_VERSION "2.4-0.9.19" /* * Debug code --- linux-2.4.19-ext3/include/linux/jbd.h.=K0000=.orig Fri Oct 11 20:18:42 2002 +++ linux-2.4.19-ext3/include/linux/jbd.h Fri Oct 11 20:18:58 2002 @@ -32,6 +32,14 @@ #define journal_oom_retry 1 +/* + * Define JBD_PARANOID_WRITES to cause a kernel BUG() check if ext3 + * finds a buffer unexpectedly dirty. This is useful for debugging, but + * can cause spurious kernel panics if there are applications such as + * tune2fs modifying our buffer_heads behind our backs. + */ +#undef JBD_PARANOID_WRITES + #ifdef CONFIG_JBD_DEBUG /* * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal @@ -249,6 +257,13 @@ return bh->b_private; } +#define HAVE_JOURNAL_CALLBACK_STATUS +struct journal_callback { + struct list_head jcb_list; + void (*jcb_func)(struct journal_callback *jcb, int error); + /* user data goes here */ +}; + struct jbd_revoke_table_s; /* The handle_t type represents a single atomic update being performed @@ -279,6 +294,12 @@ operations */ int h_err; + /* List of application registered callbacks for this handle. + * The function(s) will be called after the transaction that + * this handle is part of has been committed to disk. + */ + struct list_head h_jcb; + /* Flags */ unsigned int h_sync: 1; /* sync-on-close */ unsigned int h_jdata: 1; /* force data journaling */ @@ -398,6 +419,10 @@ /* How many handles used this transaction? */ int t_handle_count; + + /* List of registered callback functions for this transaction. + * Called when the transaction is committed. */ + struct list_head t_jcb; }; @@ -646,6 +671,9 @@ extern int journal_try_to_free_buffers(journal_t *, struct page *, int); extern int journal_stop(handle_t *); extern int journal_flush (journal_t *); +extern void journal_callback_set(handle_t *handle, + void (*fn)(struct journal_callback *,int), + struct journal_callback *jcb); extern void journal_lock_updates (journal_t *); extern void journal_unlock_updates (journal_t *); @@ -730,6 +758,10 @@ schedule(); \ } while (1) +extern void __jbd_unexpected_dirty_buffer(char *, int, struct journal_head *); +#define jbd_unexpected_dirty_buffer(jh) \ + __jbd_unexpected_dirty_buffer(__FUNCTION__, __LINE__, (jh)) + /* * is_journal_abort *