From: Chris Mason <mason@suse.com> reiserfs logging rework, making things much faster for small transactions. metadata buffers are dirtied when they are safe to write, so normal kernel mechanisms can contribute to log cleaning. --- 25-akpm/fs/reiserfs/do_balan.c | 25 25-akpm/fs/reiserfs/fix_node.c | 34 25-akpm/fs/reiserfs/ibalance.c | 2 25-akpm/fs/reiserfs/inode.c | 4 25-akpm/fs/reiserfs/journal.c | 1616 ++++++++++++++++++--------------- 25-akpm/fs/reiserfs/objectid.c | 3 25-akpm/fs/reiserfs/procfs.c | 5 25-akpm/fs/reiserfs/super.c | 31 25-akpm/include/linux/reiserfs_fs.h | 29 25-akpm/include/linux/reiserfs_fs_i.h | 4 25-akpm/include/linux/reiserfs_fs_sb.h | 70 - 11 files changed, 972 insertions(+), 851 deletions(-) diff -puN fs/reiserfs/do_balan.c~reiserfs-logging fs/reiserfs/do_balan.c --- 25/fs/reiserfs/do_balan.c~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/fs/reiserfs/do_balan.c Wed Mar 24 15:14:39 2004 @@ -30,32 +30,11 @@ struct tree_balance * cur_tb = NULL; /* is interrupting do_balance */ #endif -/* - * AKPM: The __mark_buffer_dirty() call here will not - * put the buffer on the dirty buffer LRU because we've just - * set BH_Dirty. That's a thinko in reiserfs. - * - * I'm reluctant to "fix" this bug because that would change - * behaviour. Using mark_buffer_dirty() here would make the - * buffer eligible for VM and periodic writeback, which may - * violate ordering constraints. I'll just leave the code - * as-is by removing the __mark_buffer_dirty call altogether. - * - * Chris says this code has "probably never been run" anyway. - * It is due to go away. - */ - inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, struct buffer_head * bh, int flag) { - if (reiserfs_dont_log(tb->tb_sb)) { - if (!test_set_buffer_dirty(bh)) { -// __mark_buffer_dirty(bh) ; - tb->need_balance_dirty = 1; - } - } else { - journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ; - } + journal_mark_dirty(tb->transaction_handle, + tb->transaction_handle->t_super, bh) ; } #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty diff -puN fs/reiserfs/fix_node.c~reiserfs-logging fs/reiserfs/fix_node.c --- 25/fs/reiserfs/fix_node.c~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/fs/reiserfs/fix_node.c Wed Mar 24 15:14:39 2004 @@ -2106,9 +2106,9 @@ static void tb_buffer_sanity_check (stru {;} #endif -static void clear_all_dirty_bits(struct super_block *s, +static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh) { - reiserfs_prepare_for_journal(s, bh, 0) ; + return reiserfs_prepare_for_journal(s, bh, 0) ; } static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb) @@ -2137,11 +2137,11 @@ static int wait_tb_buffers_until_unlocke p_s_tb->tb_path->path_length - i); } #endif - clear_all_dirty_bits(p_s_tb->tb_sb, - PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ; - - if ( buffer_locked (PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, + PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i))) + { locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i); + } } } @@ -2151,22 +2151,19 @@ static int wait_tb_buffers_until_unlocke if ( p_s_tb->L[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]) ; - if ( buffer_locked (p_s_tb->L[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i])) locked = p_s_tb->L[i]; } if ( !locked && p_s_tb->FL[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]) ; - if ( buffer_locked (p_s_tb->FL[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i])) locked = p_s_tb->FL[i]; } if ( !locked && p_s_tb->CFL[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]) ; - if ( buffer_locked (p_s_tb->CFL[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i])) locked = p_s_tb->CFL[i]; } @@ -2176,23 +2173,20 @@ static int wait_tb_buffers_until_unlocke if ( p_s_tb->R[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]) ; - if ( buffer_locked (p_s_tb->R[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i])) locked = p_s_tb->R[i]; } if ( !locked && p_s_tb->FR[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]) ; - if ( buffer_locked (p_s_tb->FR[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i])) locked = p_s_tb->FR[i]; } if ( !locked && p_s_tb->CFR[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]) ; - if ( buffer_locked (p_s_tb->CFR[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i])) locked = p_s_tb->CFR[i]; } } @@ -2207,10 +2201,8 @@ static int wait_tb_buffers_until_unlocke */ for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { if ( p_s_tb->FEB[i] ) { - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]) ; - if (buffer_locked(p_s_tb->FEB[i])) { + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i])) locked = p_s_tb->FEB[i] ; - } } } diff -puN fs/reiserfs/ibalance.c~reiserfs-logging fs/reiserfs/ibalance.c --- 25/fs/reiserfs/ibalance.c~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/fs/reiserfs/ibalance.c Wed Mar 24 15:14:39 2004 @@ -633,7 +633,6 @@ static void balance_internal_when_delete /* use check_internal if new root is an internal node */ check_internal (new_root); /*&&&&&&&&&&&&&&&&&&&&&&*/ - tb->tb_sb->s_dirt = 1; /* do what is needed for buffer thrown from tree */ reiserfs_invalidate_buffer(tb, tbSh); @@ -951,7 +950,6 @@ int balance_internal (struct tree_balanc PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr ); PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 ); do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); - tb->tb_sb->s_dirt = 1; } if ( tb->blknum[h] == 2 ) { diff -puN fs/reiserfs/inode.c~reiserfs-logging fs/reiserfs/inode.c --- 25/fs/reiserfs/inode.c~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/fs/reiserfs/inode.c Wed Mar 24 15:14:39 2004 @@ -964,7 +964,7 @@ static void init_inode (struct inode * i REISERFS_I(inode)->i_prealloc_block = 0; REISERFS_I(inode)->i_prealloc_count = 0; REISERFS_I(inode)->i_trans_id = 0; - REISERFS_I(inode)->i_trans_index = 0; + REISERFS_I(inode)->i_jl = NULL; if (stat_data_v1 (ih)) { struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih); @@ -1621,7 +1621,7 @@ int reiserfs_new_inode (struct reiserfs_ REISERFS_I(inode)->i_prealloc_block = 0; REISERFS_I(inode)->i_prealloc_count = 0; REISERFS_I(inode)->i_trans_id = 0; - REISERFS_I(inode)->i_trans_index = 0; + REISERFS_I(inode)->i_jl = 0; REISERFS_I(inode)->i_attrs = REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode ); diff -puN fs/reiserfs/journal.c~reiserfs-logging fs/reiserfs/journal.c --- 25/fs/reiserfs/journal.c~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/fs/reiserfs/journal.c Wed Mar 24 15:14:39 2004 @@ -32,13 +32,6 @@ ** around too long. ** -- Note, if you call this as an immediate flush from ** from within kupdate, it will ignore the immediate flag -** -** The commit thread -- a writer process for async commits. It allows a -** a process to request a log flush on a task queue. -** the commit will happen once the commit thread wakes up. -** The benefit here is the writer (with whatever -** related locks it has) doesn't have to wait for the -** log blocks to hit disk if it doesn't want to. */ #include <linux/config.h> @@ -60,6 +53,14 @@ #include <linux/suspend.h> #include <linux/buffer_head.h> #include <linux/workqueue.h> +#include <linux/writeback.h> + + +/* gets a struct reiserfs_journal_list * from a list head */ +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_list)) +#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_working_list)) /* the number of mounted filesystems. This is used to decide when to ** start and kill the commit workqueue @@ -78,6 +79,12 @@ static struct workqueue_struct *commit_w #define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ #define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */ +#define BLOCK_DIRTIED 5 + + +/* journal list state bits */ +#define LIST_TOUCHED 1 +#define LIST_DIRTY 2 /* flags for do_journal_end */ #define FLUSH_ALL 1 /* flush commit and real blocks */ @@ -86,6 +93,9 @@ static struct workqueue_struct *commit_w /* state bits for the journal */ #define WRITERS_BLOCKED 1 /* set when new writers not allowed */ +#define WRITERS_QUEUED 2 /* set when log is full due to too many + * writers + */ static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ; static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; @@ -94,6 +104,9 @@ static int can_dirty(struct reiserfs_jou static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks); static int release_journal_dev( struct super_block *super, struct reiserfs_journal *journal ); +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl); +static void flush_async_commits(void *p); static void init_journal_hash(struct super_block *p_s_sb) { memset(SB_JOURNAL(p_s_sb)->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; @@ -105,8 +118,10 @@ static void init_journal_hash(struct sup ** more details. */ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) { - if (bh) + if (bh) { clear_buffer_dirty(bh); + clear_bit(BH_JTest, &bh->b_state); + } return 0 ; } @@ -367,6 +382,7 @@ static void free_cnode(struct super_bloc static int clear_prepared_bits(struct buffer_head *bh) { clear_bit(BH_JPrepared, &bh->b_state) ; + clear_bit(BH_JRestore_dirty, &bh->b_state) ; return 0 ; } @@ -471,11 +487,6 @@ int reiserfs_in_journal(struct super_blo *next_zero_bit = 0 ; /* always start this at zero. */ - /* we aren't logging all blocks are safe for reuse */ - if (reiserfs_dont_log(p_s_sb)) { - return 0 ; - } - PROC_INFO_INC( p_s_sb, journal.in_journal ); /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. ** if we crash before the transaction that freed it commits, this transaction won't @@ -503,6 +514,7 @@ int reiserfs_in_journal(struct super_blo /* is it in the current transaction. This should never happen */ if ((cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, bl))) { + BUG(); return 1; } @@ -527,18 +539,30 @@ inline void insert_journal_hash(struct r /* lock the current transaction */ inline static void lock_journal(struct super_block *p_s_sb) { - PROC_INFO_INC( p_s_sb, journal.lock_journal ); - while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) { - PROC_INFO_INC( p_s_sb, journal.lock_journal_wait ); - sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ; - } - atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ; + PROC_INFO_INC( p_s_sb, journal.lock_journal ); + down(&SB_JOURNAL(p_s_sb)->j_lock); } /* unlock the current transaction */ inline static void unlock_journal(struct super_block *p_s_sb) { - atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ; - wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ; + up(&SB_JOURNAL(p_s_sb)->j_lock); +} + +static inline void get_journal_list(struct reiserfs_journal_list *jl) +{ + jl->j_refcount++; +} + +static inline void put_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + if (jl->j_refcount < 1) { + printk("trans id %lu, refcount at %d\n", jl->j_trans_id, + jl->j_refcount); + BUG(); + } + if (--jl->j_refcount == 0) + reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); } /* @@ -556,6 +580,83 @@ static void cleanup_freed_for_journal_li jl->j_list_bitmap = NULL ; } +static int journal_list_still_alive(struct super_block *s, + unsigned long trans_id) +{ + struct list_head *entry = &SB_JOURNAL(s)->j_journal_list; + struct reiserfs_journal_list *jl; + + if (!list_empty(entry)) { + jl = JOURNAL_LIST_ENTRY(entry->next); + if (jl->j_trans_id <= trans_id) { + return 1; + } + } + return 0; +} + +static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) { + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal_list *first_jl; + struct list_head *entry; + unsigned long trans_id = jl->j_trans_id; + unsigned long other_trans_id; + unsigned long first_trans_id; + +find_first: + /* + * first we walk backwards to find the oldest uncommitted transation + */ + first_jl = jl; + entry = jl->j_list.prev; + while(1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + if (entry == &SB_JOURNAL(s)->j_journal_list || + atomic_read(&other_jl->j_older_commits_done)) + break; + + first_jl = other_jl; + entry = other_jl->j_list.prev; + } + + /* if we didn't find any older uncommitted transactions, return now */ + if (first_jl == jl) { + return 0; + } + + first_trans_id = first_jl->j_trans_id; + + entry = &first_jl->j_list; + while(1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + other_trans_id = other_jl->j_trans_id; + + if (other_trans_id < trans_id) { + if (atomic_read(&other_jl->j_commit_left) != 0) { + flush_commit_list(s, other_jl, 0); + + /* list we were called with is gone, return */ + if (!journal_list_still_alive(s, trans_id)) + return 1; + + /* the one we just flushed is gone, this means all + * older lists are also gone, so first_jl is no longer + * valid either. Go back to the beginning. + */ + if (!journal_list_still_alive(s, other_trans_id)) { + goto find_first; + } + } + entry = entry->next; + if (entry == &SB_JOURNAL(s)->j_journal_list) + return 0; + } else { + return 0; + } + } + return 0; +} + /* ** if this journal list still has commit blocks unflushed, send them to disk. ** @@ -564,13 +665,10 @@ static void cleanup_freed_for_journal_li ** */ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { - int i, count ; - int index = 0 ; + int i; int bn ; - int retry_count = 0 ; - int orig_commit_left = 0 ; struct buffer_head *tbh = NULL ; - struct reiserfs_journal_list *other_jl ; + unsigned long trans_id = jl->j_trans_id; reiserfs_check_lock_depth("flush_commit_list") ; @@ -581,133 +679,100 @@ static int flush_commit_list(struct supe /* before we can put our commit blocks on disk, we have to make sure everyone older than ** us is on disk too */ - if (jl->j_len <= 0) { - return 0 ; - } + if (jl->j_len <= 0) + BUG(); + if (trans_id == SB_JOURNAL(s)->j_trans_id) + BUG(); + + get_journal_list(jl); if (flushall) { - /* we _must_ make sure the transactions are committed in order. Start with the - ** index after this one, wrap all the way around - */ - index = (jl - SB_JOURNAL_LIST(s)) + 1 ; - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ; - if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 && - other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) { - flush_commit_list(s, other_jl, 0) ; - } + if (flush_older_commits(s, jl) == 1) { + /* list disappeared during flush_older_commits. return */ + goto put_jl; } } - count = 0 ; - /* don't flush the commit list for the current transactoin */ - if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) { - return 0 ; - } - /* make sure nobody is trying to flush this one at the same time */ - if (atomic_read(&(jl->j_commit_flushing))) { - sleep_on(&(jl->j_commit_wait)) ; - if (flushall) { - atomic_set(&(jl->j_older_commits_done), 1) ; - } - return 0 ; + down(&jl->j_commit_lock); + if (!journal_list_still_alive(s, trans_id)) { + up(&jl->j_commit_lock); + goto put_jl; } - + if (jl->j_trans_id == 0) + BUG(); + /* this commit is done, exit */ if (atomic_read(&(jl->j_commit_left)) <= 0) { if (flushall) { atomic_set(&(jl->j_older_commits_done), 1) ; } - return 0 ; + up(&jl->j_commit_lock); + goto put_jl; } - /* keeps others from flushing while we are flushing */ - atomic_set(&(jl->j_commit_flushing), 1) ; - - if (jl->j_len > SB_JOURNAL_TRANS_MAX(s)) { - reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ; - return 0 ; - } - - orig_commit_left = atomic_read(&(jl->j_commit_left)) ; - - /* start by checking all the commit blocks in this transaction. - ** Add anyone not on disk into tbh. Stop checking once commit_left <= 1, because that means we - ** only have the commit block left - */ -retry: - count = 0 ; - for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */ - bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) % SB_ONDISK_JOURNAL_SIZE(s); + /* + * for the description block and all the log blocks, submit any buffers + * that haven't already reached the disk + */ + for (i = 0 ; i < (jl->j_len + 1) ; i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) % + SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn) ; - -/* kill this sanity check */ -if (count > (orig_commit_left + 2)) { -reiserfs_panic(s, "journal-539: flush_commit_list: BAD count(%d) > orig_commit_left(%d)!\n", count, orig_commit_left) ; -} - if (tbh) { - if (buffer_locked(tbh)) { /* wait on it, redo it just to make sure */ - wait_on_buffer(tbh) ; - if (!buffer_uptodate(tbh)) { - reiserfs_panic(s, "journal-584, buffer write failed\n") ; - } - } - if (buffer_dirty(tbh)) { - printk("journal-569: flush_commit_list, block already dirty!\n") ; - } else { - mark_buffer_dirty(tbh) ; - } - ll_rw_block(WRITE, 1, &tbh) ; - count++ ; - put_bh(tbh) ; /* once for our get_hash */ - } + wait_on_buffer(tbh) ; + ll_rw_block(WRITE, 1, &tbh) ; + put_bh(tbh) ; } - /* wait on everyone in tbh before writing commit block*/ - if (count > 0) { - for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && - i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */ - bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ; - tbh = journal_find_get_block(s, bn) ; + /* wait on everything written so far before writing the commit */ + for (i = 0 ; i < (jl->j_len + 1) ; i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ; + tbh = journal_find_get_block(s, bn) ; - wait_on_buffer(tbh) ; - if (!buffer_uptodate(tbh)) { - reiserfs_panic(s, "journal-601, buffer write failed\n") ; - } - put_bh(tbh) ; /* once for our get_hash */ - bforget(tbh) ; /* once due to original getblk in do_journal_end */ - atomic_dec(&(jl->j_commit_left)) ; - } + wait_on_buffer(tbh) ; + if (buffer_dirty(tbh)) + BUG(); + if (!buffer_uptodate(tbh)) { + reiserfs_panic(s, "journal-601, buffer write failed\n") ; + } + put_bh(tbh) ; /* once for journal_find_get_block */ + put_bh(tbh) ; /* once due to original getblk in do_journal_end */ + atomic_dec(&(jl->j_commit_left)) ; } - if (atomic_read(&(jl->j_commit_left)) != 1) { /* just the commit_bh left, flush it without calling getblk for everyone */ - if (retry_count < 2) { - printk("journal-582: flush_commit_list, not all log blocks on disk yet, trying again\n") ; - retry_count++ ; - goto retry; - } - reiserfs_panic(s, "journal-563: flush_commit_list: BAD, j_commit_left is %u, should be 1\n", - atomic_read(&(jl->j_commit_left))); - } + if (atomic_read(&(jl->j_commit_left)) != 1) + BUG(); + if (buffer_dirty(jl->j_commit_bh)) + BUG(); mark_buffer_dirty(jl->j_commit_bh) ; sync_dirty_buffer(jl->j_commit_bh) ; if (!buffer_uptodate(jl->j_commit_bh)) { reiserfs_panic(s, "journal-615: buffer write failed\n") ; } - atomic_dec(&(jl->j_commit_left)) ; bforget(jl->j_commit_bh) ; + if (SB_JOURNAL(s)->j_last_commit_id != 0 && + (jl->j_trans_id - SB_JOURNAL(s)->j_last_commit_id) != 1) { + reiserfs_warning("clm-2200: last commit %lu, current %lu\n", + SB_JOURNAL(s)->j_last_commit_id, + jl->j_trans_id); + } + SB_JOURNAL(s)->j_last_commit_id = jl->j_trans_id; /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ cleanup_freed_for_journal_list(s, jl) ; + /* mark the metadata dirty */ + dirty_one_transaction(s, jl); + atomic_dec(&(jl->j_commit_left)) ; + if (flushall) { atomic_set(&(jl->j_older_commits_done), 1) ; } - atomic_set(&(jl->j_commit_flushing), 0) ; - wake_up(&(jl->j_commit_wait)) ; + up(&jl->j_commit_lock); +put_jl: + put_journal_list(s, jl); - s->s_dirt = 1 ; return 0 ; } @@ -804,22 +869,27 @@ static int update_journal_header_block(s ** flush any and all journal lists older than you are ** can only be called from flush_journal_list */ -static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) { - int i, index ; - struct reiserfs_journal_list *other_jl ; - - index = jl - SB_JOURNAL_LIST(p_s_sb) ; - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ; - if (other_jl && other_jl->j_len > 0 && - other_jl->j_trans_id > 0 && - other_jl->j_trans_id < trans_id && - other_jl != jl) { - /* do not flush all */ - flush_journal_list(p_s_sb, other_jl, 0) ; +static int flush_older_journal_lists(struct super_block *p_s_sb, + struct reiserfs_journal_list *jl) +{ + struct list_head *entry; + struct reiserfs_journal_list *other_jl ; + unsigned long trans_id = jl->j_trans_id; + + /* we know we are the only ones flushing things, no extra race + * protection is required. + */ +restart: + entry = SB_JOURNAL(p_s_sb)->j_journal_list.next; + other_jl = JOURNAL_LIST_ENTRY(entry); + if (other_jl->j_trans_id < trans_id) { + /* do not flush all */ + flush_journal_list(p_s_sb, other_jl, 0) ; + + /* other_jl is now deleted from the list */ + goto restart; } - } - return 0 ; + return 0 ; } static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { @@ -836,15 +906,27 @@ static void reiserfs_end_buffer_io_sync( unlock_buffer(bh) ; put_bh(bh) ; } + static void submit_logged_buffer(struct buffer_head *bh) { - lock_buffer(bh) ; get_bh(bh) ; bh->b_end_io = reiserfs_end_buffer_io_sync ; mark_buffer_notjournal_new(bh) ; clear_buffer_dirty(bh) ; + if (!test_and_clear_bit(BH_JTest, &bh->b_state)) + BUG(); + if (!buffer_uptodate(bh)) + BUG(); submit_bh(WRITE, bh) ; } +static void del_from_work_list(struct super_block *s, + struct reiserfs_journal_list *jl) { + if (!list_empty(&jl->j_working_list)) { + list_del_init(&jl->j_working_list); + SB_JOURNAL(s)->j_num_work_lists--; + } +} + /* flush a journal list, both commit and real blocks ** ** always set flushall to 1, unless you are calling from inside @@ -865,29 +947,26 @@ static int flush_journal_list(struct sup unsigned long j_len_saved = jl->j_len ; if (j_len_saved <= 0) { - return 0 ; + BUG(); } if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) { reiserfs_warning("clm-2048: flush_journal_list called with wcount %d\n", atomic_read(&SB_JOURNAL(s)->j_wcount)) ; } - /* if someone is getting the commit list, we must wait for them */ - while (atomic_read(&(jl->j_commit_flushing))) { - sleep_on(&(jl->j_commit_wait)) ; - } - /* if someone is flushing this list, we must wait for them */ - while (atomic_read(&(jl->j_flushing))) { - sleep_on(&(jl->j_flush_wait)) ; - } + if (jl->j_trans_id == 0) + BUG(); - /* this list is now ours, we can change anything we want */ - atomic_set(&(jl->j_flushing), 1) ; + /* if flushall == 0, the lock is already held */ + if (flushall) { + down(&SB_JOURNAL(s)->j_flush_sem); + } else if (!down_trylock(&SB_JOURNAL(s)->j_flush_sem)) { + BUG(); + } count = 0 ; if (j_len_saved > SB_JOURNAL_TRANS_MAX(s)) { - reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ; - atomic_dec(&(jl->j_flushing)) ; + reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id); return 0 ; } @@ -902,6 +981,9 @@ static int flush_journal_list(struct sup */ flush_commit_list(s, jl, 1) ; + if (!(jl->j_state & LIST_DIRTY)) + BUG(); + /* are we done now? */ if (atomic_read(&(jl->j_nonzerolen)) <= 0 && atomic_read(&(jl->j_commit_left)) <= 0) { @@ -937,13 +1019,13 @@ static int flush_journal_list(struct sup get_bh(saved_bh) ; if (buffer_journal_dirty(saved_bh)) { + if (!can_dirty(cn)) + BUG(); was_jwait = 1 ; - mark_buffer_notjournal_dirty(saved_bh) ; - /* undo the inc from journal_mark_dirty */ - put_bh(saved_bh) ; - } - if (can_dirty(cn)) { was_dirty = 1 ; + } else if (can_dirty(cn)) { + /* everything with !pjl && jwait should be writable */ + BUG(); } } @@ -951,7 +1033,8 @@ static int flush_journal_list(struct sup ** sure they are commited, and don't try writing it to disk */ if (pjl) { - flush_commit_list(s, pjl, 1) ; + if (atomic_read(&pjl->j_commit_left)) + flush_commit_list(s, pjl, 1) ; goto free_cnode ; } @@ -970,22 +1053,17 @@ static int flush_journal_list(struct sup printk("journal-813: BAD! buffer %llu %cdirty %cjwait, not in a newer tranasction\n", (unsigned long long)saved_bh->b_blocknr, was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ; } - /* kupdate_one_transaction waits on the buffers it is writing, so we - ** should never see locked buffers here - */ - if (buffer_locked(saved_bh)) { - printk("clm-2083: locked buffer %llu in flush_journal_list\n", - (unsigned long long)saved_bh->b_blocknr) ; - wait_on_buffer(saved_bh) ; - if (!buffer_uptodate(saved_bh)) { - reiserfs_panic(s, "journal-923: buffer write failed\n") ; - } - } if (was_dirty) { /* we inc again because saved_bh gets decremented at free_cnode */ get_bh(saved_bh) ; set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; - submit_logged_buffer(saved_bh) ; + lock_buffer(saved_bh); + if (cn->blocknr != saved_bh->b_blocknr) + BUG(); + if (buffer_dirty(saved_bh)) + submit_logged_buffer(saved_bh) ; + else + unlock_buffer(saved_bh); count++ ; } else { printk("clm-2082: Unable to flush buffer %llu in flush_journal_list\n", @@ -1016,6 +1094,14 @@ free_cnode: if (!buffer_uptodate(cn->bh)) { reiserfs_panic(s, "journal-949: buffer write failed\n") ; } + /* note, we must clear the JDirty_wait bit after the up to date + ** check, otherwise we race against our flushpage routine + */ + if (!test_and_clear_bit(BH_JDirty_wait, &cn->bh->b_state)) + BUG(); + + /* undo the inc from journal_mark_dirty */ + put_bh(cn->bh) ; brelse(cn->bh) ; } cn = cn->next ; @@ -1029,7 +1115,7 @@ flush_older_and_return: ** replayed after a crash */ if (flushall) { - flush_older_journal_lists(s, jl, jl->j_trans_id) ; + flush_older_journal_lists(s, jl); } /* before we can remove everything from the hash tables for this @@ -1044,181 +1130,246 @@ flush_older_and_return: update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ; } remove_all_from_journal_list(s, jl, 0) ; + list_del(&jl->j_list); + SB_JOURNAL(s)->j_num_lists--; + del_from_work_list(s, jl); + + if (SB_JOURNAL(s)->j_last_flush_id != 0 && + (jl->j_trans_id - SB_JOURNAL(s)->j_last_flush_id) != 1) { + reiserfs_warning("clm-2201: last flush %lu, current %lu\n", + SB_JOURNAL(s)->j_last_flush_id, + jl->j_trans_id); + } + SB_JOURNAL(s)->j_last_flush_id = jl->j_trans_id; + + /* not strictly required since we are freeing the list, but it should + * help find code using dead lists later on + */ jl->j_len = 0 ; atomic_set(&(jl->j_nonzerolen), 0) ; jl->j_start = 0 ; jl->j_realblock = NULL ; jl->j_commit_bh = NULL ; jl->j_trans_id = 0 ; - atomic_dec(&(jl->j_flushing)) ; - wake_up(&(jl->j_flush_wait)) ; + jl->j_state = 0; + put_journal_list(s, jl); + if (flushall) + up(&SB_JOURNAL(s)->j_flush_sem); return 0 ; } - -static int kupdate_one_transaction(struct super_block *s, - struct reiserfs_journal_list *jl) +#define CHUNK_SIZE 32 +struct buffer_chunk { + struct buffer_head *bh[CHUNK_SIZE]; + int nr; +}; + +static void write_chunk(struct buffer_chunk *chunk) { + int i; + for (i = 0; i < chunk->nr ; i++) { + submit_logged_buffer(chunk->bh[i]) ; + } + chunk->nr = 0; +} + +static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) { + if (chunk->nr >= CHUNK_SIZE) + BUG(); + chunk->bh[chunk->nr++] = bh; + if (chunk->nr >= CHUNK_SIZE) + write_chunk(chunk); +} + +static int write_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl, + struct buffer_chunk *chunk) { - struct reiserfs_journal_list *pjl ; /* previous list for this cn */ - struct reiserfs_journal_cnode *cn, *walk_cn ; - b_blocknr_t blocknr ; - int run = 0 ; - int orig_trans_id = jl->j_trans_id ; - struct buffer_head *saved_bh ; + struct reiserfs_journal_cnode *cn; int ret = 0 ; - /* if someone is getting the commit list, we must wait for them */ - while (atomic_read(&(jl->j_commit_flushing))) { - sleep_on(&(jl->j_commit_wait)) ; - } - /* if someone is flushing this list, we must wait for them */ - while (atomic_read(&(jl->j_flushing))) { - sleep_on(&(jl->j_flush_wait)) ; - } - /* was it flushed while we slept? */ - if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) { - return 0 ; + jl->j_state |= LIST_TOUCHED; + del_from_work_list(s, jl); + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { + return 0; } - /* this list is now ours, we can change anything we want */ - atomic_set(&(jl->j_flushing), 1) ; - -loop_start: cn = jl->j_realblock ; while(cn) { - saved_bh = NULL ; /* if the blocknr == 0, this has been cleared from the hash, ** skip it */ if (cn->blocknr == 0) { goto next ; } + if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { + struct buffer_head *tmp_bh; + /* we can race against journal_mark_freed when we try + * to lock_buffer(cn->bh), so we have to inc the buffer + * count, and recheck things after locking + */ + tmp_bh = cn->bh; + get_bh(tmp_bh); + lock_buffer(tmp_bh); + if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { + if (!buffer_journal_dirty(tmp_bh) || + reiserfs_buffer_prepared(tmp_bh)) + BUG(); + add_to_chunk(chunk, tmp_bh); + ret++; + } else { + /* note, cn->bh might be null now */ + unlock_buffer(tmp_bh); + } + put_bh(tmp_bh); + } +next: + cn = cn->next ; + cond_resched(); + } + return ret ; +} + +/* used by flush_commit_list */ +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal_list *pjl; + int ret = 0 ; + + jl->j_state |= LIST_DIRTY; + cn = jl->j_realblock ; + while(cn) { /* look for a more recent transaction that logged this ** buffer. Only the most recent transaction with a buffer in ** it is allowed to send that buffer to disk */ - pjl = find_newer_jl_for_cn(cn) ; - if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) && - can_dirty(cn)) - { - if (!test_bit(BH_JPrepared, &cn->bh->b_state)) { - set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; - submit_logged_buffer(cn->bh) ; - } else { - /* someone else is using this buffer. We can't - ** send it to disk right now because they might - ** be changing/logging it. - */ - ret = 1 ; - } - } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { - clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; - if (!pjl && cn->bh) { - wait_on_buffer(cn->bh) ; - } - /* check again, someone could have logged while we scheduled */ - pjl = find_newer_jl_for_cn(cn) ; - - /* before the JDirty_wait bit is set, the - ** buffer is added to the hash list. So, if we are - ** run in the middle of a do_journal_end, we will notice - ** if this buffer was logged and added from the latest - ** transaction. In this case, we don't want to decrement - ** b_count - */ - if (!pjl && cn->bh && buffer_journal_dirty(cn->bh)) { - blocknr = cn->blocknr ; - walk_cn = cn ; - saved_bh= cn->bh ; - /* update all older transactions to show this block - ** was flushed - */ - mark_buffer_notjournal_dirty(cn->bh) ; - while(walk_cn) { - if (walk_cn->bh && walk_cn->blocknr == blocknr && - walk_cn->sb == cn->sb) { - if (walk_cn->jlist) { - atomic_dec(&(walk_cn->jlist->j_nonzerolen)) ; - } - walk_cn->bh = NULL ; - } - walk_cn = walk_cn->hnext ; - } - if (atomic_read(&saved_bh->b_count) < 1) { - reiserfs_warning("clm-2081: bad count on %lu\n", - saved_bh->b_blocknr) ; - } - brelse(saved_bh) ; - } - } - /* - ** if the more recent transaction is committed to the log, - ** this buffer can be considered flushed. Decrement our - ** counters to reflect one less buffer that needs writing. - ** - ** note, this relies on all of the above code being - ** schedule free once pjl comes back non-null. - */ - if (pjl && cn->bh && atomic_read(&pjl->j_commit_left) == 0) { - atomic_dec(&cn->jlist->j_nonzerolen) ; - cn->bh = NULL ; + pjl = find_newer_jl_for_cn(cn) ; + if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh)) + { + if (!can_dirty(cn)) + BUG(); + /* if the buffer is prepared, it will either be logged + * or restored. If restored, we need to make sure + * it actually gets marked dirty + */ + mark_buffer_notjournal_new(cn->bh) ; + if (test_bit(BH_JPrepared, &cn->bh->b_state)) { + set_bit(BH_JRestore_dirty, &cn->bh->b_state); + } else { + set_bit(BH_JTest, &cn->bh->b_state); + mark_buffer_dirty(cn->bh); + } } -next: cn = cn->next ; } - /* the first run through the loop sends all the dirty buffers to - ** ll_rw_block. - ** the second run through the loop does all the accounting - */ - if (run++ == 0) { - goto loop_start ; - } - - atomic_set(&(jl->j_flushing), 0) ; - wake_up(&(jl->j_flush_wait)) ; return ret ; } -/* since we never give dirty buffers to bdflush/kupdate, we have to -** flush them ourselves. This runs through the journal lists, finds -** old metadata in need of flushing and sends it to disk. -** this does not end transactions, commit anything, or free -** cnodes. -** -** returns the highest transaction id that was flushed last time -*/ -static unsigned long reiserfs_journal_kupdate(struct super_block *s) { - struct reiserfs_journal_list *jl ; - int i ; - int start ; - time_t age ; - int ret = 0 ; - start = SB_JOURNAL_LIST_INDEX(s) ; +static int kupdate_transactions(struct super_block *s, + struct reiserfs_journal_list *jl, + struct reiserfs_journal_list **next_jl, + unsigned long *next_trans_id, + int num_blocks, + int num_trans) { + int ret = 0; + int written = 0 ; + int transactions_flushed = 0; + unsigned long orig_trans_id = jl->j_trans_id; + struct buffer_chunk chunk; + struct list_head *entry; + chunk.nr = 0; + + down(&SB_JOURNAL(s)->j_flush_sem); + if (!journal_list_still_alive(s, orig_trans_id)) { + goto done; + } + + /* we've got j_flush_sem held, nobody is going to delete any + * of these lists out from underneath us + */ + while((num_trans && transactions_flushed < num_trans) || + (!num_trans && written < num_blocks)) { + + if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || + atomic_read(&jl->j_commit_left)) + { + del_from_work_list(s, jl); + break; + } + ret = write_one_transaction(s, jl, &chunk); - /* safety check to prevent flush attempts during a mount */ - if (start < 0) { - return 0 ; - } - i = (start + 1) % JOURNAL_LIST_COUNT ; - while(i != start) { - jl = SB_JOURNAL_LIST(s) + i ; - age = get_seconds() - jl->j_timestamp ; - if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) && - atomic_read(&(jl->j_nonzerolen)) > 0 && - atomic_read(&(jl->j_commit_left)) == 0) { - - if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) { - break ; - } - /* if ret was already 1, we want to preserve that */ - ret |= kupdate_one_transaction(s, jl) ; - } - if (atomic_read(&(jl->j_nonzerolen)) > 0) { - ret |= 1 ; + if (ret < 0) + goto done; + transactions_flushed++; + written += ret; + entry = jl->j_list.next; + + /* did we wrap? */ + if (entry == &SB_JOURNAL(s)->j_journal_list) { + break; } - i = (i + 1) % JOURNAL_LIST_COUNT ; + jl = JOURNAL_LIST_ENTRY(entry); + + /* don't bother with older transactions */ + if (jl->j_trans_id <= orig_trans_id) + break; } - return ret ; + if (chunk.nr) { + write_chunk(&chunk); + } + +done: + up(&SB_JOURNAL(s)->j_flush_sem); + return ret; +} + +/* for o_sync and fsync heavy applications, they tend to use +** all the journa list slots with tiny transactions. These +** trigger lots and lots of calls to update the header block, which +** adds seeks and slows things down. +** +** This function tries to clear out a large chunk of the journal lists +** at once, which makes everything faster since only the newest journal +** list updates the header block +*/ +static int flush_used_journal_lists(struct super_block *s, + struct reiserfs_journal_list *jl) { + unsigned long len = 0; + unsigned long cur_len; + int ret; + int i; + struct reiserfs_journal_list *tjl; + struct reiserfs_journal_list *flush_jl; + unsigned long trans_id; + + flush_jl = tjl = jl; + + /* flush for 256 transactions or 256 blocks, whichever comes first */ + for(i = 0 ; i < 256 && len < 256 ; i++) { + if (atomic_read(&tjl->j_commit_left) || + tjl->j_trans_id < jl->j_trans_id) { + break; + } + cur_len = atomic_read(&tjl->j_nonzerolen); + if (cur_len > 0) { + tjl->j_state &= ~LIST_TOUCHED; + } + len += cur_len; + flush_jl = tjl; + if (tjl->j_list.next == &SB_JOURNAL(s)->j_journal_list) + break; + tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); + } + /* try to find a group of blocks we can flush across all the + ** transactions, but only bother if we've actually spanned + ** across multiple lists + */ + if (flush_jl != jl) { + ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + } + flush_journal_list(s, flush_jl, 1); + return 0; } /* @@ -1262,6 +1413,10 @@ void remove_journal_hash(struct super_bl } static void free_journal_ram(struct super_block *p_s_sb) { + reiserfs_kfree(SB_JOURNAL(p_s_sb)->j_current_jl, + sizeof(struct reiserfs_journal_list), p_s_sb); + SB_JOURNAL(p_s_sb)->j_num_lists--; + vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ; free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ; free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */ @@ -1392,7 +1547,7 @@ static int journal_transaction_is_valid( } brelse(c_bh) ; reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid " - "transaction start offset %lu, len %d id %d\n", + "transaction start offset %llu, len %d id %d\n", d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), get_desc_trans_len(desc), get_desc_trans_id(desc)) ; return 1 ; @@ -1432,7 +1587,7 @@ static int journal_read_transaction(stru desc = (struct reiserfs_journal_desc *)d_bh->b_data ; trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: " - "journal_read_transaction, offset %lu, len %d mount_id %d\n", + "journal_read_transaction, offset %llu, len %d mount_id %d\n", d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), get_desc_trans_len(desc), get_desc_mount_id(desc)) ; if (get_desc_trans_id(desc) < oldest_trans_id) { @@ -1460,7 +1615,7 @@ static int journal_read_transaction(stru commit = (struct reiserfs_journal_commit *)c_bh->b_data ; if (journal_compare_desc_commit(p_s_sb, desc, commit)) { reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, " - "commit offset %ld had bad time %d or length %d\n", + "commit offset %llu had bad time %d or length %d\n", c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), get_commit_trans_id(commit), get_commit_trans_len(commit)); brelse(c_bh) ; @@ -1628,7 +1783,7 @@ static int journal_read(struct super_blo printk("reiserfs: checking transaction log (%s) for (%s)\n", bdevname(SB_JOURNAL(p_s_sb)->j_dev_bd, b), reiserfs_bdevname(p_s_sb)); - start = get_seconds() ; + start = get_seconds(); /* step 1, read in the journal header block. Check the transaction it says ** is the first unflushed, and if that transaction is not valid, @@ -1688,7 +1843,7 @@ static int journal_read(struct super_blo oldest_start = d_bh->b_blocknr ; newest_mount_id = get_desc_mount_id(desc) ; reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting " - "oldest_start to offset %lu, trans_id %lu\n", + "oldest_start to offset %llu, trans_id %lu\n", oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), oldest_trans_id) ; } else if (oldest_trans_id > get_desc_trans_id(desc)) { @@ -1716,7 +1871,7 @@ start_log_replay: cur_dblock = oldest_start ; if (oldest_trans_id) { reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay " - "from offset %lu, trans_id %lu\n", + "from offset %llu, trans_id %lu\n", cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), oldest_trans_id) ; @@ -1770,70 +1925,26 @@ start_log_replay: return 0 ; } - -struct reiserfs_journal_commit_task { - struct super_block *p_s_sb ; - int jindex ; - int wake_on_finish ; /* if this is one, we wake the task_done queue, if it - ** is zero, we free the whole struct on finish - */ - struct reiserfs_journal_commit_task *self ; - struct work_struct work; -} ; - -static void reiserfs_journal_commit_task_func(void *__ct) { - struct reiserfs_journal_commit_task *ct = __ct; - struct reiserfs_journal_list *jl ; - - reiserfs_write_lock(ct->p_s_sb); - - jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ; - - flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ; - - if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 && - atomic_read(&(jl->j_commit_left)) == 0) { - kupdate_one_transaction(ct->p_s_sb, jl) ; - } - reiserfs_kfree(ct->self, sizeof(struct reiserfs_journal_commit_task), ct->p_s_sb) ; - reiserfs_write_unlock(ct->p_s_sb); -} - -static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct, - struct super_block *p_s_sb, - int jindex) { - if (!ct) { - reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ; - } - ct->p_s_sb = p_s_sb ; - ct->jindex = jindex ; - INIT_WORK(&ct->work, reiserfs_journal_commit_task_func, ct); - ct->self = ct ; -} - -static void commit_flush_async(struct super_block *p_s_sb, int jindex) { - struct reiserfs_journal_commit_task *ct ; - /* using GFP_NOFS, GFP_KERNEL could try to flush inodes, which will try - ** to start/join a transaction, which will deadlock - */ - ct = reiserfs_kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_NOFS, p_s_sb) ; - if (ct) { - setup_commit_task_arg(ct, p_s_sb, jindex) ; - queue_work(commit_wq, &ct->work) ; - } else { -#ifdef CONFIG_REISERFS_CHECK - reiserfs_warning("journal-1540: kmalloc failed, doing sync commit\n") ; -#endif - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; - } +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) +{ + struct reiserfs_journal_list *jl; +retry: + jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s); + if (!jl) { + yield(); + goto retry; + } + memset(jl, 0, sizeof(*jl)); + INIT_LIST_HEAD(&jl->j_list); + INIT_LIST_HEAD(&jl->j_working_list); + sema_init(&jl->j_commit_lock, 1); + SB_JOURNAL(s)->j_num_lists++; + get_journal_list(jl); + return jl; } static void journal_list_init(struct super_block *p_s_sb) { - int i ; - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ; - init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ; - } + SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb); } static int release_journal_dev( struct super_block *super, @@ -1924,6 +2035,7 @@ int journal_init(struct super_block *p_s struct reiserfs_super_block * rs; struct reiserfs_journal_header *jh; struct reiserfs_journal *journal; + struct reiserfs_journal_list *jl; char b[BDEVNAME_SIZE]; journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ; @@ -1934,6 +2046,8 @@ int journal_init(struct super_block *p_s memset(journal, 0, sizeof(struct reiserfs_journal)) ; INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ; INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_prealloc_list); + INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_working_list); + INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_journal_list); reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap, SB_BMAP_NR(p_s_sb)) ; allocate_bitmap_nodes(p_s_sb) ; @@ -2041,10 +2155,6 @@ int journal_init(struct super_block *p_s brelse (bhjh); SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ; - SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */ - - /* clear out the journal list array */ - memset(SB_JOURNAL_LIST(p_s_sb), 0, sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ; journal_list_init(p_s_sb) ; memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; @@ -2061,13 +2171,13 @@ int journal_init(struct super_block *p_s SB_JOURNAL(p_s_sb)->j_last = NULL ; SB_JOURNAL(p_s_sb)->j_first = NULL ; init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ; + sema_init(&SB_JOURNAL(p_s_sb)->j_lock, 1); + sema_init(&SB_JOURNAL(p_s_sb)->j_flush_sem, 1); SB_JOURNAL(p_s_sb)->j_trans_id = 10 ; SB_JOURNAL(p_s_sb)->j_mount_id = 10 ; SB_JOURNAL(p_s_sb)->j_state = 0 ; atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ; - atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ; SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ; SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ; SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ? num_cnodes : 0 ; @@ -2075,8 +2185,9 @@ int journal_init(struct super_block *p_s SB_JOURNAL(p_s_sb)->j_must_wait = 0 ; init_journal_hash(p_s_sb) ; - SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ; - if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) { + jl = SB_JOURNAL(p_s_sb)->j_current_jl; + jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl); + if (!jl->j_list_bitmap) { reiserfs_warning("journal-2005, get_list_bitmap failed for journal list 0\n") ; goto free_and_return; } @@ -2084,16 +2195,12 @@ int journal_init(struct super_block *p_s reiserfs_warning("Replay Failure, unable to mount\n") ; goto free_and_return; } - SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this - where it belongs */ - - if (reiserfs_dont_log (p_s_sb)) - return 0; reiserfs_mounted_fs_count++ ; if (reiserfs_mounted_fs_count <= 1) commit_wq = create_workqueue("reiserfs"); + INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb); return 0 ; free_and_return: free_journal_ram(p_s_sb); @@ -2107,8 +2214,6 @@ free_and_return: */ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { time_t now = get_seconds() ; - if (reiserfs_dont_log(th->t_super)) - return 0 ; /* cannot restart while nested */ if (th->t_refcount > 1) return 0 ; @@ -2148,6 +2253,35 @@ void reiserfs_wait_on_write_block(struct !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ; } +static void queue_log_writer(struct super_block *s) { + set_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state); + sleep_on(&SB_JOURNAL(s)->j_join_wait); +} + +static void wake_queued_writers(struct super_block *s) { + if (test_and_clear_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state)) + wake_up(&SB_JOURNAL(s)->j_join_wait); +} + +static void let_transaction_grow(struct super_block *sb, + unsigned long trans_id) +{ + unsigned long bcount = SB_JOURNAL(sb)->j_bcount; + while(1) { + yield(); + while ((atomic_read(&SB_JOURNAL(sb)->j_wcount) > 0 || + atomic_read(&SB_JOURNAL(sb)->j_jlock)) && + SB_JOURNAL(sb)->j_trans_id == trans_id) { + queue_log_writer(sb); + } + if (SB_JOURNAL(sb)->j_trans_id != trans_id) + break; + if (bcount == SB_JOURNAL(sb)->j_bcount) + break; + bcount = SB_JOURNAL(sb)->j_bcount; + } +} + /* join == true if you must join an existing transaction. ** join == false if you can deal with waiting for others to finish ** @@ -2157,15 +2291,14 @@ void reiserfs_wait_on_write_block(struct static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) { time_t now = get_seconds() ; int old_trans_id ; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_transaction_handle myth; + int sched_count = 0; reiserfs_check_lock_depth("journal_begin") ; RFALSE( p_s_sb->s_flags & MS_RDONLY, "clm-2078: calling journal_begin on readonly FS") ; - if (reiserfs_dont_log(p_s_sb)) { - th->t_super = p_s_sb ; /* others will check this for the don't log flag */ - return 0 ; - } PROC_INFO_INC( p_s_sb, journal.journal_being ); /* set here for journal_join */ th->t_refcount = 1; @@ -2173,66 +2306,76 @@ static int do_journal_begin_r(struct rei relock: lock_journal(p_s_sb) ; + journal->j_bcount++; - if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) { + if (test_bit(WRITERS_BLOCKED, &journal->j_state)) { unlock_journal(p_s_sb) ; reiserfs_wait_on_write_block(p_s_sb) ; PROC_INFO_INC( p_s_sb, journal.journal_relock_writers ); goto relock ; } + now = get_seconds(); /* if there is no room in the journal OR ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning ** we don't sleep if there aren't other writers */ - if ( (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) || - ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || - (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && - (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) || - (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) || - (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) { + if ( (!join && journal->j_must_wait > 0) || + ( !join && (journal->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || + (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 && + (now - journal->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) || + (!join && atomic_read(&journal->j_jlock)) || + (!join && journal->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) { + old_trans_id = journal->j_trans_id; unlock_journal(p_s_sb) ; /* allow others to finish this transaction */ - /* if writer count is 0, we can just force this transaction to end, and start - ** a new one afterwards. - */ - if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) { - struct reiserfs_transaction_handle myth ; - journal_join(&myth, p_s_sb, 1) ; - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ; + if (!join && (journal->j_len_alloc + nblocks + 2) >= + SB_JOURNAL_MAX_BATCH(p_s_sb) && + ((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75)) + { + if (atomic_read(&journal->j_wcount) > 10) { + sched_count++; + queue_log_writer(p_s_sb); + goto relock; + } + } + /* don't mess with joining the transaction if all we have to do is + * wait for someone else to do a commit + */ + if (atomic_read(&journal->j_jlock)) { + while (journal->j_trans_id == old_trans_id && + atomic_read(&journal->j_jlock)) { + queue_log_writer(p_s_sb); + } + goto relock; + } + journal_join(&myth, p_s_sb, 1) ; + + /* someone might have ended the transaction while we joined */ + if (old_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) { + do_journal_end(&myth, p_s_sb, 1, 0) ; } else { - /* but if the writer count isn't zero, we have to wait for the current writers to finish. - ** They won't batch on transaction end once we set j_jlock - */ - atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ; - old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; - while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && - SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) { - sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - } + do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ; } + PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount ); goto relock ; } - - if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */ - SB_JOURNAL(p_s_sb)->j_trans_start_time = now ; + /* we are the first writer, set trans_id */ + if (journal->j_trans_start_time == 0) { + journal->j_trans_start_time = get_seconds(); } - atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ; - SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ; + atomic_inc(&(journal->j_wcount)) ; + journal->j_len_alloc += nblocks ; th->t_blocks_logged = 0 ; th->t_blocks_allocated = nblocks ; - th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; + th->t_trans_id = journal->j_trans_id ; unlock_journal(p_s_sb) ; - p_s_sb->s_dirt = 1; return 0 ; } - static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { struct reiserfs_transaction_handle *cur_th = current->journal_info; @@ -2277,11 +2420,6 @@ int journal_begin(struct reiserfs_transa return ret ; } -/* not used at all */ -int journal_prepare(struct super_block * p_s_sb, struct buffer_head *bh) { - return 0 ; -} - /* ** puts bh into the current transaction. If it was already there, reorders removes the ** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order). @@ -2297,18 +2435,14 @@ int journal_mark_dirty(struct reiserfs_t int prepared = 0 ; PROC_INFO_INC( p_s_sb, journal.mark_dirty ); - if (reiserfs_dont_log(th->t_super)) { - mark_buffer_dirty(bh) ; - return 0 ; - } - if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) { reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id); } - p_s_sb->s_dirt = 1 ; + p_s_sb->s_dirt = 1; prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ; + clear_bit(BH_JRestore_dirty, &bh->b_state); /* already in this transaction, we are done */ if (buffer_journaled(bh)) { PROC_INFO_INC( p_s_sb, journal.mark_dirty_already ); @@ -2319,13 +2453,12 @@ int journal_mark_dirty(struct reiserfs_t ** a dirty or journal_dirty or locked buffer to be logged, as some changes ** could get to disk too early. NOT GOOD. */ - if (!prepared || buffer_locked(bh)) { + if (!prepared || buffer_locked(bh) || buffer_dirty(bh)) { printk("journal-1777: buffer %llu bad state %cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT\n", (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', buffer_locked(bh) ? ' ' : '!', buffer_dirty(bh) ? ' ' : '!', buffer_journal_dirty(bh) ? ' ' : '!') ; } - count_already_incd = clear_prepared_bits(bh) ; if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) { printk("journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ; @@ -2344,14 +2477,6 @@ int journal_mark_dirty(struct reiserfs_t mark_buffer_notjournal_dirty(bh) ; } - if (buffer_dirty(bh)) { - clear_buffer_dirty(bh) ; - } - - if (buffer_journaled(bh)) { /* must double check after getting lock */ - goto done ; - } - if (SB_JOURNAL(p_s_sb)->j_len > SB_JOURNAL(p_s_sb)->j_len_alloc) { SB_JOURNAL(p_s_sb)->j_len_alloc = SB_JOURNAL(p_s_sb)->j_len + JOURNAL_PER_BALANCE_CNT ; } @@ -2391,24 +2516,6 @@ int journal_mark_dirty(struct reiserfs_t SB_JOURNAL(p_s_sb)->j_first = cn ; SB_JOURNAL(p_s_sb)->j_last = cn ; } -done: - return 0 ; -} - -/* -** if buffer already in current transaction, do a journal_mark_dirty -** otherwise, just mark it dirty and move on. Used for writes to meta blocks -** that don't need journaling -*/ -int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) { - if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) || - buffer_journal_dirty(bh)) { - return journal_mark_dirty(th, p_s_sb, bh) ; - } - if (get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_blocknr)) { - return journal_mark_dirty(th, p_s_sb, bh) ; - } - mark_buffer_dirty(bh) ; return 0 ; } @@ -2474,7 +2581,6 @@ static int remove_from_transaction(struc if (atomic_read(&(bh->b_count)) < 0) { printk("journal-1752: remove from trans, b_count < 0\n") ; } - if (!buffer_locked(bh)) reiserfs_clean_and_file_buffer(bh) ; ret = 1 ; } SB_JOURNAL(p_s_sb)->j_len-- ; @@ -2500,7 +2606,7 @@ static int can_dirty(struct reiserfs_jou int can_dirty = 1 ; /* first test hprev. These are all newer than cn, so any node here - ** with the name block number and dev means this node can't be sent + ** with the same block number and dev means this node can't be sent ** to disk right now. */ while(cur && can_dirty) { @@ -2551,72 +2657,56 @@ int journal_end_sync(struct reiserfs_tra ** change flush_commit_lists to have a repeat parameter too. ** */ -void flush_async_commits(struct super_block *p_s_sb) { - int i ; - - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) { - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; - } +static void flush_async_commits(void *p) { + struct super_block *p_s_sb = p; + struct reiserfs_journal_list *jl; + struct list_head *entry; + + lock_kernel(); + if (!list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) { + /* last entry is the youngest, commit it and you get everything */ + entry = SB_JOURNAL(p_s_sb)->j_journal_list.prev; + jl = JOURNAL_LIST_ENTRY(entry); + flush_commit_list(p_s_sb, jl, 1); } + unlock_kernel(); } /* ** flushes any old transactions to disk ** ends the current transaction if it is too old -** -** also calls flush_journal_list with old_only == 1, which allows me to reclaim -** memory and such from the journal lists whose real blocks are all on disk. -** -** called by sync_dev_journal from buffer.c */ -int flush_old_commits(struct super_block *p_s_sb, int immediate) { - int i ; - int count = 0; - int start ; - time_t now ; - struct reiserfs_transaction_handle th ; - - start = SB_JOURNAL_LIST_INDEX(p_s_sb) ; - now = get_seconds() ; - - /* safety check so we don't flush while we are replaying the log during mount */ - if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) { - return 0 ; - } - /* starting with oldest, loop until we get to the start */ - i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; - while(i != start) { - if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > SB_JOURNAL_MAX_COMMIT_AGE(p_s_sb) || - immediate)) { - /* we have to check again to be sure the current transaction did not change */ - if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) { - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; - } - } - i = (i + 1) % JOURNAL_LIST_COUNT ; - count++ ; - } - /* now, check the current transaction. If there are no writers, and it is too old, finish it, and - ** force the commit blocks to disk - */ - if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 && - SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && - SB_JOURNAL(p_s_sb)->j_len > 0 && - (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) { - journal_join(&th, p_s_sb, 1) ; - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ; - } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case. If they say to - flush, we must be sure old transactions hit the disk too. */ - journal_join(&th, p_s_sb, 1) ; - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ; - } - reiserfs_journal_kupdate(p_s_sb) ; - return 0 ; +int reiserfs_flush_old_commits(struct super_block *p_s_sb) { + time_t now ; + struct reiserfs_transaction_handle th ; + + now = get_seconds(); + /* safety check so we don't flush while we are replaying the log during + * mount + */ + if (list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) { + return 0 ; + } + + /* check the current transaction. If there are no writers, and it is + * too old, finish it, and force the commit blocks to disk + */ + if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 && + SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && + SB_JOURNAL(p_s_sb)->j_len > 0 && + (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > + SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) + { + journal_join(&th, p_s_sb, 1) ; + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + + /* we're only being called from kreiserfsd, it makes no sense to do + ** an async commit so that kreiserfsd can do it later + */ + do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ; + } + return p_s_sb->s_dirt; } /* @@ -2637,6 +2727,7 @@ static int check_journal_end(struct reis int flush = flags & FLUSH_ALL ; int commit_now = flags & COMMIT_NOW ; int wait_on_commit = flags & WAIT ; + struct reiserfs_journal_list *jl; if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) { reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", @@ -2653,13 +2744,7 @@ static int check_journal_end(struct reis ** care of in this trans */ if (SB_JOURNAL(p_s_sb)->j_len == 0) { - int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ; - unlock_journal(p_s_sb) ; - if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) > 0 && wcount <= 0) { - atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ; - wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - } - return 0 ; + BUG(); } /* if wcount > 0, and we are called to with flush or commit_now, ** we wait on j_join_wait. We will wake up when the last writer has @@ -2669,24 +2754,37 @@ static int check_journal_end(struct reis */ if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) { if (flush || commit_now) { - int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ; + unsigned trans_id ; + + jl = SB_JOURNAL(p_s_sb)->j_current_jl; + trans_id = jl->j_trans_id; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ; if (flush) { SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ; } unlock_journal(p_s_sb) ; + /* sleep while the current transaction is still j_jlocked */ - while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && - SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) { - sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - } - if (commit_now) { - if (wait_on_commit) { - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ; - } else { - commit_flush_async(p_s_sb, orig_jindex) ; + while(SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) { + if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) { + queue_log_writer(p_s_sb); + } else { + lock_journal(p_s_sb); + if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) { + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ; + } + unlock_journal(p_s_sb); } } + if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) { + BUG(); + } + if (commit_now && journal_list_still_alive(p_s_sb, trans_id) && + wait_on_commit) + { + flush_commit_list(p_s_sb, jl, 1) ; + } return 0 ; } unlock_journal(p_s_sb) ; @@ -2694,7 +2792,7 @@ static int check_journal_end(struct reis } /* deal with old transactions where we are the last writers */ - now = get_seconds() ; + now = get_seconds(); if ((now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) { commit_now = 1 ; SB_JOURNAL(p_s_sb)->j_next_async_flush = 1 ; @@ -2734,25 +2832,21 @@ int journal_mark_freed(struct reiserfs_t struct buffer_head *bh = NULL ; struct reiserfs_list_bitmap *jb = NULL ; int cleaned = 0 ; - - if (reiserfs_dont_log(th->t_super)) { - bh = sb_find_get_block(p_s_sb, blocknr) ; - if (bh && buffer_dirty (bh)) { - printk ("journal_mark_freed(dont_log): dirty buffer on hash list: %lx %d\n", bh->b_state, blocknr); - BUG (); - } - brelse (bh); - return 0 ; + + cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, blocknr); + if (cn && cn->bh) { + bh = cn->bh ; + get_bh(bh) ; } - bh = sb_find_get_block(p_s_sb, blocknr) ; /* if it is journal new, we just remove it from this transaction */ if (bh && buffer_journal_new(bh)) { mark_buffer_notjournal_new(bh) ; clear_prepared_bits(bh) ; + reiserfs_clean_and_file_buffer(bh) ; cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; } else { /* set the bit for this block in the journal bitmap for this transaction */ - jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ; + jb = SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap; if (!jb) { reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ; } @@ -2762,6 +2856,7 @@ int journal_mark_freed(struct reiserfs_t if (bh) { clear_prepared_bits(bh) ; + reiserfs_clean_and_file_buffer(bh) ; } cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; @@ -2793,7 +2888,6 @@ int journal_mark_freed(struct reiserfs_t } if (bh) { - reiserfs_clean_and_file_buffer(bh) ; put_bh(bh) ; /* get_hash grabs the buffer */ if (atomic_read(&(bh->b_count)) < 0) { printk("journal-2165: bh->b_count < 0\n") ; @@ -2803,50 +2897,84 @@ int journal_mark_freed(struct reiserfs_t } void reiserfs_update_inode_transaction(struct inode *inode) { - - REISERFS_I(inode)->i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb); - + REISERFS_I(inode)->i_jl = SB_JOURNAL(inode->i_sb)->j_current_jl; REISERFS_I(inode)->i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ; } -static int reiserfs_inode_in_this_transaction(struct inode *inode) { - if (REISERFS_I(inode)->i_trans_id == SB_JOURNAL(inode->i_sb)->j_trans_id || - REISERFS_I(inode)->i_trans_id == 0) { - return 1; - } - return 0 ; +static void __commit_trans_jl(struct inode *inode, unsigned long id, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_transaction_handle th ; + struct super_block *sb = inode->i_sb ; + + /* is it from the current transaction, or from an unknown transaction? */ + if (id == SB_JOURNAL(sb)->j_trans_id) { + jl = SB_JOURNAL(sb)->j_current_jl; + /* try to let other writers come in and grow this transaction */ + let_transaction_grow(sb, id); + if (SB_JOURNAL(sb)->j_trans_id != id) { + goto flush_commit_only; + } + + journal_begin(&th, sb, 1) ; + + /* someone might have ended this transaction while we joined */ + if (SB_JOURNAL(sb)->j_trans_id != id) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ; + journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ; + journal_end(&th, sb, 1) ; + goto flush_commit_only; + } + + journal_end_sync(&th, sb, 1) ; + + } else { + /* this gets tricky, we have to make sure the journal list in + * the inode still exists. We know the list is still around + * if we've got a larger transaction id than the oldest list + */ +flush_commit_only: + if (journal_list_still_alive(inode->i_sb, id)) { + flush_commit_list(sb, jl, 1) ; + } + } + /* otherwise the list is gone, and long since committed */ } void reiserfs_commit_for_inode(struct inode *inode) { - struct reiserfs_journal_list *jl ; - struct reiserfs_transaction_handle th ; - struct super_block *sb = inode->i_sb ; - - jl = SB_JOURNAL_LIST(sb) + REISERFS_I(inode)->i_trans_index ; - - /* is it from the current transaction, or from an unknown transaction? */ - if (reiserfs_inode_in_this_transaction(inode)) { - journal_join(&th, sb, 1) ; - reiserfs_update_inode_transaction(inode) ; - journal_end_sync(&th, sb, 1) ; - } else if (jl->j_trans_id == REISERFS_I(inode)->i_trans_id) { - flush_commit_list(sb, jl, 1) ; - } - /* if the transaction id does not match, this list is long since flushed - ** and we don't have to do anything here - */ + unsigned long id = REISERFS_I(inode)->i_trans_id; + struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; + + /* for the whole inode, assume unset id means it was + * changed in the current transaction. More conservative + */ + if (!id || !jl) { + reiserfs_update_inode_transaction(inode) ; + id = REISERFS_I(inode)->i_trans_id; + /* jl will be updated in __commit_trans_jl */ + } + + __commit_trans_jl(inode, id, jl); } void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, struct buffer_head *bh) { - PROC_INFO_INC( p_s_sb, journal.restore_prepared ); - if (reiserfs_dont_log (p_s_sb)) - return; - - if (!bh) { - return ; - } - clear_bit(BH_JPrepared, &bh->b_state) ; + PROC_INFO_INC( p_s_sb, journal.restore_prepared ); + if (!bh) { + return ; + } + if (test_and_clear_bit(BH_JRestore_dirty, &bh->b_state) && + buffer_journal_dirty(bh)) { + struct reiserfs_journal_cnode *cn; + cn = get_journal_hash_dev(p_s_sb, + SB_JOURNAL(p_s_sb)->j_list_hash_table, + bh->b_blocknr); + if (cn && can_dirty(cn)) { + set_bit(BH_JTest, &bh->b_state); + mark_buffer_dirty(bh); + } + } + clear_bit(BH_JPrepared, &bh->b_state) ; } extern struct tree_balance *cur_tb ; @@ -2857,29 +2985,39 @@ extern struct tree_balance *cur_tb ; ** wait on it. ** */ -void reiserfs_prepare_for_journal(struct super_block *p_s_sb, +int reiserfs_prepare_for_journal(struct super_block *p_s_sb, struct buffer_head *bh, int wait) { - int retry_count = 0 ; - PROC_INFO_INC( p_s_sb, journal.prepare ); - if (reiserfs_dont_log (p_s_sb)) - return; - while(!test_bit(BH_JPrepared, &bh->b_state) || - (wait && buffer_locked(bh))) { - if (buffer_journaled(bh)) { - set_bit(BH_JPrepared, &bh->b_state) ; - return ; - } - set_bit(BH_JPrepared, &bh->b_state) ; - if (wait) { - RFALSE( buffer_locked(bh) && cur_tb != NULL, - "waiting while do_balance was running\n") ; - wait_on_buffer(bh) ; + if (test_set_buffer_locked(bh)) { + if (!wait) + return 0; + lock_buffer(bh); + } + set_bit(BH_JPrepared, &bh->b_state); + if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { + clear_bit(BH_JTest, &bh->b_state); + set_bit(BH_JRestore_dirty, &bh->b_state); + } + unlock_buffer(bh); + return 1; +} + +static void flush_old_journal_lists(struct super_block *s) { + struct reiserfs_journal_list *jl; + struct list_head *entry; + time_t now = get_seconds(); + + while(!list_empty(&SB_JOURNAL(s)->j_journal_list)) { + entry = SB_JOURNAL(s)->j_journal_list.next; + jl = JOURNAL_LIST_ENTRY(entry); + /* this check should always be run, to send old lists to disk */ + if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) { + flush_used_journal_lists(s, jl); + } else { + break; + } } - PROC_INFO_INC( p_s_sb, journal.prepare_retry ); - retry_count++ ; - } } /* @@ -2898,23 +3036,24 @@ static int do_journal_end(struct reiserf struct buffer_head *c_bh ; /* commit bh */ struct buffer_head *d_bh ; /* desc bh */ int cur_write_start = 0 ; /* start index of current log write */ - int cur_blocks_left = 0 ; /* number of journal blocks left to write */ int old_start ; int i ; - int jindex ; - int orig_jindex ; int flush = flags & FLUSH_ALL ; - int commit_now = flags & COMMIT_NOW ; int wait_on_commit = flags & WAIT ; - struct reiserfs_super_block *rs ; - int trans_half ; + struct reiserfs_journal_list *jl, *temp_jl; + struct list_head *entry, *safe; + unsigned long jindex; + unsigned long commit_trans_id; + int trans_half; if (th->t_refcount > 1) BUG() ; current->journal_info = th->t_handle_save; - if (reiserfs_dont_log(th->t_super)) { - return 0 ; + reiserfs_check_lock_depth("journal end"); + if (SB_JOURNAL(p_s_sb)->j_len == 0) { + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; } lock_journal(p_s_sb) ; @@ -2923,24 +3062,24 @@ static int do_journal_end(struct reiserf flush = 1 ; } if (SB_JOURNAL(p_s_sb)->j_next_async_flush) { - flags |= COMMIT_NOW ; - commit_now = 1 ; + flags |= COMMIT_NOW | WAIT; + wait_on_commit = 1; } /* check_journal_end locks the journal, and unlocks if it does not return 1 ** it tells us if we should continue with the journal_end, or just return */ if (!check_journal_end(th, p_s_sb, nblocks, flags)) { - return 0 ; + p_s_sb->s_dirt = 1; + wake_queued_writers(p_s_sb); + goto out ; } /* check_journal_end might set these, check again */ if (SB_JOURNAL(p_s_sb)->j_next_full_flush) { flush = 1 ; } - if (SB_JOURNAL(p_s_sb)->j_next_async_flush) { - commit_now = 1 ; - } + /* ** j must wait means we have to flush the log blocks, and the real blocks for ** this transaction @@ -2957,10 +3096,9 @@ static int do_journal_end(struct reiserf current->journal_info = th->t_handle_save ; #endif - rs = SB_DISK_SUPER_BLOCK(p_s_sb) ; /* setup description block */ d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_JOURNAL(p_s_sb)->j_start) ; - set_buffer_uptodate(d_bh) ; + set_buffer_uptodate(d_bh); desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ; memset(d_bh->b_data, 0, d_bh->b_size) ; memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ; @@ -2975,28 +3113,33 @@ static int do_journal_end(struct reiserf set_buffer_uptodate(c_bh) ; /* init this journal list */ - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2); - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ; - - /* which is faster, locking/unlocking at the start and end of the for - ** or locking once per iteration around the insert_journal_hash? - ** eitherway, we are write locking insert_journal_hash. The ENTIRE FOR - ** LOOP MUST not cause schedule to occur. - */ + jl = SB_JOURNAL(p_s_sb)->j_current_jl; + + /* we lock the commit before doing anything because + * we want to make sure nobody tries to run flush_commit_list until + * the new transaction is fully setup, and we've already flushed the + * ordered bh list + */ + down(&jl->j_commit_lock); + + /* save the transaction id in case we need to commit it later */ + commit_trans_id = jl->j_trans_id; + + atomic_set(&jl->j_older_commits_done, 0) ; + jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; + jl->j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ; + jl->j_commit_bh = c_bh ; + jl->j_start = SB_JOURNAL(p_s_sb)->j_start ; + jl->j_len = SB_JOURNAL(p_s_sb)->j_len ; + atomic_set(&jl->j_nonzerolen, SB_JOURNAL(p_s_sb)->j_len) ; + atomic_set(&jl->j_commit_left, SB_JOURNAL(p_s_sb)->j_len + 2); + jl->j_realblock = NULL ; - /* for each real block, add it to the journal list hash, + /* The ENTIRE FOR LOOP MUST not cause schedule to occur. + ** for each real block, add it to the journal list hash, ** copy into real block index array in the commit or desc block */ - trans_half = journal_trans_half(p_s_sb->s_blocksize) ; + trans_half = journal_trans_half(p_s_sb->s_blocksize); for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) { if (test_bit(BH_JDirty, &cn->bh->b_state) ) { jl_cn = get_cnode(p_s_sb) ; @@ -3004,7 +3147,7 @@ static int do_journal_end(struct reiserf reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ; } if (i == 0) { - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ; + jl->j_realblock = jl_cn ; } jl_cn->prev = last_cn ; jl_cn->next = NULL ; @@ -3020,9 +3163,9 @@ static int do_journal_end(struct reiserf } jl_cn->blocknr = cn->bh->b_blocknr ; jl_cn->state = 0 ; - jl_cn->sb = p_s_sb ; + jl_cn->sb = p_s_sb; jl_cn->bh = cn->bh ; - jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ; + jl_cn->jlist = jl; insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ; if (i < trans_half) { desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ; @@ -3033,7 +3176,6 @@ static int do_journal_end(struct reiserf i-- ; } } - set_desc_trans_len(desc, SB_JOURNAL(p_s_sb)->j_len) ; set_desc_mount_id(desc, SB_JOURNAL(p_s_sb)->j_mount_id) ; set_desc_trans_id(desc, SB_JOURNAL(p_s_sb)->j_trans_id) ; @@ -3041,53 +3183,35 @@ static int do_journal_end(struct reiserf /* special check in case all buffers in the journal were marked for not logging */ if (SB_JOURNAL(p_s_sb)->j_len == 0) { - brelse(d_bh) ; - brelse(c_bh) ; - unlock_journal(p_s_sb) ; - printk("journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ; - atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ; - wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - return 0 ; + BUG(); } + /* we're about to dirty all the log blocks, mark the description block + * dirty now too. Don't mark the commit block dirty until all the + * others are on disk + */ + mark_buffer_dirty(d_bh); + /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ cur_write_start = SB_JOURNAL(p_s_sb)->j_start ; - cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len ; cn = SB_JOURNAL(p_s_sb)->j_first ; jindex = 1 ; /* start at one so we don't get the desc again */ - while(cur_blocks_left > 0) { + while(cn) { + clear_bit(BH_JNew, &(cn->bh->b_state)) ; /* copy all the real blocks into log area. dirty log blocks */ if (test_bit(BH_JDirty, &cn->bh->b_state)) { struct buffer_head *tmp_bh ; tmp_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; - set_buffer_uptodate(tmp_bh) ; + set_buffer_uptodate(tmp_bh); memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ; + mark_buffer_dirty(tmp_bh); jindex++ ; - } else { - /* JDirty cleared sometime during transaction. don't log this one */ - printk("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ; - } - cn = cn->next ; - cur_blocks_left-- ; - } - - /* we are done with both the c_bh and d_bh, but - ** c_bh must be written after all other commit blocks, - ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. - */ - - /* now loop through and mark all buffers from this transaction as JDirty_wait - ** clear the JDirty bit, clear BH_JNew too. - ** if they weren't JDirty, they weren't logged, just relse them and move on - */ - cn = SB_JOURNAL(p_s_sb)->j_first ; - while(cn) { - clear_bit(BH_JNew, &(cn->bh->b_state)) ; - if (test_bit(BH_JDirty, &(cn->bh->b_state))) { set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ; clear_bit(BH_JDirty, &(cn->bh->b_state)) ; } else { + /* JDirty cleared sometime during transaction. don't log this one */ + reiserfs_warning("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ; brelse(cn->bh) ; } next = cn->next ; @@ -3095,30 +3219,17 @@ static int do_journal_end(struct reiserf cn = next ; } - /* unlock the journal list for committing and flushing */ - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ; - - orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ; - jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; - SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ; + /* we are done with both the c_bh and d_bh, but + ** c_bh must be written after all other commit blocks, + ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + */ - /* write any buffers that must hit disk before this commit is done */ - fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock), - &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ; + SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb); - /* honor the flush and async wishes from the caller */ - if (flush) { - - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ; - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ; - } else if (commit_now) { - if (wait_on_commit) { - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ; - } else { - commit_flush_async(p_s_sb, orig_jindex) ; - } - } + /* now it is safe to insert this transaction on the main list */ + list_add_tail(&jl->j_list, &SB_JOURNAL(p_s_sb)->j_journal_list); + list_add_tail(&jl->j_working_list, &SB_JOURNAL(p_s_sb)->j_working_list); + SB_JOURNAL(p_s_sb)->j_num_work_lists++; /* reset journal values for the next transaction */ old_start = SB_JOURNAL(p_s_sb)->j_start ; @@ -3130,57 +3241,96 @@ static int do_journal_end(struct reiserf SB_JOURNAL(p_s_sb)->j_len = 0 ; SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ; SB_JOURNAL(p_s_sb)->j_trans_id++ ; + SB_JOURNAL(p_s_sb)->j_current_jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id; SB_JOURNAL(p_s_sb)->j_must_wait = 0 ; SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ; SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ; SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ; init_journal_hash(p_s_sb) ; + /* tail conversion targets have to hit the disk before we end the + * transaction. Otherwise a later transaction might repack the tail + * before this transaction commits, leaving the data block unflushed and + * clean, if we crash before the later transaction commits, the data block + * is lost. + */ + fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock), + &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ; + up(&jl->j_commit_lock); + + /* honor the flush wishes from the caller, simple commits can + ** be done outside the journal lock, they are done below + */ + if (flush) { + flush_commit_list(p_s_sb, jl, 1) ; + flush_journal_list(p_s_sb, jl, 1) ; + } + + /* if the next transaction has any chance of wrapping, flush ** transactions that might get overwritten. If any journal lists are very ** old flush them as well. */ - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - jindex = i ; - if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) { - if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) { - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; - } - } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && - (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { - if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= - SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) { - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; +first_jl: + list_for_each_safe(entry, safe, &SB_JOURNAL(p_s_sb)->j_journal_list) { + temp_jl = JOURNAL_LIST_ENTRY(entry); + if (SB_JOURNAL(p_s_sb)->j_start <= temp_jl->j_start) { + if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= + temp_jl->j_start) + { + flush_used_journal_lists(p_s_sb, temp_jl); + goto first_jl; + } else if ((SB_JOURNAL(p_s_sb)->j_start + + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) < + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) + { + /* if we don't cross into the next transaction and we don't + * wrap, there is no way we can overlap any later transactions + * break now + */ + break; + } + } else if ((SB_JOURNAL(p_s_sb)->j_start + + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) + { + if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start) + { + flush_used_journal_lists(p_s_sb, temp_jl); + goto first_jl; + } else { + /* we don't overlap anything from out start to the end of the + * log, and our wrapped portion doesn't overlap anything at + * the start of the log. We can break + */ + break; } - } - /* this check should always be run, to send old lists to disk */ - if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && - SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp < - (get_seconds() - (SB_JOURNAL_MAX_TRANS_AGE(p_s_sb) * 4))) { - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; } } + flush_old_journal_lists(p_s_sb); - /* if the next journal_list is still in use, flush it */ - if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) { - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ; - } - - /* we don't want anyone flushing the new transaction's list */ - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + - SB_JOURNAL_LIST_INDEX(p_s_sb)) ; + SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL(p_s_sb)->j_current_jl) ; - if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) { + if (!(SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap)) { reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ; } - unlock_journal(p_s_sb) ; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ; + unlock_journal(p_s_sb) ; /* wake up any body waiting to join. */ + clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state); wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; + + if (!flush) { + if (wait_on_commit) { + if (journal_list_still_alive(p_s_sb, commit_trans_id)) + flush_commit_list(p_s_sb, jl, 1) ; + } else { + queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work); + } + } +out: + reiserfs_check_lock_depth("journal end2"); return 0 ; } - - - diff -puN fs/reiserfs/objectid.c~reiserfs-logging fs/reiserfs/objectid.c --- 25/fs/reiserfs/objectid.c~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/fs/reiserfs/objectid.c Wed Mar 24 15:14:39 2004 @@ -86,7 +86,6 @@ __u32 reiserfs_get_unused_objectid (stru } journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); - s->s_dirt = 1; return unused_objectid; } @@ -105,8 +104,6 @@ void reiserfs_release_objectid (struct r reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); - s->s_dirt = 1; - /* start at the beginning of the objectid map (i = 0) and go to the end of it (i = disk_sb->s_oid_cursize). Linear search is diff -puN fs/reiserfs/procfs.c~reiserfs-logging fs/reiserfs/procfs.c --- 25/fs/reiserfs/procfs.c~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/fs/reiserfs/procfs.c Wed Mar 24 15:14:39 2004 @@ -87,7 +87,7 @@ static int show_super(struct seq_file *m struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "state: \t%s\n" - "mount options: \t%s%s%s%s%s%s%s%s%s%s%s%s\n" + "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" "gen. counter: \t%i\n" "s_kmallocs: \t%i\n" "s_disk_reads: \t%i\n" @@ -131,7 +131,6 @@ static int show_super(struct seq_file *m reiserfs_test4( sb ) ? "TEST4 " : "", have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ", replay_only( sb ) ? "REPLAY_ONLY " : "", - reiserfs_dont_log( sb ) ? "DONT_LOG " : "LOG ", convert_reiserfs( sb ) ? "CONV " : "", atomic_read( &r -> s_generation_counter ), @@ -370,7 +369,6 @@ static int show_journal(struct seq_file "j_first_unflushed_offset: \t%lu\n" "j_last_flush_trans_id: \t%lu\n" "j_trans_start_time: \t%li\n" - "j_journal_list_index: \t%i\n" "j_list_bitmap_index: \t%i\n" "j_must_wait: \t%i\n" "j_next_full_flush: \t%i\n" @@ -416,7 +414,6 @@ static int show_journal(struct seq_file JF( j_first_unflushed_offset ), JF( j_last_flush_trans_id ), JF( j_trans_start_time ), - JF( j_journal_list_index ), JF( j_list_bitmap_index ), JF( j_must_wait ), JF( j_next_full_flush ), diff -puN fs/reiserfs/super.c~reiserfs-logging fs/reiserfs/super.c --- 25/fs/reiserfs/super.c~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/fs/reiserfs/super.c Wed Mar 24 15:14:39 2004 @@ -59,22 +59,26 @@ static int is_any_reiserfs_magic_string static int reiserfs_remount (struct super_block * s, int * flags, char * data); static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf); -static void reiserfs_write_super (struct super_block * s) +static void reiserfs_sync_fs (struct super_block * s) { + if (!(s->s_flags & MS_RDONLY)) { + struct reiserfs_transaction_handle th; + reiserfs_write_lock(s); + journal_begin(&th, s, 1); + journal_end_sync(&th, s, 1); + reiserfs_flush_old_commits(s); + s->s_dirt = 0; + reiserfs_write_unlock(s); + } +} - int dirty = 0 ; - reiserfs_write_lock(s); - if (!(s->s_flags & MS_RDONLY)) { - dirty = flush_old_commits(s, 1) ; - } - s->s_dirt = dirty; - reiserfs_write_unlock(s); +static void reiserfs_write_super(struct super_block *s) +{ + reiserfs_sync_fs(s); } static void reiserfs_write_super_lockfs (struct super_block * s) { - - int dirty = 0 ; struct reiserfs_transaction_handle th ; reiserfs_write_lock(s); if (!(s->s_flags & MS_RDONLY)) { @@ -84,7 +88,7 @@ static void reiserfs_write_super_lockfs reiserfs_block_writes(&th) ; journal_end(&th, s, 1) ; } - s->s_dirt = dirty; + s->s_dirt = 0; reiserfs_write_unlock(s); } @@ -805,7 +809,6 @@ static int reiserfs_remount (struct supe reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state ); journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); - s->s_dirt = 0; } else { /* remount read-write */ if (!(s->s_flags & MS_RDONLY)) @@ -822,12 +825,12 @@ static int reiserfs_remount (struct supe set_sb_umount_state( rs, REISERFS_ERROR_FS ); /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); - s->s_dirt = 0; REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ; } /* this will force a full flush of all journal lists */ SB_JOURNAL(s)->j_must_wait = 1 ; journal_end(&th, s, 10) ; + s->s_dirt = 0; if (!( *mount_flags & MS_RDONLY ) ) finish_unfinished( s ); @@ -1392,8 +1395,6 @@ static int reiserfs_fill_super (struct s /* look for files which were to be removed in previous session */ finish_unfinished (s); - - s->s_dirt = 0; } else { if ( old_format_only(s) && !silent) { reiserfs_warning("reiserfs: using 3.5.x disk format\n") ; diff -puN include/linux/reiserfs_fs.h~reiserfs-logging include/linux/reiserfs_fs.h --- 25/include/linux/reiserfs_fs.h~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/include/linux/reiserfs_fs.h Wed Mar 24 15:14:39 2004 @@ -1702,23 +1702,39 @@ struct reiserfs_journal_header { (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12)))) #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) -/* finds n'th buffer with 0 being the start of this commit. Needs to go away, j_ap_blocks has changed -** since I created this. One chunk of code in journal.c needs changing before deleting it -*/ -#define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT]) - // We need these to make journal.c code more readable #define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +/* +** transaction handle which is passed around for all journal calls +*/ +struct reiserfs_transaction_handle { + struct super_block *t_super ; /* super for this FS when journal_begin was + called. saves calls to reiserfs_get_super + also used by nested transactions to make + sure they are nesting on the right FS + _must_ be first in the handle + */ + int t_refcount; + int t_blocks_logged ; /* number of blocks this writer has logged */ + int t_blocks_allocated ; /* number of blocks this writer allocated */ + unsigned long t_trans_id ; /* sanity check, equals the current trans id */ + void *t_handle_save ; /* save existing current->journal_info */ + int displace_new_blocks:1; /* if new block allocation occurres, that block + should be displaced from others */ +} ; + +int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ; +int reiserfs_flush_old_commits(struct super_block *); void reiserfs_commit_for_inode(struct inode *) ; void reiserfs_update_inode_transaction(struct inode *) ; void reiserfs_wait_on_write_block(struct super_block *s) ; void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ; void reiserfs_allow_writes(struct super_block *s) ; void reiserfs_check_lock_depth(char *caller) ; -void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ; +int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ; void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ; int journal_init(struct super_block *, const char * j_dev_name, int old_format, unsigned int) ; int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ; @@ -1730,7 +1746,6 @@ int journal_mark_freed(struct reiserfs_t int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ; int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ; int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ; -void flush_async_commits(struct super_block *p_s_sb) ; int buffer_journaled(const struct buffer_head *bh) ; int mark_buffer_journal_new(struct buffer_head *bh) ; diff -puN include/linux/reiserfs_fs_i.h~reiserfs-logging include/linux/reiserfs_fs_i.h --- 25/include/linux/reiserfs_fs_i.h~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/include/linux/reiserfs_fs_i.h Wed Mar 24 15:14:39 2004 @@ -3,6 +3,8 @@ #include <linux/list.h> +struct reiserfs_journal_list; + /** bitmasks for i_flags field in reiserfs-specific part of inode */ typedef enum { /** this says what format of key do all items (but stat data) of @@ -48,7 +50,7 @@ struct reiserfs_inode_info { ** needs to be committed in order for this inode to be properly ** flushed */ unsigned long i_trans_id ; - unsigned long i_trans_index ; + struct reiserfs_journal_list *i_jl; struct inode vfs_inode; }; diff -puN include/linux/reiserfs_fs_sb.h~reiserfs-logging include/linux/reiserfs_fs_sb.h --- 25/include/linux/reiserfs_fs_sb.h~reiserfs-logging Wed Mar 24 15:14:39 2004 +++ 25-akpm/include/linux/reiserfs_fs_sb.h Wed Mar 24 15:14:39 2004 @@ -106,7 +106,6 @@ typedef enum { #define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */ #define JOURNAL_HASH_SIZE 8192 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating. Must be >= 2 */ -#define JOURNAL_LIST_COUNT 64 /* these are bh_state bit flag offset numbers, for use in the buffer head */ @@ -121,6 +120,7 @@ typedef enum { */ #define BH_JPrepared 20 /* block has been prepared for the log */ #define BH_JRestore_dirty 22 /* restore the dirty bit later */ +#define BH_JTest 23 /* debugging use only */ /* One of these for every block in every transaction ** Each one is in two hash tables. First, a hash of the current transaction, and after journal_end, a @@ -154,26 +154,6 @@ struct reiserfs_list_bitmap { } ; /* -** transaction handle which is passed around for all journal calls -*/ -struct reiserfs_transaction_handle { - struct super_block *t_super ; /* super for this FS when journal_begin was - called. saves calls to reiserfs_get_super - also used by nested transactions to make - sure they are nesting on the right FS - _must_ be first in the handle - */ - int t_refcount; - int t_blocks_logged ; /* number of blocks this writer has logged */ - int t_blocks_allocated ; /* number of blocks this writer allocated */ - unsigned long t_trans_id ; /* sanity check, equals the current trans id */ - void *t_handle_save ; /* save existing current->journal_info */ - int displace_new_blocks:1; /* if new block allocation occurres, that block - should be displaced from others */ - -} ; - -/* ** one of these for each transaction. The most important part here is the j_realblock. ** this list of cnodes is used to hash all the blocks in all the commits, to mark all the ** real buffer heads dirty once all the commits hit the disk, @@ -181,23 +161,25 @@ struct reiserfs_transaction_handle { ** to be overwritten */ struct reiserfs_journal_list { unsigned long j_start ; + unsigned long j_state; unsigned long j_len ; atomic_t j_nonzerolen ; atomic_t j_commit_left ; - atomic_t j_flushing ; - atomic_t j_commit_flushing ; atomic_t j_older_commits_done ; /* all commits older than this on disk*/ + struct semaphore j_commit_lock; unsigned long j_trans_id ; time_t j_timestamp ; struct reiserfs_list_bitmap *j_list_bitmap ; struct buffer_head *j_commit_bh ; /* commit buffer head */ struct reiserfs_journal_cnode *j_realblock ; struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans. free each of these on flush */ - wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */ - wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */ -} ; + /* time ordered list of all active transactions */ + struct list_head j_list; -struct reiserfs_page_list ; /* defined in reiserfs_fs.h */ + /* time ordered list of all transactions we haven't tried to flush yet */ + struct list_head j_working_list; + int j_refcount; +} ; struct reiserfs_journal { struct buffer_head ** j_ap_blocks ; /* journal blocks on disk */ @@ -220,16 +202,11 @@ struct reiserfs_journal { unsigned long j_last_flush_trans_id ; /* last fully flushed journal timestamp */ struct buffer_head *j_header_bh ; - /* j_flush_pages must be flushed before the current transaction can - ** commit - */ - struct reiserfs_page_list *j_flush_pages ; time_t j_trans_start_time ; /* time this transaction started */ - wait_queue_head_t j_wait ; /* wait journal_end to finish I/O */ - atomic_t j_wlock ; /* lock for j_wait */ + struct semaphore j_lock; + struct semaphore j_flush_sem; wait_queue_head_t j_join_wait ; /* wait for current transaction to finish before starting new one */ atomic_t j_jlock ; /* lock for j_join_wait */ - int j_journal_list_index ; /* journal list number of the current trans */ int j_list_bitmap_index ; /* number of next list bitmap to use */ int j_must_wait ; /* no more journal begins allowed. MUST sleep on j_join_wait */ int j_next_full_flush ; /* next journal_end will flush all journal list */ @@ -246,19 +223,37 @@ struct reiserfs_journal { struct reiserfs_journal_cnode *j_cnode_free_list ; struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */ + struct reiserfs_journal_list *j_current_jl; int j_free_bitmap_nodes ; int j_used_bitmap_nodes ; + + int j_num_lists; /* total number of active transactions */ + int j_num_work_lists; /* number that need attention from kreiserfsd */ + + /* debugging to make sure things are flushed in order */ + int j_last_flush_id; + + /* debugging to make sure things are committed in order */ + int j_last_commit_id; + struct list_head j_bitmap_nodes ; struct list_head j_dirty_buffers ; spinlock_t j_dirty_buffers_lock ; /* protects j_dirty_buffers */ + + /* list of all active transactions */ + struct list_head j_journal_list; + /* lists that haven't been touched by writeback attempts */ + struct list_head j_working_list; + struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ; /* array of bitmaps to record the deleted blocks */ - struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ; /* array of all the journal lists */ struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for real buffer heads in current trans */ struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all the transactions */ struct list_head j_prealloc_list; /* list of inodes which have preallocated blocks */ unsigned long j_max_trans_size ; unsigned long j_max_batch_size ; + + struct work_struct j_work; }; #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick. magic string to find desc blocks in the journal */ @@ -417,7 +412,6 @@ struct reiserfs_sb_info #define REISERFS_LARGETAIL 0 /* large tails will be created in a session */ #define REISERFS_SMALLTAIL 17 /* small (for files less than block size) tails will be created in a session */ #define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */ -#define REISERFS_NOLOG 4 /* -o nolog: turn journalling off */ #define REISERFS_CONVERT 5 /* -o conv: causes conversion of old format super block to the new format. If not specified - old @@ -473,8 +467,6 @@ struct reiserfs_sb_info void reiserfs_file_buffer (struct buffer_head * bh, int list); extern struct file_system_type reiserfs_fs_type; -int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ; -int flush_old_commits(struct super_block *s, int) ; int reiserfs_resize(struct super_block *, unsigned long) ; #define CARRY_ON 0 @@ -484,8 +476,6 @@ int reiserfs_resize(struct super_block * #define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh) #define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal) #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block) -#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list) -#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index) #define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) #define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap) _