Filesystem aio write


 drivers/block/ll_rw_blk.c |   35 ++++++++++++++++++++++++++++++-----
 fs/buffer.c               |    3 ++-
 include/linux/blkdev.h    |    1 +
 include/linux/writeback.h |    4 ++--
 mm/filemap.c              |   29 ++++++++++++++++++++++++-----
 mm/page-writeback.c       |   17 ++++++++++++-----
 6 files changed, 71 insertions(+), 18 deletions(-)

diff -puN drivers/block/ll_rw_blk.c~aio-05-fs_write drivers/block/ll_rw_blk.c
--- 25/drivers/block/ll_rw_blk.c~aio-05-fs_write	2003-07-25 20:08:45.000000000 -0700
+++ 25-akpm/drivers/block/ll_rw_blk.c	2003-07-25 20:08:46.000000000 -0700
@@ -1699,25 +1699,50 @@ void blk_put_request(struct request *req
 }
 
 /**
- * blk_congestion_wait - wait for a queue to become uncongested
+ * blk_congestion_wait_wq - wait for a queue to become uncongested,
  * @rw: READ or WRITE
  * @timeout: timeout in jiffies
+ * @wait : wait queue entry to use for waiting or async notification
+ * (NULL defaults to synchronous behaviour)
  *
  * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
  * If no queues are congested then just wait for the next request to be
  * returned.
+ *
+ * If the wait queue parameter specifies an async i/o callback,
+ * then instead of blocking, just register the callback on the wait
+ * queue for async notification when the queue gets uncongested.
  */
-void blk_congestion_wait(int rw, long timeout)
+int blk_congestion_wait_wq(int rw, long timeout, wait_queue_t *wait)
 {
-	DEFINE_WAIT(wait);
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
+	DEFINE_WAIT(local_wait);
+
+	if (!wait)
+		wait = &local_wait;
 
 	blk_run_queues();
-	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+	prepare_to_wait(wqh, wait, TASK_UNINTERRUPTIBLE);
+	if (!is_sync_wait(wait)) {
+		/*
+		 * if we've queued an async wait queue
+		 * callback do not block; just tell the
+		 * caller to return and retry later when
+		 * the callback is notified
+		 */
+		return -EIOCBRETRY;
+	}
 	io_schedule_timeout(timeout);
-	finish_wait(wqh, &wait);
+	finish_wait(wqh, wait);
+	return 0;
+}
+
+void blk_congestion_wait(int rw, long timeout)
+{
+	blk_congestion_wait_wq(rw, timeout, NULL);
 }
 
+
 /*
  * Has to be called with the request spinlock acquired
  */
diff -puN fs/buffer.c~aio-05-fs_write fs/buffer.c
--- 25/fs/buffer.c~aio-05-fs_write	2003-07-25 20:08:45.000000000 -0700
+++ 25-akpm/fs/buffer.c	2003-07-25 20:08:46.000000000 -0700
@@ -1986,7 +1986,8 @@ static int __block_prepare_write(struct 
 	 * If we issued read requests - let them complete.
 	 */
 	while(wait_bh > wait) {
-		wait_on_buffer(*--wait_bh);
+		if (err = wait_on_buffer_wq(*--wait_bh, current->io_wait))
+			return err;
 		if (!buffer_uptodate(*wait_bh))
 			return -EIO;
 	}
diff -puN include/linux/blkdev.h~aio-05-fs_write include/linux/blkdev.h
--- 25/include/linux/blkdev.h~aio-05-fs_write	2003-07-25 20:08:46.000000000 -0700
+++ 25-akpm/include/linux/blkdev.h	2003-07-25 20:08:46.000000000 -0700
@@ -577,6 +577,7 @@ extern void blk_queue_free_tags(request_
 extern int blk_queue_resize_tags(request_queue_t *, int);
 extern void blk_queue_invalidate_tags(request_queue_t *);
 extern void blk_congestion_wait(int rw, long timeout);
+extern int blk_congestion_wait_wq(int rw, long timeout, wait_queue_t *wait);
 
 extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *);
 extern void blk_rq_prep_restart(struct request *);
diff -puN include/linux/writeback.h~aio-05-fs_write include/linux/writeback.h
--- 25/include/linux/writeback.h~aio-05-fs_write	2003-07-25 20:08:46.000000000 -0700
+++ 25-akpm/include/linux/writeback.h	2003-07-25 20:08:46.000000000 -0700
@@ -84,8 +84,8 @@ int dirty_writeback_centisecs_handler(st
 					  void *, size_t *);
 
 void page_writeback_init(void);
-void balance_dirty_pages(struct address_space *mapping);
-void balance_dirty_pages_ratelimited(struct address_space *mapping);
+int balance_dirty_pages(struct address_space *mapping);
+int balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
 
diff -puN mm/filemap.c~aio-05-fs_write mm/filemap.c
--- 25/mm/filemap.c~aio-05-fs_write	2003-07-25 20:08:46.000000000 -0700
+++ 25-akpm/mm/filemap.c	2003-07-25 20:08:46.000000000 -0700
@@ -449,8 +449,8 @@ struct page *find_trylock_page(struct ad
  *
  * Returns zero if the page was not present. find_lock_page() may sleep.
  */
-struct page *find_lock_page(struct address_space *mapping,
-				unsigned long offset)
+struct page *find_lock_page_wq(struct address_space *mapping,
+				unsigned long offset, wait_queue_t *wait)
 {
 	struct page *page;
 
@@ -461,7 +461,10 @@ repeat:
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
 			spin_unlock(&mapping->page_lock);
-			lock_page(page);
+			if (-EIOCBRETRY == lock_page_wq(page, wait)) {
+				page_cache_release(page);
+				return ERR_PTR(-EIOCBRETRY);
+			}
 			spin_lock(&mapping->page_lock);
 
 			/* Has the page been truncated while we slept? */
@@ -476,6 +479,12 @@ repeat:
 	return page;
 }
 
+struct page *find_lock_page(struct address_space *mapping,
+				unsigned long offset)
+{
+	return find_lock_page_wq(mapping, offset, NULL);
+}
+
 /**
  * find_or_create_page - locate or add a pagecache page
  *
@@ -1454,7 +1463,9 @@ __grab_cache_page(struct address_space *
 	int err;
 	struct page *page;
 repeat:
-	page = find_lock_page(mapping, index);
+	page = find_lock_page_wq(mapping, index, current->io_wait);
+	if (IS_ERR(page))
+		return page;
 	if (!page) {
 		if (!*cached_page) {
 			*cached_page = page_cache_alloc(mapping);
@@ -1795,6 +1806,10 @@ generic_file_aio_write_nolock(struct kio
 		fault_in_pages_readable(buf, bytes);
 
 		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
+		if (IS_ERR(page)) {
+			status = PTR_ERR(page);
+			break;
+		}
 		if (!page) {
 			status = -ENOMEM;
 			break;
@@ -1845,7 +1860,11 @@ generic_file_aio_write_nolock(struct kio
 		page_cache_release(page);
 		if (status < 0)
 			break;
-		balance_dirty_pages_ratelimited(mapping);
+		status = balance_dirty_pages_ratelimited(mapping);
+		if (status < 0) {
+			pr_debug("async balance_dirty_pages\n");
+			break;
+		}
 		cond_resched();
 	} while (count);
 	*ppos = pos;
diff -puN mm/page-writeback.c~aio-05-fs_write mm/page-writeback.c
--- 25/mm/page-writeback.c~aio-05-fs_write	2003-07-25 20:08:46.000000000 -0700
+++ 25-akpm/mm/page-writeback.c	2003-07-25 20:08:46.000000000 -0700
@@ -144,7 +144,7 @@ get_dirty_limits(struct page_state *ps, 
  * If we're over `background_thresh' then pdflush is woken to perform some
  * writeout.
  */
-void balance_dirty_pages(struct address_space *mapping)
+int balance_dirty_pages(struct address_space *mapping)
 {
 	struct page_state ps;
 	long nr_reclaimable;
@@ -161,6 +161,7 @@ void balance_dirty_pages(struct address_
 			.sync_mode	= WB_SYNC_NONE,
 			.older_than_this = NULL,
 			.nr_to_write	= write_chunk,
+			.nonblocking	= !is_sync_wait(current->io_wait)
 		};
 
 		get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
@@ -187,7 +188,11 @@ void balance_dirty_pages(struct address_
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
 		}
-		blk_congestion_wait(WRITE, HZ/10);
+		if (-EIOCBRETRY == blk_congestion_wait_wq(WRITE, HZ/10,
+			current->io_wait)) {
+			pr_debug("async blk congestion wait\n");
+			return -EIOCBRETRY;
+		}
 	}
 
 	if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
@@ -195,6 +200,8 @@ void balance_dirty_pages(struct address_
 
 	if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh)
 		pdflush_operation(background_writeout, 0);
+
+	return 0;
 }
 
 /**
@@ -210,7 +217,7 @@ void balance_dirty_pages(struct address_
  * decrease the ratelimiting by a lot, to prevent individual processes from
  * overshooting the limit by (ratelimit_pages) each.
  */
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+int balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
 	static DEFINE_PER_CPU(int, ratelimits) = 0;
 	long ratelimit;
@@ -222,10 +229,10 @@ void balance_dirty_pages_ratelimited(str
 	if (get_cpu_var(ratelimits)++ >= ratelimit) {
 		__get_cpu_var(ratelimits) = 0;
 		put_cpu_var(ratelimits);
-		balance_dirty_pages(mapping);
-		return;
+		return balance_dirty_pages(mapping);
 	}
 	put_cpu_var(ratelimits);
+	return 0;
 }
 
 /*

_