From: Nathan Scott <nathans@sgi.com>

This patch adds a mechanism by which a filesystem can register an interest in
the completion of direct I/O.  The completion routine will be given the
inode, an offset and a length, and an optional filesystem-private field.

We have extended the use of the buffer_head-based interface (i.e. 
get_block_t) for direct I/O such that the b_private field is now utilised. 
It is defined to be initially zero at the start of I/O, and will be passed
into the filesystem unmodified by the VFS with each map request, while
setting up the direct I/O.  Once I/O has completed the final value of this
pointer will be passed into a filesystems I/O completion handler.  This
mechanism can be used to keep track of all of the mapping requests which
encompass an individual direct I/O request.

This has been implemented specifically for XFS, but is done so as to be as
generic as possible.  XFS uses this mechanism to provide support for
unwritten extents - these are file extents which have been pre-allocated
on-disk, but not yet written to (once written, these become regular file
extents, but only once I/O is complete).


 25-akpm/fs/block_dev.c          |    2 -
 25-akpm/fs/direct-io.c          |   28 +++++++++++++++++++++++----
 25-akpm/fs/ext2/inode.c         |    2 -
 25-akpm/fs/ext3/inode.c         |    3 +-
 25-akpm/fs/jfs/inode.c          |    2 -
 25-akpm/fs/xfs/linux/xfs_aops.c |   41 +++++++++++++++++++++++++++++++++-------
 25-akpm/include/linux/fs.h      |    4 ++-
 7 files changed, 66 insertions(+), 16 deletions(-)

diff -puN fs/block_dev.c~xfs-dio-unwritten-extents fs/block_dev.c
--- 25/fs/block_dev.c~xfs-dio-unwritten-extents	Wed Jul 30 14:16:24 2003
+++ 25-akpm/fs/block_dev.c	Wed Jul 30 14:16:24 2003
@@ -125,7 +125,7 @@ blkdev_direct_IO(int rw, struct kiocb *i
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
 
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_bdev, iov, offset,
-				nr_segs, blkdev_get_blocks);
+				nr_segs, blkdev_get_blocks, NULL);
 }
 
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
diff -puN fs/direct-io.c~xfs-dio-unwritten-extents fs/direct-io.c
--- 25/fs/direct-io.c~xfs-dio-unwritten-extents	Wed Jul 30 14:16:24 2003
+++ 25-akpm/fs/direct-io.c	Wed Jul 30 14:16:24 2003
@@ -15,6 +15,8 @@
  *		added support for non-aligned IO.
  * 06Nov2002	pbadari@us.ibm.com
  *		added asynchronous IO support.
+ * 21Jul2003	nathans@sgi.com
+ *		added IO completion notifier.
  */
 
 #include <linux/kernel.h>
@@ -74,6 +76,7 @@ struct dio {
 	int boundary;			/* prev block is at a boundary */
 	int reap_counter;		/* rate limit reaping */
 	get_blocks_t *get_blocks;	/* block mapping function */
+	dio_iodone_t *end_io;		/* IO completion function */
 	sector_t final_block_in_bio;	/* current final block in bio + 1 */
 	sector_t next_block_for_io;	/* next block to be put under IO,
 					   in dio_blocks units */
@@ -193,13 +196,27 @@ static struct page *dio_get_page(struct 
 }
 
 /*
+ * Called when all DIO BIO I/O has been completed - let the filesystem
+ * know, if it registered an interest earlier via get_blocks.  Pass the
+ * private field of the map buffer_head so that filesystems can use it
+ * to hold additional state between get_blocks calls and dio_complete.
+ */
+static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
+{
+	if (dio->end_io)
+		dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
+}
+
+/*
  * Called when a BIO has been processed.  If the count goes to zero then IO is
  * complete and we can signal this to the AIO layer.
  */
 static void finished_one_bio(struct dio *dio)
 {
 	if (atomic_dec_and_test(&dio->bio_count)) {
-		if(dio->is_async) {
+		if (dio->is_async) {
+			dio_complete(dio, dio->block_in_file << dio->blkbits,
+					dio->result);
 			aio_complete(dio->iocb, dio->result, 0);
 			kfree(dio);
 		}
@@ -824,7 +841,7 @@ out:
 static int
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-	unsigned blkbits, get_blocks_t get_blocks)
+	unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io)
 {
 	unsigned long user_addr; 
 	int seg;
@@ -852,6 +869,8 @@ direct_io_worker(int rw, struct kiocb *i
 	dio->boundary = 0;
 	dio->reap_counter = 0;
 	dio->get_blocks = get_blocks;
+	dio->end_io = end_io;
+	dio->map_bh.b_private = NULL;
 	dio->final_block_in_bio = -1;
 	dio->next_block_for_io = -1;
 
@@ -953,6 +972,7 @@ direct_io_worker(int rw, struct kiocb *i
 			if (rw == READ && (offset + ret > i_size))
 				ret = i_size - offset;
 		}
+		dio_complete(dio, offset, ret);
 		kfree(dio);
 	}
 	return ret;
@@ -964,7 +984,7 @@ direct_io_worker(int rw, struct kiocb *i
 int
 blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_blocks_t get_blocks)
+	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io)
 {
 	int seg;
 	size_t size;
@@ -999,7 +1019,7 @@ blockdev_direct_IO(int rw, struct kiocb 
 	}
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset, 
-				nr_segs, blkbits, get_blocks);
+				nr_segs, blkbits, get_blocks, end_io);
 out:
 	return retval;
 }
diff -puN fs/ext2/inode.c~xfs-dio-unwritten-extents fs/ext2/inode.c
--- 25/fs/ext2/inode.c~xfs-dio-unwritten-extents	Wed Jul 30 14:16:24 2003
+++ 25-akpm/fs/ext2/inode.c	Wed Jul 30 14:16:24 2003
@@ -662,7 +662,7 @@ ext2_direct_IO(int rw, struct kiocb *ioc
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
 
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				offset, nr_segs, ext2_get_blocks);
+				offset, nr_segs, ext2_get_blocks, NULL);
 }
 
 static int
diff -puN fs/ext3/inode.c~xfs-dio-unwritten-extents fs/ext3/inode.c
--- 25/fs/ext3/inode.c~xfs-dio-unwritten-extents	Wed Jul 30 14:16:24 2003
+++ 25-akpm/fs/ext3/inode.c	Wed Jul 30 14:16:24 2003
@@ -1562,7 +1562,8 @@ static int ext3_direct_IO(int rw, struct
 	}
 
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
-				offset, nr_segs, ext3_direct_io_get_blocks);
+				 offset, nr_segs,
+				 ext3_direct_io_get_blocks, NULL);
 
 out_stop:
 	if (handle) {
diff -puN fs/jfs/inode.c~xfs-dio-unwritten-extents fs/jfs/inode.c
--- 25/fs/jfs/inode.c~xfs-dio-unwritten-extents	Wed Jul 30 14:16:24 2003
+++ 25-akpm/fs/jfs/inode.c	Wed Jul 30 14:16:24 2003
@@ -308,7 +308,7 @@ static int jfs_direct_IO(int rw, struct 
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
 
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				offset, nr_segs, jfs_get_blocks);
+				offset, nr_segs, jfs_get_blocks, NULL);
 }
 
 struct address_space_operations jfs_aops = {
diff -puN fs/xfs/linux/xfs_aops.c~xfs-dio-unwritten-extents fs/xfs/linux/xfs_aops.c
--- 25/fs/xfs/linux/xfs_aops.c~xfs-dio-unwritten-extents	Wed Jul 30 14:16:24 2003
+++ 25-akpm/fs/xfs/linux/xfs_aops.c	Wed Jul 30 14:16:24 2003
@@ -76,10 +76,10 @@ linvfs_unwritten_done(
 
 /*
  * Issue transactions to convert a buffer range from unwritten
- * to written extents.
+ * to written extents (buffered IO).
  */
 STATIC void
-linvfs_unwritten_conv(
+linvfs_unwritten_convert(
 	xfs_buf_t	*bp)
 {
 	vnode_t		*vp = XFS_BUF_FSPRIVATE(bp, vnode_t *);
@@ -96,6 +96,30 @@ linvfs_unwritten_conv(
 	pagebuf_iodone(bp, 0, 0);
 }
 
+/*
+ * Issue transactions to convert a buffer range from unwritten
+ * to written extents (direct IO).
+ */
+STATIC void
+linvfs_unwritten_convert_direct(
+	struct inode	*inode,
+	loff_t		offset,
+	ssize_t		size,
+	void		*private)
+{
+	ASSERT(!private || inode == (struct inode *)private);
+
+	/* private indicates an unwritten extent lay beneath this IO,
+	 * see linvfs_get_block_core.
+	 */
+	if (private && size > 0) {
+		vnode_t	*vp = LINVFS_GET_VP(inode);
+		int	error;
+
+		VOP_BMAP(vp, offset, size, BMAP_UNWRITTEN, NULL, NULL, error);
+	}
+}
+
 STATIC int
 map_blocks(
 	struct inode		*inode,
@@ -456,7 +480,7 @@ map_unwritten(
 	XFS_BUF_SET_SIZE(pb, size);
 	XFS_BUF_SET_OFFSET(pb, offset);
 	XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode));
-	XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_conv);
+	XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert);
 
 	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
 		pagebuf_iodone(pb, 1, 1);
@@ -804,7 +828,7 @@ STATIC int
 linvfs_get_block_core(
 	struct inode		*inode,
 	sector_t		iblock,
-	int			blocks,
+	unsigned long		blocks,
 	struct buffer_head	*bh_result,
 	int			create,
 	int			direct,
@@ -854,8 +878,11 @@ linvfs_get_block_core(
 			set_buffer_mapped(bh_result);
 		}
 		if (pbmap.pbm_flags & PBMF_UNWRITTEN) {
-			if (create)
+			if (create) {
+				if (direct)
+					bh_result->b_private = inode;
 				set_buffer_mapped(bh_result);
+			}
 			set_buffer_unwritten(bh_result);
 			set_buffer_delay(bh_result);
 		}
@@ -935,8 +962,8 @@ linvfs_direct_IO(
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
 
-        return blockdev_direct_IO(rw, iocb, inode, NULL,
-			iov, offset, nr_segs, linvfs_get_blocks_direct);
+        return blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs,
+		linvfs_get_blocks_direct, linvfs_unwritten_convert_direct);
 }
 
 
diff -puN include/linux/fs.h~xfs-dio-unwritten-extents include/linux/fs.h
--- 25/include/linux/fs.h~xfs-dio-unwritten-extents	Wed Jul 30 14:16:24 2003
+++ 25-akpm/include/linux/fs.h	Wed Jul 30 14:16:24 2003
@@ -219,6 +219,8 @@ typedef int (get_block_t)(struct inode *
 typedef int (get_blocks_t)(struct inode *inode, sector_t iblock,
 			unsigned long max_blocks,
 			struct buffer_head *bh_result, int create);
+typedef void (dio_iodone_t)(struct inode *inode, loff_t offset,
+			ssize_t bytes, void *private);
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what
@@ -1291,7 +1293,7 @@ extern ssize_t generic_file_direct_IO(in
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs);
 extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_blocks_t *get_blocks);
+	unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
 	unsigned long nr_segs, loff_t *ppos);
 ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, 

_