diff -urN rawio/fs/buffer.c o_direct/fs/buffer.c
--- rawio/fs/buffer.c	Tue Apr 24 07:37:00 2001
+++ o_direct/fs/buffer.c	Tue Apr 24 07:40:09 2001
@@ -565,6 +565,16 @@
 	spin_unlock(&lru_list_lock);
 }
 
+void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
+{
+	spin_lock(&lru_list_lock);
+	if (bh->b_inode)
+		list_del(&bh->b_inode_buffers);
+	bh->b_inode = inode;
+	list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
+	spin_unlock(&lru_list_lock);
+}
+
 /* The caller must have the lru_list lock before calling the 
    remove_inode_queue functions.  */
 static void __remove_inode_queue(struct buffer_head *bh)
@@ -584,7 +594,7 @@
 	int ret;
 	
 	spin_lock(&lru_list_lock);
-	ret = !list_empty(&inode->i_dirty_buffers);
+	ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
 	spin_unlock(&lru_list_lock);
 	
 	return ret;
@@ -819,6 +829,113 @@
     bh->b_end_io = end_buffer_io_async ;
 }
 
+int osync_inode_data_buffers(struct inode *inode)
+{
+	struct buffer_head *bh;
+	struct list_head *list;
+	int err = 0;
+
+ repeat:
+
+	for (list = inode->i_dirty_data_buffers.prev; 
+	     bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
+	     list = bh->b_inode_buffers.prev) {
+		if (buffer_locked(bh)) {
+			atomic_inc(&bh->b_count);
+			spin_unlock(&lru_list_lock);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				err = -EIO;
+			brelse(bh);
+			spin_lock(&lru_list_lock);
+			goto repeat;
+		}
+	}
+
+	return err;
+}
+
+/*
+ * osync is designed to support O_SYNC io.  It waits synchronously for
+ * all already-submitted IO to complete, but does not queue any new
+ * writes to the disk.
+ *
+ * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
+ * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * completion.  Any other dirty buffers which are not yet queued for
+ * write will not be flushed to disk by the osync.
+ */
+
+int osync_inode_buffers(struct inode *inode)
+{
+	struct buffer_head *bh;
+	struct list_head *list;
+	int err = 0;
+
+ repeat:
+
+	for (list = inode->i_dirty_buffers.prev; 
+	     bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
+	     list = bh->b_inode_buffers.prev) {
+		if (buffer_locked(bh)) {
+			atomic_inc(&bh->b_count);
+			spin_unlock(&lru_list_lock);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				err = -EIO;
+			brelse(bh);
+			spin_lock(&lru_list_lock);
+			goto repeat;
+		}
+	}
+
+	return err;
+}
+
+int fsync_inode_data_buffers(struct inode *inode)
+{
+	struct buffer_head *bh;
+	struct inode tmp;
+	int err = 0;
+	
+	INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
+	
+	spin_lock(&lru_list_lock);
+
+	while (!list_empty(&inode->i_dirty_data_buffers)) {
+		bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
+		list_del(&bh->b_inode_buffers);
+		if (!buffer_dirty(bh) && !buffer_locked(bh))
+			bh->b_inode = NULL;
+		else {
+			bh->b_inode = &tmp;
+			list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers);
+			if (buffer_dirty(bh)) {
+				atomic_inc(&bh->b_count);
+				spin_unlock(&lru_list_lock);
+				ll_rw_block(WRITE, 1, &bh);
+				brelse(bh);
+				spin_lock(&lru_list_lock);
+			}
+		}
+	}
+
+	while (!list_empty(&tmp.i_dirty_data_buffers)) {
+		bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev);
+		remove_inode_queue(bh);
+		atomic_inc(&bh->b_count);
+		spin_unlock(&lru_list_lock);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			err = -EIO;
+		brelse(bh);
+		spin_lock(&lru_list_lock);
+	}
+	spin_unlock(&lru_list_lock);
+
+	return err;
+}
+
 /*
  * Synchronise all the inode's dirty buffers to the disk.
  *
@@ -843,7 +960,7 @@
 {
 	struct buffer_head *bh;
 	struct inode tmp;
-	int err = 0, err2;
+	int err = 0;
 	
 	INIT_LIST_HEAD(&tmp.i_dirty_buffers);
 	
@@ -878,58 +995,11 @@
 		brelse(bh);
 		spin_lock(&lru_list_lock);
 	}
-	
 	spin_unlock(&lru_list_lock);
-	err2 = osync_inode_buffers(inode);
-
-	if (err)
-		return err;
-	else
-		return err2;
-}
 
-
-/*
- * osync is designed to support O_SYNC io.  It waits synchronously for
- * all already-submitted IO to complete, but does not queue any new
- * writes to the disk.
- *
- * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
- * you dirty the buffers, and then use osync_inode_buffers to wait for
- * completion.  Any other dirty buffers which are not yet queued for
- * write will not be flushed to disk by the osync.
- */
-
-int osync_inode_buffers(struct inode *inode)
-{
-	struct buffer_head *bh;
-	struct list_head *list;
-	int err = 0;
-
-	spin_lock(&lru_list_lock);
-	
- repeat:
-	
-	for (list = inode->i_dirty_buffers.prev; 
-	     bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
-	     list = bh->b_inode_buffers.prev) {
-		if (buffer_locked(bh)) {
-			atomic_inc(&bh->b_count);
-			spin_unlock(&lru_list_lock);
-			wait_on_buffer(bh);
-			if (!buffer_uptodate(bh))
-				err = -EIO;
-			brelse(bh);
-			spin_lock(&lru_list_lock);
-			goto repeat;
-		}
-	}
-
-	spin_unlock(&lru_list_lock);
 	return err;
 }
 
-
 /*
  * Invalidate any and all dirty buffers on a given inode.  We are
  * probably unmounting the fs, but that doesn't mean we have already
@@ -937,15 +1007,13 @@
  */
 void invalidate_inode_buffers(struct inode *inode)
 {
-	struct list_head *list, *next;
+	struct list_head * entry;
 	
 	spin_lock(&lru_list_lock);
-	list = inode->i_dirty_buffers.next; 
-	while (list != &inode->i_dirty_buffers) {
-		next = list->next;
-		remove_inode_queue(BH_ENTRY(list));
-		list = next;
-	}
+	while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
+		remove_inode_queue(BH_ENTRY(entry));
+	while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
+		remove_inode_queue(BH_ENTRY(entry));
 	spin_unlock(&lru_list_lock);
 }
 
@@ -1139,8 +1207,8 @@
 	if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf))
 		goto in_use;
 	__hash_unlink(buf);
-	remove_inode_queue(buf);
 	write_unlock(&hash_table_lock);
+	remove_inode_queue(buf);
 	__remove_from_lru_list(buf, buf->b_list);
 	spin_unlock(&lru_list_lock);
 	put_last_free(buf);
@@ -1364,7 +1432,7 @@
  * we have truncated the file and are going to free the
  * blocks on-disk..
  */
-int block_flushpage(struct page *page, unsigned long offset)
+int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
 {
 	struct buffer_head *head, *bh, *next;
 	unsigned int curr_off = 0;
@@ -1401,7 +1469,8 @@
 	 */
 	if (!offset) {
 		if (!try_to_free_buffers(page, 0)) {
-			atomic_inc(&buffermem_pages);
+			if (drop_pagecache)
+				atomic_inc(&buffermem_pages);
 			return 0;
 		}
 	}
@@ -1631,7 +1700,7 @@
 			set_bit(BH_Uptodate, &bh->b_state);
 			if (!atomic_set_buffer_dirty(bh)) {
 				__mark_dirty(bh);
-				buffer_insert_inode_queue(bh, inode);
+				buffer_insert_inode_data_queue(bh, inode);
 				need_balance_dirty = 1;
 			}
 		}
@@ -1960,6 +2029,47 @@
 	return tmp.b_blocknr;
 }
 
+int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
+{
+	int i, nr_blocks, retval;
+	unsigned long * blocks = iobuf->blocks;
+
+	nr_blocks = iobuf->length / blocksize;
+	/* build the blocklist */
+	for (i = 0; i < nr_blocks; i++, blocknr++) {
+		struct buffer_head bh;
+
+		bh.b_state = 0;
+		bh.b_dev = inode->i_dev;
+		bh.b_size = blocksize;
+
+		retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
+		if (retval)
+			goto out;
+
+		if (rw == READ) {
+			if (buffer_new(&bh))
+				BUG();
+			if (!buffer_mapped(&bh)) {
+				/* there was an hole in the filesystem */
+				blocks[i] = -1UL;
+				continue;
+			}
+		} else {
+			if (buffer_new(&bh))
+				unmap_underlying_metadata(&bh);
+			if (!buffer_mapped(&bh))
+				BUG();
+		}
+		blocks[i] = bh.b_blocknr;
+	}
+
+	retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
+
+ out:
+	return retval;
+}
+
 /*
  * IO completion routine for a buffer_head being used for kiobuf IO: we
  * can't dispatch the kiobuf callback until io_count reaches 0.  
@@ -2075,6 +2185,18 @@
 			
 			while (length > 0) {
 				blocknr = b[bufind++];
+				if (blocknr == -1UL) {
+					if (rw == READ) {
+						/* there was an hole in the filesystem */
+						memset(kmap(map) + offset, 0, size);
+						flush_dcache_page(map);
+						kunmap(map);
+
+						transferred += size;
+						goto skip_block;
+					} else
+						BUG();
+				}
 				tmp = bhs[bhind++];
 
 				tmp->b_dev = B_FREE;
@@ -2093,9 +2215,6 @@
 				} else
 					set_bit(BH_Uptodate, &tmp->b_state);
 
-				length -= size;
-				offset += size;
-
 				atomic_inc(&iobuf->io_count);
 
 				submit_bh(rw, tmp);
@@ -2111,7 +2230,11 @@
 						goto finished;
 					bhind = 0;
 				}
-				
+
+			skip_block:
+				length -= size;
+				offset += size;
+
 				if (offset >= PAGE_SIZE) {
 					offset = 0;
 					break;
diff -urN rawio/fs/ext2/fsync.c o_direct/fs/ext2/fsync.c
--- rawio/fs/ext2/fsync.c	Thu Dec 14 22:34:11 2000
+++ o_direct/fs/ext2/fsync.c	Tue Apr 24 07:40:09 2001
@@ -44,6 +44,7 @@
 	int err;
 	
 	err  = fsync_inode_buffers(inode);
+	err |= fsync_inode_data_buffers(inode);
 	if (!(inode->i_state & I_DIRTY))
 		return err;
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
diff -urN rawio/fs/ext2/inode.c o_direct/fs/ext2/inode.c
--- rawio/fs/ext2/inode.c	Sat Apr 21 20:04:20 2001
+++ o_direct/fs/ext2/inode.c	Tue Apr 24 07:40:09 2001
@@ -666,13 +666,18 @@
 {
 	return generic_block_bmap(mapping,block,ext2_get_block);
 }
+static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
+{
+	return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block);
+}
 struct address_space_operations ext2_aops = {
 	readpage: ext2_readpage,
 	writepage: ext2_writepage,
 	sync_page: block_sync_page,
 	prepare_write: ext2_prepare_write,
 	commit_write: generic_commit_write,
-	bmap: ext2_bmap
+	bmap: ext2_bmap,
+	direct_IO: ext2_direct_IO,
 };
 
 /*
diff -urN rawio/fs/fcntl.c o_direct/fs/fcntl.c
--- rawio/fs/fcntl.c	Tue Nov 28 18:40:01 2000
+++ o_direct/fs/fcntl.c	Tue Apr 24 07:47:36 2001
@@ -194,7 +194,7 @@
 	return ret;
 }
 
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT)
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
@@ -215,6 +215,25 @@
 			if (error < 0)
 				return error;
 		}
+	}
+
+	if (arg & O_DIRECT) {
+		/*
+		 * alloc_kiovec() can sleep and we are only serialized by
+		 * the big kernel lock here, so abuse the i_sem to serialize
+		 * this case too. We of course wouldn't need to go deep down
+		 * to the inode layer, we could stay at the file layer, but
+		 * we don't want to pay for the memory of a semaphore in each
+		 * file structure too and we use the inode semaphore that we just
+		 * pay for anyways.
+		 */
+		error = 0;
+		down(&inode->i_sem);
+		if (!filp->f_iobuf)
+			error = alloc_kiovec(1, &filp->f_iobuf);
+		up(&inode->i_sem);
+		if (error < 0)
+			return error;
 	}
 
 	/* required for strict SunOS emulation */
diff -urN rawio/fs/file_table.c o_direct/fs/file_table.c
--- rawio/fs/file_table.c	Sat Apr 21 20:04:20 2001
+++ o_direct/fs/file_table.c	Tue Apr 24 07:40:09 2001
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/iobuf.h>
 
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {0, 0, NR_FILE};
@@ -104,6 +105,10 @@
 
 	if (atomic_dec_and_test(&file->f_count)) {
 		locks_remove_flock(file);
+
+		if (file->f_iobuf)
+			free_kiovec(1, &file->f_iobuf);
+
 		if (file->f_op && file->f_op->release)
 			file->f_op->release(inode, file);
 		fops_put(file->f_op);
diff -urN rawio/fs/inode.c o_direct/fs/inode.c
--- rawio/fs/inode.c	Sat Apr 21 20:04:20 2001
+++ o_direct/fs/inode.c	Tue Apr 24 07:40:09 2001
@@ -77,7 +77,7 @@
 	 ((struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL))
 static void destroy_inode(struct inode *inode) 
 {
-	if (!list_empty(&inode->i_dirty_buffers))
+	if (inode_has_buffers(inode))
 		BUG();
 	kmem_cache_free(inode_cachep, (inode));
 }
@@ -103,6 +103,7 @@
 		INIT_LIST_HEAD(&inode->i_data.locked_pages);
 		INIT_LIST_HEAD(&inode->i_dentry);
 		INIT_LIST_HEAD(&inode->i_dirty_buffers);
+		INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
 		sema_init(&inode->i_sem, 1);
 		sema_init(&inode->i_zombie, 1);
 		spin_lock_init(&inode->i_data.i_shared_lock);
@@ -342,6 +343,8 @@
 		while (inode->i_state & I_DIRTY)
 			sync_one(inode, sync);
 		spin_unlock(&inode_lock);
+		if (sync)
+			wait_on_inode(inode);
 	}
 	else
 		printk("write_inode_now: no super block\n");
@@ -356,9 +359,9 @@
  * O_SYNC flag set, to flush dirty writes to disk.  
  */
 
-int generic_osync_inode(struct inode *inode, int datasync)
+int generic_osync_inode(struct inode *inode, int what)
 {
-	int err;
+	int err = 0, err2 = 0, need_write_inode_now = 0;
 	
 	/* 
 	 * WARNING
@@ -381,23 +384,24 @@
 	 * every O_SYNC write, not just the synchronous I/Os.  --sct
 	 */
 
-#ifdef WRITERS_QUEUE_IO
-	err = osync_inode_buffers(inode);
-#else
-	err = fsync_inode_buffers(inode);
-#endif
+	if (what & OSYNC_METADATA)
+		err = fsync_inode_buffers(inode);
+	if (what & OSYNC_DATA)
+		err2 = fsync_inode_data_buffers(inode);
+	if (!err)
+		err = err2;
 
 	spin_lock(&inode_lock);
-	if (!(inode->i_state & I_DIRTY))
-		goto out;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		goto out;
+	if ((inode->i_state & I_DIRTY) &&
+	    ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
+		need_write_inode_now = 1;
 	spin_unlock(&inode_lock);
-	write_inode_now(inode, 1);
-	return err;
 
- out:
-	spin_unlock(&inode_lock);
+	if (need_write_inode_now)
+		write_inode_now(inode, 1);
+	else
+		wait_on_inode(inode);
+
 	return err;
 }
 
@@ -412,8 +416,7 @@
  
 void clear_inode(struct inode *inode)
 {
-	if (!list_empty(&inode->i_dirty_buffers))
-		invalidate_inode_buffers(inode);
+	invalidate_inode_buffers(inode);
        
 	if (inode->i_data.nrpages)
 		BUG();
diff -urN rawio/fs/open.c o_direct/fs/open.c
--- rawio/fs/open.c	Thu Feb 22 03:45:10 2001
+++ o_direct/fs/open.c	Tue Apr 24 07:40:09 2001
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/tty.h>
+#include <linux/iobuf.h>
 
 #include <asm/uaccess.h>
 
@@ -662,6 +663,15 @@
 			goto cleanup_all;
 	}
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
+	/* preallocate kiobuf for O_DIRECT */
+	f->f_iobuf = NULL;
+	f->f_iobuf_lock = 0;
+	if (f->f_flags & O_DIRECT) {
+		error = alloc_kiovec(1, &f->f_iobuf);
+		if (error)
+			goto cleanup_all;
+	}
 
 	return f;
 
diff -urN rawio/fs/reiserfs/file.c o_direct/fs/reiserfs/file.c
--- rawio/fs/reiserfs/file.c	Sat Feb 10 02:34:12 2001
+++ o_direct/fs/reiserfs/file.c	Tue Apr 24 07:40:09 2001
@@ -84,7 +84,7 @@
 			      ) {
   struct inode * p_s_inode = p_s_dentry->d_inode;
   struct reiserfs_transaction_handle th ;
-  int n_err = 0;
+  int n_err;
   int windex ;
   int jbegin_count = 1 ;
 
@@ -94,6 +94,7 @@
       BUG ();
 
   n_err = fsync_inode_buffers(p_s_inode) ;
+  n_err |= fsync_inode_data_buffers(p_s_inode);
   /* commit the current transaction to flush any metadata
   ** changes.  sys_fsync takes care of flushing the dirty pages for us
   */
diff -urN rawio/include/asm-i386/fcntl.h o_direct/include/asm-i386/fcntl.h
--- rawio/include/asm-i386/fcntl.h	Thu Nov 16 15:37:33 2000
+++ o_direct/include/asm-i386/fcntl.h	Tue Apr 24 07:40:09 2001
@@ -16,7 +16,7 @@
 #define O_NDELAY	O_NONBLOCK
 #define O_SYNC		 010000
 #define FASYNC		 020000	/* fcntl, for BSD compatibility */
-#define O_DIRECT	 040000	/* direct disk access hint - currently ignored */
+#define O_DIRECT	 040000	/* direct disk access hint */
 #define O_LARGEFILE	0100000
 #define O_DIRECTORY	0200000	/* must be a directory */
 #define O_NOFOLLOW	0400000 /* don't follow links */
diff -urN rawio/include/linux/fs.h o_direct/include/linux/fs.h
--- rawio/include/linux/fs.h	Tue Apr 24 06:15:35 2001
+++ o_direct/include/linux/fs.h	Tue Apr 24 07:40:09 2001
@@ -360,6 +360,7 @@
  */
 struct page;
 struct address_space;
+struct kiobuf;
 
 struct address_space_operations {
 	int (*writepage)(struct page *);
@@ -369,6 +370,7 @@
 	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	int (*bmap)(struct address_space *, long);
+	int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
 };
 
 struct address_space {
@@ -400,6 +402,7 @@
 	struct list_head	i_dentry;
 	
 	struct list_head	i_dirty_buffers;
+	struct list_head	i_dirty_data_buffers;
 
 	unsigned long		i_ino;
 	atomic_t		i_count;
@@ -495,6 +498,10 @@
 
 	/* needed for tty driver, and maybe others */
 	void			*private_data;
+
+	/* preallocated helper kiobuf to speedup O_DIRECT */
+	struct kiobuf		*f_iobuf;
+	long			f_iobuf_lock;
 };
 extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
@@ -1091,6 +1098,7 @@
 extern int check_disk_change(kdev_t);
 extern int invalidate_inodes(struct super_block *);
 extern void invalidate_inode_pages(struct inode *);
+extern void invalidate_inode_pages2(struct address_space *);
 extern void invalidate_inode_buffers(struct inode *);
 #define invalidate_buffers(dev)	__invalidate_buffers((dev), 0)
 #define destroy_buffers(dev)	__invalidate_buffers((dev), 1)
@@ -1099,8 +1107,10 @@
 extern void write_inode_now(struct inode *, int);
 extern void sync_dev(kdev_t);
 extern int fsync_dev(kdev_t);
-extern int fsync_inode_buffers(struct inode *);
 extern int osync_inode_buffers(struct inode *);
+extern int osync_inode_data_buffers(struct inode *);
+extern int fsync_inode_buffers(struct inode *);
+extern int fsync_inode_data_buffers(struct inode *);
 extern int inode_has_buffers(struct inode *);
 extern void filemap_fdatasync(struct address_space *);
 extern void filemap_fdatawait(struct address_space *);
@@ -1260,7 +1270,9 @@
 typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int);
 
 /* Generic buffer handling for block filesystems.. */
-extern int block_flushpage(struct page *, unsigned long);
+extern int discard_bh_page(struct page *, unsigned long, int);
+#define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
+#define block_invalidate_page(page) discard_bh_page(page, 0, 0)
 extern int block_symlink(struct inode *, const char *, int);
 extern int block_write_full_page(struct page*, get_block_t*);
 extern int block_read_full_page(struct page*, get_block_t*);
@@ -1272,6 +1284,7 @@
 int generic_block_bmap(struct address_space *, long, get_block_t *);
 int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
+extern int generic_direct_IO(int, struct inode *, struct kiobuf *, unsigned long, int, get_block_t *);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
@@ -1319,6 +1332,9 @@
 extern int file_fsync(struct file *, struct dentry *, int);
 extern int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx);
 extern int generic_osync_inode(struct inode *, int);
+#define OSYNC_METADATA (1<<0)
+#define OSYNC_DATA (1<<1)
+#define OSYNC_INODE (1<<2)
 
 extern int inode_change_ok(struct inode *, struct iattr *);
 extern void inode_setattr(struct inode *, struct iattr *);
diff -urN rawio/kernel/ksyms.c o_direct/kernel/ksyms.c
--- rawio/kernel/ksyms.c	Tue Apr 24 07:37:00 2001
+++ o_direct/kernel/ksyms.c	Tue Apr 24 07:58:26 2001
@@ -205,6 +205,7 @@
 EXPORT_SYMBOL(generic_file_read);
 EXPORT_SYMBOL(do_generic_file_read);
 EXPORT_SYMBOL(generic_file_write);
+EXPORT_SYMBOL(generic_direct_IO);
 EXPORT_SYMBOL(generic_file_mmap);
 EXPORT_SYMBOL(generic_ro_fops);
 EXPORT_SYMBOL(generic_buffer_fdatasync);
@@ -480,6 +481,7 @@
 EXPORT_SYMBOL(__wait_on_super);
 EXPORT_SYMBOL(file_fsync);
 EXPORT_SYMBOL(fsync_inode_buffers);
+EXPORT_SYMBOL(fsync_inode_data_buffers);
 EXPORT_SYMBOL(clear_inode);
 EXPORT_SYMBOL(nr_async_pages);
 EXPORT_SYMBOL(___strtok);
diff -urN rawio/mm/filemap.c o_direct/mm/filemap.c
--- rawio/mm/filemap.c	Sat Apr 21 20:04:24 2001
+++ o_direct/mm/filemap.c	Tue Apr 24 08:16:28 2001
@@ -21,6 +21,7 @@
 #include <linux/swapctl.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/iobuf.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -199,7 +200,7 @@
 
 }
 
-static inline void truncate_complete_page(struct page *page)
+static void truncate_complete_page(struct page *page)
 {
 	/* Leave it on the LRU if it gets converted into anonymous buffers */
 	if (!page->buffers || block_flushpage(page, 0))
@@ -234,15 +235,14 @@
 
 		/* Is one of the pages to truncate? */
 		if ((offset >= start) || (*partial && (offset + 1) == start)) {
-			if (TryLockPage(page)) {
-				page_cache_get(page);
-				spin_unlock(&pagecache_lock);
-				wait_on_page(page);
-				page_cache_release(page);
-				return 1;
-			}
+			int failed;
+			failed = TryLockPage(page);
 			page_cache_get(page);
 			spin_unlock(&pagecache_lock);
+			if (failed) {
+				wait_on_page(page);
+				goto again;
+			}
 
 			if (*partial && (offset + 1) == start) {
 				truncate_partial_page(page, *partial);
@@ -251,7 +251,12 @@
 				truncate_complete_page(page);
 
 			UnlockPage(page);
+		again:
 			page_cache_release(page);
+			if (current->need_resched) {
+				__set_current_state(TASK_RUNNING);
+				schedule();
+			}
 			return 1;
 		}
 	}
@@ -284,6 +289,82 @@
 	spin_unlock(&pagecache_lock);
 }
 
+static inline int invalidate_this_page2(struct page * page)
+{
+	int loop = 0;
+
+	if (page_count(page) == 1 + !!page->buffers) {
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+		truncate_complete_page(page);
+	} else {
+		if (page->buffers) {
+			page_cache_get(page);
+			spin_unlock(&pagecache_lock);
+			block_invalidate_page(page);
+		} else
+			loop = 1;
+
+		ClearPageDirty(page);
+		ClearPageUptodate(page);
+	}
+
+	return loop;
+}
+
+static int FASTCALL(invalidate_list_pages2(struct list_head *));
+static int invalidate_list_pages2(struct list_head *head)
+{
+	struct list_head *curr;
+	struct page * page;
+
+	curr = head->next;
+	while (curr != head) {
+		int loop;
+
+		page = list_entry(curr, struct page, list);
+		curr = curr->next;
+
+		if (TryLockPage(page)) {
+			page_cache_get(page);
+			spin_unlock(&pagecache_lock);
+			wait_on_page(page);
+			goto again;
+		}
+
+		loop = invalidate_this_page2(page);
+		UnlockPage(page);
+		if (loop)
+			continue;
+	again:
+		page_cache_release(page);
+		if (current->need_resched) {
+			__set_current_state(TASK_RUNNING);
+			schedule();
+		}
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
+ * free the pages because they're mapped.
+ * @mapping: the address_space which pages we want to invalidate
+ */
+void invalidate_inode_pages2(struct address_space * mapping)
+{
+repeat:
+	spin_lock(&pagecache_lock);
+	if (invalidate_list_pages2(&mapping->clean_pages))
+		goto repeat;
+	if (invalidate_list_pages2(&mapping->dirty_pages))
+		goto repeat;
+	if (invalidate_list_pages2(&mapping->locked_pages))
+		goto repeat;
+	spin_unlock(&pagecache_lock);
+}
+
 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 {
 	goto inside;
@@ -1208,6 +1289,87 @@
 	UPDATE_ATIME(inode);
 }
 
+static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
+{
+	ssize_t retval;
+	int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
+	struct kiobuf * iobuf;
+	struct inode * inode = filp->f_dentry->d_inode;
+	struct address_space * mapping = inode->i_mapping;
+
+	new_iobuf = 0;
+	iobuf = filp->f_iobuf;
+	if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
+		/*
+		 * A parallel read/write is using the preallocated iobuf
+		 * so just run slow and allocate a new one.
+		 */
+		retval = alloc_kiovec(1, &iobuf);
+		if (retval)
+			goto out;
+		new_iobuf = 1;
+	}
+
+	blocksize = inode->i_sb->s_blocksize;
+	blocksize_mask = blocksize - 1;
+	blocksize_bits = inode->i_sb->s_blocksize_bits;
+	chunk_size = KIO_MAX_ATOMIC_IO << 10;
+
+	retval = -EINVAL;
+	if ((offset & blocksize_mask) || (count & blocksize_mask))
+		goto out_free;
+	if (!mapping->a_ops->direct_IO)
+		goto out_free;
+
+	/*
+	 * Flush to disk exlusively the _data_, metadata must remains
+	 * completly asynchronous or performance will go to /dev/null.
+	 */
+	filemap_fdatasync(mapping);
+	retval = fsync_inode_data_buffers(inode);
+	filemap_fdatawait(mapping);
+	if (retval < 0)
+		goto out;
+
+	progress = retval = 0;
+	while (count > 0) {
+		iosize = count;
+		if (iosize > chunk_size)
+			iosize = chunk_size;
+
+		retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
+		if (retval)
+			break;
+
+		retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
+
+		if (rw == READ && retval > 0)
+			mark_dirty_kiobuf(iobuf, retval);
+		
+		if (retval >= 0) {
+			count -= retval;
+			buf += retval;
+			progress += retval;
+		}
+
+		unmap_kiobuf(iobuf);
+
+		if (retval != iosize)
+			break;
+	}
+
+	if (progress)
+		retval = progress;
+
+ out_free:
+	if (!new_iobuf)
+		clear_bit(0, &filp->f_iobuf_lock);
+	else
+		free_kiovec(1, &iobuf);
+ out:	
+	return retval;
+}
+
 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
 {
 	char *kaddr;
@@ -1238,6 +1400,9 @@
 {
 	ssize_t retval;
 
+	if (filp->f_flags & O_DIRECT)
+		goto o_direct;
+
 	retval = -EFAULT;
 	if (access_ok(VERIFY_WRITE, buf, count)) {
 		retval = 0;
@@ -1256,7 +1421,22 @@
 				retval = desc.error;
 		}
 	}
+ out:
 	return retval;
+
+ o_direct:
+	{
+		loff_t pos = *ppos;
+		struct inode * inode = filp->f_dentry->d_inode;
+
+		if (pos + count > inode->i_size)
+			count = inode->i_size - pos;
+		retval = generic_file_direct_IO(READ, filp, buf, count, pos);
+		if (retval > 0)
+			*ppos = pos + retval;
+		UPDATE_ATIME(filp->f_dentry->d_inode);
+		goto out;
+	}
 }
 
 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
@@ -2446,7 +2626,7 @@
  *							okir@monad.swb.de
  */
 ssize_t
-generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
+generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
 {
 	struct inode	*inode = file->f_dentry->d_inode; 
 	struct address_space *mapping = inode->i_mapping;
@@ -2541,6 +2721,9 @@
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	mark_inode_dirty_sync(inode);
 
+	if (file->f_flags & O_DIRECT)
+		goto o_direct;
+
 	while (count) {
 		unsigned long index, offset;
 		char *kaddr;
@@ -2615,7 +2798,7 @@
 	/* For now, when the user asks for O_SYNC, we'll actually
 	 * provide O_DSYNC. */
 	if ((status >= 0) && (file->f_flags & O_SYNC))
-		status = generic_osync_inode(inode, 1); /* 1 means datasync */
+		status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
 	
 	err = written ? written : status;
 out:
@@ -2627,6 +2810,25 @@
 	ClearPageUptodate(page);
 	kunmap(page);
 	goto unlock;
+
+o_direct:
+	err = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
+	if (err > 0) {
+		loff_t end = pos + err;
+		if (end > inode->i_size) {
+			inode->i_size = end;
+			mark_inode_dirty(inode);
+		}
+		*ppos = end;
+		invalidate_inode_pages2(mapping);
+		/*
+		 * Sync the fs metadata but not the minor inode changes and
+		 * of course not the data as we did direct DMA for the IO.
+		 */
+		if (file->f_flags & O_SYNC)
+			err = generic_osync_inode(inode, OSYNC_METADATA);
+	}
+	goto out;
 }
 
 void __init page_cache_init(unsigned long mempages)