diff -urN 2.4.0-test9-pre5/arch/alpha/Makefile z/arch/alpha/Makefile
--- 2.4.0-test9-pre5/arch/alpha/Makefile	Tue Sep 12 02:32:45 2000
+++ z/arch/alpha/Makefile	Fri Sep 22 17:47:44 2000
@@ -119,6 +119,10 @@
 
 archdep:
 	@$(MAKEBOOT) dep
+
+vmlinux: arch/alpha/vmlinux.lds
+
+arch/alpha/vmlinux.lds: arch/alpha/vmlinux.lds.in
 	$(CPP) $(CPPFLAGS) -xc -P arch/alpha/vmlinux.lds.in -o arch/alpha/vmlinux.lds
 
 bootpfile:
diff -urN 2.4.0-test9-pre5/arch/alpha/kernel/pci_iommu.c z/arch/alpha/kernel/pci_iommu.c
--- 2.4.0-test9-pre5/arch/alpha/kernel/pci_iommu.c	Sat Jun 24 16:02:27 2000
+++ z/arch/alpha/kernel/pci_iommu.c	Fri Sep 22 17:47:44 2000
@@ -416,7 +416,9 @@
 	ptes = &arena->ptes[dma_ofs];
 	sg = leader;
 	do {
+#if DEBUG_ALLOC > 0
 		struct scatterlist *last_sg = sg;
+#endif
 
 		size = sg->length;
 		paddr = virt_to_phys(sg->address);
diff -urN 2.4.0-test9-pre5/arch/alpha/kernel/smp.c z/arch/alpha/kernel/smp.c
--- 2.4.0-test9-pre5/arch/alpha/kernel/smp.c	Tue Sep 12 02:32:46 2000
+++ z/arch/alpha/kernel/smp.c	Fri Sep 22 17:47:44 2000
@@ -1046,8 +1046,8 @@
 	"	blbs	%0,2b\n"
 	"	br	1b\n"
 	".previous"
-	: "=r" (tmp), "=m" (__dummy_lock(lock)), "=r" (stuck)
-	: "1" (__dummy_lock(lock)), "2" (stuck));
+	: "=r" (tmp), "=m" (lock->lock), "=r" (stuck)
+	: "1" (lock->lock), "2" (stuck) : "memory");
 
 	if (stuck < 0) {
 		printk(KERN_WARNING
@@ -1124,9 +1124,9 @@
 	"	blt	%1,8b\n"
 	"	br	1b\n"
 	".previous"
-	: "=m" (__dummy_lock(lock)), "=&r" (regx), "=&r" (regy),
+	: "=m" (*(volatile int *)lock), "=&r" (regx), "=&r" (regy),
 	  "=&r" (stuck_lock), "=&r" (stuck_reader)
-	: "0" (__dummy_lock(lock)), "3" (stuck_lock), "4" (stuck_reader));
+	: "0" (*(volatile int *)lock), "3" (stuck_lock), "4" (stuck_reader) : "memory");
 
 	if (stuck_lock < 0) {
 		printk(KERN_WARNING "write_lock stuck at %p\n", inline_pc);
@@ -1163,8 +1163,8 @@
 	"	blbs	%1,6b;"
 	"	br	1b\n"
 	".previous"
-	: "=m" (__dummy_lock(lock)), "=&r" (regx), "=&r" (stuck_lock)
-	: "0" (__dummy_lock(lock)), "2" (stuck_lock));
+	: "=m" (*(volatile int *)lock), "=&r" (regx), "=&r" (stuck_lock)
+	: "0" (*(volatile int *)lock), "2" (stuck_lock) : "memory");
 
 	if (stuck_lock < 0) {
 		printk(KERN_WARNING "read_lock stuck at %p\n", inline_pc);
diff -urN 2.4.0-test9-pre5/arch/alpha/mm/extable.c z/arch/alpha/mm/extable.c
--- 2.4.0-test9-pre5/arch/alpha/mm/extable.c	Sat Jun 24 16:02:27 2000
+++ z/arch/alpha/mm/extable.c	Fri Sep 22 17:47:44 2000
@@ -88,7 +88,7 @@
 	 */
 	ret = search_exception_table_without_gp(addr);
 	if (ret) {
-		printk(KERN_ALERT, "%s: [%lx] EX_TABLE search fail with"
+		printk(KERN_ALERT "%s: [%lx] EX_TABLE search fail with"
 		       "exc frame GP, success with raw GP\n",
 		       current->comm, addr);
 		return ret;
diff -urN 2.4.0-test9-pre5/include/asm-alpha/atomic.h z/include/asm-alpha/atomic.h
--- 2.4.0-test9-pre5/include/asm-alpha/atomic.h	Sun Sep  3 23:48:31 2000
+++ z/include/asm-alpha/atomic.h	Fri Sep 22 17:47:44 2000
@@ -11,11 +11,13 @@
  * than regular operations.
  */
 
-#ifdef CONFIG_SMP
+
+/*
+ * Counter is volatile to make sure gcc doesn't try to be clever
+ * and move things around on us. We need to use _exactly_ the address
+ * the user gave us, not some alias that contains the same information.
+ */
 typedef struct { volatile int counter; } atomic_t;
-#else
-typedef struct { int counter; } atomic_t;
-#endif
 
 #define ATOMIC_INIT(i)	( (atomic_t) { (i) } )
 
@@ -23,19 +25,12 @@
 #define atomic_set(v,i)		((v)->counter = (i))
 
 /*
- * Make sure gcc doesn't try to be clever and move things around
- * on us. We need to use _exactly_ the address the user gave us,
- * not some alias that contains the same information.
- */
-#define __atomic_fool_gcc(x) (*(struct { int a[100]; } *)x)
-
-/*
  * To get proper branch prediction for the main line, we must branch
  * forward to code at the end of this object's .text section, then
  * branch back to restart the operation.
  */
 
-extern __inline__ void atomic_add(int i, atomic_t * v)
+static __inline__ void atomic_add(int i, atomic_t * v)
 {
 	unsigned long temp;
 	__asm__ __volatile__(
@@ -46,11 +41,11 @@
 	".subsection 2\n"
 	"2:	br 1b\n"
 	".previous"
-	:"=&r" (temp), "=m" (__atomic_fool_gcc(v))
-	:"Ir" (i), "m" (__atomic_fool_gcc(v)));
+	:"=&r" (temp), "=m" (v->counter)
+	:"Ir" (i), "m" (v->counter));
 }
 
-extern __inline__ void atomic_sub(int i, atomic_t * v)
+static __inline__ void atomic_sub(int i, atomic_t * v)
 {
 	unsigned long temp;
 	__asm__ __volatile__(
@@ -61,14 +56,14 @@
 	".subsection 2\n"
 	"2:	br 1b\n"
 	".previous"
-	:"=&r" (temp), "=m" (__atomic_fool_gcc(v))
-	:"Ir" (i), "m" (__atomic_fool_gcc(v)));
+	:"=&r" (temp), "=m" (v->counter)
+	:"Ir" (i), "m" (v->counter));
 }
 
 /*
  * Same as above, but return the result value
  */
-extern __inline__ long atomic_add_return(int i, atomic_t * v)
+static __inline__ long atomic_add_return(int i, atomic_t * v)
 {
 	long temp, result;
 	__asm__ __volatile__(
@@ -81,12 +76,12 @@
 	".subsection 2\n"
 	"2:	br 1b\n"
 	".previous"
-	:"=&r" (temp), "=m" (__atomic_fool_gcc(v)), "=&r" (result)
-	:"Ir" (i), "m" (__atomic_fool_gcc(v)));
+	:"=&r" (temp), "=m" (v->counter), "=&r" (result)
+	:"Ir" (i), "m" (v->counter) : "memory");
 	return result;
 }
 
-extern __inline__ long atomic_sub_return(int i, atomic_t * v)
+static __inline__ long atomic_sub_return(int i, atomic_t * v)
 {
 	long temp, result;
 	__asm__ __volatile__(
@@ -99,8 +94,8 @@
 	".subsection 2\n"
 	"2:	br 1b\n"
 	".previous"
-	:"=&r" (temp), "=m" (__atomic_fool_gcc(v)), "=&r" (result)
-	:"Ir" (i), "m" (__atomic_fool_gcc(v)));
+	:"=&r" (temp), "=m" (v->counter), "=&r" (result)
+	:"Ir" (i), "m" (v->counter) : "memory");
 	return result;
 }
 
diff -urN 2.4.0-test9-pre5/include/asm-alpha/bitops.h z/include/asm-alpha/bitops.h
--- 2.4.0-test9-pre5/include/asm-alpha/bitops.h	Tue Sep 12 02:32:47 2000
+++ z/include/asm-alpha/bitops.h	Fri Sep 22 17:47:44 2000
@@ -1,6 +1,8 @@
 #ifndef _ALPHA_BITOPS_H
 #define _ALPHA_BITOPS_H
 
+#include <linux/config.h>
+
 /*
  * Copyright 1994, Linus Torvalds.
  */
@@ -17,14 +19,19 @@
  * bit 0 is the LSB of addr; bit 64 is the LSB of (addr+1).
  */
 
+#define BITOPS_NO_BRANCH
+
 extern __inline__ void set_bit(unsigned long nr, volatile void * addr)
 {
+#ifndef BITOPS_NO_BRANCH
 	unsigned long oldbit;
+#endif
 	unsigned long temp;
 	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
 
+#ifndef BITOPS_NO_BRANCH
 	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
+	"1:	ldl_l %0,%4\n"
 	"	and %0,%3,%2\n"
 	"	bne %2,2f\n"
 	"	xor %0,%3,%0\n"
@@ -36,16 +43,57 @@
 	".previous"
 	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
 	:"Ir" (1UL << (nr & 31)), "m" (*m));
+#else
+	__asm__ __volatile__(
+	"1:	ldl_l %0,%3\n"
+	"	bis %0,%2,%0\n"
+	"	stl_c %0,%1\n"
+	"	beq %0,2f\n"
+	".subsection 2\n"
+	"2:	br 1b\n"
+	".previous"
+	:"=&r" (temp), "=m" (*m)
+	:"Ir" (1UL << (nr & 31)), "m" (*m));
+#endif
 }
 
+/*
+ * WARNING: non atomic version.
+ */
+extern __inline__ void __set_bit(unsigned long nr, volatile void * addr)
+{
+	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
+	/*
+	 * Asm and C produces the same thing so let
+	 * the compiler to do its good work.
+	 */
+#if 0
+	int tmp;
+
+	__asm__ __volatile__(
+	"ldl %0,%3\n\t"
+	"bis %0,%2,%0\n\t"
+	"stl %0,%1"
+	: "=&r" (tmp), "=m" (*m)
+	: "Ir" (1UL << (nr & 31)), "m" (*m));
+#else
+	*m |= 1UL << (nr & 31);
+#endif
+}
+
+#define smp_mb__before_clear_bit()	smp_mb()
+#define smp_mb__after_clear_bit()	smp_mb()
 extern __inline__ void clear_bit(unsigned long nr, volatile void * addr)
 {
+#ifndef BITOPS_NO_BRANCH
 	unsigned long oldbit;
+#endif
 	unsigned long temp;
 	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
 
+#ifndef BITOPS_NO_BRANCH
 	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
+	"1:	ldl_l %0,%4\n"
 	"	and %0,%3,%2\n"
 	"	beq %2,2f\n"
 	"	xor %0,%3,%0\n"
@@ -57,6 +105,18 @@
 	".previous"
 	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
 	:"Ir" (1UL << (nr & 31)), "m" (*m));
+#else
+	__asm__ __volatile__(
+	"1:	ldl_l %0,%3\n"
+	"	and %0,%2,%0\n"
+	"	stl_c %0,%1\n"
+	"	beq %0,2f\n"
+	".subsection 2\n"
+	"2:	br 1b\n"
+	".previous"
+	:"=&r" (temp), "=m" (*m)
+	:"Ir" (~(1UL << (nr & 31))), "m" (*m));
+#endif
 }
 
 extern __inline__ void change_bit(unsigned long nr, volatile void * addr)
@@ -65,12 +125,12 @@
 	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
 
 	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
+	"1:	ldl_l %0,%3\n"
 	"	xor %0,%2,%0\n"
 	"	stl_c %0,%1\n"
-	"	beq %0,3f\n"
+	"	beq %0,2f\n"
 	".subsection 2\n"
-	"3:	br 1b\n"
+	"2:	br 1b\n"
 	".previous"
 	:"=&r" (temp), "=m" (*m)
 	:"Ir" (1UL << (nr & 31)), "m" (*m));
@@ -84,18 +144,43 @@
 	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
 
 	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
+	"1:	ldl_l %0,%4\n"
 	"	and %0,%3,%2\n"
 	"	bne %2,2f\n"
 	"	xor %0,%3,%0\n"
 	"	stl_c %0,%1\n"
 	"	beq %0,3f\n"
+#ifdef CONFIG_SMP
 	"	mb\n"
+#endif
 	"2:\n"
 	".subsection 2\n"
 	"3:	br 1b\n"
 	".previous"
 	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
+	:"Ir" (1UL << (nr & 31)), "m" (*m) : "memory");
+
+	return oldbit != 0;
+}
+
+/*
+ * WARNING: non atomic version.
+ */
+extern __inline__ int __test_and_set_bit(unsigned long nr,
+					 volatile void * addr)
+{
+	unsigned long oldbit;
+	unsigned long temp;
+	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
+
+	__asm__ __volatile__(
+	"	ldl %0,%4\n"
+	"	and %0,%3,%2\n"
+	"	bne %2,1f\n"
+	"	xor %0,%3,%0\n"
+	"	stl %0,%1\n"
+	"1:\n"
+	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
 	:"Ir" (1UL << (nr & 31)), "m" (*m));
 
 	return oldbit != 0;
@@ -109,18 +194,43 @@
 	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
 
 	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
+	"1:	ldl_l %0,%4\n"
 	"	and %0,%3,%2\n"
 	"	beq %2,2f\n"
 	"	xor %0,%3,%0\n"
 	"	stl_c %0,%1\n"
 	"	beq %0,3f\n"
+#ifdef CONFIG_SMP
 	"	mb\n"
+#endif
 	"2:\n"
 	".subsection 2\n"
 	"3:	br 1b\n"
 	".previous"
 	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
+	:"Ir" (1UL << (nr & 31)), "m" (*m) : "memory");
+
+	return oldbit != 0;
+}
+
+/*
+ * WARNING: non atomic version.
+ */
+extern __inline__ int __test_and_clear_bit(unsigned long nr,
+					   volatile void * addr)
+{
+	unsigned long oldbit;
+	unsigned long temp;
+	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
+
+	__asm__ __volatile__(
+	"	ldl %0,%4\n"
+	"	and %0,%3,%2\n"
+	"	beq %2,1f\n"
+	"	xor %0,%3,%0\n"
+	"	stl %0,%1\n"
+	"1:\n"
+	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
 	:"Ir" (1UL << (nr & 31)), "m" (*m));
 
 	return oldbit != 0;
@@ -134,17 +244,19 @@
 	unsigned int * m = ((unsigned int *) addr) + (nr >> 5);
 
 	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
+	"1:	ldl_l %0,%4\n"
 	"	and %0,%3,%2\n"
 	"	xor %0,%3,%0\n"
 	"	stl_c %0,%1\n"
 	"	beq %0,3f\n"
+#ifdef CONFIG_SMP
 	"	mb\n"
+#endif
 	".subsection 2\n"
 	"3:	br 1b\n"
 	".previous"
 	:"=&r" (temp), "=m" (*m), "=&r" (oldbit)
-	:"Ir" (1UL << (nr & 31)), "m" (*m));
+	:"Ir" (1UL << (nr & 31)), "m" (*m) : "memory");
 
 	return oldbit != 0;
 }
@@ -279,16 +391,16 @@
 
 #ifdef __KERNEL__
 
-#define ext2_set_bit                 test_and_set_bit
-#define ext2_clear_bit               test_and_clear_bit
+#define ext2_set_bit                 __test_and_set_bit
+#define ext2_clear_bit               __test_and_clear_bit
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
 
 /* Bitmap functions for the minix filesystem.  */
-#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr)
-#define minix_set_bit(nr,addr) set_bit(nr,addr)
-#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr)
+#define minix_test_and_set_bit(nr,addr) __test_and_set_bit(nr,addr)
+#define minix_set_bit(nr,addr) __set_bit(nr,addr)
+#define minix_test_and_clear_bit(nr,addr) __test_and_clear_bit(nr,addr)
 #define minix_test_bit(nr,addr) test_bit(nr,addr)
 #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
 
diff -urN 2.4.0-test9-pre5/include/asm-alpha/elf.h z/include/asm-alpha/elf.h
--- 2.4.0-test9-pre5/include/asm-alpha/elf.h	Thu Jul 20 21:31:11 2000
+++ z/include/asm-alpha/elf.h	Fri Sep 22 17:47:44 2000
@@ -127,7 +127,7 @@
 
 #ifdef __KERNEL__
 #define SET_PERSONALITY(EX, IBCS2)				\
-	set_personality((EX).e_flags & EF_ALPHA_32BIT		\
+	set_personality(((EX).e_flags & EF_ALPHA_32BIT)		\
 	   ? PER_LINUX_32BIT : (IBCS2) ? PER_SVR4 : PER_LINUX)
 #endif
 
diff -urN 2.4.0-test9-pre5/include/asm-alpha/semaphore-helper.h z/include/asm-alpha/semaphore-helper.h
--- 2.4.0-test9-pre5/include/asm-alpha/semaphore-helper.h	Sun Feb 27 06:19:44 2000
+++ z/include/asm-alpha/semaphore-helper.h	Fri Sep 22 17:47:44 2000
@@ -37,7 +37,7 @@
 		".subsection 2\n"
 		"3:	br	1b\n"
 		".previous"
-		: "=r"(ret), "=r"(tmp), "=m"(__atomic_fool_gcc(&sem->waking))
+		: "=r"(ret), "=r"(tmp), "=m"(sem->waking.counter)
 		: "0"(0));
 
 	return ret > 0;
diff -urN 2.4.0-test9-pre5/include/asm-alpha/spinlock.h z/include/asm-alpha/spinlock.h
--- 2.4.0-test9-pre5/include/asm-alpha/spinlock.h	Tue Aug 15 22:44:25 2000
+++ z/include/asm-alpha/spinlock.h	Fri Sep 22 17:47:44 2000
@@ -5,8 +5,8 @@
 #include <linux/kernel.h>
 #include <asm/current.h>
 
-#define DEBUG_SPINLOCK 1
-#define DEBUG_RWLOCK 1
+#define DEBUG_SPINLOCK 0
+#define DEBUG_RWLOCK 0
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
@@ -38,9 +38,6 @@
 #define spin_is_locked(x)	((x)->lock != 0)
 #define spin_unlock_wait(x)	({ do { barrier(); } while ((x)->lock); })
 
-typedef struct { unsigned long a[100]; } __dummy_lock_t;
-#define __dummy_lock(lock) (*(__dummy_lock_t *)(lock))
-
 #if DEBUG_SPINLOCK
 extern void spin_unlock(spinlock_t * lock);
 extern void debug_spin_lock(spinlock_t * lock, const char *, int);
@@ -83,8 +80,8 @@
 	"	blbs	%0,2b\n"
 	"	br	1b\n"
 	".previous"
-	: "=r" (tmp), "=m" (__dummy_lock(lock))
-	: "m"(__dummy_lock(lock)));
+	: "=r" (tmp), "=m" (lock->lock)
+	: "m"(lock->lock) : "memory");
 }
 
 #define spin_trylock(lock) (!test_and_set_bit(0,(lock)))
@@ -119,9 +116,8 @@
 	"	bne	%1,6b\n"
 	"	br	1b\n"
 	".previous"
-	: "=m" (__dummy_lock(lock)), "=&r" (regx)
-	: "0" (__dummy_lock(lock))
-	);
+	: "=m" (*(volatile int *)lock), "=&r" (regx)
+	: "0" (*(volatile int *)lock) : "memory");
 }
 
 static inline void read_lock(rwlock_t * lock)
@@ -140,9 +136,8 @@
 	"	blbs	%1,6b\n"
 	"	br	1b\n"
 	".previous"
-	: "=m" (__dummy_lock(lock)), "=&r" (regx)
-	: "m" (__dummy_lock(lock))
-	);
+	: "=m" (*(volatile int *)lock), "=&r" (regx)
+	: "m" (*(volatile int *)lock) : "memory");
 }
 #endif /* DEBUG_RWLOCK */
 
@@ -156,6 +151,7 @@
 {
 	long regx;
 	__asm__ __volatile__(
+	"	mb\n"
 	"1:	ldl_l	%1,%0\n"
 	"	addl	%1,2,%1\n"
 	"	stl_c	%1,%0\n"
@@ -163,8 +159,8 @@
 	".subsection 2\n"
 	"6:	br	1b\n"
 	".previous"
-	: "=m" (__dummy_lock(lock)), "=&r" (regx)
-	: "m" (__dummy_lock(lock)));
+	: "=m" (*(volatile int *)lock), "=&r" (regx)
+	: "m" (*(volatile int *)lock) : "memory");
 }
 
 #endif /* _ALPHA_SPINLOCK_H */
diff -urN 2.4.0-test9-pre5/include/asm-alpha/system.h z/include/asm-alpha/system.h
--- 2.4.0-test9-pre5/include/asm-alpha/system.h	Thu Aug 10 18:14:18 2000
+++ z/include/asm-alpha/system.h	Fri Sep 22 17:47:44 2000
@@ -137,12 +137,19 @@
 #define wmb() \
 __asm__ __volatile__("wmb": : :"memory")
 
+#ifdef __SMP__
+#define smp_mb()	mb()
+#define smp_rmb()	rmb()
+#define smp_wmb()	wmb()
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#endif
+
 #define set_mb(var, value) \
 do { var = value; mb(); } while (0)
 
-#define set_rmb(var, value) \
-do { var = value; rmb(); } while (0)
-
 #define set_wmb(var, value) \
 do { var = value; wmb(); } while (0)
 
@@ -284,11 +291,11 @@
 #define getipl()		(rdps() & 7)
 #define setipl(ipl)		((void) swpipl(ipl))
 
-#define __cli()			setipl(IPL_MAX)
-#define __sti()			setipl(IPL_MIN)
+#define __cli()			do { setipl(IPL_MAX); barrier(); } while(0)
+#define __sti()			do { barrier(); setipl(IPL_MIN); } while(0)
 #define __save_flags(flags)	((flags) = rdps())
-#define __save_and_cli(flags)	((flags) = swpipl(IPL_MAX))
-#define __restore_flags(flags)	setipl(flags)
+#define __save_and_cli(flags)	do { (flags) = swpipl(IPL_MAX); barrier(); } while(0)
+#define __restore_flags(flags)	do { barrier(); setipl(flags); barrier(); } while(0)
 
 #define local_irq_save(flags)		__save_and_cli(flags)
 #define local_irq_restore(flags)	__restore_flags(flags)
@@ -344,6 +351,8 @@
 
 /*
  * Atomic exchange.
+ * Since it can be used to implement critical sections
+ * it must clobber "memory" (also for interrupts in UP).
  */
 
 extern __inline__ unsigned long
@@ -352,16 +361,18 @@
 	unsigned long dummy;
 
 	__asm__ __volatile__(
-	"1:	ldl_l %0,%2\n"
+	"1:	ldl_l %0,%4\n"
 	"	bis $31,%3,%1\n"
 	"	stl_c %1,%2\n"
 	"	beq %1,2f\n"
+#ifdef CONFIG_SMP
 	"	mb\n"
+#endif
 	".subsection 2\n"
 	"2:	br 1b\n"
 	".previous"
 	: "=&r" (val), "=&r" (dummy), "=m" (*m)
-	: "rI" (val), "m" (*m));
+	: "rI" (val), "m" (*m) : "memory");
 
 	return val;
 }
@@ -372,16 +383,18 @@
 	unsigned long dummy;
 
 	__asm__ __volatile__(
-	"1:	ldq_l %0,%2\n"
+	"1:	ldq_l %0,%4\n"
 	"	bis $31,%3,%1\n"
 	"	stq_c %1,%2\n"
 	"	beq %1,2f\n"
+#ifdef CONFIG_SMP
 	"	mb\n"
+#endif
 	".subsection 2\n"
 	"2:	br 1b\n"
 	".previous"
 	: "=&r" (val), "=&r" (dummy), "=m" (*m)
-	: "rI" (val), "m" (*m));
+	: "rI" (val), "m" (*m) : "memory");
 
 	return val;
 }
@@ -416,6 +429,11 @@
  * Atomic compare and exchange.  Compare OLD with MEM, if identical,
  * store NEW in MEM.  Return the initial value in MEM.  Success is
  * indicated by comparing RETURN with OLD.
+ *
+ * The memory barrier should be placed in SMP only when we actually
+ * make the change. If we don't change anything (so if the returned
+ * prev is equal to old) then we aren't acquiring anything new and
+ * we don't need any memory barrier as far I can tell.
  */
 
 #define __HAVE_ARCH_CMPXCHG 1
@@ -426,18 +444,21 @@
 	unsigned long prev, cmp;
 
 	__asm__ __volatile__(
-	"1:	ldl_l %0,%2\n"
+	"1:	ldl_l %0,%5\n"
 	"	cmpeq %0,%3,%1\n"
 	"	beq %1,2f\n"
 	"	mov %4,%1\n"
 	"	stl_c %1,%2\n"
 	"	beq %1,3f\n"
-	"2:	mb\n"
+#ifdef CONFIG_SMP
+	"	mb\n"
+#endif
+	"2:\n"
 	".subsection 2\n"
 	"3:	br 1b\n"
 	".previous"
 	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
-	: "r"((long) old), "r"(new), "m"(*m));
+	: "r"((long) old), "r"(new), "m"(*m) : "memory");
 
 	return prev;
 }
@@ -448,18 +469,21 @@
 	unsigned long prev, cmp;
 
 	__asm__ __volatile__(
-	"1:	ldq_l %0,%2\n"
+	"1:	ldq_l %0,%5\n"
 	"	cmpeq %0,%3,%1\n"
 	"	beq %1,2f\n"
 	"	mov %4,%1\n"
 	"	stq_c %1,%2\n"
 	"	beq %1,3f\n"
-	"2:	mb\n"
+#ifdef CONFIG_SMP
+	"	mb\n"
+#endif
+	"2:\n"
 	".subsection 2\n"
 	"3:	br 1b\n"
 	".previous"
 	: "=&r"(prev), "=&r"(cmp), "=m"(*m)
-	: "r"((long) old), "r"(new), "m"(*m));
+	: "r"((long) old), "r"(new), "m"(*m) : "memory");
 
 	return prev;
 }
diff -urN 2.4.0-test9-pre5/include/asm-i386/atomic.h z/include/asm-i386/atomic.h
--- 2.4.0-test9-pre5/include/asm-i386/atomic.h	Mon Sep  4 14:35:13 2000
+++ z/include/asm-i386/atomic.h	Fri Sep 22 17:47:44 2000
@@ -19,102 +19,96 @@
  * on us. We need to use _exactly_ the address the user gave us,
  * not some alias that contains the same information.
  */
-#define __atomic_fool_gcc(x) (*(volatile struct { int a[100]; } *)x)
-
-#ifdef CONFIG_SMP
 typedef struct { volatile int counter; } atomic_t;
-#else
-typedef struct { int counter; } atomic_t;
-#endif
 
 #define ATOMIC_INIT(i)	{ (i) }
 
 #define atomic_read(v)		((v)->counter)
 #define atomic_set(v,i)		(((v)->counter) = (i))
 
-static __inline__ void atomic_add(int i, volatile atomic_t *v)
+static __inline__ void atomic_add(int i, atomic_t *v)
 {
 	__asm__ __volatile__(
 		LOCK "addl %1,%0"
-		:"=m" (__atomic_fool_gcc(v))
-		:"ir" (i), "m" (__atomic_fool_gcc(v)));
+		:"=m" (v->counter)
+		:"ir" (i), "m" (v->counter));
 }
 
-static __inline__ void atomic_sub(int i, volatile atomic_t *v)
+static __inline__ void atomic_sub(int i, atomic_t *v)
 {
 	__asm__ __volatile__(
 		LOCK "subl %1,%0"
-		:"=m" (__atomic_fool_gcc(v))
-		:"ir" (i), "m" (__atomic_fool_gcc(v)));
+		:"=m" (v->counter)
+		:"ir" (i), "m" (v->counter));
 }
 
-static __inline__ int atomic_sub_and_test(int i, volatile atomic_t *v)
+static __inline__ int atomic_sub_and_test(int i, atomic_t *v)
 {
 	unsigned char c;
 
 	__asm__ __volatile__(
 		LOCK "subl %2,%0; sete %1"
-		:"=m" (__atomic_fool_gcc(v)), "=qm" (c)
-		:"ir" (i), "m" (__atomic_fool_gcc(v)));
+		:"=m" (v->counter), "=qm" (c)
+		:"ir" (i), "m" (v->counter) : "memory");
 	return c;
 }
 
-static __inline__ void atomic_inc(volatile atomic_t *v)
+static __inline__ void atomic_inc(atomic_t *v)
 {
 	__asm__ __volatile__(
 		LOCK "incl %0"
-		:"=m" (__atomic_fool_gcc(v))
-		:"m" (__atomic_fool_gcc(v)));
+		:"=m" (v->counter)
+		:"m" (v->counter));
 }
 
-static __inline__ void atomic_dec(volatile atomic_t *v)
+static __inline__ void atomic_dec(atomic_t *v)
 {
 	__asm__ __volatile__(
 		LOCK "decl %0"
-		:"=m" (__atomic_fool_gcc(v))
-		:"m" (__atomic_fool_gcc(v)));
+		:"=m" (v->counter)
+		:"m" (v->counter));
 }
 
-static __inline__ int atomic_dec_and_test(volatile atomic_t *v)
+static __inline__ int atomic_dec_and_test(atomic_t *v)
 {
 	unsigned char c;
 
 	__asm__ __volatile__(
 		LOCK "decl %0; sete %1"
-		:"=m" (__atomic_fool_gcc(v)), "=qm" (c)
-		:"m" (__atomic_fool_gcc(v)));
+		:"=m" (v->counter), "=qm" (c)
+		:"m" (v->counter) : "memory");
 	return c != 0;
 }
 
-static __inline__ int atomic_inc_and_test(volatile atomic_t *v)
+static __inline__ int atomic_inc_and_test(atomic_t *v)
 {
 	unsigned char c;
 
 	__asm__ __volatile__(
 		LOCK "incl %0; sete %1"
-		:"=m" (__atomic_fool_gcc(v)), "=qm" (c)
-		:"m" (__atomic_fool_gcc(v)));
+		:"=m" (v->counter), "=qm" (c)
+		:"m" (v->counter) : "memory");
 	return c != 0;
 }
 
-extern __inline__ int atomic_add_negative(int i, volatile atomic_t *v)
+static __inline__ int atomic_add_negative(int i, atomic_t *v)
 {
 	unsigned char c;
 
 	__asm__ __volatile__(
 		LOCK "addl %2,%0; sets %1"
-		:"=m" (__atomic_fool_gcc(v)), "=qm" (c)
-		:"ir" (i), "m" (__atomic_fool_gcc(v)));
+		:"=m" (v->counter), "=qm" (c)
+		:"ir" (i), "m" (v->counter) : "memory");
 	return c;
 }
 
 /* These are x86-specific, used by some header files */
 #define atomic_clear_mask(mask, addr) \
 __asm__ __volatile__(LOCK "andl %0,%1" \
-: : "r" (~(mask)),"m" (__atomic_fool_gcc(addr)) : "memory")
+: : "r" (~(mask)),"m" (*addr) : "memory")
 
 #define atomic_set_mask(mask, addr) \
 __asm__ __volatile__(LOCK "orl %0,%1" \
-: : "r" (mask),"m" (__atomic_fool_gcc(addr)) : "memory")
+: : "r" (mask),"m" (*addr) : "memory")
 
 #endif
diff -urN 2.4.0-test9-pre5/include/asm-i386/bitops.h z/include/asm-i386/bitops.h
--- 2.4.0-test9-pre5/include/asm-i386/bitops.h	Sat Aug 26 18:15:40 2000
+++ z/include/asm-i386/bitops.h	Fri Sep 22 17:47:44 2000
@@ -21,29 +21,9 @@
 #define LOCK_PREFIX ""
 #endif
 
-/*
- * Function prototypes to keep gcc -Wall happy
- */
-extern void set_bit(int nr, volatile void * addr);
-extern void clear_bit(int nr, volatile void * addr);
-extern void change_bit(int nr, volatile void * addr);
-extern int test_and_set_bit(int nr, volatile void * addr);
-extern int test_and_clear_bit(int nr, volatile void * addr);
-extern int test_and_change_bit(int nr, volatile void * addr);
-extern int __constant_test_bit(int nr, const volatile void * addr);
-extern int __test_bit(int nr, volatile void * addr);
-extern int find_first_zero_bit(void * addr, unsigned size);
-extern int find_next_zero_bit (void * addr, int size, int offset);
-extern unsigned long ffz(unsigned long word);
-
-/*
- * Some hacks to defeat gcc over-optimizations..
- */
-struct __dummy { unsigned long a[100]; };
-#define ADDR (*(volatile struct __dummy *) addr)
-#define CONST_ADDR (*(volatile const struct __dummy *) addr)
+#define ADDR (*(volatile long *) addr)
 
-extern __inline__ void set_bit(int nr, volatile void * addr)
+static __inline__ void set_bit(int nr, volatile void * addr)
 {
 	__asm__ __volatile__( LOCK_PREFIX
 		"btsl %1,%0"
@@ -51,7 +31,21 @@
 		:"Ir" (nr));
 }
 
-extern __inline__ void clear_bit(int nr, volatile void * addr)
+/* WARNING: non atomic and it can be reordered! */
+static __inline__ void __set_bit(int nr, volatile void * addr)
+{
+	__asm__(
+		"btsl %1,%0"
+		:"=m" (ADDR)
+		:"Ir" (nr));
+}
+
+/*
+ * clear_bit() doesn't provide any barrier for the compiler.
+ */
+#define smp_mb__before_clear_bit()	barrier()
+#define smp_mb__after_clear_bit()	barrier()
+static __inline__ void clear_bit(int nr, volatile void * addr)
 {
 	__asm__ __volatile__( LOCK_PREFIX
 		"btrl %1,%0"
@@ -59,7 +53,7 @@
 		:"Ir" (nr));
 }
 
-extern __inline__ void change_bit(int nr, volatile void * addr)
+static __inline__ void change_bit(int nr, volatile void * addr)
 {
 	__asm__ __volatile__( LOCK_PREFIX
 		"btcl %1,%0"
@@ -67,48 +61,77 @@
 		:"Ir" (nr));
 }
 
-extern __inline__ int test_and_set_bit(int nr, volatile void * addr)
+/*
+ * It will also imply a memory barrier, thus it must clobber memory
+ * to make sure to reload anything that was cached into registers
+ * outside _this_ critical section.
+ */
+static __inline__ int test_and_set_bit(int nr, volatile void * addr)
 {
 	int oldbit;
 
 	__asm__ __volatile__( LOCK_PREFIX
 		"btsl %2,%1\n\tsbbl %0,%0"
 		:"=r" (oldbit),"=m" (ADDR)
+		:"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static __inline__ int __test_and_set_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__(
+		"btsl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"=m" (ADDR)
 		:"Ir" (nr));
 	return oldbit;
 }
 
-extern __inline__ int test_and_clear_bit(int nr, volatile void * addr)
+static __inline__ int test_and_clear_bit(int nr, volatile void * addr)
 {
 	int oldbit;
 
 	__asm__ __volatile__( LOCK_PREFIX
 		"btrl %2,%1\n\tsbbl %0,%0"
 		:"=r" (oldbit),"=m" (ADDR)
+		:"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static __inline__ int __test_and_clear_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__(
+		"btrl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"=m" (ADDR)
 		:"Ir" (nr));
 	return oldbit;
 }
 
-extern __inline__ int test_and_change_bit(int nr, volatile void * addr)
+static __inline__ int test_and_change_bit(int nr, volatile void * addr)
 {
 	int oldbit;
 
 	__asm__ __volatile__( LOCK_PREFIX
 		"btcl %2,%1\n\tsbbl %0,%0"
 		:"=r" (oldbit),"=m" (ADDR)
-		:"Ir" (nr));
+		:"Ir" (nr) : "memory");
 	return oldbit;
 }
 
 /*
  * This routine doesn't need to be atomic.
  */
-extern __inline__ int __constant_test_bit(int nr, const volatile void * addr)
+static __inline__ int constant_test_bit(int nr, const volatile void * addr)
 {
 	return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
 }
 
-extern __inline__ int __test_bit(int nr, volatile void * addr)
+static __inline__ int variable_test_bit(int nr, volatile void * addr)
 {
 	int oldbit;
 
@@ -121,13 +144,13 @@
 
 #define test_bit(nr,addr) \
 (__builtin_constant_p(nr) ? \
- __constant_test_bit((nr),(addr)) : \
- __test_bit((nr),(addr)))
+ constant_test_bit((nr),(addr)) : \
+ variable_test_bit((nr),(addr)))
 
 /*
  * Find-bit routines..
  */
-extern __inline__ int find_first_zero_bit(void * addr, unsigned size)
+static __inline__ int find_first_zero_bit(void * addr, unsigned size)
 {
 	int d0, d1, d2;
 	int res;
@@ -151,7 +174,7 @@
 	return res;
 }
 
-extern __inline__ int find_next_zero_bit (void * addr, int size, int offset)
+static __inline__ int find_next_zero_bit (void * addr, int size, int offset)
 {
 	unsigned long * p = ((unsigned long *) addr) + (offset >> 5);
 	int set = 0, bit = offset & 31, res;
@@ -182,7 +205,7 @@
  * ffz = Find First Zero in word. Undefined if no zero exists,
  * so code should check against ~0UL first..
  */
-extern __inline__ unsigned long ffz(unsigned long word)
+static __inline__ unsigned long ffz(unsigned long word)
 {
 	__asm__("bsfl %1,%0"
 		:"=r" (word)
@@ -198,7 +221,7 @@
  * differs in spirit from the above ffz (man ffs).
  */
 
-extern __inline__ int ffs(int x)
+static __inline__ int ffs(int x)
 {
 	int r;
 
@@ -222,16 +245,16 @@
 
 #ifdef __KERNEL__
 
-#define ext2_set_bit                 test_and_set_bit
-#define ext2_clear_bit               test_and_clear_bit
+#define ext2_set_bit                 __test_and_set_bit
+#define ext2_clear_bit               __test_and_clear_bit
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
 
 /* Bitmap functions for the minix filesystem.  */
-#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr)
-#define minix_set_bit(nr,addr) set_bit(nr,addr)
-#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr)
+#define minix_test_and_set_bit(nr,addr) __test_and_set_bit(nr,addr)
+#define minix_set_bit(nr,addr) __set_bit(nr,addr)
+#define minix_test_and_clear_bit(nr,addr) __test_and_clear_bit(nr,addr)
 #define minix_test_bit(nr,addr) test_bit(nr,addr)
 #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size)
 
diff -urN 2.4.0-test9-pre5/include/asm-i386/rwlock.h z/include/asm-i386/rwlock.h
--- 2.4.0-test9-pre5/include/asm-i386/rwlock.h	Wed Dec  8 00:05:27 1999
+++ z/include/asm-i386/rwlock.h	Fri Sep 22 17:47:44 2000
@@ -17,9 +17,6 @@
 #ifndef _ASM_I386_RWLOCK_H
 #define _ASM_I386_RWLOCK_H
 
-typedef struct { unsigned long a[100]; } __dummy_lock_t;
-#define __dummy_lock(lock) (*(__dummy_lock_t *)(lock))
-
 #define RW_LOCK_BIAS		 0x01000000
 #define RW_LOCK_BIAS_STR	"0x01000000"
 
@@ -44,7 +41,7 @@
 		     "popl %%eax\n\t" \
 		     "jmp 1b\n" \
 		     ".previous" \
-		     :"=m" (__dummy_lock(rw)))
+		     :"=m" (*(volatile int *)rw) : : "memory")
 
 #define __build_read_lock(rw, helper)	do { \
 						if (__builtin_constant_p(rw)) \
@@ -74,7 +71,7 @@
 		     "popl %%eax\n\t" \
 		     "jmp 1b\n" \
 		     ".previous" \
-		     :"=m" (__dummy_lock(rw)))
+		     :"=m" (*(volatile int *)rw) : : "memory")
 
 #define __build_write_lock(rw, helper)	do { \
 						if (__builtin_constant_p(rw)) \
diff -urN 2.4.0-test9-pre5/include/asm-i386/spinlock.h z/include/asm-i386/spinlock.h
--- 2.4.0-test9-pre5/include/asm-i386/spinlock.h	Tue Sep 12 02:32:47 2000
+++ z/include/asm-i386/spinlock.h	Fri Sep 22 17:50:09 2000
@@ -70,13 +70,12 @@
 	char oldval;
 	__asm__ __volatile__(
 		"xchgb %b0,%1"
-		:"=q" (oldval), "=m" (__dummy_lock(lock))
-		:"0" (0)
-		:"memory");
+		:"=q" (oldval), "=m" (lock->lock)
+		:"0" (0) : "memory");
 	return oldval > 0;
 }
 
-extern inline void spin_lock(spinlock_t *lock)
+static inline void spin_lock(spinlock_t *lock)
 {
 #if SPINLOCK_DEBUG
 	__label__ here;
@@ -88,11 +87,10 @@
 #endif
 	__asm__ __volatile__(
 		spin_lock_string
-		:"=m" (__dummy_lock(lock))
-		: :"memory");
+		:"=m" (lock->lock) : : "memory");
 }
 
-extern inline void spin_unlock(spinlock_t *lock)
+static inline void spin_unlock(spinlock_t *lock)
 {
 #if SPINLOCK_DEBUG
 	if (lock->magic != SPINLOCK_MAGIC)
@@ -102,8 +100,7 @@
 #endif
 	__asm__ __volatile__(
 		spin_unlock_string
-		:"=m" (__dummy_lock(lock))
-		: :"memory");
+		:"=m" (lock->lock) : : "memory");
 }
 
 /*
@@ -146,7 +143,7 @@
  */
 /* the spinlock helpers are in arch/i386/kernel/semaphore.S */
 
-extern inline void read_lock(rwlock_t *rw)
+static inline void read_lock(rwlock_t *rw)
 {
 #if SPINLOCK_DEBUG
 	if (rw->magic != RWLOCK_MAGIC)
@@ -155,7 +152,7 @@
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-extern inline void write_lock(rwlock_t *rw)
+static inline void write_lock(rwlock_t *rw)
 {
 #if SPINLOCK_DEBUG
 	if (rw->magic != RWLOCK_MAGIC)
@@ -164,10 +161,10 @@
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-#define read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" (__dummy_lock(&(rw)->lock)))
-#define write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" (__dummy_lock(&(rw)->lock)))
+#define read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
+#define write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
 
-extern inline int write_trylock(rwlock_t *lock)
+static inline int write_trylock(rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
diff -urN 2.4.0-test9-pre5/include/asm-i386/system.h z/include/asm-i386/system.h
--- 2.4.0-test9-pre5/include/asm-i386/system.h	Tue Sep 12 02:32:47 2000
+++ z/include/asm-i386/system.h	Fri Sep 22 17:47:44 2000
@@ -278,11 +278,22 @@
 #endif
 #define rmb()	mb()
 #define wmb()	__asm__ __volatile__ ("": : :"memory")
+
+#ifdef __SMP__
+#define smp_mb()	mb()
+#define smp_rmb()	rmb()
+#define smp_wmb()	wmb()
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#endif
+
 #define set_mb(var, value) do { xchg(&var, value); } while (0)
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 
 /* interrupt control.. */
-#define __save_flags(x)		__asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */ :"memory")
+#define __save_flags(x)		__asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */)
 #define __restore_flags(x) 	__asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory")
 #define __cli() 		__asm__ __volatile__("cli": : :"memory")
 #define __sti()			__asm__ __volatile__("sti": : :"memory")
@@ -291,9 +302,9 @@
 
 /* For spinlocks etc */
 #define local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
-#define local_irq_restore(x)	__asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory")
-#define local_irq_disable()	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+#define local_irq_restore(x)	__restore_flags(x)
+#define local_irq_disable()	__cli()
+#define local_irq_enable()	__sti()
 
 #ifdef CONFIG_SMP
 
diff -urN 2.4.0-test9-pre5/include/asm-sparc64/system.h z/include/asm-sparc64/system.h
--- 2.4.0-test9-pre5/include/asm-sparc64/system.h	Thu Aug 17 19:57:41 2000
+++ z/include/asm-sparc64/system.h	Fri Sep 22 17:47:44 2000
@@ -100,8 +100,8 @@
 #define nop() 		__asm__ __volatile__ ("nop")
 
 #define membar(type)	__asm__ __volatile__ ("membar " type : : : "memory");
-#define rmb()		membar("#LoadLoad | #LoadStore")
-#define wmb()		membar("#StoreLoad | #StoreStore")
+#define rmb()		membar("#LoadLoad")
+#define wmb()		membar("#StoreStore")
 #define set_mb(__var, __value) \
 	do { __var = __value; membar("#StoreLoad | #StoreStore"); } while(0)
 #define set_wmb(__var, __value) \
diff -urN 2.4.0-test9-pre5/include/linux/brlock.h z/include/linux/brlock.h
--- 2.4.0-test9-pre5/include/linux/brlock.h	Sat Aug 26 18:15:47 2000
+++ z/include/linux/brlock.h	Fri Sep 22 17:47:44 2000
@@ -114,10 +114,23 @@
 	lock = &__br_write_locks[idx].lock;
 again:
 	(*ctr)++;
-	rmb();
+	mb();
 	if (spin_is_locked(lock)) {
 		(*ctr)--;
-		rmb();
+		wmb(); /*
+			* The release of the ctr must become visible
+			* to the other cpus eventually thus wmb(),
+			* we don't care if spin_is_locked is reordered
+			* before the releasing of the ctr.
+			* However IMHO this wmb() is superflous even in theory.
+			* It would not be superflous only if on the
+			* other CPUs doing a ldl_l instead of an ldl
+			* would make a difference and I don't think this is
+			* the case.
+			* I'd like to clarify this issue further
+			* but for now this is a slow path so adding the
+			* wmb() will keep us on the safe side.
+			*/
 		while (spin_is_locked(lock))
 			barrier();
 		goto again;
diff -urN 2.4.0-test9-pre5/include/linux/locks.h z/include/linux/locks.h
--- 2.4.0-test9-pre5/include/linux/locks.h	Sun Aug 27 16:21:04 2000
+++ z/include/linux/locks.h	Fri Sep 22 17:47:44 2000
@@ -29,7 +29,9 @@
 extern inline void unlock_buffer(struct buffer_head *bh)
 {
 	clear_bit(BH_Lock, &bh->b_state);
-	wake_up(&bh->b_wait);
+	smp_mb__after_clear_bit();
+	if (waitqueue_active(&bh->b_wait))
+		wake_up(&bh->b_wait);
 }
 
 /*
@@ -55,7 +57,12 @@
 extern inline void unlock_super(struct super_block * sb)
 {
 	sb->s_lock = 0;
-	wake_up(&sb->s_wait);
+	/*
+	 * No need of any barrier, we're protected by
+	 * the big kernel lock here... unfortunately :)
+	 */
+	if (waitqueue_active(&sb->s_wait))
+		wake_up(&sb->s_wait);
 }
 
 #endif /* _LINUX_LOCKS_H */
diff -urN 2.4.0-test9-pre5/include/linux/mm.h z/include/linux/mm.h
--- 2.4.0-test9-pre5/include/linux/mm.h	Thu Sep 21 17:44:41 2000
+++ z/include/linux/mm.h	Fri Sep 22 17:47:44 2000
@@ -193,9 +193,18 @@
 #define PageLocked(page)	test_bit(PG_locked, &(page)->flags)
 #define LockPage(page)		set_bit(PG_locked, &(page)->flags)
 #define TryLockPage(page)	test_and_set_bit(PG_locked, &(page)->flags)
+/*
+ * The first mb is necessary to safely close the critical section opened by the
+ * TryLockPage(), the second mb is necessary to enforce ordering between
+ * the clear_bit and the read of the waitqueue (to avoid SMP races with a
+ * parallel wait_on_page).
+ */
 #define UnlockPage(page)	do { \
+					smp_mb__before_clear_bit(); \
 					clear_bit(PG_locked, &(page)->flags); \
-					wake_up(&page->wait); \
+					smp_mb__after_clear_bit(); \
+					if (waitqueue_active(&page->wait)) \
+						wake_up(&page->wait); \
 				} while (0)
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
diff -urN 2.4.0-test9-pre5/include/linux/tqueue.h z/include/linux/tqueue.h
--- 2.4.0-test9-pre5/include/linux/tqueue.h	Sat Aug 26 18:15:42 2000
+++ z/include/linux/tqueue.h	Fri Sep 22 17:47:44 2000
@@ -114,7 +114,7 @@
 			f      = p -> routine;
 			save_p = p;
 			p      = p -> next;
-			mb();
+			smp_mb();
 			save_p -> sync = 0;
 			if (f)
 				(*f)(arg);
diff -urN 2.4.0-test9-pre5/kernel/softirq.c z/kernel/softirq.c
--- 2.4.0-test9-pre5/kernel/softirq.c	Thu Aug 17 19:57:44 2000
+++ z/kernel/softirq.c	Fri Sep 22 17:47:44 2000
@@ -44,7 +44,7 @@
 irq_cpustat_t irq_stat[NR_CPUS];
 #endif	/* CONFIG_ARCH_S390 */
 
-static struct softirq_action softirq_vec[32];
+static struct softirq_action softirq_vec[32] __cacheline_aligned;
 
 asmlinkage void do_softirq()
 {
@@ -140,6 +140,14 @@
 				clear_bit(TASKLET_STATE_SCHED, &t->state);
 
 				t->func(t->data);
+				/*
+				 * talklet_trylock() uses test_and_set_bit that imply
+				 * an mb when it returns zero, thus we need the explicit
+				 * mb only here: while closing the critical section.
+				 */
+#ifdef CONFIG_SMP
+				smp_mb__before_clear_bit();
+#endif
 				tasklet_unlock(t);
 				continue;
 			}
diff -urN 2.4.0-test9-pre5/net/core/dev.c z/net/core/dev.c
--- 2.4.0-test9-pre5/net/core/dev.c	Thu Sep 21 17:44:41 2000
+++ z/net/core/dev.c	Fri Sep 22 17:47:44 2000
@@ -1141,6 +1141,7 @@
 			struct net_device *dev = head;
 			head = head->next_sched;
 
+			smp_mb__before_clear_bit();
 			clear_bit(__LINK_STATE_SCHED, &dev->state);
 
 			if (spin_trylock(&dev->queue_lock)) {