From: Ingo Molnar <mingo@elte.hu>

What i'd propose is the attached (tested, against 2.6.0-test2) patch
instead. It unifies the functionality of add_timer() and mod_timer(), and
makes any combination of the timer API calls completely SMP-safe.  
del_timer() is still not using the timer lock.

this patch fixes the only timer bug in 2.6 i'm aware of: the
del_timer_sync() + add_timer() combination in kernel/itimer.c is buggy.
This was correct code in 2.4, because there it was safe to do an
add_timer() from the timer handler itself, parallel to a del_timer_sync().  
If we want to make this safe in 2.6 too (which i think we want to) then we
have to make add_timer() almost equivalent to mod_timer(), locking-wise.
And once we are at this point i think it's much cleaner to actually make
add_timer() a variant of mod_timer(). (There's no locking cost for
add_timer(), only the cost of an extra branch. And we've removed another
commonly used function from the icache.)




 include/linux/timer.h |   23 ++++++++
 kernel/ksyms.c        |    5 +
 kernel/timer.c        |  129 ++++++++++++++++++++------------------------------
 3 files changed, 78 insertions(+), 79 deletions(-)

diff -puN include/linux/timer.h~timer-race-fixes include/linux/timer.h
--- 25/include/linux/timer.h~timer-race-fixes	2003-08-07 15:23:46.000000000 -0700
+++ 25-akpm/include/linux/timer.h	2003-08-07 15:23:46.000000000 -0700
@@ -60,11 +60,30 @@ static inline int timer_pending(const st
 	return timer->base != NULL;
 }
 
-extern void add_timer(struct timer_list * timer);
 extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
+extern int __mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
-  
+
+/***
+ * add_timer - start a timer
+ * @timer: the timer to be added
+ *
+ * The kernel will do a ->function(->data) callback from the
+ * timer interrupt at the ->expired point in the future. The
+ * current time is 'jiffies'.
+ *
+ * The timer's ->expired, ->function (and if the handler uses it, ->data)
+ * fields must be set prior calling this function.
+ *
+ * Timers with an ->expired field in the past will be executed in the next
+ * timer tick.
+ */
+static inline void add_timer(struct timer_list * timer)
+{
+	__mod_timer(timer, timer->expires);
+}
+
 #ifdef CONFIG_SMP
   extern int del_timer_sync(struct timer_list * timer);
 #else
diff -puN kernel/timer.c~timer-race-fixes kernel/timer.c
--- 25/kernel/timer.c~timer-race-fixes	2003-08-07 15:23:46.000000000 -0700
+++ 25-akpm/kernel/timer.c	2003-08-07 15:23:46.000000000 -0700
@@ -144,34 +144,62 @@ static void internal_add_timer(tvec_base
 	list_add_tail(&timer->entry, vec);
 }
 
-/***
- * add_timer - start a timer
- * @timer: the timer to be added
- *
- * The kernel will do a ->function(->data) callback from the
- * timer interrupt at the ->expired point in the future. The
- * current time is 'jiffies'.
- *
- * The timer's ->expired, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
- *
- * Timers with an ->expired field in the past will be executed in the next
- * timer tick. It's illegal to add an already pending timer.
- */
-void add_timer(struct timer_list *timer)
+int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	tvec_base_t *base = &get_cpu_var(tvec_bases);
-  	unsigned long flags;
-  
-  	BUG_ON(timer_pending(timer) || !timer->function);
+	tvec_base_t *old_base, *new_base;
+	unsigned long flags;
+	int ret = 0;
+
+	BUG_ON(!timer->function);
 
 	check_timer(timer);
 
-	spin_lock_irqsave(&base->lock, flags);
-	internal_add_timer(base, timer);
-	timer->base = base;
-	spin_unlock_irqrestore(&base->lock, flags);
-	put_cpu_var(tvec_bases);
+	spin_lock_irqsave(&timer->lock, flags);
+	new_base = &__get_cpu_var(tvec_bases);
+repeat:
+	old_base = timer->base;
+
+	/*
+	 * Prevent deadlocks via ordering by old_base < new_base.
+	 */
+	if (old_base && (new_base != old_base)) {
+		if (old_base < new_base) {
+			spin_lock(&new_base->lock);
+			spin_lock(&old_base->lock);
+		} else {
+			spin_lock(&old_base->lock);
+			spin_lock(&new_base->lock);
+		}
+		/*
+		 * The timer base might have been cancelled while we were
+		 * trying to take the lock(s):
+		 */
+		if (timer->base != old_base) {
+			spin_unlock(&new_base->lock);
+			spin_unlock(&old_base->lock);
+			goto repeat;
+		}
+	} else
+		spin_lock(&new_base->lock);
+
+	/*
+	 * Delete the previous timeout (if there was any), and install
+	 * the new one:
+	 */
+	if (old_base) {
+		list_del(&timer->entry);
+		ret = 1;
+	}
+	timer->expires = expires;
+	internal_add_timer(new_base, timer);
+	timer->base = new_base;
+
+	if (old_base && (new_base != old_base))
+		spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
+	spin_unlock_irqrestore(&timer->lock, flags);
+
+	return ret;
 }
 
 /***
@@ -179,7 +207,7 @@ void add_timer(struct timer_list *timer)
  * @timer: the timer to be added
  * @cpu: the CPU to start it on
  *
- * This is not very scalable on SMP.
+ * This is not very scalable on SMP. Double adds are not possible.
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
@@ -217,10 +245,6 @@ void add_timer_on(struct timer_list *tim
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	tvec_base_t *old_base, *new_base;
-	unsigned long flags;
-	int ret = 0;
-
 	BUG_ON(!timer->function);
 
 	check_timer(timer);
@@ -233,52 +257,7 @@ int mod_timer(struct timer_list *timer, 
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	spin_lock_irqsave(&timer->lock, flags);
-	new_base = &__get_cpu_var(tvec_bases);
-repeat:
-	old_base = timer->base;
-
-	/*
-	 * Prevent deadlocks via ordering by old_base < new_base.
-	 */
-	if (old_base && (new_base != old_base)) {
-		if (old_base < new_base) {
-			spin_lock(&new_base->lock);
-			spin_lock(&old_base->lock);
-		} else {
-			spin_lock(&old_base->lock);
-			spin_lock(&new_base->lock);
-		}
-		/*
-		 * The timer base might have been cancelled while we were
-		 * trying to take the lock(s):
-		 */
-		if (timer->base != old_base) {
-			spin_unlock(&new_base->lock);
-			spin_unlock(&old_base->lock);
-			goto repeat;
-		}
-	} else
-		spin_lock(&new_base->lock);
-
-	/*
-	 * Delete the previous timeout (if there was any), and install
-	 * the new one:
-	 */
-	if (old_base) {
-		list_del(&timer->entry);
-		ret = 1;
-	}
-	timer->expires = expires;
-	internal_add_timer(new_base, timer);
-	timer->base = new_base;
-
-	if (old_base && (new_base != old_base))
-		spin_unlock(&old_base->lock);
-	spin_unlock(&new_base->lock);
-	spin_unlock_irqrestore(&timer->lock, flags);
-
-	return ret;
+	return __mod_timer(timer, expires);
 }
 
 /***
diff -puN kernel/ksyms.c~timer-race-fixes kernel/ksyms.c
--- 25/kernel/ksyms.c~timer-race-fixes	2003-08-07 15:23:46.000000000 -0700
+++ 25-akpm/kernel/ksyms.c	2003-08-07 15:23:46.000000000 -0700
@@ -405,8 +405,6 @@ EXPORT_SYMBOL(proc_doulongvec_ms_jiffies
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 
 /* interrupt handling */
-EXPORT_SYMBOL(add_timer);
-EXPORT_SYMBOL(del_timer);
 EXPORT_SYMBOL(request_irq);
 EXPORT_SYMBOL(free_irq);
 
@@ -433,7 +431,10 @@ EXPORT_SYMBOL(probe_irq_off);
 #ifdef CONFIG_SMP
 EXPORT_SYMBOL(del_timer_sync);
 #endif
+EXPORT_SYMBOL(add_timer);
+EXPORT_SYMBOL(del_timer);
 EXPORT_SYMBOL(mod_timer);
+EXPORT_SYMBOL(__mod_timer);
 
 #ifdef HAVE_DISABLE_HLT
 EXPORT_SYMBOL(disable_hlt);

_