From: Jack F Vogel <jfv@bluesong.net>

A bug against an xSeries system showed up recently noting that the
check_nmi_watchdog() test was failing.

I have been investigating it and discovered in both i386 and x86_64 the
recent change to the routine to use the cpu_callin_map has uncovered a
problem.  Prior to that change, on an SMP box, the test was trivally
passing because all cpu's were found to not yet be online, but now with the
callin_map they are discovered, it goes on to test the counter and they
have not yet begun to increment, so it announces a CPU is stuck and bails
out.

On all the systems I have access to test, the announcement of failure is
also bougs...  by the time you can login and check /proc/interrupts, the
NMI count is happily incrementing on all CPUs.  Its just that the test is
being done too early.

I have tried moving the call to the test around a bit, and it was always
too early.  I finally hit on this proposed solution, it delays the routine
via a late_initcall(), seems like the right solution to me.  

Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/arch/i386/kernel/apic.c      |    2 --
 25-akpm/arch/i386/kernel/io_apic.c   |    2 --
 25-akpm/arch/i386/kernel/nmi.c       |   11 +++++++----
 25-akpm/arch/i386/kernel/smpboot.c   |    3 ---
 25-akpm/arch/x86_64/kernel/io_apic.c |    2 --
 25-akpm/arch/x86_64/kernel/nmi.c     |    9 +++++++--
 25-akpm/include/asm-i386/apic.h      |    1 -
 7 files changed, 14 insertions(+), 16 deletions(-)

diff -puN arch/i386/kernel/apic.c~rfc-check-nmi-watchdog-is-broken arch/i386/kernel/apic.c
--- 25/arch/i386/kernel/apic.c~rfc-check-nmi-watchdog-is-broken	2005-03-30 18:28:21.000000000 -0800
+++ 25-akpm/arch/i386/kernel/apic.c	2005-03-30 18:28:21.000000000 -0800
@@ -1265,8 +1265,6 @@ int __init APIC_init_uniprocessor (void)
 
 	setup_local_APIC();
 
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		check_nmi_watchdog();
 #ifdef CONFIG_X86_IO_APIC
 	if (smp_found_config)
 		if (!skip_ioapic_setup && nr_ioapics)
diff -puN arch/i386/kernel/io_apic.c~rfc-check-nmi-watchdog-is-broken arch/i386/kernel/io_apic.c
--- 25/arch/i386/kernel/io_apic.c~rfc-check-nmi-watchdog-is-broken	2005-03-30 18:28:21.000000000 -0800
+++ 25-akpm/arch/i386/kernel/io_apic.c	2005-03-30 18:28:21.000000000 -0800
@@ -2175,7 +2175,6 @@ static inline void check_timer(void)
 				disable_8259A_irq(0);
 				setup_nmi();
 				enable_8259A_irq(0);
-				check_nmi_watchdog();
 			}
 			return;
 		}
@@ -2198,7 +2197,6 @@ static inline void check_timer(void)
 				add_pin_to_irq(0, 0, pin2);
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
-				check_nmi_watchdog();
 			}
 			return;
 		}
diff -puN arch/i386/kernel/nmi.c~rfc-check-nmi-watchdog-is-broken arch/i386/kernel/nmi.c
--- 25/arch/i386/kernel/nmi.c~rfc-check-nmi-watchdog-is-broken	2005-03-30 18:28:21.000000000 -0800
+++ 25-akpm/arch/i386/kernel/nmi.c	2005-03-30 18:28:21.000000000 -0800
@@ -102,20 +102,21 @@ int nmi_active;
 	(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|	\
 	 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
 
-int __init check_nmi_watchdog (void)
+static int __init check_nmi_watchdog(void)
 {
 	unsigned int prev_nmi_count[NR_CPUS];
 	int cpu;
 
-	printk(KERN_INFO "testing NMI watchdog ... ");
+	if (nmi_watchdog == NMI_NONE)
+		return 0;
+
+	printk(KERN_INFO "Testing NMI watchdog ... ");
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
 	local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
-	/* FIXME: Only boot CPU is online at this stage.  Check CPUs
-           as they come up. */
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_SMP
 		/* Check cpu_callin_map here because that is set
@@ -139,6 +140,8 @@ int __init check_nmi_watchdog (void)
 
 	return 0;
 }
+/* This needs to happen later in boot so counters are working */
+late_initcall(check_nmi_watchdog);
 
 static int __init setup_nmi_watchdog(char *str)
 {
diff -puN arch/i386/kernel/smpboot.c~rfc-check-nmi-watchdog-is-broken arch/i386/kernel/smpboot.c
--- 25/arch/i386/kernel/smpboot.c~rfc-check-nmi-watchdog-is-broken	2005-03-30 18:28:21.000000000 -0800
+++ 25-akpm/arch/i386/kernel/smpboot.c	2005-03-30 18:28:21.000000000 -0800
@@ -1062,9 +1062,6 @@ static void __init smp_boot_cpus(unsigne
 			printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
 	}
 
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		check_nmi_watchdog();
-
 	smpboot_setup_io_apic();
 
 	setup_boot_APIC_clock();
diff -puN arch/x86_64/kernel/io_apic.c~rfc-check-nmi-watchdog-is-broken arch/x86_64/kernel/io_apic.c
--- 25/arch/x86_64/kernel/io_apic.c~rfc-check-nmi-watchdog-is-broken	2005-03-30 18:28:21.000000000 -0800
+++ 25-akpm/arch/x86_64/kernel/io_apic.c	2005-03-30 18:28:21.000000000 -0800
@@ -1607,7 +1607,6 @@ static inline void check_timer(void)
 				disable_8259A_irq(0);
 				setup_nmi();
 				enable_8259A_irq(0);
-				check_nmi_watchdog();
 			}
 			return;
 		}
@@ -1627,7 +1626,6 @@ static inline void check_timer(void)
 			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
-				check_nmi_watchdog();
 			}
 			return;
 		}
diff -puN arch/x86_64/kernel/nmi.c~rfc-check-nmi-watchdog-is-broken arch/x86_64/kernel/nmi.c
--- 25/arch/x86_64/kernel/nmi.c~rfc-check-nmi-watchdog-is-broken	2005-03-30 18:28:21.000000000 -0800
+++ 25-akpm/arch/x86_64/kernel/nmi.c	2005-03-30 18:28:21.000000000 -0800
@@ -112,17 +112,20 @@ static __init int cpu_has_lapic(void)
 	} 	
 }
 
-int __init check_nmi_watchdog (void)
+static int __init check_nmi_watchdog (void)
 {
 	int counts[NR_CPUS];
 	int cpu;
 
+	if (nmi_watchdog == NMI_NONE)
+		return 0;
+
 	if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic())  {
 		nmi_watchdog = NMI_NONE;
 		return -1; 
 	}	
 
-	printk(KERN_INFO "testing NMI watchdog ... ");
+	printk(KERN_INFO "Testing NMI watchdog ... ");
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		counts[cpu] = cpu_pda[cpu].__nmi_count; 
@@ -154,6 +157,8 @@ int __init check_nmi_watchdog (void)
 
 	return 0;
 }
+/* Have this called later during boot so counters are updating */
+late_initcall(check_nmi_watchdog);
 
 int __init setup_nmi_watchdog(char *str)
 {
diff -puN include/asm-i386/apic.h~rfc-check-nmi-watchdog-is-broken include/asm-i386/apic.h
--- 25/include/asm-i386/apic.h~rfc-check-nmi-watchdog-is-broken	2005-03-30 18:29:19.000000000 -0800
+++ 25-akpm/include/asm-i386/apic.h	2005-03-30 18:29:25.000000000 -0800
@@ -109,7 +109,6 @@ extern int APIC_init_uniprocessor (void)
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
 
-extern int check_nmi_watchdog (void);
 extern void enable_NMI_through_LVT0 (void * dummy);
 
 extern unsigned int nmi_watchdog;
_