From: j-nomura@ce.jp.nec.com

The attached patch is NUMA-aware zonelist builder patch, which sorts
zonelist in the order that near-node first, far-node last.  In lse-tech and
linux-ia64, where most of NUMA people resides, no objections are raised so
far.

The patch adds NUMA-specific version of build_zonelists which calls
find_next_best_node to select the next-nearest node to add to zonelist.

The patch has no effect on flat NUMA platform.


---

 25-akpm/include/asm-generic/topology.h |    7 ++
 25-akpm/include/asm-i386/topology.h    |    6 +
 25-akpm/mm/page_alloc.c                |  105 +++++++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+)

diff -puN include/asm-generic/topology.h~numa-aware-zonelist-builder include/asm-generic/topology.h
--- 25/include/asm-generic/topology.h~numa-aware-zonelist-builder	Mon Mar  1 15:18:30 2004
+++ 25-akpm/include/asm-generic/topology.h	Mon Mar  1 15:18:30 2004
@@ -45,6 +45,13 @@
 #define pcibus_to_cpumask(bus)	(cpu_online_map)
 #endif
 
+#ifndef node_distance
+#define node_distance(from,to)	(from != to)
+#endif
+#ifndef PENALTY_FOR_NODE_WITH_CPUS
+#define PENALTY_FOR_NODE_WITH_CPUS	(1)
+#endif
+
 /* Cross-node load balancing interval. */
 #ifndef NODE_BALANCE_RATE
 #define NODE_BALANCE_RATE 10
diff -puN include/asm-i386/topology.h~numa-aware-zonelist-builder include/asm-i386/topology.h
--- 25/include/asm-i386/topology.h~numa-aware-zonelist-builder	Mon Mar  1 15:18:30 2004
+++ 25-akpm/include/asm-i386/topology.h	Mon Mar  1 15:18:30 2004
@@ -66,6 +66,12 @@ static inline cpumask_t pcibus_to_cpumas
 	return node_to_cpumask(mp_bus_id_to_node[bus]);
 }
 
+/* Node-to-Node distance */
+static inline int node_distance(int from, int to)
+{
+	return (from != to);
+}
+
 /* Cross-node load balancing interval. */
 #define NODE_BALANCE_RATE 100
 
diff -puN mm/page_alloc.c~numa-aware-zonelist-builder mm/page_alloc.c
--- 25/mm/page_alloc.c~numa-aware-zonelist-builder	Mon Mar  1 15:18:30 2004
+++ 25-akpm/mm/page_alloc.c	Mon Mar  1 15:18:30 2004
@@ -1128,6 +1128,109 @@ static int __init build_zonelists_node(p
 	return j;
 }
 
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (numnodes)
+static int __initdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given
+ *    node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: pointer to the bitmap of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __init find_next_best_node(int node, void *used_node_mask)
+{
+	int i, n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	for (i = 0; i < numnodes; i++) {
+		/* Start from local node */
+		n = (node+i)%numnodes;
+
+		/* Don't want a node to appear more than once */
+		if (test_bit(n, used_node_mask))
+			continue;
+
+		/* Use the distance array to find the distance */
+		val = node_distance(node, n);
+
+		/* Give preference to headless and unused nodes */
+		if (node_to_cpumask(n))
+			val += PENALTY_FOR_NODE_WITH_CPUS;
+
+		/* Slight preference for less loaded node */
+		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+		val += node_load[n];
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node >= 0)
+		set_bit(best_node, used_node_mask);
+
+	return best_node;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+	int prev_node, load;
+	struct zonelist *zonelist;
+	DECLARE_BITMAP(used_mask, MAX_NUMNODES);
+
+	/* initialize zonelists */
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		memset(zonelist, 0, sizeof(*zonelist));
+		zonelist->zones[0] = NULL;
+	}
+
+	/* NUMA-aware ordering of nodes */
+	local_node = pgdat->node_id;
+	load = numnodes;
+	prev_node = local_node;
+	CLEAR_BITMAP(used_mask, MAX_NUMNODES);
+	while ((node = find_next_best_node(local_node, used_mask)) >= 0) {
+		/*
+		 * We don't want to pressure a particular node.
+		 * So adding penalty to the first node in same
+		 * distance group to make it round-robin.
+		 */
+		if (node_distance(local_node, node) !=
+				node_distance(local_node, prev_node))
+			node_load[node] += load;
+		prev_node = node;
+		load--;
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			zonelist = pgdat->node_zonelists + i;
+			for (j = 0; zonelist->zones[j] != NULL; j++);
+
+			k = ZONE_NORMAL;
+			if (i & __GFP_HIGHMEM)
+				k = ZONE_HIGHMEM;
+			if (i & __GFP_DMA)
+				k = ZONE_DMA;
+
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			zonelist->zones[j] = NULL;
+		}
+	}
+}
+
+#else	/* CONFIG_NUMA */
+
 static void __init build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
@@ -1164,6 +1267,8 @@ static void __init build_zonelists(pg_da
 	} 
 }
 
+#endif	/* CONFIG_NUMA */
+
 void __init build_all_zonelists(void)
 {
 	int i;

_