diff -Nur --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/fs/proc/proc_misc.c lazy_buddy-2.5.41/fs/proc/proc_misc.c
--- linux-2.5/fs/proc/proc_misc.c	Tue Oct  8 07:54:50 2002
+++ lazy_buddy-2.5.41/fs/proc/proc_misc.c	Sat Oct  5 16:33:49 2002
@@ -133,6 +133,7 @@
 static int meminfo_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
+	extern unsigned long nr_deferred_pages(void);
 	struct sysinfo i;
 	int len, committed;
 	struct page_state ps;
@@ -170,6 +171,7 @@
 		"SwapFree:     %8lu kB\n"
 		"Dirty:        %8lu kB\n"
 		"Writeback:    %8lu kB\n"
+		"Deferred:     %8lu kB\n"
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
 		"Committed_AS: %8u kB\n"
@@ -191,6 +193,7 @@
 		K(i.freeswap),
 		K(ps.nr_dirty),
 		K(ps.nr_writeback),
+		K(nr_deferred_pages()),
 		K(ps.nr_mapped),
 		K(ps.nr_slab),
 		K(committed),
diff -Nur --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/include/linux/mmzone.h lazy_buddy-2.5.41/include/linux/mmzone.h
--- linux-2.5/include/linux/mmzone.h	Tue Oct  8 07:54:53 2002
+++ lazy_buddy-2.5.41/include/linux/mmzone.h	Sat Oct  5 16:33:54 2002
@@ -25,8 +25,9 @@
 #endif
 
 struct free_area {
-	struct list_head	free_list;
-	unsigned long		*map;
+	unsigned long		globally_free, active, locally_free;
+	struct list_head	free_list, deferred_pages;
+	unsigned long	*map;
 };
 
 struct pglist_data;
diff -Nur --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/mm/page_alloc.c lazy_buddy-2.5.41/mm/page_alloc.c
--- linux-2.5/mm/page_alloc.c	Tue Oct  8 07:54:54 2002
+++ lazy_buddy-2.5.41/mm/page_alloc.c	Tue Oct  8 07:54:07 2002
@@ -10,6 +10,7 @@
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Lazy buddy allocation, William Irwin, IBM, May 2002
  */
 
 #include <linux/config.h>
@@ -80,15 +81,10 @@
  * -- wli
  */
 
+static void FASTCALL(low_level_free(struct page *page, unsigned int order));
 void __free_pages_ok (struct page *page, unsigned int order)
 {
-	unsigned long index, page_idx, mask, flags;
-	struct free_area *area;
-	struct page *base;
-	struct zone *zone;
-
 	mod_page_state(pgfree, 1<<order);
-
 	BUG_ON(PageLRU(page));
 	BUG_ON(PagePrivate(page));
 	BUG_ON(page->mapping != NULL);
@@ -105,12 +101,22 @@
 			list_add(&page->list, &current->local_pages);
 			page->index = order;
 			current->nr_local_pages++;
-			goto out;
+			return;
 		}
 	}
 
-	zone = page_zone(page);
+	low_level_free(page, order);
+}
 
+static void FASTCALL(buddy_free(struct page *, int order));
+static void buddy_free(struct page *page, int order)
+{
+	unsigned long index, page_idx, mask;
+	struct free_area *area;
+	struct page *base;
+	struct zone *zone;
+
+	zone = page_zone(page);
 	mask = (~0UL) << order;
 	base = zone->zone_mem_map;
 	page_idx = page - base;
@@ -119,8 +125,6 @@
 	index = page_idx >> (1 + order);
 	area = zone->free_area + order;
 
-	spin_lock_irqsave(&zone->lock, flags);
-	zone->free_pages -= mask;
 	while (mask + (1 << (MAX_ORDER-1))) {
 		struct page *buddy1, *buddy2;
 
@@ -140,15 +144,74 @@
 		BUG_ON(bad_range(zone, buddy1));
 		BUG_ON(bad_range(zone, buddy2));
 		list_del(&buddy1->list);
+		area->globally_free--;
 		mask <<= 1;
 		area++;
 		index >>= 1;
 		page_idx &= mask;
 	}
 	list_add(&(base + page_idx)->list, &area->free_list);
+	area->globally_free++;
+}
+
+/*
+ * Deferred coalescing for the page-level allocator. Each size of memory
+ * block has 3 different variables associated with it:
+ * (1) active			-- granted to a caller
+ * (2) locally free		-- on the deferred coalescing queues
+ * (3) globally free		-- marked on the buddy bitmap
+ *
+ * The algorithm must enforce the invariant that active >= locally_free.
+ * There are three distinct regimes (states) identified by the allocator:
+ *
+ * (1) lazy regime		-- area->active > area->locally_free + 1
+ *	allocate and free from deferred queues
+ * (2) reclaiming regime	-- area->active == area->locally_free + 1
+ *	allocate and free from buddy queues
+ * (3) accelerated regime	-- area->active == area->locally_free
+ *	allocate and free from buddy queues, plus free extra deferred pages
+ */
+static void low_level_free(struct page *page, unsigned int order)
+{
+	struct zone *zone = page_zone(page);
+	unsigned long offset;
+	unsigned long flags;
+	struct free_area *area;
+	struct page *deferred_page;
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	offset = page - zone->zone_mem_map;
+	area = &zone->free_area[order];
+
+	switch (area->active - area->locally_free) {
+		case 0:
+			/*
+			 * Free a deferred page; this is the accelerated
+			 * regime for page coalescing.
+			 */
+			if (likely(!list_empty(&area->deferred_pages))) {
+				deferred_page = list_entry(area->deferred_pages.next, struct page, list);
+				list_del(&deferred_page->list);
+				area->locally_free--;
+				buddy_free(deferred_page, order);
+			}
+			/*
+			 * Fall through and also free the page we were
+			 * originally asked to free.
+			 */
+		case 1:
+			buddy_free(page, order);
+			break;
+		default:
+			list_add(&page->list, &area->deferred_pages);
+			area->locally_free++;
+			break;
+	}
+
+	area->active -= min(area->active, 1UL);
+	zone->free_pages += 1UL << order;
 	spin_unlock_irqrestore(&zone->lock, flags);
-out:
-	return;
 }
 
 #define MARK_USED(index, order, area) \
@@ -165,6 +228,7 @@
 		area--;
 		high--;
 		size >>= 1;
+		area->globally_free++;
 		list_add(&page->list, &area->free_list);
 		MARK_USED(index, high, area);
 		index += size;
@@ -193,15 +257,20 @@
 	set_page_count(page, 1);
 }
 
-static struct page *rmqueue(struct zone *zone, unsigned int order)
+/*
+ * Mark the bitmap checking our buddy, and descend levels breaking off
+ * free fragments in the bitmap along the way. When done, wrap up with
+ * the single pass of expand() to break off the various fragments from
+ * the free lists.
+ */
+static FASTCALL(struct page *buddy_alloc(struct zone *, unsigned int));
+static struct page *buddy_alloc(struct zone *zone, unsigned int order)
 {
 	struct free_area *area = zone->free_area + order;
 	unsigned int curr_order = order;
 	struct list_head *head, *curr;
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&zone->lock, flags);
 	do {
 		head = &area->free_list;
 		curr = head->next;
@@ -215,25 +284,140 @@
 			index = page - zone->zone_mem_map;
 			if (curr_order != MAX_ORDER-1)
 				MARK_USED(index, curr_order, area);
-			zone->free_pages -= 1UL << order;
+			area->globally_free--;
 
 			page = expand(zone, page, index, order, curr_order, area);
-			spin_unlock_irqrestore(&zone->lock, flags);
 
 			if (bad_range(zone, page))
 				BUG();
-			prep_new_page(page);
 			return page;	
 		}
 		curr_order++;
 		area++;
 	} while (curr_order < MAX_ORDER);
-	spin_unlock_irqrestore(&zone->lock, flags);
 
 	return NULL;
 }
 
+/*
+ * split_pages() is much like expand, but operates only on the queues
+ * of deferred coalesced pages.
+ */
+static void FASTCALL(split_pages(struct zone *, struct page *, int, int));
+static void split_pages(struct zone *zone, struct page *page,
+			int page_order, int deferred_order)
+{
+	int split_order;
+	unsigned long split_offset;
+	struct page *split_page;
+
+	split_order = deferred_order - 1;
+	split_offset = 1UL << split_order;
+	while (split_order >= page_order) {
+		split_page = &page[split_offset];
+		list_add(&split_page->list, &zone->free_area[split_order].deferred_pages);
+		zone->free_area[split_order].locally_free++;
+		--split_order;
+		split_offset >>= 1;
+	}
+}
+
+#define COALESCE_BATCH 256
+
+/*
+ * Support for on-demand splitting and coalescing from deferred queues.
+ */
+static struct page *FASTCALL(steal_deferred_page(struct zone *, int));
+static struct page *steal_deferred_page(struct zone *zone, int order)
+{
+	struct page *page;
+	struct list_head *elem;
+	struct free_area *area = zone->free_area;
+	int found_order, k;
+
+	/*
+	 * There aren't enough pages to satisfy the allocation.
+	 * Don't bother trying.
+	 */
+	if (zone->free_pages < (1 << order))
+		return NULL;
+
+	/*
+	 * Steal a page off of a higher-order deferred queue if possible.
+	 */
+	for (found_order = order+1; found_order < MAX_ORDER; ++found_order)
+		if (!list_empty(&area[found_order].deferred_pages))
+			goto found_page;
+
+	/*
+	 * Now enter the method of last resort. The contiguous block of
+	 * memory may be split into tiny pieces. To detect contiguity,
+	 * walk the deferred queues and insert pages into the buddy
+	 * bitmaps until enough adjacencies have hopefully been created
+	 * to satisfy the allocation. About the only reason to do
+	 * something this slow is that it is still faster than I/O.
+	 */
+	for (found_order = order - 1; found_order >= 0; --found_order) {
+		for (k = 0; k < COALESCE_BATCH; ++k) {
+			if (list_empty(&area[found_order].deferred_pages))
+				break;
+
+			elem = area[found_order].deferred_pages.next;
+			page = list_entry(elem, struct page, list);
+
+			list_del(&page->list);
+			area[found_order].locally_free--;
+			buddy_free(page, found_order);
+		}
+
+		page = buddy_alloc(zone, order);
+		if (page)
+			return page;
+	}
+
+	/*
+	 * One last attempt to extract a sufficiently contiguous block
+	 * from the buddy bitmaps.
+	 */
+	return buddy_alloc(zone, order);
+
+found_page:
+	elem = area[found_order].deferred_pages.next;
+	page = list_entry(elem, struct page, list);
+	list_del(&page->list);
+	area[found_order].locally_free--;
+	split_pages(zone, page, order, found_order);
+	return page;
+}
+
 #ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * In order satisfy high-order boottime allocations a further pass is
+ * made at boot-time to fully coalesce all pages. Furthermore, swsusp
+ * also seems to want to be able to detect free regions by making use
+ * of full coalescing, and so the functionality is provided here.
+ */
+static void forced_coalesce(struct zone *zone)
+{
+	int order;
+	struct page *page;
+	struct free_area *area;
+	struct list_head *save, *elem;
+
+	if (!zone->present_pages)
+		return;
+
+	for (order = MAX_ORDER - 1; order >= 0; --order) {
+		area = &zone->free_area[order];
+		list_for_each_safe(elem, save, &area->deferred_pages) {
+			page = list_entry(elem, struct page, list);
+			list_del(&page->list);
+			area->locally_free--;
+			buddy_free(page, order);
+		}
+	}
+}
+
 int is_head_of_free_region(struct page *page)
 {
         struct zone *zone = page_zone(page);
@@ -246,6 +430,7 @@
 	 * suspend anyway, but...
 	 */
 	spin_lock_irqsave(&zone->lock, flags);
+	forced_coalesce(zone);
 	for (order = MAX_ORDER - 1; order >= 0; --order)
 		list_for_each(curr, &zone->free_area[order].free_list)
 			if (page == list_entry(curr, struct page, list)) {
@@ -257,6 +442,42 @@
 }
 #endif /* CONFIG_SOFTWARE_SUSPEND */
 
+/*
+ * low_level_alloc() exports free page search functionality to the
+ * internal VM allocator. It uses the strategy outlined above
+ * low_level_free() in order to decide where to obtain its pages.
+ */
+static FASTCALL(struct page *low_level_alloc(struct zone *, unsigned int));
+static struct page *low_level_alloc(struct zone *zone, unsigned int order)
+{
+	struct page *page;
+	unsigned long flags;
+	struct free_area *area;
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	area = &zone->free_area[order];
+
+	if (likely(!list_empty(&area->deferred_pages))) {
+		page = list_entry(area->deferred_pages.next, struct page, list);
+		list_del(&page->list);
+		area->locally_free--;
+	} else
+		page = buddy_alloc(zone, order);
+	if (unlikely(!page)) {
+		page = steal_deferred_page(zone, order);
+		if (likely(!page))
+		goto out;
+	}
+
+	prep_new_page(page);
+	area->active++;
+	zone->free_pages -= 1UL << order;
+out:
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return page;
+}
+
 static /* inline */ struct page *
 balance_classzone(struct zone* classzone, unsigned int gfp_mask,
 			unsigned int order, int * freed)
@@ -340,7 +561,7 @@
 		/* the incremental min is allegedly to discourage fallback */
 		min += z->pages_low;
 		if (z->free_pages > min || z->free_pages >= z->pages_high) {
-			page = rmqueue(z, order);
+			page = low_level_alloc(z, order);
 			if (page)
 				return page;
 		}
@@ -365,7 +586,7 @@
 			local_min >>= 2;
 		min += local_min;
 		if (z->free_pages > min || z->free_pages >= z->pages_high) {
-			page = rmqueue(z, order);
+			page = low_level_alloc(z, order);
 			if (page)
 				return page;
 		}
@@ -379,7 +600,7 @@
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
 
-			page = rmqueue(z, order);
+			page = low_level_alloc(z, order);
 			if (page)
 				return page;
 		}
@@ -408,7 +629,7 @@
 
 		min += z->pages_min;
 		if (z->free_pages > min || z->free_pages >= z->pages_high) {
-			page = rmqueue(z, order);
+			page = low_level_alloc(z, order);
 			if (page)
 				return page;
 		}
@@ -546,6 +767,17 @@
 }
 #endif
 
+unsigned long nr_deferred_pages(void)
+{
+	struct zone *zone;
+	unsigned long order, pages = 0;
+
+	for_each_zone(zone)
+		for (order = 0; order < MAX_ORDER; ++order)
+			pages += zone->free_area[order].locally_free << order;
+	return pages;
+}
+
 /*
  * Accumulate the page_state information across all CPUs.
  * The result is unavoidably approximate - it can change
@@ -936,6 +1168,7 @@
 		local_offset += size;
 		for (i = 0; ; i++) {
 			unsigned long bitmap_size;
+			INIT_LIST_HEAD(&zone->free_area[i].deferred_pages);
 
 			INIT_LIST_HEAD(&zone->free_area[i].free_list);
 			if (i == MAX_ORDER-1) {
@@ -1022,33 +1255,29 @@
 {
 }
 
-/* 
- * This walks the freelist for each zone. Whilst this is slow, I'd rather 
- * be slow here than slow down the fast path by keeping stats - mjbligh
+/*
+ * Render fragmentation statistics for /proc/ reporting.
+ * It simply formats the counters maintained by the queueing
+ * discipline in the buffer passed to it.
  */
 static int frag_show(struct seq_file *m, void *arg)
 {
 	pg_data_t *pgdat = (pg_data_t *)arg;
-	struct zone *zone;
-	struct zone *node_zones = pgdat->node_zones;
+	struct zone *zone, *node_zones = pgdat->node_zones;
 	unsigned long flags;
 	int order;
 
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-		if (!zone->present_pages)
-			continue;
-
-		spin_lock_irqsave(&zone->lock, flags);
-		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
-		for (order = 0; order < MAX_ORDER; ++order) {
-			unsigned long nr_bufs = 0;
-			struct list_head *elem;
-
-			list_for_each(elem, &(zone->free_area[order].free_list))
-				++nr_bufs;
-			seq_printf(m, "%6lu ", nr_bufs);
-		}
-		spin_unlock_irqrestore(&zone->lock, flags);
+		seq_printf(m, "Node %d, zone %8s\n", pgdat->node_id, zone->name);
+		seq_puts(m, "buddy:  ");
+		for (order = 0; order < MAX_ORDER; ++order)
+			seq_printf(m, "%6lu ", zone->free_area[order].globally_free);
+		seq_puts(m, "\ndefer:  ");
+		for (order = 0; order < MAX_ORDER; ++order)
+			seq_printf(m, "%6lu ", zone->free_area[order].locally_free);
+		seq_puts(m, "\nactive: ");
+		for (order = 0; order < MAX_ORDER; ++order)
+			seq_printf(m, "%6lu ", zone->free_area[order].active);
 		seq_putc(m, '\n');
 	}
 	return 0;