diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/i386/Kconfig linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/i386/Kconfig --- linux-2.6.17-rc4-mm3-clean/arch/i386/Kconfig 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/i386/Kconfig 2006-05-22 11:28:22.000000000 +0100 @@ -580,12 +580,10 @@ config ARCH_SELECT_MEMORY_MODEL def_bool y depends on ARCH_SPARSEMEM_ENABLE -source "mm/Kconfig" +config ARCH_POPULATES_NODE_MAP + def_bool y -config HAVE_ARCH_EARLY_PFN_TO_NID - bool - default y - depends on NUMA +source "mm/Kconfig" config HIGHPTE bool "Allocate 3rd-level pagetables from highmem" diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/i386/kernel/setup.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/i386/kernel/setup.c --- linux-2.6.17-rc4-mm3-clean/arch/i386/kernel/setup.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/i386/kernel/setup.c 2006-05-22 11:28:22.000000000 +0100 @@ -1206,22 +1206,15 @@ static unsigned long __init setup_memory void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int max_dma, low; + unsigned int max_dma; +#ifndef CONFIG_HIGHMEM + unsigned long highend_pfn = max_low_pfn; +#endif max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - low = max_low_pfn; - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = highend_pfn - low; -#endif - } - free_area_init(zones_size); + add_active_range(0, 0, highend_pfn); + free_area_init_nodes(max_dma, max_dma, max_low_pfn, highend_pfn); } #else extern unsigned long __init setup_memory(void); diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/i386/kernel/srat.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/i386/kernel/srat.c --- linux-2.6.17-rc4-mm3-clean/arch/i386/kernel/srat.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/i386/kernel/srat.c 2006-05-22 11:28:22.000000000 +0100 @@ -55,8 +55,6 @@ struct node_memory_chunk_s { static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; static int num_memory_chunks; /* total number of memory chunks */ -static int zholes_size_init; -static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES]; extern void * boot_ioremap(unsigned long, unsigned long); @@ -136,50 +134,6 @@ static void __init parse_memory_affinity "enabled and removable" : "enabled" ) ); } -#if MAX_NR_ZONES != 4 -#error "MAX_NR_ZONES != 4, chunk_to_zone requires review" -#endif -/* Take a chunk of pages from page frame cstart to cend and count the number - * of pages in each zone, returned via zones[]. - */ -static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, - unsigned long *zones) -{ - unsigned long max_dma; - extern unsigned long max_low_pfn; - - int z; - unsigned long rend; - - /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide - * similarly scoped information and should be handled in a consistant - * manner. - */ - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - - /* Split the hole into the zones in which it falls. Repeatedly - * take the segment in which the remaining hole starts, round it - * to the end of that zone. - */ - memset(zones, 0, MAX_NR_ZONES * sizeof(long)); - while (cstart < cend) { - if (cstart < max_dma) { - z = ZONE_DMA; - rend = (cend < max_dma)? cend : max_dma; - - } else if (cstart < max_low_pfn) { - z = ZONE_NORMAL; - rend = (cend < max_low_pfn)? cend : max_low_pfn; - - } else { - z = ZONE_HIGHMEM; - rend = cend; - } - zones[z] += rend - cstart; - cstart = rend; - } -} - /* * The SRAT table always lists ascending addresses, so can always * assume that the first "start" address that you see is the real @@ -224,7 +178,6 @@ static int __init acpi20_parse_srat(stru memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); - memset(zholes_size, 0, sizeof(zholes_size)); num_memory_chunks = 0; while (p < end) { @@ -288,6 +241,7 @@ static int __init acpi20_parse_srat(stru printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, chunk->nid, chunk->start_pfn, chunk->end_pfn); node_read_chunk(chunk->nid, chunk); + add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn); } for_each_online_node(nid) { @@ -405,57 +359,7 @@ int __init get_memcfg_from_srat(void) return acpi20_parse_srat((struct acpi_table_srat *)header); } out_err: + remove_all_active_ranges(); printk("failed to get NUMA memory information from SRAT table\n"); return 0; } - -/* For each node run the memory list to determine whether there are - * any memory holes. For each hole determine which ZONE they fall - * into. - * - * NOTE#1: this requires knowledge of the zone boundries and so - * _cannot_ be performed before those are calculated in setup_memory. - * - * NOTE#2: we rely on the fact that the memory chunks are ordered by - * start pfn number during setup. - */ -static void __init get_zholes_init(void) -{ - int nid; - int c; - int first; - unsigned long end = 0; - - for_each_online_node(nid) { - first = 1; - for (c = 0; c < num_memory_chunks; c++){ - if (node_memory_chunk[c].nid == nid) { - if (first) { - end = node_memory_chunk[c].end_pfn; - first = 0; - - } else { - /* Record any gap between this chunk - * and the previous chunk on this node - * against the zones it spans. - */ - chunk_to_zones(end, - node_memory_chunk[c].start_pfn, - &zholes_size[nid * MAX_NR_ZONES]); - } - } - } - } -} - -unsigned long * __init get_zholes_size(int nid) -{ - if (!zholes_size_init) { - zholes_size_init++; - get_zholes_init(); - } - if (nid >= MAX_NUMNODES || !node_online(nid)) - printk("%s: nid = %d is invalid/offline. num_online_nodes = %d", - __FUNCTION__, nid, num_online_nodes()); - return &zholes_size[nid * MAX_NR_ZONES]; -} diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/i386/mm/discontig.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/i386/mm/discontig.c --- linux-2.6.17-rc4-mm3-clean/arch/i386/mm/discontig.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/i386/mm/discontig.c 2006-05-22 11:28:22.000000000 +0100 @@ -157,21 +157,6 @@ static void __init find_max_pfn_node(int BUG(); } -/* Find the owning node for a pfn. */ -int early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - for_each_node(nid) { - if (node_end_pfn[nid] == 0) - break; - if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn) - return nid; - } - - return 0; -} - /* * Allocate memory for the pg_data_t for this node via a crude pre-bootmem * method. For node zero take this from the bottom of memory, for @@ -227,6 +212,8 @@ static unsigned long calculate_numa_rema unsigned long pfn; for_each_online_node(nid) { + unsigned old_end_pfn = node_end_pfn[nid]; + /* * The acpi/srat node info can show hot-add memroy zones * where memory could be added but not currently present. @@ -276,6 +263,7 @@ static unsigned long calculate_numa_rema node_end_pfn[nid] -= size; node_remap_start_pfn[nid] = node_end_pfn[nid]; + shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); @@ -355,45 +343,20 @@ unsigned long __init setup_memory(void) void __init zone_sizes_init(void) { int nid; + unsigned long max_dma_pfn; - - for_each_online_node(nid) { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned long *zholes_size; - unsigned int max_dma; - - unsigned long low = max_low_pfn; - unsigned long start = node_start_pfn[nid]; - unsigned long high = node_end_pfn[nid]; - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - - if (node_has_online_mem(nid)){ - if (start > low) { -#ifdef CONFIG_HIGHMEM - BUG_ON(start > high); - zones_size[ZONE_HIGHMEM] = high - start; -#endif - } else { - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - BUG_ON(max_dma > low); - BUG_ON(low > high); - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; -#endif - } - } + /* If SRAT has not registered memory, register it now */ + if (find_max_pfn_with_active_regions() == 0) { + for_each_online_node(nid) { + if (node_has_online_mem(nid)) + add_active_range(nid, node_start_pfn[nid], + node_end_pfn[nid]); } - - zholes_size = get_zholes_size(nid); - - free_area_init_node(nid, NODE_DATA(nid), zones_size, start, - zholes_size); } + + max_dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + free_area_init_nodes(max_dma_pfn, max_dma_pfn, + max_low_pfn, highend_pfn); return; } diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/ia64/Kconfig linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ia64/Kconfig --- linux-2.6.17-rc4-mm3-clean/arch/ia64/Kconfig 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ia64/Kconfig 2006-05-22 11:30:19.000000000 +0100 @@ -353,6 +353,9 @@ config NODES_SHIFT MAX_NUMNODES will be 2^(This value). If in doubt, use the default. +config ARCH_POPULATES_NODE_MAP + def_bool y + # VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent. # VIRTUAL_MEM_MAP has been retained for historical reasons. config VIRTUAL_MEM_MAP diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/ia64/mm/contig.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ia64/mm/contig.c --- linux-2.6.17-rc4-mm3-clean/arch/ia64/mm/contig.c 2006-05-12 00:31:53.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ia64/mm/contig.c 2006-05-22 11:30:19.000000000 +0100 @@ -26,10 +26,6 @@ #include #include -#ifdef CONFIG_VIRTUAL_MEM_MAP -static unsigned long num_dma_physpages; -#endif - /** * show_mem - display a memory statistics summary * @@ -212,18 +208,6 @@ count_pages (u64 start, u64 end, void *a return 0; } -#ifdef CONFIG_VIRTUAL_MEM_MAP -static int -count_dma_pages (u64 start, u64 end, void *arg) -{ - unsigned long *count = arg; - - if (start < MAX_DMA_ADDRESS) - *count += (min(end, MAX_DMA_ADDRESS) - start) >> PAGE_SHIFT; - return 0; -} -#endif - /* * Set up the page tables. */ @@ -232,47 +216,24 @@ void __init paging_init (void) { unsigned long max_dma; - unsigned long zones_size[MAX_NR_ZONES]; #ifdef CONFIG_VIRTUAL_MEM_MAP - unsigned long zholes_size[MAX_NR_ZONES]; + unsigned long nid = 0; unsigned long max_gap; #endif - /* initialize mem_map[] */ - - memset(zones_size, 0, sizeof(zones_size)); - num_physpages = 0; efi_memmap_walk(count_pages, &num_physpages); max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; #ifdef CONFIG_VIRTUAL_MEM_MAP - memset(zholes_size, 0, sizeof(zholes_size)); - - num_dma_physpages = 0; - efi_memmap_walk(count_dma_pages, &num_dma_physpages); - - if (max_low_pfn < max_dma) { - zones_size[ZONE_DMA] = max_low_pfn; - zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages; - } else { - zones_size[ZONE_DMA] = max_dma; - zholes_size[ZONE_DMA] = max_dma - num_dma_physpages; - if (num_physpages > num_dma_physpages) { - zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; - zholes_size[ZONE_NORMAL] = - ((max_low_pfn - max_dma) - - (num_physpages - num_dma_physpages)); - } - } - max_gap = 0; + efi_memmap_walk(register_active_ranges, &nid); efi_memmap_walk(find_largest_hole, (u64 *)&max_gap); if (max_gap < LARGE_GAP) { vmem_map = (struct page *) 0; - free_area_init_node(0, NODE_DATA(0), zones_size, 0, - zholes_size); + free_area_init_nodes(max_dma, max_dma, + max_low_pfn, max_low_pfn); } else { unsigned long map_size; @@ -284,19 +245,14 @@ paging_init (void) efi_memmap_walk(create_mem_map_page_table, NULL); NODE_DATA(0)->node_mem_map = vmem_map; - free_area_init_node(0, NODE_DATA(0), zones_size, - 0, zholes_size); + free_area_init_nodes(max_dma, max_dma, + max_low_pfn, max_low_pfn); printk("Virtual mem_map starts at 0x%p\n", mem_map); } #else /* !CONFIG_VIRTUAL_MEM_MAP */ - if (max_low_pfn < max_dma) - zones_size[ZONE_DMA] = max_low_pfn; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; - } - free_area_init(zones_size); + add_active_range(0, 0, max_low_pfn); + free_area_init_nodes(max_dma, max_dma, max_low_pfn, max_low_pfn); #endif /* !CONFIG_VIRTUAL_MEM_MAP */ zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/ia64/mm/discontig.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ia64/mm/discontig.c --- linux-2.6.17-rc4-mm3-clean/arch/ia64/mm/discontig.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ia64/mm/discontig.c 2006-05-22 11:30:19.000000000 +0100 @@ -705,6 +705,7 @@ static __init int count_node_pages(unsig { unsigned long end = start + len; + add_active_range(node, start >> PAGE_SHIFT, end >> PAGE_SHIFT); mem_data[node].num_physpages += len >> PAGE_SHIFT; if (start <= __pa(MAX_DMA_ADDRESS)) mem_data[node].num_dma_physpages += @@ -729,9 +730,8 @@ static __init int count_node_pages(unsig void __init paging_init(void) { unsigned long max_dma; - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long zholes_size[MAX_NR_ZONES]; unsigned long pfn_offset = 0; + unsigned long max_pfn = 0; int node; max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; @@ -748,47 +748,18 @@ void __init paging_init(void) #endif for_each_online_node(node) { - memset(zones_size, 0, sizeof(zones_size)); - memset(zholes_size, 0, sizeof(zholes_size)); - num_physpages += mem_data[node].num_physpages; - - if (mem_data[node].min_pfn >= max_dma) { - /* All of this node's memory is above ZONE_DMA */ - zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - - mem_data[node].min_pfn; - zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn - - mem_data[node].min_pfn - - mem_data[node].num_physpages; - } else if (mem_data[node].max_pfn < max_dma) { - /* All of this node's memory is in ZONE_DMA */ - zones_size[ZONE_DMA] = mem_data[node].max_pfn - - mem_data[node].min_pfn; - zholes_size[ZONE_DMA] = mem_data[node].max_pfn - - mem_data[node].min_pfn - - mem_data[node].num_dma_physpages; - } else { - /* This node has memory in both zones */ - zones_size[ZONE_DMA] = max_dma - - mem_data[node].min_pfn; - zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - - mem_data[node].num_dma_physpages; - zones_size[ZONE_NORMAL] = mem_data[node].max_pfn - - max_dma; - zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] - - (mem_data[node].num_physpages - - mem_data[node].num_dma_physpages); - } - pfn_offset = mem_data[node].min_pfn; #ifdef CONFIG_VIRTUAL_MEM_MAP NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; #endif - free_area_init_node(node, NODE_DATA(node), zones_size, - pfn_offset, zholes_size); + if (mem_data[node].max_pfn > max_pfn) + max_pfn = mem_data[node].max_pfn; } + free_area_init_nodes(max_dma, max_dma, max_pfn, max_pfn); + zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/ia64/mm/init.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ia64/mm/init.c --- linux-2.6.17-rc4-mm3-clean/arch/ia64/mm/init.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ia64/mm/init.c 2006-05-22 11:30:19.000000000 +0100 @@ -539,6 +539,18 @@ find_largest_hole (u64 start, u64 end, v last_end = end; return 0; } + +int __init +register_active_ranges(u64 start, u64 end, void *nid) +{ + BUG_ON(nid == NULL); + BUG_ON(*(unsigned long *)nid >= MAX_NUMNODES); + + add_active_range(*(unsigned long *)nid, + __pa(start) >> PAGE_SHIFT, + __pa(end) >> PAGE_SHIFT); + return 0; +} #endif /* CONFIG_VIRTUAL_MEM_MAP */ static int __init diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/powerpc/Kconfig linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/powerpc/Kconfig --- linux-2.6.17-rc4-mm3-clean/arch/powerpc/Kconfig 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/powerpc/Kconfig 2006-05-22 15:42:36.000000000 +0100 @@ -684,11 +684,10 @@ config ARCH_SPARSEMEM_DEFAULT def_bool y depends on SMP && PPC_PSERIES -source "mm/Kconfig" - -config HAVE_ARCH_EARLY_PFN_TO_NID +config ARCH_POPULATES_NODE_MAP def_bool y - depends on NEED_MULTIPLE_NODES + +source "mm/Kconfig" config ARCH_MEMORY_PROBE def_bool y diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/powerpc/mm/mem.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/powerpc/mm/mem.c --- linux-2.6.17-rc4-mm3-clean/arch/powerpc/mm/mem.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/powerpc/mm/mem.c 2006-05-22 11:27:19.000000000 +0100 @@ -257,20 +257,22 @@ void __init do_init_bootmem(void) boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages); + /* Add active regions with valid PFNs */ + for (i = 0; i < lmb.memory.cnt; i++) { + unsigned long start_pfn, end_pfn; + start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; + end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i); + add_active_range(0, start_pfn, end_pfn); + } + /* Add all physical memory to the bootmem map, mark each area * present. */ - for (i = 0; i < lmb.memory.cnt; i++) { - unsigned long base = lmb.memory.region[i].base; - unsigned long size = lmb_size_bytes(&lmb.memory, i); #ifdef CONFIG_HIGHMEM - if (base >= total_lowmem) - continue; - if (base + size > total_lowmem) - size = total_lowmem - base; + free_bootmem_with_active_regions(0, total_lowmem >> PAGE_SHIFT); +#else + free_bootmem_with_active_regions(0, max_pfn); #endif - free_bootmem(base, size); - } /* reserve the sections we're already using */ for (i = 0; i < lmb.reserved.cnt; i++) @@ -278,9 +280,8 @@ void __init do_init_bootmem(void) lmb_size_bytes(&lmb.reserved, i)); /* XXX need to clip this if using highmem? */ - for (i = 0; i < lmb.memory.cnt; i++) - memory_present(0, lmb_start_pfn(&lmb.memory, i), - lmb_end_pfn(&lmb.memory, i)); + sparse_memory_present_with_active_regions(0); + init_bootmem_done = 1; } @@ -289,8 +290,6 @@ void __init do_init_bootmem(void) */ void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long zholes_size[MAX_NR_ZONES]; unsigned long total_ram = lmb_phys_mem_size(); unsigned long top_of_ram = lmb_end_of_DRAM(); @@ -308,26 +307,18 @@ void __init paging_init(void) top_of_ram, total_ram); printk(KERN_DEBUG "Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); - /* - * All pages are DMA-able so we put them all in the DMA zone. - */ - memset(zones_size, 0, sizeof(zones_size)); - memset(zholes_size, 0, sizeof(zholes_size)); - - zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; - zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT; - #ifdef CONFIG_HIGHMEM - zones_size[ZONE_DMA] = total_lowmem >> PAGE_SHIFT; - zones_size[ZONE_HIGHMEM] = (total_memory - total_lowmem) >> PAGE_SHIFT; - zholes_size[ZONE_HIGHMEM] = (top_of_ram - total_ram) >> PAGE_SHIFT; + free_area_init_nodes(total_lowmem >> PAGE_SHIFT, + total_lowmem >> PAGE_SHIFT, + total_lowmem >> PAGE_SHIFT, + top_of_ram >> PAGE_SHIFT); #else - zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; - zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT; -#endif /* CONFIG_HIGHMEM */ + free_area_init_nodes(top_of_ram >> PAGE_SHIFT, + top_of_ram >> PAGE_SHIFT, + top_of_ram >> PAGE_SHIFT, + top_of_ram >> PAGE_SHIFT); +#endif - free_area_init_node(0, NODE_DATA(0), zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size); } #endif /* ! CONFIG_NEED_MULTIPLE_NODES */ diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/powerpc/mm/numa.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/powerpc/mm/numa.c --- linux-2.6.17-rc4-mm3-clean/arch/powerpc/mm/numa.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/powerpc/mm/numa.c 2006-05-22 11:27:19.000000000 +0100 @@ -39,96 +39,6 @@ static bootmem_data_t __initdata plat_no static int min_common_depth; static int n_mem_addr_cells, n_mem_size_cells; -/* - * We need somewhere to store start/end/node for each region until we have - * allocated the real node_data structures. - */ -#define MAX_REGIONS (MAX_LMB_REGIONS*2) -static struct { - unsigned long start_pfn; - unsigned long end_pfn; - int nid; -} init_node_data[MAX_REGIONS] __initdata; - -int __init early_pfn_to_nid(unsigned long pfn) -{ - unsigned int i; - - for (i = 0; init_node_data[i].end_pfn; i++) { - unsigned long start_pfn = init_node_data[i].start_pfn; - unsigned long end_pfn = init_node_data[i].end_pfn; - - if ((start_pfn <= pfn) && (pfn < end_pfn)) - return init_node_data[i].nid; - } - - return -1; -} - -void __init add_region(unsigned int nid, unsigned long start_pfn, - unsigned long pages) -{ - unsigned int i; - - dbg("add_region nid %d start_pfn 0x%lx pages 0x%lx\n", - nid, start_pfn, pages); - - for (i = 0; init_node_data[i].end_pfn; i++) { - if (init_node_data[i].nid != nid) - continue; - if (init_node_data[i].end_pfn == start_pfn) { - init_node_data[i].end_pfn += pages; - return; - } - if (init_node_data[i].start_pfn == (start_pfn + pages)) { - init_node_data[i].start_pfn -= pages; - return; - } - } - - /* - * Leave last entry NULL so we dont iterate off the end (we use - * entry.end_pfn to terminate the walk). - */ - if (i >= (MAX_REGIONS - 1)) { - printk(KERN_ERR "WARNING: too many memory regions in " - "numa code, truncating\n"); - return; - } - - init_node_data[i].start_pfn = start_pfn; - init_node_data[i].end_pfn = start_pfn + pages; - init_node_data[i].nid = nid; -} - -/* We assume init_node_data has no overlapping regions */ -void __init get_region(unsigned int nid, unsigned long *start_pfn, - unsigned long *end_pfn, unsigned long *pages_present) -{ - unsigned int i; - - *start_pfn = -1UL; - *end_pfn = *pages_present = 0; - - for (i = 0; init_node_data[i].end_pfn; i++) { - if (init_node_data[i].nid != nid) - continue; - - *pages_present += init_node_data[i].end_pfn - - init_node_data[i].start_pfn; - - if (init_node_data[i].start_pfn < *start_pfn) - *start_pfn = init_node_data[i].start_pfn; - - if (init_node_data[i].end_pfn > *end_pfn) - *end_pfn = init_node_data[i].end_pfn; - } - - /* We didnt find a matching region, return start/end as 0 */ - if (*start_pfn == -1UL) - *start_pfn = 0; -} - static void __cpuinit map_cpu_to_node(int cpu, int node) { numa_cpu_lookup_table[cpu] = node; @@ -471,8 +381,8 @@ new_range: continue; } - add_region(nid, start >> PAGE_SHIFT, - size >> PAGE_SHIFT); + add_active_range(nid, start >> PAGE_SHIFT, + (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT)); if (--ranges) goto new_range; @@ -485,6 +395,7 @@ static void __init setup_nonnuma(void) { unsigned long top_of_ram = lmb_end_of_DRAM(); unsigned long total_ram = lmb_phys_mem_size(); + unsigned long start_pfn, end_pfn; unsigned int i; printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", @@ -492,9 +403,11 @@ static void __init setup_nonnuma(void) printk(KERN_DEBUG "Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); - for (i = 0; i < lmb.memory.cnt; ++i) - add_region(0, lmb.memory.region[i].base >> PAGE_SHIFT, - lmb_size_pages(&lmb.memory, i)); + for (i = 0; i < lmb.memory.cnt; ++i) { + start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; + end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i); + add_active_range(0, start_pfn, end_pfn); + } node_set_online(0); } @@ -632,11 +545,11 @@ void __init do_init_bootmem(void) (void *)(unsigned long)boot_cpuid); for_each_online_node(nid) { - unsigned long start_pfn, end_pfn, pages_present; + unsigned long start_pfn, end_pfn; unsigned long bootmem_paddr; unsigned long bootmap_pages; - get_region(nid, &start_pfn, &end_pfn, &pages_present); + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); /* Allocate the node structure node local if possible */ NODE_DATA(nid) = careful_allocation(nid, @@ -669,19 +582,7 @@ void __init do_init_bootmem(void) init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, start_pfn, end_pfn); - /* Add free regions on this node */ - for (i = 0; init_node_data[i].end_pfn; i++) { - unsigned long start, end; - - if (init_node_data[i].nid != nid) - continue; - - start = init_node_data[i].start_pfn << PAGE_SHIFT; - end = init_node_data[i].end_pfn << PAGE_SHIFT; - - dbg("free_bootmem %lx %lx\n", start, end - start); - free_bootmem_node(NODE_DATA(nid), start, end - start); - } + free_bootmem_with_active_regions(nid, end_pfn); /* Mark reserved regions on this node */ for (i = 0; i < lmb.reserved.cnt; i++) { @@ -712,44 +613,14 @@ void __init do_init_bootmem(void) } } - /* Add regions into sparsemem */ - for (i = 0; init_node_data[i].end_pfn; i++) { - unsigned long start, end; - - if (init_node_data[i].nid != nid) - continue; - - start = init_node_data[i].start_pfn; - end = init_node_data[i].end_pfn; - - memory_present(nid, start, end); - } + sparse_memory_present_with_active_regions(nid); } } void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long zholes_size[MAX_NR_ZONES]; - int nid; - - memset(zones_size, 0, sizeof(zones_size)); - memset(zholes_size, 0, sizeof(zholes_size)); - - for_each_online_node(nid) { - unsigned long start_pfn, end_pfn, pages_present; - - get_region(nid, &start_pfn, &end_pfn, &pages_present); - - zones_size[ZONE_DMA] = end_pfn - start_pfn; - zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - pages_present; - - dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, - zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); - - free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, - zholes_size); - } + unsigned long end_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; + free_area_init_nodes(end_pfn, end_pfn, end_pfn, end_pfn); } static int __init early_numa(char *p) diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/ppc/Kconfig linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ppc/Kconfig --- linux-2.6.17-rc4-mm3-clean/arch/ppc/Kconfig 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ppc/Kconfig 2006-05-22 11:27:19.000000000 +0100 @@ -949,6 +949,9 @@ config NR_CPUS config HIGHMEM bool "High memory support" +config ARCH_POPULATES_NODE_MAP + def_bool y + source kernel/Kconfig.hz source kernel/Kconfig.preempt source "mm/Kconfig" diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/ppc/mm/init.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ppc/mm/init.c --- linux-2.6.17-rc4-mm3-clean/arch/ppc/mm/init.c 2006-05-12 00:31:53.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/ppc/mm/init.c 2006-05-22 11:27:19.000000000 +0100 @@ -359,8 +359,7 @@ void __init do_init_bootmem(void) */ void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES], i; - + unsigned long start_pfn, end_pfn; #ifdef CONFIG_HIGHMEM map_page(PKMAP_BASE, 0, 0); /* XXX gross */ pkmap_page_table = pte_offset_kernel(pmd_offset(pgd_offset_k @@ -370,19 +369,22 @@ void __init paging_init(void) (KMAP_FIX_BEGIN), KMAP_FIX_BEGIN), KMAP_FIX_BEGIN); kmap_prot = PAGE_KERNEL; #endif /* CONFIG_HIGHMEM */ - - /* - * All pages are DMA-able so we put them all in the DMA zone. - */ - zones_size[ZONE_DMA] = total_lowmem >> PAGE_SHIFT; - for (i = 1; i < MAX_NR_ZONES; i++) - zones_size[i] = 0; + /* All pages are DMA-able so we put them all in the DMA zone. */ + start_pfn = __pa(PAGE_OFFSET) >> PAGE_SHIFT; + end_pfn = start_pfn + (total_memory >> PAGE_SHIFT); + add_active_range(0, start_pfn, end_pfn); #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = (total_memory - total_lowmem) >> PAGE_SHIFT; + free_area_init_nodes(total_lowmem >> PAGE_SHIFT, + total_lowmem >> PAGE_SHIFT, + total_lowmem >> PAGE_SHIFT, + total_memory >> PAGE_SHIFT); +#else + free_area_init_nodes(total_memory >> PAGE_SHIFT, + total_memory >> PAGE_SHIFT, + total_memory >> PAGE_SHIFT, + total_memory >> PAGE_SHIFT); #endif /* CONFIG_HIGHMEM */ - - free_area_init(zones_size); } void __init mem_init(void) diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/x86_64/Kconfig linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/Kconfig --- linux-2.6.17-rc4-mm3-clean/arch/x86_64/Kconfig 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/Kconfig 2006-05-22 11:29:22.000000000 +0100 @@ -73,6 +73,9 @@ config ARCH_MAY_HAVE_PC_FDC bool default y +config ARCH_POPULATES_NODE_MAP + def_bool y + config DMI bool default y diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/x86_64/kernel/e820.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/kernel/e820.c --- linux-2.6.17-rc4-mm3-clean/arch/x86_64/kernel/e820.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/kernel/e820.c 2006-05-24 12:19:20.000000000 +0100 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -160,58 +161,14 @@ unsigned long __init find_e820_area(unsi return -1UL; } -/* - * Free bootmem based on the e820 table for a node. - */ -void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr >= end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr && last-addr >= PAGE_SIZE) - free_bootmem_node(pgdat, addr, last-addr); - } -} - /* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) { - int i; unsigned long end_pfn = 0; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long start, end; - - start = round_up(ei->addr, PAGE_SIZE); - end = round_down(ei->addr + ei->size, PAGE_SIZE); - if (start >= end) - continue; - if (ei->type == E820_RAM) { - if (end > end_pfn<>PAGE_SHIFT; - } else { - if (end > end_pfn_map<>PAGE_SHIFT; - } - } + end_pfn = find_max_pfn_with_active_regions(); if (end_pfn > end_pfn_map) end_pfn_map = end_pfn; @@ -222,43 +179,10 @@ unsigned long __init e820_end_of_ram(voi if (end_pfn > end_pfn_map) end_pfn = end_pfn_map; + printk("end_pfn_map = %lu\n", end_pfn_map); return end_pfn; } -/* - * Compute how much memory is missing in a range. - * Unlike the other functions in this file the arguments are in page numbers. - */ -unsigned long __init -e820_hole_size(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long ram = 0; - unsigned long start = start_pfn << PAGE_SHIFT; - unsigned long end = end_pfn << PAGE_SHIFT; - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr >= end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr) - ram += last - addr; - } - return ((end - start) - ram) >> PAGE_SHIFT; -} - /* * Mark e820 reserved areas as busy for the resource manager. */ @@ -293,6 +217,43 @@ void __init e820_reserve_resources(void) } } +/* Walk the e820 map and register active regions within a node */ +void __init +e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + unsigned long ei_startpfn, ei_endpfn; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; + ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) + >> PAGE_SHIFT; + + /* Skip map entries smaller than a page */ + if (ei_startpfn > ei_endpfn) + continue; + + /* Check if end_pfn_map should be updated */ + if (ei->type != E820_RAM && ei_endpfn > end_pfn_map) + end_pfn_map = ei_endpfn; + + /* Skip if map is outside the node */ + if (ei->type != E820_RAM || + ei_endpfn <= start_pfn || + ei_startpfn >= end_pfn) + continue; + + /* Check for overlaps */ + if (ei_startpfn < start_pfn) + ei_startpfn = start_pfn; + if (ei_endpfn > end_pfn) + ei_endpfn = end_pfn; + + add_active_range(nid, ei_startpfn, ei_endpfn); + } +} + /* * Add a memory region to the kernel e820 map. */ diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/x86_64/kernel/setup.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/kernel/setup.c --- linux-2.6.17-rc4-mm3-clean/arch/x86_64/kernel/setup.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/kernel/setup.c 2006-05-22 11:29:22.000000000 +0100 @@ -472,7 +472,8 @@ contig_initmem_init(unsigned long start_ if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n",bootmap_size); bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); - e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); + e820_register_active_regions(0, start_pfn, end_pfn); + free_bootmem_with_active_regions(0, end_pfn); reserve_bootmem(bootmap, bootmap_size); } #endif @@ -646,6 +647,7 @@ void __init setup_arch(char **cmdline_p) early_identify_cpu(&boot_cpu_data); + e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: @@ -671,6 +673,9 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif + /* Remove active ranges so rediscovery with NUMA-awareness happens */ + remove_all_active_ranges(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/x86_64/mm/init.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/mm/init.c --- linux-2.6.17-rc4-mm3-clean/arch/x86_64/mm/init.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/mm/init.c 2006-05-22 11:29:22.000000000 +0100 @@ -406,69 +406,12 @@ void __cpuinit zap_low_mappings(int cpu) __flush_tlb_all(); } -/* Compute zone sizes for the DMA and DMA32 zones in a node. */ -__init void -size_zones(unsigned long *z, unsigned long *h, - unsigned long start_pfn, unsigned long end_pfn) -{ - int i; - unsigned long w; - - for (i = 0; i < MAX_NR_ZONES; i++) - z[i] = 0; - - if (start_pfn < MAX_DMA_PFN) - z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; - if (start_pfn < MAX_DMA32_PFN) { - unsigned long dma32_pfn = MAX_DMA32_PFN; - if (dma32_pfn > end_pfn) - dma32_pfn = end_pfn; - z[ZONE_DMA32] = dma32_pfn - start_pfn; - } - z[ZONE_NORMAL] = end_pfn - start_pfn; - - /* Remove lower zones from higher ones. */ - w = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - if (z[i]) - z[i] -= w; - w += z[i]; - } - - /* Compute holes */ - w = start_pfn; - for (i = 0; i < MAX_NR_ZONES; i++) { - unsigned long s = w; - w += z[i]; - h[i] = e820_hole_size(s, w); - } - - /* Add the space pace needed for mem_map to the holes too. */ - for (i = 0; i < MAX_NR_ZONES; i++) - h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE; - - /* The 16MB DMA zone has the kernel and other misc mappings. - Account them too */ - if (h[ZONE_DMA]) { - h[ZONE_DMA] += dma_reserve; - if (h[ZONE_DMA] >= z[ZONE_DMA]) { - printk(KERN_WARNING - "Kernel too large and filling up ZONE_DMA?\n"); - h[ZONE_DMA] = z[ZONE_DMA]; - } - } -} - #ifndef CONFIG_NUMA void __init paging_init(void) { - unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; - memory_present(0, 0, end_pfn); sparse_init(); - size_zones(zones, holes, 0, end_pfn); - free_area_init_node(0, NODE_DATA(0), zones, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); + free_area_init_nodes(MAX_DMA_PFN, MAX_DMA32_PFN, end_pfn, end_pfn); } #endif @@ -620,7 +563,8 @@ void __init mem_init(void) #else totalram_pages = free_all_bootmem(); #endif - reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn); + reservedpages = end_pfn - totalram_pages - + absent_pages_in_range(0, end_pfn); after_bootmem = 1; diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/x86_64/mm/k8topology.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/mm/k8topology.c --- linux-2.6.17-rc4-mm3-clean/arch/x86_64/mm/k8topology.c 2006-05-12 00:31:53.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/mm/k8topology.c 2006-05-22 11:29:22.000000000 +0100 @@ -146,6 +146,9 @@ int __init k8_scan_nodes(unsigned long s nodes[nodeid].start = base; nodes[nodeid].end = limit; + e820_register_active_regions(nodeid, + nodes[nodeid].start >> PAGE_SHIFT, + nodes[nodeid].end >> PAGE_SHIFT); prevbase = base; diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/x86_64/mm/numa.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/mm/numa.c --- linux-2.6.17-rc4-mm3-clean/arch/x86_64/mm/numa.c 2006-05-12 00:31:53.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/mm/numa.c 2006-05-22 11:29:22.000000000 +0100 @@ -161,7 +161,7 @@ void __init setup_node_bootmem(int nodei bootmap_start >> PAGE_SHIFT, start_pfn, end_pfn); - e820_bootmem_free(NODE_DATA(nodeid), start, end); + free_bootmem_with_active_regions(nodeid, end); reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } numa_init_array(); return 0; } @@ -299,6 +296,7 @@ void __init numa_initmem_init(unsigned l for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); node_to_cpumask[0] = cpumask_of_cpu(0); + e820_register_active_regions(0, start_pfn, end_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } @@ -346,6 +344,8 @@ void __init paging_init(void) for_each_online_node(i) { setup_node_zones(i); } + + free_area_init_nodes(MAX_DMA_PFN, MAX_DMA32_PFN, end_pfn, end_pfn); } /* [numa=off] */ diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/arch/x86_64/mm/srat.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/mm/srat.c --- linux-2.6.17-rc4-mm3-clean/arch/x86_64/mm/srat.c 2006-05-22 11:25:40.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/arch/x86_64/mm/srat.c 2006-05-22 11:29:22.000000000 +0100 @@ -91,6 +91,7 @@ static __init void bad_srat(void) apicid_to_node[i] = NUMA_NO_NODE; for (i = 0; i < MAX_NUMNODES; i++) nodes_add[i].start = nodes[i].end = 0; + remove_all_active_ranges(); } static __init inline int srat_disabled(void) @@ -173,7 +174,7 @@ static int hotadd_enough_memory(struct b if (mem < 0) return 0; - allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE; + allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; allowed = (allowed / 100) * hotadd_percent; if (allocated + mem > allowed) { unsigned long range; @@ -223,7 +224,7 @@ static int reserve_hotadd(int node, unsi } /* This check might be a bit too strict, but I'm keeping it for now. */ - if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { + if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { printk(KERN_ERR "SRAT: Hotplug area has existing memory\n"); return -1; } @@ -317,6 +318,8 @@ acpi_numa_memory_affinity_init(struct ac printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, nd->start, nd->end); + e820_register_active_regions(node, nd->start >> PAGE_SHIFT, + nd->end >> PAGE_SHIFT); #ifdef RESERVE_HOTADD if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) { @@ -341,13 +344,13 @@ static int nodes_cover_memory(void) unsigned long s = nodes[i].start >> PAGE_SHIFT; unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; - pxmram -= e820_hole_size(s, e); + pxmram -= absent_pages_in_range(s, e); pxmram -= nodes_add[i].end - nodes_add[i].start; if ((long)pxmram < 0) pxmram = 0; } - e820ram = end_pfn - e820_hole_size(0, end_pfn); + e820ram = end_pfn - absent_pages_in_range(0, end_pfn); /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ if ((long)(e820ram - pxmram) >= 1*1024*1024) { printk(KERN_ERR diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/include/asm-ia64/meminit.h linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/asm-ia64/meminit.h --- linux-2.6.17-rc4-mm3-clean/include/asm-ia64/meminit.h 2006-05-22 11:25:42.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/asm-ia64/meminit.h 2006-05-22 11:30:19.000000000 +0100 @@ -55,6 +55,7 @@ extern void efi_memmap_init(unsigned lon extern unsigned long vmalloc_end; extern struct page *vmem_map; extern int find_largest_hole (u64 start, u64 end, void *arg); + extern int register_active_ranges (u64 start, u64 end, void *arg); extern int create_mem_map_page_table (u64 start, u64 end, void *arg); #endif diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/include/asm-x86_64/e820.h linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/asm-x86_64/e820.h --- linux-2.6.17-rc4-mm3-clean/include/asm-x86_64/e820.h 2006-05-12 00:31:53.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/asm-x86_64/e820.h 2006-05-22 11:29:22.000000000 +0100 @@ -50,10 +50,9 @@ extern void e820_print_map(char *who); extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); -extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end); extern void e820_setup_gap(void); -extern unsigned long e820_hole_size(unsigned long start_pfn, - unsigned long end_pfn); +extern void e820_register_active_regions(int nid, + unsigned long start_pfn, unsigned long end_pfn); extern void __init parse_memopt(char *p, char **end); extern void __init parse_memmapopt(char *p, char **end); diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/include/asm-x86_64/proto.h linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/asm-x86_64/proto.h --- linux-2.6.17-rc4-mm3-clean/include/asm-x86_64/proto.h 2006-05-22 11:25:42.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/asm-x86_64/proto.h 2006-05-22 11:29:22.000000000 +0100 @@ -24,8 +24,6 @@ extern void mtrr_bp_init(void); #define mtrr_bp_init() do {} while (0) #endif extern void init_memory_mapping(unsigned long start, unsigned long end); -extern void size_zones(unsigned long *z, unsigned long *h, - unsigned long start_pfn, unsigned long end_pfn); extern void system_call(void); extern int kernel_syscall(void); diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/include/linux/mm.h linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/linux/mm.h --- linux-2.6.17-rc4-mm3-clean/include/linux/mm.h 2006-05-22 11:25:42.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/linux/mm.h 2006-05-24 10:43:31.000000000 +0100 @@ -922,6 +922,51 @@ extern void free_area_init(unsigned long extern void free_area_init_node(int nid, pg_data_t *pgdat, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/* + * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its + * zones, allocate the backing mem_map and account for memory holes in an + * architecture independent manner. This is a substitute for creating the + * zone_sizes[] and zholes_size[] arrays and passing them to + * free_area_init_node() + * + * An architecture is expected to register range of page frames backed by + * physical memory with add_active_range() before calling + * free_area_init_nodes() passing in the PFN each zone ends at. At a basic + * usage, an architecture is expected to do something like + * + * for_each_valid_physical_page_range() + * add_active_range(node_id, start_pfn, end_pfn) + * free_area_init_nodes(max_dma, max_dma32, max_normal_pfn, max_highmem_pfn); + * + * If the architecture guarantees that there are no holes in the ranges + * registered with add_active_range(), free_bootmem_active_regions() + * will call free_bootmem_node() for each registered physical page range. + * Similarly sparse_memory_present_with_active_regions() calls + * memory_present() for each range when SPARSEMEM is enabled. + * + * See mm/page_alloc.c for more information on each function exposed by + * CONFIG_ARCH_POPULATES_NODE_MAP + */ +extern void free_area_init_nodes(unsigned long max_dma_pfn, + unsigned long max_dma32_pfn, + unsigned long max_low_pfn, + unsigned long max_high_pfn); +extern void add_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); +extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn, + unsigned long new_end_pfn); +extern void remove_all_active_ranges(void); +extern unsigned long absent_pages_in_range(unsigned long start_pfn, + unsigned long end_pfn); +extern void get_pfn_range_for_nid(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn); +extern unsigned long find_min_pfn_with_active_regions(void); +extern unsigned long find_max_pfn_with_active_regions(void); +extern void free_bootmem_with_active_regions(int nid, + unsigned long max_low_pfn); +extern void sparse_memory_present_with_active_regions(int nid); +#endif extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); extern void setup_per_zone_pages_min(void); extern void mem_init(void); diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/include/linux/mmzone.h linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/linux/mmzone.h --- linux-2.6.17-rc4-mm3-clean/include/linux/mmzone.h 2006-05-22 11:25:42.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/include/linux/mmzone.h 2006-05-22 11:26:26.000000000 +0100 @@ -271,6 +271,13 @@ struct zonelist { struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited }; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +struct node_active_region { + unsigned long start_pfn; + unsigned long end_pfn; + int nid; +}; +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ /* * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM @@ -477,7 +484,8 @@ extern struct zone *next_zone(struct zon #endif -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ + !defined(CONFIG_ARCH_POPULATES_NODE_MAP) #define early_pfn_to_nid(nid) (0UL) #endif diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.17-rc4-mm3-clean/mm/page_alloc.c linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/mm/page_alloc.c --- linux-2.6.17-rc4-mm3-clean/mm/page_alloc.c 2006-05-22 11:25:42.000000000 +0100 +++ linux-2.6.17-rc4-mm3-105-ia64_use_init_nodes/mm/page_alloc.c 2006-05-24 11:07:09.000000000 +0100 @@ -38,6 +38,8 @@ #include #include #include +#include +#include #include #include @@ -87,6 +89,33 @@ int min_free_kbytes = 1024; unsigned long __meminitdata nr_kernel_pages; unsigned long __meminitdata nr_all_pages; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct + * ranges of memory that may be registered with add_active_range(). + * Ranges passed to add_active_range() will be merged if possible + * so the number of times add_active_range() can be called is + * related to the number of nodes and the number of holes + */ + #ifdef CONFIG_MAX_ACTIVE_REGIONS + /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ + #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS + #else + #if MAX_NUMNODES >= 128 + /* If there can be many nodes, allow up to 50 holes per node */ + #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) + #else + /* By default, allow up to 256 distinct regions */ + #define MAX_ACTIVE_REGIONS 256 + #endif + #endif + + struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; + int __initdata nr_nodemap_entries; + unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; + unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -1872,25 +1901,6 @@ static inline unsigned long wait_table_b #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) -static void __init calculate_zone_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) -{ - unsigned long realtotalpages, totalpages = 0; - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zones_size[i]; - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - pgdat->node_present_pages = realtotalpages; - printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); -} - - /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -2208,6 +2218,271 @@ __meminit int init_currently_empty_zone( return 0; } +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/* + * Basic iterator support. Return the first range of PFNs for a node + * Note: nid == MAX_NUMNODES returns first region regardless of node + */ +static int __init first_active_region_index_in_nid(int nid) +{ + int i; + for (i = 0; i < nr_nodemap_entries; i++) { + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + } + + return -1; +} + +/* + * Basic iterator support. Return the next active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardles of node + */ +static int __init next_active_region_index_in_nid(int index, int nid) +{ + for (index = index + 1; index < nr_nodemap_entries; index++) { + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + } + + return -1; +} + +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + * Architectures may implement their own version but if add_active_range() + * was used and there are no special requirements, this is a convenient + * alternative + */ +int __init early_pfn_to_nid(unsigned long pfn) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) { + unsigned long start_pfn = early_node_map[i].start_pfn; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if ((start_pfn <= pfn) && (pfn < end_pfn)) + return early_node_map[i].nid; + } + + return 0; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + +/* Basic iterator support to walk early_node_map[] */ +#define for_each_active_range_index_in_nid(i, nid) \ + for (i = first_active_region_index_in_nid(nid); \ + i != -1; \ + i = next_active_region_index_in_nid(i, nid)) + +/** + * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * @nid: The node to free memory on + * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * this function may be used instead of calling free_bootmem() manually. + */ +void __init free_bootmem_with_active_regions(int nid, + unsigned long max_low_pfn) +{ + int i; + for_each_active_range_index_in_nid(i, nid) { + unsigned long size_pages = 0; + unsigned long end_pfn = early_node_map[i].end_pfn; + if (early_node_map[i].start_pfn >= max_low_pfn) + continue; + + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; + + size_pages = end_pfn - early_node_map[i].start_pfn; + free_bootmem_node(NODE_DATA(early_node_map[i].nid), + PFN_PHYS(early_node_map[i].start_pfn), + size_pages << PAGE_SHIFT); + } +} + +/** + * sparse_memory_present_with_active_regions - Call memory_present for each active range + * @nid: The node to call memory_present for + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * this function may be used instead of calling memory_present() manually. + */ +void __init sparse_memory_present_with_active_regions(int nid) +{ + int i; + for_each_active_range_index_in_nid(i, nid) + memory_present(early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); +} + +/** + * get_pfn_range_for_nid - Return the start and end page frames for a node + * @nid: The nid to return the range for + * @start_pfn: Passed by reference. On return, it will have the node start_pfn + * @end_pfn: Passed by reference. On return, it will have the node end_pfn + * + * It returns the start and end page frame of a node based on information + * provided by an arch calling add_active_range(). If called for a node + * with no available memory, a warning is printed and the start and end + * PFNs will be 0 + */ +void __init get_pfn_range_for_nid(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + int i; + *start_pfn = -1UL; + *end_pfn = 0; + + for_each_active_range_index_in_nid(i, nid) { + *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); + *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); + } + + if (*start_pfn == -1UL) { + printk(KERN_WARNING "Node %u active with no memory\n", nid); + *start_pfn = 0; + } +} + +/* + * Return the number of pages a zone spans in a node, including holes + * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() + */ +unsigned long __init zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + unsigned long node_start_pfn, node_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; + + /* Get the start and end of the node and zone */ + get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + + /* Check that this node has pages within the zone's required range */ + if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + return 0; + + /* Move the zone boundaries inside the node if necessary */ + zone_end_pfn = min(zone_end_pfn, node_end_pfn); + zone_start_pfn = max(zone_start_pfn, node_start_pfn); + + /* Return the spanned pages */ + return zone_end_pfn - zone_start_pfn; +} + +/* + * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, + * then all holes in the requested range will be accounted for + */ +unsigned long __init __absent_pages_in_range(int nid, + unsigned long range_start_pfn, + unsigned long range_end_pfn) +{ + int i = 0; + unsigned long prev_end_pfn = 0, hole_pages = 0; + unsigned long start_pfn; + + /* Find the end_pfn of the first active range of pfns in the node */ + i = first_active_region_index_in_nid(nid); + if (i == -1) + return 0; + prev_end_pfn = early_node_map[i].start_pfn; + + /* Find all holes for the zone within the node */ + for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { + + /* No need to continue if prev_end_pfn is outside the zone */ + if (prev_end_pfn >= range_end_pfn) + break; + + /* Make sure the end of the zone is not within the hole */ + start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); + prev_end_pfn = max(prev_end_pfn, range_start_pfn); + + /* Update the hole size cound and move on */ + if (start_pfn > range_start_pfn) { + BUG_ON(prev_end_pfn > start_pfn); + hole_pages += start_pfn - prev_end_pfn; + } + prev_end_pfn = early_node_map[i].end_pfn; + } + + return hole_pages; +} + +/** + * absent_pages_in_range - Return number of page frames in holes within a range + * @start_pfn: The start PFN to start searching for holes + * @end_pfn: The end PFN to stop searching for holes + * + * It returns the number of pages frames in memory holes within a range + */ +unsigned long __init absent_pages_in_range(unsigned long start_pfn, + unsigned long end_pfn) +{ + return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); +} + +/* Return the number of page frames in holes in a zone on a node */ +unsigned long __init zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + return __absent_pages_in_range(nid, + arch_zone_lowest_possible_pfn[zone_type], + arch_zone_highest_possible_pfn[zone_type]); +} +#else +static inline unsigned long zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zones_size) +{ + return zones_size[zone_type]; +} + +static inline unsigned long zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zholes_size) +{ + if (!zholes_size) + return 0; + + return zholes_size[zone_type]; +} +#endif + +static void __init calculate_node_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) { + totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, + zones_size); + } + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + for (i = 0; i < MAX_NR_ZONES; i++) { + realtotalpages -= + zone_absent_pages_in_node(pgdat->node_id, i, zholes_size); + } + pgdat->node_present_pages = realtotalpages; + printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, + realtotalpages); +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -2231,10 +2506,9 @@ static void __meminit free_area_init_cor struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - realsize = size = zones_size[j]; - if (zholes_size) - realsize -= zholes_size[j]; - + size = zone_spanned_pages_in_node(nid, j, zones_size); + realsize = size - zone_absent_pages_in_node(nid, j, + zholes_size); if (j < ZONE_HIGHMEM) nr_kernel_pages += realsize; nr_all_pages += realsize; @@ -2325,13 +2599,243 @@ void __meminit free_area_init_node(int n { pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - calculate_zone_totalpages(pgdat, zones_size, zholes_size); + calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); free_area_init_core(pgdat, zones_size, zholes_size); } +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/** + * add_active_range - Register a range of PFNs backed by physical memory + * @nid: The node ID the range resides on + * @start_pfn: The start PFN of the available physical memory + * @end_pfn: The end PFN of the available physical memory + * + * These ranges are stored in an early_node_map[] and later used by + * free_area_init_nodes() to calculate zone sizes and holes. If the + * range spans a memory hole, it is up to the architecture to ensure + * the memory is not freed by the bootmem allocator. If possible + * the range being registered will be merged with existing ranges. + */ +void __init add_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + printk("Entering add_active_range(%d, %lu, %lu) " + "%d entries of %d used\n", + nid, start_pfn, end_pfn, + nr_nodemap_entries, MAX_ACTIVE_REGIONS); + + /* Merge with existing active regions if possible */ + for (i = 0; i < nr_nodemap_entries; i++) { + if (early_node_map[i].nid != nid) + continue; + + /* Skip if an existing region covers this new one */ + if (start_pfn >= early_node_map[i].start_pfn && + end_pfn <= early_node_map[i].end_pfn) { + return; + } + + /* Merge forward if suitable */ + if (start_pfn <= early_node_map[i].end_pfn && + end_pfn > early_node_map[i].end_pfn) { + early_node_map[i].end_pfn = end_pfn; + return; + } + + /* Merge backward if suitable */ + if (start_pfn < early_node_map[i].end_pfn && + end_pfn >= early_node_map[i].start_pfn) { + early_node_map[i].start_pfn = start_pfn; + return; + } + } + + /* Check that early_node_map is large enough */ + if (i >= MAX_ACTIVE_REGIONS) { + printk(KERN_CRIT "More than %d memory regions, truncating\n", + MAX_ACTIVE_REGIONS); + return; + } + + early_node_map[i].nid = nid; + early_node_map[i].start_pfn = start_pfn; + early_node_map[i].end_pfn = end_pfn; + nr_nodemap_entries = i + 1; +} + +/** + * shrink_active_range - Shrink an existing registered range of PFNs + * @nid: The node id the range is on that should be shrunk + * @old_end_pfn: The old end PFN of the range + * @new_end_pfn: The new PFN of the range + * + * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. + * The map is kept at the end physical page range that has already been + * registered with add_active_range(). This function allows an arch to shrink + * an existing registered range. + */ +void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, + unsigned long new_end_pfn) +{ + int i; + + /* Find the old active region end and shrink */ + for_each_active_range_index_in_nid(i, nid) { + if (early_node_map[i].end_pfn == old_end_pfn) { + early_node_map[i].end_pfn = new_end_pfn; + break; + } + } +} + +/** + * remove_all_active_ranges - Remove all currently registered regions + * During discovery, it may be found that a table like SRAT is invalid + * and an alternative discovery method must be used. This function removes + * all currently registered regions. + */ +void __init remove_all_active_ranges() +{ + memset(early_node_map, 0, sizeof(early_node_map)); + nr_nodemap_entries = 0; +} + +/* Compare two active node_active_regions */ +static int __init cmp_node_active_region(const void *a, const void *b) +{ + struct node_active_region *arange = (struct node_active_region *)a; + struct node_active_region *brange = (struct node_active_region *)b; + + /* Done this way to avoid overflows */ + if (arange->start_pfn > brange->start_pfn) + return 1; + if (arange->start_pfn < brange->start_pfn) + return -1; + + return 0; +} + +/* sort the node_map by start_pfn */ +static void __init sort_node_map(void) +{ + sort(early_node_map, (size_t)nr_nodemap_entries, + sizeof(struct node_active_region), + cmp_node_active_region, NULL); +} + +/* Find the lowest pfn for a node. This depends on a sorted early_node_map */ +unsigned long __init find_min_pfn_for_node(unsigned long nid) +{ + int i; + + /* Assuming a sorted map, the first range found has the starting pfn */ + for_each_active_range_index_in_nid(i, nid) + return early_node_map[i].start_pfn; + + printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); + return 0; +} + +/** + * find_min_pfn_with_active_regions - Find the minimum PFN registered + * + * It returns the minimum PFN based on information provided via + * add_active_range() + */ +unsigned long __init find_min_pfn_with_active_regions(void) +{ + return find_min_pfn_for_node(MAX_NUMNODES); +} + +/** + * find_max_pfn_with_active_regions - Find the maximum PFN registered + * + * It returns the maximum PFN based on information provided via + * add_active_range() + */ +unsigned long __init find_max_pfn_with_active_regions(void) +{ + int i; + unsigned long max_pfn = 0; + + for (i = 0; i < nr_nodemap_entries; i++) + max_pfn = max(max_pfn, early_node_map[i].end_pfn); + + return max_pfn; +} + +/** + * free_area_init_nodes - Initialise all pg_data_t and zone data + * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA + * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32 + * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL + * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM + * + * This will call free_area_init_node() for each active node in the system. + * Using the page ranges provided by add_active_range(), the size of each + * zone in each node and their holes is calculated. If the maximum PFN + * between two adjacent zones match, it is assumed that the zone is empty. + * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed + * that arch_max_dma32_pfn has no pages. It is also assumed that a zone + * starts where the previous one ended. For example, ZONE_DMA32 starts + * at arch_max_dma_pfn. + */ +void __init free_area_init_nodes(unsigned long arch_max_dma_pfn, + unsigned long arch_max_dma32_pfn, + unsigned long arch_max_low_pfn, + unsigned long arch_max_high_pfn) +{ + unsigned long nid; + int i; + + /* Record where the zone boundaries are */ + memset(arch_zone_lowest_possible_pfn, 0, + sizeof(arch_zone_lowest_possible_pfn)); + memset(arch_zone_highest_possible_pfn, 0, + sizeof(arch_zone_highest_possible_pfn)); + arch_zone_lowest_possible_pfn[ZONE_DMA] = + find_min_pfn_with_active_regions(); + arch_zone_highest_possible_pfn[ZONE_DMA] = arch_max_dma_pfn; + arch_zone_highest_possible_pfn[ZONE_DMA32] = arch_max_dma32_pfn; + arch_zone_highest_possible_pfn[ZONE_NORMAL] = arch_max_low_pfn; + arch_zone_highest_possible_pfn[ZONE_HIGHMEM] = arch_max_high_pfn; + for (i = 1; i < MAX_NR_ZONES; i++) { + arch_zone_lowest_possible_pfn[i] = + arch_zone_highest_possible_pfn[i-1]; + } + + /* Regions in the early_node_map can be in any order */ + sort_node_map(); + + /* Print out the zone ranges */ + printk("Zone PFN ranges:\n"); + for (i = 0; i < MAX_NR_ZONES; i++) { + printk(" %-8s %8lu -> %8lu\n", + zone_names[i], + arch_zone_lowest_possible_pfn[i], + arch_zone_highest_possible_pfn[i]); + } + + /* Print out the early_node_map[] */ + printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); + for (i = 0; i < nr_nodemap_entries; i++) + printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); + + /* Initialise every node */ + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + free_area_init_node(nid, pgdat, NULL, + find_min_pfn_for_node(nid), NULL); + } +} +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + #ifndef CONFIG_NEED_MULTIPLE_NODES static bootmem_data_t contig_bootmem_data; struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };