LKML Archive mirror
 help / color / mirror / Atom feed
* [Patch] Move swiotlb_init early on X86_64
@ 2006-03-01  1:10 Zou Nan hai
  2006-03-02  4:15 ` Tony Luck
  2006-03-07  8:39 ` Andi Kleen
  0 siblings, 2 replies; 11+ messages in thread
From: Zou Nan hai @ 2006-03-01  1:10 UTC (permalink / raw
  To: LKML; +Cc: Andrew Morton, Andi Kleen, Venkatesh Pallipadi

on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.

On platforms with huge physical memory, 
large memmap and vfs cache may eat up all usable system memory 
under 4G.

Move swiotlb_init early before memmap is allocated can
solve this issue.

Signed-off-by: Zou Nan hai <Nanhai.zou@intel.com>



diff -Nraup linux-2.6.16-rc5/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
--- linux-2.6.16-rc5/arch/ia64/mm/init.c	2006-03-01 17:43:29.000000000 +0800
+++ b/arch/ia64/mm/init.c	2006-03-01 17:40:58.000000000 +0800
@@ -585,7 +585,7 @@ mem_init (void)
 	 * any drivers that may need the PCI DMA interface are initialized or bootmem has
 	 * been freed.
 	 */
-	platform_dma_init();
+	platform_dma_init(0);
 #endif
 
 #ifdef CONFIG_FLATMEM
diff -Nraup linux-2.6.16-rc5/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
--- linux-2.6.16-rc5/arch/x86_64/kernel/pci-swiotlb.c	2006-03-01 17:43:29.000000000 +0800
+++ b/arch/x86_64/kernel/pci-swiotlb.c	2006-03-01 17:41:01.000000000 +0800
@@ -36,7 +36,7 @@ void pci_swiotlb_init(void)
 	       swiotlb = 1;
 	if (swiotlb) {
 		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
-		swiotlb_init();
+		swiotlb_init(__pa(MAX_DMA_ADDRESS));
 		dma_ops = &swiotlb_dma_ops;
 	}
 }
diff -Nraup linux-2.6.16-rc5/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
--- linux-2.6.16-rc5/arch/x86_64/mm/init.c	2006-03-01 17:43:29.000000000 +0800
+++ b/arch/x86_64/mm/init.c	2006-03-01 17:41:01.000000000 +0800
@@ -437,6 +437,9 @@ void __init paging_init(void)
 
 	memory_present(0, 0, end_pfn);
 	sparse_init();
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
 	size_zones(zones, holes, 0, end_pfn);
 	free_area_init_node(0, NODE_DATA(0), zones,
 			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
@@ -528,9 +531,6 @@ void __init mem_init(void)
 {
 	long codesize, reservedpages, datasize, initsize;
 
-#ifdef CONFIG_SWIOTLB
-	pci_swiotlb_init();
-#endif
 	no_iommu_init();
 
 	/* How many end-of-memory variables you have, grandma! */
diff -Nraup linux-2.6.16-rc5/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
--- linux-2.6.16-rc5/arch/x86_64/mm/numa.c	2006-03-01 17:43:29.000000000 +0800
+++ b/arch/x86_64/mm/numa.c	2006-03-01 17:41:01.000000000 +0800
@@ -305,7 +305,9 @@ void __init paging_init(void)
 	int i;
 
 	arch_sparse_init();
-
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
 	for_each_online_node(i) {
 		setup_node_zones(i); 
 	}
diff -Nraup linux-2.6.16-rc5/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
--- linux-2.6.16-rc5/include/asm-ia64/machvec.h	2006-02-17 00:23:50.000000000 +0800
+++ b/include/asm-ia64/machvec.h	2006-03-01 17:41:10.000000000 +0800
@@ -36,7 +36,7 @@ typedef int ia64_mv_pci_legacy_write_t (
 					u8 size);
 
 /* DMA-mapping interface: */
-typedef void ia64_mv_dma_init (void);
+typedef void ia64_mv_dma_init (size_t);
 typedef void *ia64_mv_dma_alloc_coherent (struct device *, size_t, dma_addr_t *, gfp_t);
 typedef void ia64_mv_dma_free_coherent (struct device *, size_t, void *, dma_addr_t);
 typedef dma_addr_t ia64_mv_dma_map_single (struct device *, void *, size_t, int);
@@ -76,6 +76,11 @@ typedef unsigned int ia64_mv_readl_relax
 typedef unsigned long ia64_mv_readq_relaxed_t (const volatile void __iomem *);
 
 static inline void
+machvec_noop_size_t (size_t size)
+{
+}
+
+static inline void
 machvec_noop (void)
 {
 }
diff -Nraup linux-2.6.16-rc5/include/asm-ia64/machvec_hpzx1.h b/include/asm-ia64/machvec_hpzx1.h
--- linux-2.6.16-rc5/include/asm-ia64/machvec_hpzx1.h	2006-02-17 00:23:50.000000000 +0800
+++ b/include/asm-ia64/machvec_hpzx1.h	2006-03-01 17:41:10.000000000 +0800
@@ -20,7 +20,7 @@ extern ia64_mv_dma_mapping_error	sba_dma
  */
 #define platform_name				"hpzx1"
 #define platform_setup				dig_setup
-#define platform_dma_init			machvec_noop
+#define platform_dma_init			machvec_noop_size_t
 #define platform_dma_alloc_coherent		sba_alloc_coherent
 #define platform_dma_free_coherent		sba_free_coherent
 #define platform_dma_map_single			sba_map_single
diff -Nraup linux-2.6.16-rc5/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h
--- linux-2.6.16-rc5/include/asm-ia64/machvec_sn2.h	2006-03-01 17:43:31.000000000 +0800
+++ b/include/asm-ia64/machvec_sn2.h	2006-03-01 17:41:10.000000000 +0800
@@ -102,7 +102,7 @@ extern ia64_mv_dma_supported		sn_dma_sup
 #define platform_pci_get_legacy_mem	sn_pci_get_legacy_mem
 #define platform_pci_legacy_read	sn_pci_legacy_read
 #define platform_pci_legacy_write	sn_pci_legacy_write
-#define platform_dma_init		machvec_noop
+#define platform_dma_init		machvec_noop_size_t
 #define platform_dma_alloc_coherent	sn_dma_alloc_coherent
 #define platform_dma_free_coherent	sn_dma_free_coherent
 #define platform_dma_map_single		sn_dma_map_single
diff -Nraup linux-2.6.16-rc5/include/asm-x86_64/swiotlb.h b/include/asm-x86_64/swiotlb.h
--- linux-2.6.16-rc5/include/asm-x86_64/swiotlb.h	2006-03-01 17:43:31.000000000 +0800
+++ b/include/asm-x86_64/swiotlb.h	2006-03-01 17:41:11.000000000 +0800
@@ -41,7 +41,7 @@ extern int swiotlb_dma_mapping_error(dma
 extern void swiotlb_free_coherent (struct device *hwdev, size_t size,
 				   void *vaddr, dma_addr_t dma_handle);
 extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
-extern void swiotlb_init(void);
+extern void swiotlb_init(size_t);
 
 #ifdef CONFIG_SWIOTLB
 extern int swiotlb;
diff -Nraup linux-2.6.16-rc5/include/linux/bootmem.h b/include/linux/bootmem.h
--- linux-2.6.16-rc5/include/linux/bootmem.h	2006-03-01 17:43:31.000000000 +0800
+++ b/include/linux/bootmem.h	2006-03-01 17:41:11.000000000 +0800
@@ -57,10 +57,14 @@ extern void __init reserve_bootmem (unsi
 	__alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
 	__alloc_bootmem_low((x), SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_low_goal(x,goal) \
+	__alloc_bootmem_low((x), SMP_CACHE_BYTES, goal)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem_low((x), PAGE_SIZE, 0)
+#define alloc_bootmem_low_pages_goal(x,goal) \
+	__alloc_bootmem_low((x), PAGE_SIZE, goal)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 extern unsigned long __init free_all_bootmem (void);
 extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
diff -Nraup linux-2.6.16-rc5/lib/swiotlb.c b/lib/swiotlb.c
--- linux-2.6.16-rc5/lib/swiotlb.c	2006-03-01 17:43:31.000000000 +0800
+++ b/lib/swiotlb.c	2006-03-01 17:41:12.000000000 +0800
@@ -129,8 +129,8 @@ __setup("swiotlb=", setup_io_tlb_npages)
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
  */
-void
-swiotlb_init_with_default_size (size_t default_size)
+static void
+swiotlb_init_with_default_size (size_t default_size, size_t goal)
 {
 	unsigned long i;
 
@@ -142,7 +142,7 @@ swiotlb_init_with_default_size (size_t d
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+	io_tlb_start = alloc_bootmem_low_pages_goal(io_tlb_nslabs * (1 << IO_TLB_SHIFT), goal);
 	if (!io_tlb_start)
 		panic("Cannot allocate SWIOTLB buffer");
 	io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
@@ -161,15 +161,15 @@ swiotlb_init_with_default_size (size_t d
 	/*
 	 * Get the overflow emergency buffer
 	 */
-	io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+	io_tlb_overflow_buffer = alloc_bootmem_low_goal(io_tlb_overflow, goal);
 	printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
 	       virt_to_phys(io_tlb_start), virt_to_phys(io_tlb_end));
 }
 
 void
-swiotlb_init (void)
+swiotlb_init (size_t goal)
 {
-	swiotlb_init_with_default_size(64 * (1<<20));	/* default to 64MB */
+	swiotlb_init_with_default_size(64 * (1<<20), goal);	/* default to 64MB */
 }
 
 /*




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch] Move swiotlb_init early on X86_64
  2006-03-01  1:10 [Patch] Move swiotlb_init early on X86_64 Zou Nan hai
@ 2006-03-02  4:15 ` Tony Luck
  2006-03-02  4:30   ` Andi Kleen
  2006-03-07  8:39 ` Andi Kleen
  1 sibling, 1 reply; 11+ messages in thread
From: Tony Luck @ 2006-03-02  4:15 UTC (permalink / raw
  To: Zou Nan hai; +Cc: LKML, Andrew Morton, Andi Kleen, Venkatesh Pallipadi

On 01 Mar 2006 09:10:58 +0800, Zou Nan hai <nanhai.zou@intel.com> wrote:
> on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
>
> On platforms with huge physical memory,
> large memmap and vfs cache may eat up all usable system memory
> under 4G.
>
> Move swiotlb_init early before memmap is allocated can
> solve this issue.

Shouldn't memmap be allocated from memory above 4G (if available)? Using
up lots of <4G memory on something that doesn't need to be below 4G
sounds like a poor use of resources.

-Tony

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch] Move swiotlb_init early on X86_64
  2006-03-02  4:15 ` Tony Luck
@ 2006-03-02  4:30   ` Andi Kleen
  2006-03-02  4:33     ` Zou Nan hai
  0 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2006-03-02  4:30 UTC (permalink / raw
  To: Tony Luck; +Cc: Zou Nan hai, LKML, Andrew Morton, Venkatesh Pallipadi

On Thursday 02 March 2006 05:15, Tony Luck wrote:
> On 01 Mar 2006 09:10:58 +0800, Zou Nan hai <nanhai.zou@intel.com> wrote:
> > on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> >
> > On platforms with huge physical memory,
> > large memmap and vfs cache may eat up all usable system memory
> > under 4G.
> >
> > Move swiotlb_init early before memmap is allocated can
> > solve this issue.
> 
> Shouldn't memmap be allocated from memory above 4G (if available)? Using
> up lots of <4G memory on something that doesn't need to be below 4G
> sounds like a poor use of resources.

On the really large machines it will be distributed over the nodes anyways.
But yes the single node SMP case should probably allocate it higher.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch] Move swiotlb_init early on X86_64
  2006-03-02  4:30   ` Andi Kleen
@ 2006-03-02  4:33     ` Zou Nan hai
  0 siblings, 0 replies; 11+ messages in thread
From: Zou Nan hai @ 2006-03-02  4:33 UTC (permalink / raw
  To: Andi Kleen; +Cc: Tony Luck, LKML, Andrew Morton, Venkatesh Pallipadi

On Thu, 2006-03-02 at 12:30, Andi Kleen wrote:
> On Thursday 02 March 2006 05:15, Tony Luck wrote:
> > On 01 Mar 2006 09:10:58 +0800, Zou Nan hai <nanhai.zou@intel.com> wrote:
> > > on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> > >
> > > On platforms with huge physical memory,
> > > large memmap and vfs cache may eat up all usable system memory
> > > under 4G.
> > >
> > > Move swiotlb_init early before memmap is allocated can
> > > solve this issue.
> > 
> > Shouldn't memmap be allocated from memory above 4G (if available)? Using
> > up lots of <4G memory on something that doesn't need to be below 4G
> > sounds like a poor use of resources.
> 
> On the really large machines it will be distributed over the nodes anyways.
> But yes the single node SMP case should probably allocate it higher.
> 
> -Andi

Really, then how about the following patch?

Let normal bootmem allocator go above 4G first.
This will save more memory with address less than 4G.

Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>

--- linux-2.6.16-rc5/mm/bootmem.c	2006-03-03 08:31:52.000000000 +0800
+++ b/mm/bootmem.c	2006-03-03 09:05:17.000000000 +0800
@@ -381,16 +381,24 @@ unsigned long __init free_all_bootmem (v
 	return(free_all_bootmem_core(NODE_DATA(0)));
 }
 
+#define LOW32LIMIT 0xffffffff
+
 void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
 {
 	pg_data_t *pgdat = pgdat_list;
 	void *ptr;
 
+	if (goal < LOW32LIMIT) {
+		for_each_pgdat(pgdat)
+			if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+						 align, LOW32LIMIT, 0)))
+			return(ptr);
+	}
+
 	for_each_pgdat(pgdat)
 		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
 						 align, goal, 0)))
 			return(ptr);
-
 	/*
 	 * Whoops, we cannot satisfy the allocation request.
 	 */
@@ -405,6 +413,13 @@ void * __init __alloc_bootmem_node(pg_da
 {
 	void *ptr;
 
+	if (goal < LOW32LIMIT) {
+		ptr = __alloc_bootmem_core(pgdat->bdata, size, align,
+				LOW32LIMIT, 0);
+		if (ptr)
+			return (ptr);
+	}
+
 	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return (ptr);
@@ -412,7 +427,6 @@ void * __init __alloc_bootmem_node(pg_da
 	return __alloc_bootmem(size, align, goal);
 }
 
-#define LOW32LIMIT 0xffffffff
 
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
 {






^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [Patch] Move swiotlb_init early on X86_64
@ 2006-03-02  9:09 Zhang, Yanmin
  2006-03-02 23:35 ` Zou Nan hai
  0 siblings, 1 reply; 11+ messages in thread
From: Zhang, Yanmin @ 2006-03-02  9:09 UTC (permalink / raw
  To: Zou, Nanhai, Andi Kleen
  Cc: Luck, Tony, LKML, Andrew Morton, Pallipadi, Venkatesh

>>-----Original Message-----
>>From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of Zou Nan hai
>>Sent: 2006年3月2日 12:33
>>
>>Really, then how about the following patch?
>>
>>Let normal bootmem allocator go above 4G first.
>>This will save more memory with address less than 4G.
>>
>>Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
>>
>>--- linux-2.6.16-rc5/mm/bootmem.c	2006-03-03 08:31:52.000000000 +0800
>>+++ b/mm/bootmem.c	2006-03-03 09:05:17.000000000 +0800
>>@@ -381,16 +381,24 @@ unsigned long __init free_all_bootmem (v
>> 	return(free_all_bootmem_core(NODE_DATA(0)));
>> }
>>
>>+#define LOW32LIMIT 0xffffffff
>>+
>> void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
>> {
>> 	pg_data_t *pgdat = pgdat_list;
>> 	void *ptr;
>>
>>+	if (goal < LOW32LIMIT) {
On i386, above is always true.


>>+		for_each_pgdat(pgdat)
>>+			if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
>>+						 align, LOW32LIMIT, 0)))
>>+			return(ptr);
>>+	}

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [Patch] Move swiotlb_init early on X86_64
  2006-03-02  9:09 Zhang, Yanmin
@ 2006-03-02 23:35 ` Zou Nan hai
  2006-03-03  1:32   ` Andi Kleen
  0 siblings, 1 reply; 11+ messages in thread
From: Zou Nan hai @ 2006-03-02 23:35 UTC (permalink / raw
  To: Zhang, Yanmin
  Cc: Andi Kleen, Luck, Tony, LKML, Andrew Morton, Pallipadi, Venkatesh

On Thu, 2006-03-02 at 17:09, Zhang, Yanmin wrote:
> >>-----Original Message-----
> >>From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of Zou Nan hai
> >>Sent: 2006年3月2日 12:33
> >>
> >>Really, then how about the following patch?
> >>
> >>Let normal bootmem allocator go above 4G first.
> >>This will save more memory with address less than 4G.
> >>
> >>Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
> >>
> >>--- linux-2.6.16-rc5/mm/bootmem.c	2006-03-03 08:31:52.000000000 +0800
> >>+++ b/mm/bootmem.c	2006-03-03 09:05:17.000000000 +0800
> >>@@ -381,16 +381,24 @@ unsigned long __init free_all_bootmem (v
> >> 	return(free_all_bootmem_core(NODE_DATA(0)));
> >> }
> >>
> >>+#define LOW32LIMIT 0xffffffff
> >>+
> >> void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
> >> {
> >> 	pg_data_t *pgdat = pgdat_list;
> >> 	void *ptr;
> >>
> >>+	if (goal < LOW32LIMIT) {
> On i386, above is always true.
> 
> 

Ok, I modified the patch.

On single node SMP System with large physical memory, 
allocation from bootmem allocator like memmap and vfs_cache 
may eat up usable memory under 4G, then software I/O TLB will not be able to allocate bounce buffer.

This patch modify the bootmem allocator,
let normal bootmem allocation on 64 bit system first go above 4G
address.

Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>

--- linux-2.6.16-rc5/mm/bootmem.c	2006-03-03 08:31:52.000000000 +0800
+++ b/mm/bootmem.c	2006-03-04 03:48:55.000000000 +0800
@@ -381,16 +381,25 @@ unsigned long __init free_all_bootmem (v
 	return(free_all_bootmem_core(NODE_DATA(0)));
 }
 
+#define LOW32LIMIT 0xffffffff
+
 void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
 {
 	pg_data_t *pgdat = pgdat_list;
 	void *ptr;
 
+#if (BITS_PER_LONG == 64)
+	if (goal < LOW32LIMIT) {
+		for_each_pgdat(pgdat)
+			if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+						 align, LOW32LIMIT, 0)))
+			return(ptr);
+	}
+#endif
 	for_each_pgdat(pgdat)
 		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
 						 align, goal, 0)))
 			return(ptr);
-
 	/*
 	 * Whoops, we cannot satisfy the allocation request.
 	 */
@@ -404,6 +413,14 @@ void * __init __alloc_bootmem_node(pg_da
 				   unsigned long goal)
 {
 	void *ptr;
+#if (BITS_PER_LONG == 64)
+	if (goal < LOW32LIMIT) {
+		ptr = __alloc_bootmem_core(pgdat->bdata, size, align,
+				LOW32LIMIT, 0);
+		if (ptr)
+			return (ptr);
+	}
+#endif
 
 	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
@@ -412,7 +429,6 @@ void * __init __alloc_bootmem_node(pg_da
 	return __alloc_bootmem(size, align, goal);
 }
 
-#define LOW32LIMIT 0xffffffff
 
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
 {






 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch] Move swiotlb_init early on X86_64
  2006-03-02 23:35 ` Zou Nan hai
@ 2006-03-03  1:32   ` Andi Kleen
  0 siblings, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2006-03-03  1:32 UTC (permalink / raw
  To: Zou Nan hai
  Cc: Zhang, Yanmin, Luck, Tony, LKML, Andrew Morton,
	Pallipadi, Venkatesh

On Friday 03 March 2006 00:35, Zou Nan hai wrote:

> This patch modify the bootmem allocator,
> let normal bootmem allocation on 64 bit system first go above 4G
> address.

That's very ugly and likely to break some architectures. Sorry
but #ifdefs is the wrong way to do this.

Passing a limit parameter is better and use that in the swiotlb
allocation. If you're worried about changing too many callers
you could add a new entry point.

-Andi


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [Patch] Move swiotlb_init early on X86_64
@ 2006-03-03  1:59 Pallipadi, Venkatesh
  0 siblings, 0 replies; 11+ messages in thread
From: Pallipadi, Venkatesh @ 2006-03-03  1:59 UTC (permalink / raw
  To: Andi Kleen, Zou, Nanhai; +Cc: Zhang, Yanmin, Luck, Tony, LKML, Andrew Morton

 

>-----Original Message-----
>From: Andi Kleen [mailto:ak@suse.de] 
>Sent: Thursday, March 02, 2006 5:32 PM
>To: Zou, Nanhai
>Cc: Zhang, Yanmin; Luck, Tony; LKML; Andrew Morton; Pallipadi, 
>Venkatesh
>Subject: Re: [Patch] Move swiotlb_init early on X86_64
>
>On Friday 03 March 2006 00:35, Zou Nan hai wrote:
>
>> This patch modify the bootmem allocator,
>> let normal bootmem allocation on 64 bit system first go above 4G
>> address.
>
>That's very ugly and likely to break some architectures. Sorry
>but #ifdefs is the wrong way to do this.
>
>Passing a limit parameter is better and use that in the swiotlb
>allocation. If you're worried about changing too many callers
>you could add a new entry point.
>

Another potential issue with this approach:
On a 64 bit system with less than 4G phys memory, we will fail
to get any memory above 4G and fall back to start from '0'.
This is different from original behaviour, where goal was 
MAX_DMA_ADDRESS (16M) and we would allocate memory starting 
from 16M. As a result, we will now eat up memory in 0-16M range 
and may break some legacy drivers as they will not get any memory.

If we go this way, then we should fallback to original goal if we 
are not able to get greater than 4G memory.

Thanks,
Venki

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch] Move swiotlb_init early on X86_64
  2006-03-01  1:10 [Patch] Move swiotlb_init early on X86_64 Zou Nan hai
  2006-03-02  4:15 ` Tony Luck
@ 2006-03-07  8:39 ` Andi Kleen
  2006-03-07 23:23   ` Zou Nan hai
  1 sibling, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2006-03-07  8:39 UTC (permalink / raw
  To: Zou Nan hai; +Cc: LKML, Andrew Morton, Venkatesh Pallipadi

On Wednesday 01 March 2006 02:10, Zou Nan hai wrote:
> on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> 
> On platforms with huge physical memory, 
> large memmap and vfs cache may eat up all usable system memory 
> under 4G.
> 
> Move swiotlb_init early before memmap is allocated can
> solve this issue.
> 
> Signed-off-by: Zou Nan hai <Nanhai.zou@intel.com>


I came up with a simpler change now that should fix the problem too.
It just try to move the memmap to the end of the node. I don't have a system
big enough to test the original problem though.

It should be fairly safe because if the allocation fails we just fallback
to the normal old way of allocating it near the beginning.

Try to allocate node memmap near the end of node

This fixes problems with very large nodes (over 128GB) filling up all of 
the first 4GB with their mem_map and not leaving enough
space for the swiotlb.


Signed-off-by: Andi Kleen <ak@suse.de>

---
 arch/x86_64/mm/numa.c   |   12 +++++++++++-
 include/linux/bootmem.h |    3 +++
 mm/bootmem.c            |    2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -172,7 +172,7 @@ void __init setup_node_bootmem(int nodei
 /* Initialize final allocator for a zone */
 void __init setup_node_zones(int nodeid)
 { 
-	unsigned long start_pfn, end_pfn; 
+	unsigned long start_pfn, end_pfn, memmapsize, limit;
 	unsigned long zones[MAX_NR_ZONES];
 	unsigned long holes[MAX_NR_ZONES];
 
@@ -182,6 +182,16 @@ void __init setup_node_zones(int nodeid)
 	Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
 		nodeid, start_pfn, end_pfn);
 
+	/* Try to allocate mem_map at end to not fill up precious <4GB
+	   memory. */
+	memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
+	limit = end_pfn << PAGE_SHIFT;
+	NODE_DATA(nodeid)->node_mem_map = 
+		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
+				memmapsize, SMP_CACHE_BYTES, 
+				limit, 
+				round_down(limit - memmapsize, PAGE_SIZE));
+
 	size_zones(zones, holes, start_pfn, end_pfn);
 	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
 			    start_pfn, holes);
Index: linux/include/linux/bootmem.h
===================================================================
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -52,6 +52,9 @@ extern void * __init __alloc_bootmem_low
 					      unsigned long size,
 					      unsigned long align,
 					      unsigned long goal);
+extern void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
+		unsigned long size, unsigned long align, unsigned long goal,
+		unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
Index: linux/mm/bootmem.c
===================================================================
--- linux.orig/mm/bootmem.c
+++ linux/mm/bootmem.c
@@ -152,7 +152,7 @@ static void __init free_bootmem_core(boo
  *
  * NOTE:  This function is _not_ reentrant.
  */
-static void * __init
+void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	      unsigned long align, unsigned long goal, unsigned long limit)
 {



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch] Move swiotlb_init early on X86_64
  2006-03-07  8:39 ` Andi Kleen
@ 2006-03-07 23:23   ` Zou Nan hai
  2006-03-08  9:33     ` Andi Kleen
  0 siblings, 1 reply; 11+ messages in thread
From: Zou Nan hai @ 2006-03-07 23:23 UTC (permalink / raw
  To: Andi Kleen; +Cc: LKML, Andrew Morton, Venkatesh Pallipadi

On Tue, 2006-03-07 at 16:39, Andi Kleen wrote:
> On Wednesday 01 March 2006 02:10, Zou Nan hai wrote:
> > on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> > 
> > On platforms with huge physical memory, 
> > large memmap and vfs cache may eat up all usable system memory 
> > under 4G.
> > 
> > Move swiotlb_init early before memmap is allocated can
> > solve this issue.
> > 
> > Signed-off-by: Zou Nan hai <Nanhai.zou@intel.com>
> 
> 
> I came up with a simpler change now that should fix the problem too.
> It just try to move the memmap to the end of the node. I don't have a system
> big enough to test the original problem though.
> 
> It should be fairly safe because if the allocation fails we just fallback
> to the normal old way of allocating it near the beginning.
> 
> Try to allocate node memmap near the end of node
> 
> This fixes problems with very large nodes (over 128GB) filling up all of 
> the first 4GB with their mem_map and not leaving enough
> space for the swiotlb.
> 
> 
> Signed-off-by: Andi Kleen <ak@suse.de>
> 
> ---
>  arch/x86_64/mm/numa.c   |   12 +++++++++++-
>  include/linux/bootmem.h |    3 +++
>  mm/bootmem.c            |    2 +-
>  3 files changed, 15 insertions(+), 2 deletions(-)
> 
> Index: linux/arch/x86_64/mm/numa.c
> ===================================================================
> --- linux.orig/arch/x86_64/mm/numa.c
> +++ linux/arch/x86_64/mm/numa.c
> @@ -172,7 +172,7 @@ void __init setup_node_bootmem(int nodei
>  /* Initialize final allocator for a zone */
>  void __init setup_node_zones(int nodeid)
>  { 
> -	unsigned long start_pfn, end_pfn; 
> +	unsigned long start_pfn, end_pfn, memmapsize, limit;
>  	unsigned long zones[MAX_NR_ZONES];
>  	unsigned long holes[MAX_NR_ZONES];
>  
> @@ -182,6 +182,16 @@ void __init setup_node_zones(int nodeid)
>  	Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
>  		nodeid, start_pfn, end_pfn);
>  
> +	/* Try to allocate mem_map at end to not fill up precious <4GB
> +	   memory. */
> +	memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
> +	limit = end_pfn << PAGE_SHIFT;
> +	NODE_DATA(nodeid)->node_mem_map = 
> +		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
> +				memmapsize, SMP_CACHE_BYTES, 
> +				limit, 
> +				round_down(limit - memmapsize, PAGE_SIZE));
> +

, round_down(limit - memmapsize, PAGE_SIZE), limit);?


Zou Nan hai




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Patch] Move swiotlb_init early on X86_64
  2006-03-07 23:23   ` Zou Nan hai
@ 2006-03-08  9:33     ` Andi Kleen
  0 siblings, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2006-03-08  9:33 UTC (permalink / raw
  To: Zou Nan hai; +Cc: LKML, Andrew Morton, Venkatesh Pallipadi

On Wednesday 08 March 2006 00:23, Zou Nan hai wrote:

>
> , round_down(limit - memmapsize, PAGE_SIZE), limit);?

Indeed. Thanks for catching that.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2006-03-08  9:40 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-03-01  1:10 [Patch] Move swiotlb_init early on X86_64 Zou Nan hai
2006-03-02  4:15 ` Tony Luck
2006-03-02  4:30   ` Andi Kleen
2006-03-02  4:33     ` Zou Nan hai
2006-03-07  8:39 ` Andi Kleen
2006-03-07 23:23   ` Zou Nan hai
2006-03-08  9:33     ` Andi Kleen
  -- strict thread matches above, loose matches on Subject: below --
2006-03-02  9:09 Zhang, Yanmin
2006-03-02 23:35 ` Zou Nan hai
2006-03-03  1:32   ` Andi Kleen
2006-03-03  1:59 Pallipadi, Venkatesh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).