* [Patch] Move swiotlb_init early on X86_64
@ 2006-03-01 1:10 Zou Nan hai
2006-03-02 4:15 ` Tony Luck
2006-03-07 8:39 ` Andi Kleen
0 siblings, 2 replies; 11+ messages in thread
From: Zou Nan hai @ 2006-03-01 1:10 UTC (permalink / raw
To: LKML; +Cc: Andrew Morton, Andi Kleen, Venkatesh Pallipadi
on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
On platforms with huge physical memory,
large memmap and vfs cache may eat up all usable system memory
under 4G.
Move swiotlb_init early before memmap is allocated can
solve this issue.
Signed-off-by: Zou Nan hai <Nanhai.zou@intel.com>
diff -Nraup linux-2.6.16-rc5/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
--- linux-2.6.16-rc5/arch/ia64/mm/init.c 2006-03-01 17:43:29.000000000 +0800
+++ b/arch/ia64/mm/init.c 2006-03-01 17:40:58.000000000 +0800
@@ -585,7 +585,7 @@ mem_init (void)
* any drivers that may need the PCI DMA interface are initialized or bootmem has
* been freed.
*/
- platform_dma_init();
+ platform_dma_init(0);
#endif
#ifdef CONFIG_FLATMEM
diff -Nraup linux-2.6.16-rc5/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
--- linux-2.6.16-rc5/arch/x86_64/kernel/pci-swiotlb.c 2006-03-01 17:43:29.000000000 +0800
+++ b/arch/x86_64/kernel/pci-swiotlb.c 2006-03-01 17:41:01.000000000 +0800
@@ -36,7 +36,7 @@ void pci_swiotlb_init(void)
swiotlb = 1;
if (swiotlb) {
printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_init();
+ swiotlb_init(__pa(MAX_DMA_ADDRESS));
dma_ops = &swiotlb_dma_ops;
}
}
diff -Nraup linux-2.6.16-rc5/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
--- linux-2.6.16-rc5/arch/x86_64/mm/init.c 2006-03-01 17:43:29.000000000 +0800
+++ b/arch/x86_64/mm/init.c 2006-03-01 17:41:01.000000000 +0800
@@ -437,6 +437,9 @@ void __init paging_init(void)
memory_present(0, 0, end_pfn);
sparse_init();
+#ifdef CONFIG_SWIOTLB
+ pci_swiotlb_init();
+#endif
size_zones(zones, holes, 0, end_pfn);
free_area_init_node(0, NODE_DATA(0), zones,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
@@ -528,9 +531,6 @@ void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
-#ifdef CONFIG_SWIOTLB
- pci_swiotlb_init();
-#endif
no_iommu_init();
/* How many end-of-memory variables you have, grandma! */
diff -Nraup linux-2.6.16-rc5/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
--- linux-2.6.16-rc5/arch/x86_64/mm/numa.c 2006-03-01 17:43:29.000000000 +0800
+++ b/arch/x86_64/mm/numa.c 2006-03-01 17:41:01.000000000 +0800
@@ -305,7 +305,9 @@ void __init paging_init(void)
int i;
arch_sparse_init();
-
+#ifdef CONFIG_SWIOTLB
+ pci_swiotlb_init();
+#endif
for_each_online_node(i) {
setup_node_zones(i);
}
diff -Nraup linux-2.6.16-rc5/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
--- linux-2.6.16-rc5/include/asm-ia64/machvec.h 2006-02-17 00:23:50.000000000 +0800
+++ b/include/asm-ia64/machvec.h 2006-03-01 17:41:10.000000000 +0800
@@ -36,7 +36,7 @@ typedef int ia64_mv_pci_legacy_write_t (
u8 size);
/* DMA-mapping interface: */
-typedef void ia64_mv_dma_init (void);
+typedef void ia64_mv_dma_init (size_t);
typedef void *ia64_mv_dma_alloc_coherent (struct device *, size_t, dma_addr_t *, gfp_t);
typedef void ia64_mv_dma_free_coherent (struct device *, size_t, void *, dma_addr_t);
typedef dma_addr_t ia64_mv_dma_map_single (struct device *, void *, size_t, int);
@@ -76,6 +76,11 @@ typedef unsigned int ia64_mv_readl_relax
typedef unsigned long ia64_mv_readq_relaxed_t (const volatile void __iomem *);
static inline void
+machvec_noop_size_t (size_t size)
+{
+}
+
+static inline void
machvec_noop (void)
{
}
diff -Nraup linux-2.6.16-rc5/include/asm-ia64/machvec_hpzx1.h b/include/asm-ia64/machvec_hpzx1.h
--- linux-2.6.16-rc5/include/asm-ia64/machvec_hpzx1.h 2006-02-17 00:23:50.000000000 +0800
+++ b/include/asm-ia64/machvec_hpzx1.h 2006-03-01 17:41:10.000000000 +0800
@@ -20,7 +20,7 @@ extern ia64_mv_dma_mapping_error sba_dma
*/
#define platform_name "hpzx1"
#define platform_setup dig_setup
-#define platform_dma_init machvec_noop
+#define platform_dma_init machvec_noop_size_t
#define platform_dma_alloc_coherent sba_alloc_coherent
#define platform_dma_free_coherent sba_free_coherent
#define platform_dma_map_single sba_map_single
diff -Nraup linux-2.6.16-rc5/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h
--- linux-2.6.16-rc5/include/asm-ia64/machvec_sn2.h 2006-03-01 17:43:31.000000000 +0800
+++ b/include/asm-ia64/machvec_sn2.h 2006-03-01 17:41:10.000000000 +0800
@@ -102,7 +102,7 @@ extern ia64_mv_dma_supported sn_dma_sup
#define platform_pci_get_legacy_mem sn_pci_get_legacy_mem
#define platform_pci_legacy_read sn_pci_legacy_read
#define platform_pci_legacy_write sn_pci_legacy_write
-#define platform_dma_init machvec_noop
+#define platform_dma_init machvec_noop_size_t
#define platform_dma_alloc_coherent sn_dma_alloc_coherent
#define platform_dma_free_coherent sn_dma_free_coherent
#define platform_dma_map_single sn_dma_map_single
diff -Nraup linux-2.6.16-rc5/include/asm-x86_64/swiotlb.h b/include/asm-x86_64/swiotlb.h
--- linux-2.6.16-rc5/include/asm-x86_64/swiotlb.h 2006-03-01 17:43:31.000000000 +0800
+++ b/include/asm-x86_64/swiotlb.h 2006-03-01 17:41:11.000000000 +0800
@@ -41,7 +41,7 @@ extern int swiotlb_dma_mapping_error(dma
extern void swiotlb_free_coherent (struct device *hwdev, size_t size,
void *vaddr, dma_addr_t dma_handle);
extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
-extern void swiotlb_init(void);
+extern void swiotlb_init(size_t);
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
diff -Nraup linux-2.6.16-rc5/include/linux/bootmem.h b/include/linux/bootmem.h
--- linux-2.6.16-rc5/include/linux/bootmem.h 2006-03-01 17:43:31.000000000 +0800
+++ b/include/linux/bootmem.h 2006-03-01 17:41:11.000000000 +0800
@@ -57,10 +57,14 @@ extern void __init reserve_bootmem (unsi
__alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_low((x), SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_low_goal(x,goal) \
+ __alloc_bootmem_low((x), SMP_CACHE_BYTES, goal)
#define alloc_bootmem_pages(x) \
__alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_low((x), PAGE_SIZE, 0)
+#define alloc_bootmem_low_pages_goal(x,goal) \
+ __alloc_bootmem_low((x), PAGE_SIZE, goal)
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
extern unsigned long __init free_all_bootmem (void);
extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
diff -Nraup linux-2.6.16-rc5/lib/swiotlb.c b/lib/swiotlb.c
--- linux-2.6.16-rc5/lib/swiotlb.c 2006-03-01 17:43:31.000000000 +0800
+++ b/lib/swiotlb.c 2006-03-01 17:41:12.000000000 +0800
@@ -129,8 +129,8 @@ __setup("swiotlb=", setup_io_tlb_npages)
* Statically reserve bounce buffer space and initialize bounce buffer data
* structures for the software IO TLB used to implement the DMA API.
*/
-void
-swiotlb_init_with_default_size (size_t default_size)
+static void
+swiotlb_init_with_default_size (size_t default_size, size_t goal)
{
unsigned long i;
@@ -142,7 +142,7 @@ swiotlb_init_with_default_size (size_t d
/*
* Get IO TLB memory from the low pages
*/
- io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+ io_tlb_start = alloc_bootmem_low_pages_goal(io_tlb_nslabs * (1 << IO_TLB_SHIFT), goal);
if (!io_tlb_start)
panic("Cannot allocate SWIOTLB buffer");
io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
@@ -161,15 +161,15 @@ swiotlb_init_with_default_size (size_t d
/*
* Get the overflow emergency buffer
*/
- io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+ io_tlb_overflow_buffer = alloc_bootmem_low_goal(io_tlb_overflow, goal);
printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
virt_to_phys(io_tlb_start), virt_to_phys(io_tlb_end));
}
void
-swiotlb_init (void)
+swiotlb_init (size_t goal)
{
- swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */
+ swiotlb_init_with_default_size(64 * (1<<20), goal); /* default to 64MB */
}
/*
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch] Move swiotlb_init early on X86_64
2006-03-01 1:10 [Patch] Move swiotlb_init early on X86_64 Zou Nan hai
@ 2006-03-02 4:15 ` Tony Luck
2006-03-02 4:30 ` Andi Kleen
2006-03-07 8:39 ` Andi Kleen
1 sibling, 1 reply; 11+ messages in thread
From: Tony Luck @ 2006-03-02 4:15 UTC (permalink / raw
To: Zou Nan hai; +Cc: LKML, Andrew Morton, Andi Kleen, Venkatesh Pallipadi
On 01 Mar 2006 09:10:58 +0800, Zou Nan hai <nanhai.zou@intel.com> wrote:
> on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
>
> On platforms with huge physical memory,
> large memmap and vfs cache may eat up all usable system memory
> under 4G.
>
> Move swiotlb_init early before memmap is allocated can
> solve this issue.
Shouldn't memmap be allocated from memory above 4G (if available)? Using
up lots of <4G memory on something that doesn't need to be below 4G
sounds like a poor use of resources.
-Tony
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch] Move swiotlb_init early on X86_64
2006-03-02 4:15 ` Tony Luck
@ 2006-03-02 4:30 ` Andi Kleen
2006-03-02 4:33 ` Zou Nan hai
0 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2006-03-02 4:30 UTC (permalink / raw
To: Tony Luck; +Cc: Zou Nan hai, LKML, Andrew Morton, Venkatesh Pallipadi
On Thursday 02 March 2006 05:15, Tony Luck wrote:
> On 01 Mar 2006 09:10:58 +0800, Zou Nan hai <nanhai.zou@intel.com> wrote:
> > on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> >
> > On platforms with huge physical memory,
> > large memmap and vfs cache may eat up all usable system memory
> > under 4G.
> >
> > Move swiotlb_init early before memmap is allocated can
> > solve this issue.
>
> Shouldn't memmap be allocated from memory above 4G (if available)? Using
> up lots of <4G memory on something that doesn't need to be below 4G
> sounds like a poor use of resources.
On the really large machines it will be distributed over the nodes anyways.
But yes the single node SMP case should probably allocate it higher.
-Andi
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch] Move swiotlb_init early on X86_64
2006-03-02 4:30 ` Andi Kleen
@ 2006-03-02 4:33 ` Zou Nan hai
0 siblings, 0 replies; 11+ messages in thread
From: Zou Nan hai @ 2006-03-02 4:33 UTC (permalink / raw
To: Andi Kleen; +Cc: Tony Luck, LKML, Andrew Morton, Venkatesh Pallipadi
On Thu, 2006-03-02 at 12:30, Andi Kleen wrote:
> On Thursday 02 March 2006 05:15, Tony Luck wrote:
> > On 01 Mar 2006 09:10:58 +0800, Zou Nan hai <nanhai.zou@intel.com> wrote:
> > > on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> > >
> > > On platforms with huge physical memory,
> > > large memmap and vfs cache may eat up all usable system memory
> > > under 4G.
> > >
> > > Move swiotlb_init early before memmap is allocated can
> > > solve this issue.
> >
> > Shouldn't memmap be allocated from memory above 4G (if available)? Using
> > up lots of <4G memory on something that doesn't need to be below 4G
> > sounds like a poor use of resources.
>
> On the really large machines it will be distributed over the nodes anyways.
> But yes the single node SMP case should probably allocate it higher.
>
> -Andi
Really, then how about the following patch?
Let normal bootmem allocator go above 4G first.
This will save more memory with address less than 4G.
Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
--- linux-2.6.16-rc5/mm/bootmem.c 2006-03-03 08:31:52.000000000 +0800
+++ b/mm/bootmem.c 2006-03-03 09:05:17.000000000 +0800
@@ -381,16 +381,24 @@ unsigned long __init free_all_bootmem (v
return(free_all_bootmem_core(NODE_DATA(0)));
}
+#define LOW32LIMIT 0xffffffff
+
void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
{
pg_data_t *pgdat = pgdat_list;
void *ptr;
+ if (goal < LOW32LIMIT) {
+ for_each_pgdat(pgdat)
+ if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+ align, LOW32LIMIT, 0)))
+ return(ptr);
+ }
+
for_each_pgdat(pgdat)
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
align, goal, 0)))
return(ptr);
-
/*
* Whoops, we cannot satisfy the allocation request.
*/
@@ -405,6 +413,13 @@ void * __init __alloc_bootmem_node(pg_da
{
void *ptr;
+ if (goal < LOW32LIMIT) {
+ ptr = __alloc_bootmem_core(pgdat->bdata, size, align,
+ LOW32LIMIT, 0);
+ if (ptr)
+ return (ptr);
+ }
+
ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
if (ptr)
return (ptr);
@@ -412,7 +427,6 @@ void * __init __alloc_bootmem_node(pg_da
return __alloc_bootmem(size, align, goal);
}
-#define LOW32LIMIT 0xffffffff
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
{
^ permalink raw reply [flat|nested] 11+ messages in thread
* RE: [Patch] Move swiotlb_init early on X86_64
@ 2006-03-02 9:09 Zhang, Yanmin
2006-03-02 23:35 ` Zou Nan hai
0 siblings, 1 reply; 11+ messages in thread
From: Zhang, Yanmin @ 2006-03-02 9:09 UTC (permalink / raw
To: Zou, Nanhai, Andi Kleen
Cc: Luck, Tony, LKML, Andrew Morton, Pallipadi, Venkatesh
>>-----Original Message-----
>>From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of Zou Nan hai
>>Sent: 2006年3月2日 12:33
>>
>>Really, then how about the following patch?
>>
>>Let normal bootmem allocator go above 4G first.
>>This will save more memory with address less than 4G.
>>
>>Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
>>
>>--- linux-2.6.16-rc5/mm/bootmem.c 2006-03-03 08:31:52.000000000 +0800
>>+++ b/mm/bootmem.c 2006-03-03 09:05:17.000000000 +0800
>>@@ -381,16 +381,24 @@ unsigned long __init free_all_bootmem (v
>> return(free_all_bootmem_core(NODE_DATA(0)));
>> }
>>
>>+#define LOW32LIMIT 0xffffffff
>>+
>> void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
>> {
>> pg_data_t *pgdat = pgdat_list;
>> void *ptr;
>>
>>+ if (goal < LOW32LIMIT) {
On i386, above is always true.
>>+ for_each_pgdat(pgdat)
>>+ if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
>>+ align, LOW32LIMIT, 0)))
>>+ return(ptr);
>>+ }
^ permalink raw reply [flat|nested] 11+ messages in thread
* RE: [Patch] Move swiotlb_init early on X86_64
2006-03-02 9:09 Zhang, Yanmin
@ 2006-03-02 23:35 ` Zou Nan hai
2006-03-03 1:32 ` Andi Kleen
0 siblings, 1 reply; 11+ messages in thread
From: Zou Nan hai @ 2006-03-02 23:35 UTC (permalink / raw
To: Zhang, Yanmin
Cc: Andi Kleen, Luck, Tony, LKML, Andrew Morton, Pallipadi, Venkatesh
On Thu, 2006-03-02 at 17:09, Zhang, Yanmin wrote:
> >>-----Original Message-----
> >>From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of Zou Nan hai
> >>Sent: 2006年3月2日 12:33
> >>
> >>Really, then how about the following patch?
> >>
> >>Let normal bootmem allocator go above 4G first.
> >>This will save more memory with address less than 4G.
> >>
> >>Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
> >>
> >>--- linux-2.6.16-rc5/mm/bootmem.c 2006-03-03 08:31:52.000000000 +0800
> >>+++ b/mm/bootmem.c 2006-03-03 09:05:17.000000000 +0800
> >>@@ -381,16 +381,24 @@ unsigned long __init free_all_bootmem (v
> >> return(free_all_bootmem_core(NODE_DATA(0)));
> >> }
> >>
> >>+#define LOW32LIMIT 0xffffffff
> >>+
> >> void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
> >> {
> >> pg_data_t *pgdat = pgdat_list;
> >> void *ptr;
> >>
> >>+ if (goal < LOW32LIMIT) {
> On i386, above is always true.
>
>
Ok, I modified the patch.
On single node SMP System with large physical memory,
allocation from bootmem allocator like memmap and vfs_cache
may eat up usable memory under 4G, then software I/O TLB will not be able to allocate bounce buffer.
This patch modify the bootmem allocator,
let normal bootmem allocation on 64 bit system first go above 4G
address.
Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
--- linux-2.6.16-rc5/mm/bootmem.c 2006-03-03 08:31:52.000000000 +0800
+++ b/mm/bootmem.c 2006-03-04 03:48:55.000000000 +0800
@@ -381,16 +381,25 @@ unsigned long __init free_all_bootmem (v
return(free_all_bootmem_core(NODE_DATA(0)));
}
+#define LOW32LIMIT 0xffffffff
+
void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
{
pg_data_t *pgdat = pgdat_list;
void *ptr;
+#if (BITS_PER_LONG == 64)
+ if (goal < LOW32LIMIT) {
+ for_each_pgdat(pgdat)
+ if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+ align, LOW32LIMIT, 0)))
+ return(ptr);
+ }
+#endif
for_each_pgdat(pgdat)
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
align, goal, 0)))
return(ptr);
-
/*
* Whoops, we cannot satisfy the allocation request.
*/
@@ -404,6 +413,14 @@ void * __init __alloc_bootmem_node(pg_da
unsigned long goal)
{
void *ptr;
+#if (BITS_PER_LONG == 64)
+ if (goal < LOW32LIMIT) {
+ ptr = __alloc_bootmem_core(pgdat->bdata, size, align,
+ LOW32LIMIT, 0);
+ if (ptr)
+ return (ptr);
+ }
+#endif
ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
if (ptr)
@@ -412,7 +429,6 @@ void * __init __alloc_bootmem_node(pg_da
return __alloc_bootmem(size, align, goal);
}
-#define LOW32LIMIT 0xffffffff
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
{
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch] Move swiotlb_init early on X86_64
2006-03-02 23:35 ` Zou Nan hai
@ 2006-03-03 1:32 ` Andi Kleen
0 siblings, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2006-03-03 1:32 UTC (permalink / raw
To: Zou Nan hai
Cc: Zhang, Yanmin, Luck, Tony, LKML, Andrew Morton,
Pallipadi, Venkatesh
On Friday 03 March 2006 00:35, Zou Nan hai wrote:
> This patch modify the bootmem allocator,
> let normal bootmem allocation on 64 bit system first go above 4G
> address.
That's very ugly and likely to break some architectures. Sorry
but #ifdefs is the wrong way to do this.
Passing a limit parameter is better and use that in the swiotlb
allocation. If you're worried about changing too many callers
you could add a new entry point.
-Andi
^ permalink raw reply [flat|nested] 11+ messages in thread
* RE: [Patch] Move swiotlb_init early on X86_64
@ 2006-03-03 1:59 Pallipadi, Venkatesh
0 siblings, 0 replies; 11+ messages in thread
From: Pallipadi, Venkatesh @ 2006-03-03 1:59 UTC (permalink / raw
To: Andi Kleen, Zou, Nanhai; +Cc: Zhang, Yanmin, Luck, Tony, LKML, Andrew Morton
>-----Original Message-----
>From: Andi Kleen [mailto:ak@suse.de]
>Sent: Thursday, March 02, 2006 5:32 PM
>To: Zou, Nanhai
>Cc: Zhang, Yanmin; Luck, Tony; LKML; Andrew Morton; Pallipadi,
>Venkatesh
>Subject: Re: [Patch] Move swiotlb_init early on X86_64
>
>On Friday 03 March 2006 00:35, Zou Nan hai wrote:
>
>> This patch modify the bootmem allocator,
>> let normal bootmem allocation on 64 bit system first go above 4G
>> address.
>
>That's very ugly and likely to break some architectures. Sorry
>but #ifdefs is the wrong way to do this.
>
>Passing a limit parameter is better and use that in the swiotlb
>allocation. If you're worried about changing too many callers
>you could add a new entry point.
>
Another potential issue with this approach:
On a 64 bit system with less than 4G phys memory, we will fail
to get any memory above 4G and fall back to start from '0'.
This is different from original behaviour, where goal was
MAX_DMA_ADDRESS (16M) and we would allocate memory starting
from 16M. As a result, we will now eat up memory in 0-16M range
and may break some legacy drivers as they will not get any memory.
If we go this way, then we should fallback to original goal if we
are not able to get greater than 4G memory.
Thanks,
Venki
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch] Move swiotlb_init early on X86_64
2006-03-01 1:10 [Patch] Move swiotlb_init early on X86_64 Zou Nan hai
2006-03-02 4:15 ` Tony Luck
@ 2006-03-07 8:39 ` Andi Kleen
2006-03-07 23:23 ` Zou Nan hai
1 sibling, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2006-03-07 8:39 UTC (permalink / raw
To: Zou Nan hai; +Cc: LKML, Andrew Morton, Venkatesh Pallipadi
On Wednesday 01 March 2006 02:10, Zou Nan hai wrote:
> on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
>
> On platforms with huge physical memory,
> large memmap and vfs cache may eat up all usable system memory
> under 4G.
>
> Move swiotlb_init early before memmap is allocated can
> solve this issue.
>
> Signed-off-by: Zou Nan hai <Nanhai.zou@intel.com>
I came up with a simpler change now that should fix the problem too.
It just try to move the memmap to the end of the node. I don't have a system
big enough to test the original problem though.
It should be fairly safe because if the allocation fails we just fallback
to the normal old way of allocating it near the beginning.
Try to allocate node memmap near the end of node
This fixes problems with very large nodes (over 128GB) filling up all of
the first 4GB with their mem_map and not leaving enough
space for the swiotlb.
Signed-off-by: Andi Kleen <ak@suse.de>
---
arch/x86_64/mm/numa.c | 12 +++++++++++-
include/linux/bootmem.h | 3 +++
mm/bootmem.c | 2 +-
3 files changed, 15 insertions(+), 2 deletions(-)
Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -172,7 +172,7 @@ void __init setup_node_bootmem(int nodei
/* Initialize final allocator for a zone */
void __init setup_node_zones(int nodeid)
{
- unsigned long start_pfn, end_pfn;
+ unsigned long start_pfn, end_pfn, memmapsize, limit;
unsigned long zones[MAX_NR_ZONES];
unsigned long holes[MAX_NR_ZONES];
@@ -182,6 +182,16 @@ void __init setup_node_zones(int nodeid)
Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
nodeid, start_pfn, end_pfn);
+ /* Try to allocate mem_map at end to not fill up precious <4GB
+ memory. */
+ memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
+ limit = end_pfn << PAGE_SHIFT;
+ NODE_DATA(nodeid)->node_mem_map =
+ __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
+ memmapsize, SMP_CACHE_BYTES,
+ limit,
+ round_down(limit - memmapsize, PAGE_SIZE));
+
size_zones(zones, holes, start_pfn, end_pfn);
free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
start_pfn, holes);
Index: linux/include/linux/bootmem.h
===================================================================
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -52,6 +52,9 @@ extern void * __init __alloc_bootmem_low
unsigned long size,
unsigned long align,
unsigned long goal);
+extern void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
+ unsigned long size, unsigned long align, unsigned long goal,
+ unsigned long limit);
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
#define alloc_bootmem(x) \
Index: linux/mm/bootmem.c
===================================================================
--- linux.orig/mm/bootmem.c
+++ linux/mm/bootmem.c
@@ -152,7 +152,7 @@ static void __init free_bootmem_core(boo
*
* NOTE: This function is _not_ reentrant.
*/
-static void * __init
+void * __init
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
unsigned long align, unsigned long goal, unsigned long limit)
{
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch] Move swiotlb_init early on X86_64
2006-03-07 8:39 ` Andi Kleen
@ 2006-03-07 23:23 ` Zou Nan hai
2006-03-08 9:33 ` Andi Kleen
0 siblings, 1 reply; 11+ messages in thread
From: Zou Nan hai @ 2006-03-07 23:23 UTC (permalink / raw
To: Andi Kleen; +Cc: LKML, Andrew Morton, Venkatesh Pallipadi
On Tue, 2006-03-07 at 16:39, Andi Kleen wrote:
> On Wednesday 01 March 2006 02:10, Zou Nan hai wrote:
> > on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> >
> > On platforms with huge physical memory,
> > large memmap and vfs cache may eat up all usable system memory
> > under 4G.
> >
> > Move swiotlb_init early before memmap is allocated can
> > solve this issue.
> >
> > Signed-off-by: Zou Nan hai <Nanhai.zou@intel.com>
>
>
> I came up with a simpler change now that should fix the problem too.
> It just try to move the memmap to the end of the node. I don't have a system
> big enough to test the original problem though.
>
> It should be fairly safe because if the allocation fails we just fallback
> to the normal old way of allocating it near the beginning.
>
> Try to allocate node memmap near the end of node
>
> This fixes problems with very large nodes (over 128GB) filling up all of
> the first 4GB with their mem_map and not leaving enough
> space for the swiotlb.
>
>
> Signed-off-by: Andi Kleen <ak@suse.de>
>
> ---
> arch/x86_64/mm/numa.c | 12 +++++++++++-
> include/linux/bootmem.h | 3 +++
> mm/bootmem.c | 2 +-
> 3 files changed, 15 insertions(+), 2 deletions(-)
>
> Index: linux/arch/x86_64/mm/numa.c
> ===================================================================
> --- linux.orig/arch/x86_64/mm/numa.c
> +++ linux/arch/x86_64/mm/numa.c
> @@ -172,7 +172,7 @@ void __init setup_node_bootmem(int nodei
> /* Initialize final allocator for a zone */
> void __init setup_node_zones(int nodeid)
> {
> - unsigned long start_pfn, end_pfn;
> + unsigned long start_pfn, end_pfn, memmapsize, limit;
> unsigned long zones[MAX_NR_ZONES];
> unsigned long holes[MAX_NR_ZONES];
>
> @@ -182,6 +182,16 @@ void __init setup_node_zones(int nodeid)
> Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
> nodeid, start_pfn, end_pfn);
>
> + /* Try to allocate mem_map at end to not fill up precious <4GB
> + memory. */
> + memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
> + limit = end_pfn << PAGE_SHIFT;
> + NODE_DATA(nodeid)->node_mem_map =
> + __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
> + memmapsize, SMP_CACHE_BYTES,
> + limit,
> + round_down(limit - memmapsize, PAGE_SIZE));
> +
, round_down(limit - memmapsize, PAGE_SIZE), limit);?
Zou Nan hai
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Patch] Move swiotlb_init early on X86_64
2006-03-07 23:23 ` Zou Nan hai
@ 2006-03-08 9:33 ` Andi Kleen
0 siblings, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2006-03-08 9:33 UTC (permalink / raw
To: Zou Nan hai; +Cc: LKML, Andrew Morton, Venkatesh Pallipadi
On Wednesday 08 March 2006 00:23, Zou Nan hai wrote:
>
> , round_down(limit - memmapsize, PAGE_SIZE), limit);?
Indeed. Thanks for catching that.
-Andi
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2006-03-08 9:40 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-03-01 1:10 [Patch] Move swiotlb_init early on X86_64 Zou Nan hai
2006-03-02 4:15 ` Tony Luck
2006-03-02 4:30 ` Andi Kleen
2006-03-02 4:33 ` Zou Nan hai
2006-03-07 8:39 ` Andi Kleen
2006-03-07 23:23 ` Zou Nan hai
2006-03-08 9:33 ` Andi Kleen
-- strict thread matches above, loose matches on Subject: below --
2006-03-02 9:09 Zhang, Yanmin
2006-03-02 23:35 ` Zou Nan hai
2006-03-03 1:32 ` Andi Kleen
2006-03-03 1:59 Pallipadi, Venkatesh
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).