[PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's

All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
@ 2013-09-30 12:35 Malcolm Crossley
  2013-09-30 13:26 ` Jan Beulich
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Malcolm Crossley @ 2013-09-30 12:35 UTC (permalink / raw
  To: xen-devel; +Cc: keir, andrew.cooper3, tim, JBeulich

The page scrubbing is done in 128MB chunks in lockstep across all the CPU's.
This allows for the boot CPU to hold the heap_lock whilst each chunk is being
scrubbed and then release the heap_lock when all CPU's are finished scrubing
their individual chunk. This allows for the heap_lock to not be held
continously and for pending softirqs are to be serviced periodically across
all CPU's.

The page scrub memory chunks are allocated to the CPU's in a NUMA aware
fashion to reduce Socket interconnect overhead and improve performance.

This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
6386 machine from 49 seconds to 3 seconds.

Changes in v2
 - Reduced default chunk size to 128MB
 - Added code to scrub NUMA nodes with no active CPU linked to them
 - Be robust to boot CPU not being linked to a NUMA node

diff -r a03cc3136759 -r ee1108d26fc5 docs/misc/xen-command-line.markdown
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -188,6 +188,16 @@ Scrub free RAM during boot.  This is a s
 accidentally leaking sensitive VM data into other VMs if Xen crashes
 and reboots.
 
+### bootscrub_blocksize
+> `= <size>`
+
+> Default: `128MiB`
+
+Maximum RAM block size to be scrubbed whilst holding the page heap lock and not
+running softirqs. Reduce this if softirqs are not being run frequently enough.
+Setting this to a high value may cause cause boot failure, particularly if the
+NMI watchdog is also enabled.
+
 ### cachesize
 > `= <size>`
 
diff -r a03cc3136759 -r ee1108d26fc5 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata =
 boolean_param("bootscrub", opt_bootscrub);
 
 /*
+ * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock held
+ */
+static unsigned int __initdata opt_bootscrub_blocksize = 128 * 1024 * 1024;
+size_param("bootscrub_blocksize", opt_bootscrub_blocksize);
+
+/*
  * Bit width of the DMA heap -- used to override NUMA-node-first.
  * allocation strategy, which can otherwise exhaust low memory.
  */
@@ -90,6 +96,16 @@ static struct bootmem_region {
 } *__initdata bootmem_region_list;
 static unsigned int __initdata nr_bootmem_regions;
 
+static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0);
+
+struct scrub_region {
+    u64 offset;
+    u64 start;
+    u64 chunk_size;
+    u64 cpu_block_size;
+};
+static struct scrub_region __initdata region[MAX_NUMNODES];
+
 static void __init boot_bug(int line)
 {
     panic("Boot BUG at %s:%d\n", __FILE__, line);
@@ -1254,28 +1270,44 @@ void __init end_boot_allocator(void)
     printk("\n");
 }
 
-/*
- * Scrub all unallocated pages in all heap zones. This function is more
- * convoluted than appears necessary because we do not want to continuously
- * hold the lock while scrubbing very large memory areas.
- */
-void __init scrub_heap_pages(void)
+void __init smp_scrub_heap_pages(void *data)
 {
-    unsigned long mfn;
+    unsigned long mfn, start_mfn, end_mfn;
     struct page_info *pg;
+    struct scrub_region *region = data;
+    unsigned int temp_cpu, local_node, local_cpu_index = 0;
+    unsigned int cpu = smp_processor_id();
 
-    if ( !opt_bootscrub )
-        return;
+    ASSERT(region != NULL);
 
-    printk("Scrubbing Free RAM: ");
+    local_node = cpu_to_node(cpu);
+    /* Determine if we are scrubbing using the boot CPU */
+    if ( region->cpu_block_size != ~0ULL )
+        /* Determine the current CPU's index into CPU's linked to this node*/
+        for_each_cpu( temp_cpu, &node_to_cpumask(local_node) )
+        {
+            if ( cpu == temp_cpu )
+                break;
+            local_cpu_index++;
+        }
 
-    for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
+    /* Calculate the starting mfn for this CPU's memory block */
+    start_mfn = region->start + (region->cpu_block_size * local_cpu_index)
+                + region->offset;
+
+    /* Calculate the end mfn into this CPU's memory block for this iteration */
+    if ( region->offset + region->chunk_size > region->cpu_block_size )
+        end_mfn = region->start + (region->cpu_block_size * local_cpu_index)
+                  + region->cpu_block_size;
+    else
+        end_mfn = start_mfn + region->chunk_size;
+
+
+    for ( mfn = start_mfn; mfn < end_mfn; mfn++ )
     {
-        process_pending_softirqs();
-
         pg = mfn_to_page(mfn);
 
-        /* Quick lock-free check. */
+        /* Check the mfn is valid and page is free. */
         if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
             continue;
 
@@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void)
         if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
             printk(".");
 
+        /* Do the scrub if possible */
+        if ( page_state_is(pg, free) )
+            scrub_one_page(pg);
+    }
+    /* Increment count to indicate scrubbing complete on this CPU */
+    atomic_dec(&bootscrub_count);
+}
+
+/*
+ * Scrub all unallocated pages in all heap zones. This function uses all
+ * online cpu's to scrub the memory in parallel.
+ */
+void __init scrub_heap_pages(void)
+{
+    cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }};
+    unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id();
+    unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0;
+    unsigned long mem_start, mem_end;
+
+    if ( !opt_bootscrub )
+        return;
+
+    boot_cpu_node = cpu_to_node(cpu);
+
+    printk("Scrubbing Free RAM: ");
+
+    /* Scrub block size */
+    chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;
+    if ( chunk_size == 0 )
+        chunk_size = 1;
+
+    /* Determine the amount of memory to scrub, per CPU on each Node */
+    for_each_online_node ( i )
+    {
+        /* Calculate Node memory start and end address */
+        mem_start = max(node_start_pfn(i), first_valid_mfn);
+        mem_end = min(mem_start + node_spanned_pages(i), max_page);
+        /* Divide by number of CPU's for this node */
+        node_cpus = node_to_cpumask(i);
+        /* It's possible a node has no CPU's */
+        if ( cpumask_empty(&node_cpus) )
+            continue;
+        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus);
+
+        region[i].cpu_block_size = (mem_end - mem_start) /
+                                    cpumask_weight(&node_cpus);
+        region[i].start = mem_start;
+
+        if ( region[i].cpu_block_size > max_cpu_blk_size )
+            max_cpu_blk_size = region[i].cpu_block_size;
+    }
+
+    /* Round default chunk size down if required */
+    if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size )
+        chunk_size = max_cpu_blk_size;
+
+    total_node_cpus = cpumask_weight(&total_node_cpus_mask);
+    /* Start all CPU's scrubbing memory, chunk_size at a time */
+    for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size )
+    {
+        process_pending_softirqs();
+
+        atomic_set(&bootscrub_count, total_node_cpus);
+
         spin_lock(&heap_lock);
 
-        /* Re-check page status with lock held. */
-        if ( page_state_is(pg, free) )
-            scrub_one_page(pg);
+        /* Start all other CPU's on all nodes */
+        for_each_online_node ( i )
+        {
+            region[i].chunk_size = chunk_size;
+            region[i].offset = mfn_off;
+            node_cpus = node_to_cpumask(i);
+            /* Clear local cpu ID */
+            cpumask_clear_cpu(cpu, &node_cpus);
+            /* Start page scrubbing on all other CPU's */
+            on_selected_cpus(&node_cpus, smp_scrub_heap_pages, &region[i], 0);
+        }
+
+        /* Start scrub on local CPU if CPU linked to a memory node */
+        if ( boot_cpu_node != NUMA_NO_NODE )
+            smp_scrub_heap_pages(&region[boot_cpu_node]);
+
+        /* Wait for page scrubbing to complete on all other CPU's */
+        while ( atomic_read(&bootscrub_count) > 0 )
+            cpu_relax();
 
         spin_unlock(&heap_lock);
     }
 
+    /* Use the boot CPU to scrub any nodes which have no CPU's linked to them */
+    for_each_online_node ( i )
+    {
+        node_cpus = node_to_cpumask(i);
+
+        if ( !cpumask_empty(&node_cpus) )
+            continue;
+
+        mem_start = max(node_start_pfn(i), first_valid_mfn);
+        mem_end = min(mem_start + node_spanned_pages(i), max_page);
+
+        region[0].offset = 0;
+        region[0].cpu_block_size = ~0ULL;
+
+        for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size )
+        {
+            spin_lock(&heap_lock);
+            if ( mfn + chunk_size > mem_end )
+                region[0].chunk_size = mem_end - mfn;
+            else
+                region[0].chunk_size = chunk_size;
+
+            region[0].start = mfn;
+
+            smp_scrub_heap_pages(&region[0]);
+            spin_unlock(&heap_lock);
+            process_pending_softirqs();
+        }
+    }
     printk("done.\n");
 
     /* Now that the heap is initialized, run checks and set bounds

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2013-09-30 12:35 [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's Malcolm Crossley
@ 2013-09-30 13:26 ` Jan Beulich
  2013-09-30 13:56   ` Malcolm Crossley
  2013-09-30 17:43 ` Konrad Rzeszutek Wilk
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2013-09-30 13:26 UTC (permalink / raw
  To: Malcolm Crossley; +Cc: andrew.cooper3, tim, keir, xen-devel

>>> On 30.09.13 at 14:35, Malcolm Crossley <malcolm.crossley@citrix.com> wrote:
> The page scrubbing is done in 128MB chunks in lockstep across all the CPU's.
> This allows for the boot CPU to hold the heap_lock whilst each chunk is being
> scrubbed and then release the heap_lock when all CPU's are finished scrubing
> their individual chunk. This allows for the heap_lock to not be held
> continously and for pending softirqs are to be serviced periodically across
> all CPU's.
> 
> The page scrub memory chunks are allocated to the CPU's in a NUMA aware
> fashion to reduce Socket interconnect overhead and improve performance.
> 
> This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
> 6386 machine from 49 seconds to 3 seconds.

And is this a NUMA system with heavily different access times
between local and remote memory?

What I'm trying to understand before reviewing the actual patch
is whether what you do is really necessary: Generally it ought to
be sufficient to have one CPU on each node scrub that node's
memory, as a CPU should be able to saturate the bus if it does
(almost) nothing but memory write. Hence having multiple cores
on the same socket (not to speak of multiple threads in a core)
do this work in parallel is likely not going to be beneficial, and
hence the logic you're adding here might be more complex than
necessary.

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2013-09-30 13:26 ` Jan Beulich
@ 2013-09-30 13:56   ` Malcolm Crossley
  2013-09-30 15:35     ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Malcolm Crossley @ 2013-09-30 13:56 UTC (permalink / raw
  To: Jan Beulich; +Cc: andrew.cooper3, tim, keir, xen-devel

On 30/09/13 14:26, Jan Beulich wrote:
>>>> On 30.09.13 at 14:35, Malcolm Crossley <malcolm.crossley@citrix.com> wrote:
>> The page scrubbing is done in 128MB chunks in lockstep across all the CPU's.
>> This allows for the boot CPU to hold the heap_lock whilst each chunk is being
>> scrubbed and then release the heap_lock when all CPU's are finished scrubing
>> their individual chunk. This allows for the heap_lock to not be held
>> continously and for pending softirqs are to be serviced periodically across
>> all CPU's.
>>
>> The page scrub memory chunks are allocated to the CPU's in a NUMA aware
>> fashion to reduce Socket interconnect overhead and improve performance.
>>
>> This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
>> 6386 machine from 49 seconds to 3 seconds.
> And is this a NUMA system with heavily different access times
> between local and remote memory?
The AMD 64 core system has 8 NUMA nodes with up to 2 hops between the nodes.

This page show's some data on the roundtrip bandwidth between different 
core's
http://www.cl.cam.ac.uk/research/srg/netos/ipc-bench/details/tmpn2YlFp.html

This paper also show the difference memory bandwidth with NUMA aware 
threads:

http://www.cs.uchicago.edu/files/tr_authentic/TR-2011-02.pdf

The unstrided results are the only one's we're interested in.
>
> What I'm trying to understand before reviewing the actual patch
> is whether what you do is really necessary: Generally it ought to
> be sufficient to have one CPU on each node scrub that node's
> memory, as a CPU should be able to saturate the bus if it does
> (almost) nothing but memory write. Hence having multiple cores
> on the same socket (not to speak of multiple threads in a core)
> do this work in parallel is likely not going to be beneficial, and
> hence the logic you're adding here might be more complex than
> necessary.
The second paper cited above shows that between 3 times more core's
than NUMA nodes are required to reach peak NUMA memory bandwidth.

The difference between 1 core per node bandwidth and 3 core's per node 
bandwidth is:

AMD: 30000MB/s (1-core) vs 48000MB/s (3-core)
Intel: 12000MB/s (1-core) vs 38000MB/s (3-core)

So I think it's worth the extra complexity to have multiple core's per 
node scrubbing memory.

Malcolm
> Jan
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2013-09-30 13:56   ` Malcolm Crossley
@ 2013-09-30 15:35     ` Jan Beulich
  2013-09-30 15:42       ` Andrew Cooper
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2013-09-30 15:35 UTC (permalink / raw
  To: Malcolm Crossley; +Cc: andrew.cooper3, tim, keir, xen-devel

>>> On 30.09.13 at 15:56, Malcolm Crossley <malcolm.crossley@citrix.com> wrote:
> The difference between 1 core per node bandwidth and 3 core's per node 
> bandwidth is:
> 
> AMD: 30000MB/s (1-core) vs 48000MB/s (3-core)
> Intel: 12000MB/s (1-core) vs 38000MB/s (3-core)
> 
> So I think it's worth the extra complexity to have multiple core's per 
> node scrubbing memory.

The numbers are convincing. Which means that the only request
I'd have is to avoid more than one hyperthread on a core to get
picked for doing the scrubbing.

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2013-09-30 15:35     ` Jan Beulich
@ 2013-09-30 15:42       ` Andrew Cooper
  2013-09-30 16:08         ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Andrew Cooper @ 2013-09-30 15:42 UTC (permalink / raw
  To: Jan Beulich; +Cc: xen-devel, Malcolm Crossley, tim, keir

On 30/09/13 16:35, Jan Beulich wrote:
>>>> On 30.09.13 at 15:56, Malcolm Crossley <malcolm.crossley@citrix.com> wrote:
>> The difference between 1 core per node bandwidth and 3 core's per node 
>> bandwidth is:
>>
>> AMD: 30000MB/s (1-core) vs 48000MB/s (3-core)
>> Intel: 12000MB/s (1-core) vs 38000MB/s (3-core)
>>
>> So I think it's worth the extra complexity to have multiple core's per 
>> node scrubbing memory.
> The numbers are convincing. Which means that the only request
> I'd have is to avoid more than one hyperthread on a core to get
> picked for doing the scrubbing.
>
> Jan
>

Why? Being independent operations, scrubbing like this will still be
sped up by hyperthreading.

As scrubbing RAM is simply a race to get it done as quickly as possible,
unless it can be demonstrated that using all cores is actively
detrimental to the overall time taken, we really should use all cores
where possible.

~Andrew

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2013-09-30 15:42       ` Andrew Cooper
@ 2013-09-30 16:08         ` Jan Beulich
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2013-09-30 16:08 UTC (permalink / raw
  To: Andrew Cooper; +Cc: xen-devel, Malcolm Crossley, tim, keir

>>> On 30.09.13 at 17:42, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 30/09/13 16:35, Jan Beulich wrote:
>>>>> On 30.09.13 at 15:56, Malcolm Crossley <malcolm.crossley@citrix.com> wrote:
>>> The difference between 1 core per node bandwidth and 3 core's per node 
>>> bandwidth is:
>>>
>>> AMD: 30000MB/s (1-core) vs 48000MB/s (3-core)
>>> Intel: 12000MB/s (1-core) vs 38000MB/s (3-core)
>>>
>>> So I think it's worth the extra complexity to have multiple core's per 
>>> node scrubbing memory.
>> The numbers are convincing. Which means that the only request
>> I'd have is to avoid more than one hyperthread on a core to get
>> picked for doing the scrubbing.
> 
> Why? Being independent operations, scrubbing like this will still be
> sped up by hyperthreading.
> 
> As scrubbing RAM is simply a race to get it done as quickly as possible,
> unless it can be demonstrated that using all cores is actively
> detrimental to the overall time taken, we really should use all cores
> where possible.

As long as the actual scrubbing routine is written well enough
(to saturate the store bandwidth of a single core), having
two threads compete with one another for the store ports can
only be negatively affecting performance. _If_ multiple hyper-
threads would really turn out to be beneficial, that would thus
rather be an indication to improve clear_page() (or whatever is
being used for the actual scrubbing).

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2013-09-30 12:35 [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's Malcolm Crossley
  2013-09-30 13:26 ` Jan Beulich
@ 2013-09-30 17:43 ` Konrad Rzeszutek Wilk
  2013-10-03 11:39 ` Tim Deegan
  2014-04-01 19:29 ` Konrad Rzeszutek Wilk
  3 siblings, 0 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-09-30 17:43 UTC (permalink / raw
  To: Malcolm Crossley; +Cc: keir, andrew.cooper3, tim, JBeulich, xen-devel

On Mon, Sep 30, 2013 at 01:35:17PM +0100, Malcolm Crossley wrote:
> The page scrubbing is done in 128MB chunks in lockstep across all the CPU's.
> This allows for the boot CPU to hold the heap_lock whilst each chunk is being
> scrubbed and then release the heap_lock when all CPU's are finished scrubing
> their individual chunk. This allows for the heap_lock to not be held
> continously and for pending softirqs are to be serviced periodically across
> all CPU's.
> 
> The page scrub memory chunks are allocated to the CPU's in a NUMA aware
> fashion to reduce Socket interconnect overhead and improve performance.
> 
> This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
> 6386 machine from 49 seconds to 3 seconds.

A bit older version of this one cut down the 1TB machine scrubbing from minutes
(I think it was 5 or 10 - I gave up on counting) down to less than a minute.

> 
> Changes in v2
>  - Reduced default chunk size to 128MB
>  - Added code to scrub NUMA nodes with no active CPU linked to them
>  - Be robust to boot CPU not being linked to a NUMA node
> 
> diff -r a03cc3136759 -r ee1108d26fc5 docs/misc/xen-command-line.markdown
> --- a/docs/misc/xen-command-line.markdown
> +++ b/docs/misc/xen-command-line.markdown
> @@ -188,6 +188,16 @@ Scrub free RAM during boot.  This is a s
>  accidentally leaking sensitive VM data into other VMs if Xen crashes
>  and reboots.
>  
> +### bootscrub_blocksize
> +> `= <size>`
> +
> +> Default: `128MiB`
> +
> +Maximum RAM block size to be scrubbed whilst holding the page heap lock and not
> +running softirqs. Reduce this if softirqs are not being run frequently enough.
> +Setting this to a high value may cause cause boot failure, particularly if the
> +NMI watchdog is also enabled.
> +
>  ### cachesize
>  > `= <size>`
>  
> diff -r a03cc3136759 -r ee1108d26fc5 xen/common/page_alloc.c
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata =
>  boolean_param("bootscrub", opt_bootscrub);
>  
>  /*
> + * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock held
> + */
> +static unsigned int __initdata opt_bootscrub_blocksize = 128 * 1024 * 1024;
> +size_param("bootscrub_blocksize", opt_bootscrub_blocksize);
> +
> +/*
>   * Bit width of the DMA heap -- used to override NUMA-node-first.
>   * allocation strategy, which can otherwise exhaust low memory.
>   */
> @@ -90,6 +96,16 @@ static struct bootmem_region {
>  } *__initdata bootmem_region_list;
>  static unsigned int __initdata nr_bootmem_regions;
>  
> +static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0);
> +
> +struct scrub_region {
> +    u64 offset;
> +    u64 start;
> +    u64 chunk_size;
> +    u64 cpu_block_size;
> +};
> +static struct scrub_region __initdata region[MAX_NUMNODES];
> +
>  static void __init boot_bug(int line)
>  {
>      panic("Boot BUG at %s:%d\n", __FILE__, line);
> @@ -1254,28 +1270,44 @@ void __init end_boot_allocator(void)
>      printk("\n");
>  }
>  
> -/*
> - * Scrub all unallocated pages in all heap zones. This function is more
> - * convoluted than appears necessary because we do not want to continuously
> - * hold the lock while scrubbing very large memory areas.
> - */
> -void __init scrub_heap_pages(void)
> +void __init smp_scrub_heap_pages(void *data)
>  {
> -    unsigned long mfn;
> +    unsigned long mfn, start_mfn, end_mfn;
>      struct page_info *pg;
> +    struct scrub_region *region = data;
> +    unsigned int temp_cpu, local_node, local_cpu_index = 0;
> +    unsigned int cpu = smp_processor_id();
>  
> -    if ( !opt_bootscrub )
> -        return;
> +    ASSERT(region != NULL);
>  
> -    printk("Scrubbing Free RAM: ");
> +    local_node = cpu_to_node(cpu);
> +    /* Determine if we are scrubbing using the boot CPU */
> +    if ( region->cpu_block_size != ~0ULL )
> +        /* Determine the current CPU's index into CPU's linked to this node*/
> +        for_each_cpu( temp_cpu, &node_to_cpumask(local_node) )
> +        {
> +            if ( cpu == temp_cpu )
> +                break;
> +            local_cpu_index++;
> +        }
>  
> -    for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
> +    /* Calculate the starting mfn for this CPU's memory block */
> +    start_mfn = region->start + (region->cpu_block_size * local_cpu_index)
> +                + region->offset;
> +
> +    /* Calculate the end mfn into this CPU's memory block for this iteration */
> +    if ( region->offset + region->chunk_size > region->cpu_block_size )
> +        end_mfn = region->start + (region->cpu_block_size * local_cpu_index)
> +                  + region->cpu_block_size;
> +    else
> +        end_mfn = start_mfn + region->chunk_size;
> +
> +
> +    for ( mfn = start_mfn; mfn < end_mfn; mfn++ )
>      {
> -        process_pending_softirqs();
> -
>          pg = mfn_to_page(mfn);
>  
> -        /* Quick lock-free check. */
> +        /* Check the mfn is valid and page is free. */
>          if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
>              continue;
>  
> @@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void)
>          if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
>              printk(".");
>  
> +        /* Do the scrub if possible */
> +        if ( page_state_is(pg, free) )
> +            scrub_one_page(pg);
> +    }
> +    /* Increment count to indicate scrubbing complete on this CPU */
> +    atomic_dec(&bootscrub_count);
> +}
> +
> +/*
> + * Scrub all unallocated pages in all heap zones. This function uses all
> + * online cpu's to scrub the memory in parallel.
> + */
> +void __init scrub_heap_pages(void)
> +{
> +    cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }};
> +    unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id();
> +    unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0;
> +    unsigned long mem_start, mem_end;
> +
> +    if ( !opt_bootscrub )
> +        return;
> +
> +    boot_cpu_node = cpu_to_node(cpu);
> +
> +    printk("Scrubbing Free RAM: ");
> +
> +    /* Scrub block size */
> +    chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;
> +    if ( chunk_size == 0 )
> +        chunk_size = 1;
> +
> +    /* Determine the amount of memory to scrub, per CPU on each Node */
> +    for_each_online_node ( i )
> +    {
> +        /* Calculate Node memory start and end address */
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);
> +        /* Divide by number of CPU's for this node */
> +        node_cpus = node_to_cpumask(i);
> +        /* It's possible a node has no CPU's */
> +        if ( cpumask_empty(&node_cpus) )
> +            continue;
> +        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus);
> +
> +        region[i].cpu_block_size = (mem_end - mem_start) /
> +                                    cpumask_weight(&node_cpus);
> +        region[i].start = mem_start;
> +
> +        if ( region[i].cpu_block_size > max_cpu_blk_size )
> +            max_cpu_blk_size = region[i].cpu_block_size;
> +    }
> +
> +    /* Round default chunk size down if required */
> +    if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size )
> +        chunk_size = max_cpu_blk_size;
> +
> +    total_node_cpus = cpumask_weight(&total_node_cpus_mask);
> +    /* Start all CPU's scrubbing memory, chunk_size at a time */
> +    for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size )
> +    {
> +        process_pending_softirqs();
> +
> +        atomic_set(&bootscrub_count, total_node_cpus);
> +
>          spin_lock(&heap_lock);
>  
> -        /* Re-check page status with lock held. */
> -        if ( page_state_is(pg, free) )
> -            scrub_one_page(pg);
> +        /* Start all other CPU's on all nodes */
> +        for_each_online_node ( i )
> +        {
> +            region[i].chunk_size = chunk_size;
> +            region[i].offset = mfn_off;
> +            node_cpus = node_to_cpumask(i);
> +            /* Clear local cpu ID */
> +            cpumask_clear_cpu(cpu, &node_cpus);
> +            /* Start page scrubbing on all other CPU's */
> +            on_selected_cpus(&node_cpus, smp_scrub_heap_pages, &region[i], 0);
> +        }
> +
> +        /* Start scrub on local CPU if CPU linked to a memory node */
> +        if ( boot_cpu_node != NUMA_NO_NODE )
> +            smp_scrub_heap_pages(&region[boot_cpu_node]);
> +
> +        /* Wait for page scrubbing to complete on all other CPU's */
> +        while ( atomic_read(&bootscrub_count) > 0 )
> +            cpu_relax();
>  
>          spin_unlock(&heap_lock);
>      }
>  
> +    /* Use the boot CPU to scrub any nodes which have no CPU's linked to them */
> +    for_each_online_node ( i )
> +    {
> +        node_cpus = node_to_cpumask(i);
> +
> +        if ( !cpumask_empty(&node_cpus) )
> +            continue;
> +
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);
> +
> +        region[0].offset = 0;
> +        region[0].cpu_block_size = ~0ULL;
> +
> +        for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size )
> +        {
> +            spin_lock(&heap_lock);
> +            if ( mfn + chunk_size > mem_end )
> +                region[0].chunk_size = mem_end - mfn;
> +            else
> +                region[0].chunk_size = chunk_size;
> +
> +            region[0].start = mfn;
> +
> +            smp_scrub_heap_pages(&region[0]);
> +            spin_unlock(&heap_lock);
> +            process_pending_softirqs();
> +        }
> +    }
>      printk("done.\n");
>  
>      /* Now that the heap is initialized, run checks and set bounds

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2013-09-30 12:35 [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's Malcolm Crossley
  2013-09-30 13:26 ` Jan Beulich
  2013-09-30 17:43 ` Konrad Rzeszutek Wilk
@ 2013-10-03 11:39 ` Tim Deegan
  2014-04-01 19:29 ` Konrad Rzeszutek Wilk
  3 siblings, 0 replies; 14+ messages in thread
From: Tim Deegan @ 2013-10-03 11:39 UTC (permalink / raw
  To: Malcolm Crossley; +Cc: andrew.cooper3, keir, JBeulich, xen-devel

Hi,

Again, I'm very much in favour of this in principle, but I have some
comments on implementation -- some of which I made last time as well!

At 13:35 +0100 on 30 Sep (1380548117), Malcolm Crossley wrote:
> +static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0);
> +
> +struct scrub_region {
> +    u64 offset;
> +    u64 start;
> +    u64 chunk_size;
> +    u64 cpu_block_size;

Unsigned long, please: your actual scrubbing code uses unsigned long
local varables to work with these numbers anyway.

> +    local_node = cpu_to_node(cpu);
> +    /* Determine if we are scrubbing using the boot CPU */
> +    if ( region->cpu_block_size != ~0ULL )

Please define a correctly typed macro for this constant. 
(Actually, having read the calling side I think this constant won't be
needed at all)

> @@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void)
>          if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
>              printk(".");

Since the scrub order is pretty much arbitrary this printk isn't much of
an indicator of progress.  Maybe replace it with a printk in the
dispatch loop instead?

> +        /* Do the scrub if possible */
> +        if ( page_state_is(pg, free) )

You already checked this a couple of lines above, and since you're not
taking the lock there's no need to re-check.

> +            scrub_one_page(pg);
> +    }

There should be a wmb() here, to make sure the main scrub dispatcher can't
exit while the last worker is still issuing writes.

> +    /* Increment count to indicate scrubbing complete on this CPU */
> +    atomic_dec(&bootscrub_count);
> +}
> +
> +/*
> + * Scrub all unallocated pages in all heap zones. This function uses all
> + * online cpu's to scrub the memory in parallel.
> + */
> +void __init scrub_heap_pages(void)
> +{
> +    cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }};
> +    unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id();
> +    unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0;
> +    unsigned long mem_start, mem_end;
> +
> +    if ( !opt_bootscrub )
> +        return;
> +
> +    boot_cpu_node = cpu_to_node(cpu);
> +
> +    printk("Scrubbing Free RAM: ");
> +
> +    /* Scrub block size */
> +    chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;

The naming is a little distracting here; in general you're using 'block'
to mean the amount that a CPU will scrub altogether, and 'chunk' for the
amount scrubbed at once -- maybe this command-line option should be
'chunk' too?

> +    if ( chunk_size == 0 )
> +        chunk_size = 1;
> +
> +    /* Determine the amount of memory to scrub, per CPU on each Node */
> +    for_each_online_node ( i )
> +    {
> +        /* Calculate Node memory start and end address */
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);

DYM min(node_start_pfn(i) + node_spanned_pages(i), max_page)?

> +        /* Divide by number of CPU's for this node */
> +        node_cpus = node_to_cpumask(i);

Shouldn't this be masked with cpu_online_map?

> +        /* It's possible a node has no CPU's */
> +        if ( cpumask_empty(&node_cpus) )
> +            continue;
> +        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus);
> +
> +        region[i].cpu_block_size = (mem_end - mem_start) /
> +                                    cpumask_weight(&node_cpus);

What if there's a remainder in this division?  I don't see anything that
will scrub the leftover frames.

> +        region[i].start = mem_start;
> +
> +        if ( region[i].cpu_block_size > max_cpu_blk_size )
> +            max_cpu_blk_size = region[i].cpu_block_size;
> +    }
> +
> +    /* Round default chunk size down if required */
> +    if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size )
> +        chunk_size = max_cpu_blk_size;

Is this necessary?  The worker will confine itself to the block size anyway.

> +    total_node_cpus = cpumask_weight(&total_node_cpus_mask);
> +    /* Start all CPU's scrubbing memory, chunk_size at a time */
> +    for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size )
> +    {
> +        process_pending_softirqs();
> +
> +        atomic_set(&bootscrub_count, total_node_cpus);
> +
>          spin_lock(&heap_lock);
>  
> -        /* Re-check page status with lock held. */
> -        if ( page_state_is(pg, free) )
> -            scrub_one_page(pg);
> +        /* Start all other CPU's on all nodes */
> +        for_each_online_node ( i )
> +        {
> +            region[i].chunk_size = chunk_size;
> +            region[i].offset = mfn_off;

Not sure these need to be per-node values since they're always the same.
Also, since thsy're always the same, this doesn't need to be in a
for_each_online_node() loop: you could just set the shared offset
variable and use total_node_cpus_mask to trigger all the workers at once.

> +            node_cpus = node_to_cpumask(i);

Again, masked with cpu_online_map?

> +            /* Clear local cpu ID */
> +            cpumask_clear_cpu(cpu, &node_cpus);

AIUI, on_selected_cpus will DTRT here even if you're calling yourself.
You could also call it with @wait == 1 and then you don't need to
maintain your own bootscrub_count.

> +            /* Start page scrubbing on all other CPU's */
> +            on_selected_cpus(&node_cpus, smp_scrub_heap_pages, &region[i], 0);
> +        }
> +
> +        /* Start scrub on local CPU if CPU linked to a memory node */
> +        if ( boot_cpu_node != NUMA_NO_NODE )
> +            smp_scrub_heap_pages(&region[boot_cpu_node]);
> +
> +        /* Wait for page scrubbing to complete on all other CPU's */
> +        while ( atomic_read(&bootscrub_count) > 0 )
> +            cpu_relax();
>  
>          spin_unlock(&heap_lock);
>      }
>  
> +    /* Use the boot CPU to scrub any nodes which have no CPU's linked to them */
> +    for_each_online_node ( i )
> +    {
> +        node_cpus = node_to_cpumask(i);
> +
> +        if ( !cpumask_empty(&node_cpus) )
> +            continue;
> +
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);

Again, ITYM node_start_pfn(i) + node_spanned_pages(i).

> +        region[0].offset = 0;
> +        region[0].cpu_block_size = ~0ULL;
> +
> +        for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size )
> +        {
> +            spin_lock(&heap_lock);
> +            if ( mfn + chunk_size > mem_end )
> +                region[0].chunk_size = mem_end - mfn;
> +            else
> +                region[0].chunk_size = chunk_size;
> +
> +            region[0].start = mfn;
> +

You're struggling a little to force this information into the SMP
protocol; I think it would be cleaner if you carved the central loop of
smp_scrub_heap_pages() out into its own funciton and called that here
instead.

> +            smp_scrub_heap_pages(&region[0]);
> +            spin_unlock(&heap_lock);
> +            process_pending_softirqs();

This belongs at the top of the loop, I think, since the last softirq
check was before the last iteration of the main loop.

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2013-09-30 12:35 [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's Malcolm Crossley
                   ` (2 preceding siblings ...)
  2013-10-03 11:39 ` Tim Deegan
@ 2014-04-01 19:29 ` Konrad Rzeszutek Wilk
  2014-04-03  1:19   ` Konrad Rzeszutek Wilk
  3 siblings, 1 reply; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-04-01 19:29 UTC (permalink / raw
  To: Malcolm Crossley; +Cc: andrew.cooper3, tim, keir, JBeulich, xen-devel

On Mon, Sep 30, 2013 at 01:35:17PM +0100, Malcolm Crossley wrote:
> The page scrubbing is done in 128MB chunks in lockstep across all the CPU's.
> This allows for the boot CPU to hold the heap_lock whilst each chunk is being
> scrubbed and then release the heap_lock when all CPU's are finished scrubing
> their individual chunk. This allows for the heap_lock to not be held
> continously and for pending softirqs are to be serviced periodically across
> all CPU's.
> 
> The page scrub memory chunks are allocated to the CPU's in a NUMA aware
> fashion to reduce Socket interconnect overhead and improve performance.
> 
> This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
> 6386 machine from 49 seconds to 3 seconds.
> 
> Changes in v2
>  - Reduced default chunk size to 128MB
>  - Added code to scrub NUMA nodes with no active CPU linked to them
>  - Be robust to boot CPU not being linked to a NUMA node

Is there an updated version of this patch?

My recollection is that Jan wanted this patch to spread the work
only on the cores and not the threads. Were there any other concerns?

Thanks!
> 
> diff -r a03cc3136759 -r ee1108d26fc5 docs/misc/xen-command-line.markdown
> --- a/docs/misc/xen-command-line.markdown
> +++ b/docs/misc/xen-command-line.markdown
> @@ -188,6 +188,16 @@ Scrub free RAM during boot.  This is a s
>  accidentally leaking sensitive VM data into other VMs if Xen crashes
>  and reboots.
>  
> +### bootscrub_blocksize
> +> `= <size>`
> +
> +> Default: `128MiB`
> +
> +Maximum RAM block size to be scrubbed whilst holding the page heap lock and not
> +running softirqs. Reduce this if softirqs are not being run frequently enough.
> +Setting this to a high value may cause cause boot failure, particularly if the
> +NMI watchdog is also enabled.
> +
>  ### cachesize
>  > `= <size>`
>  
> diff -r a03cc3136759 -r ee1108d26fc5 xen/common/page_alloc.c
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata =
>  boolean_param("bootscrub", opt_bootscrub);
>  
>  /*
> + * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock held
> + */
> +static unsigned int __initdata opt_bootscrub_blocksize = 128 * 1024 * 1024;
> +size_param("bootscrub_blocksize", opt_bootscrub_blocksize);
> +
> +/*
>   * Bit width of the DMA heap -- used to override NUMA-node-first.
>   * allocation strategy, which can otherwise exhaust low memory.
>   */
> @@ -90,6 +96,16 @@ static struct bootmem_region {
>  } *__initdata bootmem_region_list;
>  static unsigned int __initdata nr_bootmem_regions;
>  
> +static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0);
> +
> +struct scrub_region {
> +    u64 offset;
> +    u64 start;
> +    u64 chunk_size;
> +    u64 cpu_block_size;
> +};
> +static struct scrub_region __initdata region[MAX_NUMNODES];
> +
>  static void __init boot_bug(int line)
>  {
>      panic("Boot BUG at %s:%d\n", __FILE__, line);
> @@ -1254,28 +1270,44 @@ void __init end_boot_allocator(void)
>      printk("\n");
>  }
>  
> -/*
> - * Scrub all unallocated pages in all heap zones. This function is more
> - * convoluted than appears necessary because we do not want to continuously
> - * hold the lock while scrubbing very large memory areas.
> - */
> -void __init scrub_heap_pages(void)
> +void __init smp_scrub_heap_pages(void *data)
>  {
> -    unsigned long mfn;
> +    unsigned long mfn, start_mfn, end_mfn;
>      struct page_info *pg;
> +    struct scrub_region *region = data;
> +    unsigned int temp_cpu, local_node, local_cpu_index = 0;
> +    unsigned int cpu = smp_processor_id();
>  
> -    if ( !opt_bootscrub )
> -        return;
> +    ASSERT(region != NULL);
>  
> -    printk("Scrubbing Free RAM: ");
> +    local_node = cpu_to_node(cpu);
> +    /* Determine if we are scrubbing using the boot CPU */
> +    if ( region->cpu_block_size != ~0ULL )
> +        /* Determine the current CPU's index into CPU's linked to this node*/
> +        for_each_cpu( temp_cpu, &node_to_cpumask(local_node) )
> +        {
> +            if ( cpu == temp_cpu )
> +                break;
> +            local_cpu_index++;
> +        }
>  
> -    for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
> +    /* Calculate the starting mfn for this CPU's memory block */
> +    start_mfn = region->start + (region->cpu_block_size * local_cpu_index)
> +                + region->offset;
> +
> +    /* Calculate the end mfn into this CPU's memory block for this iteration */
> +    if ( region->offset + region->chunk_size > region->cpu_block_size )
> +        end_mfn = region->start + (region->cpu_block_size * local_cpu_index)
> +                  + region->cpu_block_size;
> +    else
> +        end_mfn = start_mfn + region->chunk_size;
> +
> +
> +    for ( mfn = start_mfn; mfn < end_mfn; mfn++ )
>      {
> -        process_pending_softirqs();
> -
>          pg = mfn_to_page(mfn);
>  
> -        /* Quick lock-free check. */
> +        /* Check the mfn is valid and page is free. */
>          if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
>              continue;
>  
> @@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void)
>          if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
>              printk(".");
>  
> +        /* Do the scrub if possible */
> +        if ( page_state_is(pg, free) )
> +            scrub_one_page(pg);
> +    }
> +    /* Increment count to indicate scrubbing complete on this CPU */
> +    atomic_dec(&bootscrub_count);
> +}
> +
> +/*
> + * Scrub all unallocated pages in all heap zones. This function uses all
> + * online cpu's to scrub the memory in parallel.
> + */
> +void __init scrub_heap_pages(void)
> +{
> +    cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }};
> +    unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id();
> +    unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0;
> +    unsigned long mem_start, mem_end;
> +
> +    if ( !opt_bootscrub )
> +        return;
> +
> +    boot_cpu_node = cpu_to_node(cpu);
> +
> +    printk("Scrubbing Free RAM: ");
> +
> +    /* Scrub block size */
> +    chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;
> +    if ( chunk_size == 0 )
> +        chunk_size = 1;
> +
> +    /* Determine the amount of memory to scrub, per CPU on each Node */
> +    for_each_online_node ( i )
> +    {
> +        /* Calculate Node memory start and end address */
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);
> +        /* Divide by number of CPU's for this node */
> +        node_cpus = node_to_cpumask(i);
> +        /* It's possible a node has no CPU's */
> +        if ( cpumask_empty(&node_cpus) )
> +            continue;
> +        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus);
> +
> +        region[i].cpu_block_size = (mem_end - mem_start) /
> +                                    cpumask_weight(&node_cpus);
> +        region[i].start = mem_start;
> +
> +        if ( region[i].cpu_block_size > max_cpu_blk_size )
> +            max_cpu_blk_size = region[i].cpu_block_size;
> +    }
> +
> +    /* Round default chunk size down if required */
> +    if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size )
> +        chunk_size = max_cpu_blk_size;
> +
> +    total_node_cpus = cpumask_weight(&total_node_cpus_mask);
> +    /* Start all CPU's scrubbing memory, chunk_size at a time */
> +    for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size )
> +    {
> +        process_pending_softirqs();
> +
> +        atomic_set(&bootscrub_count, total_node_cpus);
> +
>          spin_lock(&heap_lock);
>  
> -        /* Re-check page status with lock held. */
> -        if ( page_state_is(pg, free) )
> -            scrub_one_page(pg);
> +        /* Start all other CPU's on all nodes */
> +        for_each_online_node ( i )
> +        {
> +            region[i].chunk_size = chunk_size;
> +            region[i].offset = mfn_off;
> +            node_cpus = node_to_cpumask(i);
> +            /* Clear local cpu ID */
> +            cpumask_clear_cpu(cpu, &node_cpus);
> +            /* Start page scrubbing on all other CPU's */
> +            on_selected_cpus(&node_cpus, smp_scrub_heap_pages, &region[i], 0);
> +        }
> +
> +        /* Start scrub on local CPU if CPU linked to a memory node */
> +        if ( boot_cpu_node != NUMA_NO_NODE )
> +            smp_scrub_heap_pages(&region[boot_cpu_node]);
> +
> +        /* Wait for page scrubbing to complete on all other CPU's */
> +        while ( atomic_read(&bootscrub_count) > 0 )
> +            cpu_relax();
>  
>          spin_unlock(&heap_lock);
>      }
>  
> +    /* Use the boot CPU to scrub any nodes which have no CPU's linked to them */
> +    for_each_online_node ( i )
> +    {
> +        node_cpus = node_to_cpumask(i);
> +
> +        if ( !cpumask_empty(&node_cpus) )
> +            continue;
> +
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);
> +
> +        region[0].offset = 0;
> +        region[0].cpu_block_size = ~0ULL;
> +
> +        for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size )
> +        {
> +            spin_lock(&heap_lock);
> +            if ( mfn + chunk_size > mem_end )
> +                region[0].chunk_size = mem_end - mfn;
> +            else
> +                region[0].chunk_size = chunk_size;
> +
> +            region[0].start = mfn;
> +
> +            smp_scrub_heap_pages(&region[0]);
> +            spin_unlock(&heap_lock);
> +            process_pending_softirqs();
> +        }
> +    }
>      printk("done.\n");
>  
>      /* Now that the heap is initialized, run checks and set bounds
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2014-04-01 19:29 ` Konrad Rzeszutek Wilk
@ 2014-04-03  1:19   ` Konrad Rzeszutek Wilk
  2014-04-03  8:35     ` Jan Beulich
  2014-04-03  9:00     ` Tim Deegan
  0 siblings, 2 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-04-03  1:19 UTC (permalink / raw
  To: Malcolm Crossley, JBeulich; +Cc: andrew.cooper3, tim, keir, xen-devel

[-- Attachment #1: Type: text/plain, Size: 1683 bytes --]

On Tue, Apr 01, 2014 at 03:29:05PM -0400, Konrad Rzeszutek Wilk wrote:
> On Mon, Sep 30, 2013 at 01:35:17PM +0100, Malcolm Crossley wrote:
> > The page scrubbing is done in 128MB chunks in lockstep across all the CPU's.
> > This allows for the boot CPU to hold the heap_lock whilst each chunk is being
> > scrubbed and then release the heap_lock when all CPU's are finished scrubing
> > their individual chunk. This allows for the heap_lock to not be held
> > continously and for pending softirqs are to be serviced periodically across
> > all CPU's.
> > 
> > The page scrub memory chunks are allocated to the CPU's in a NUMA aware
> > fashion to reduce Socket interconnect overhead and improve performance.
> > 
> > This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
> > 6386 machine from 49 seconds to 3 seconds.
> > 
> > Changes in v2
> >  - Reduced default chunk size to 128MB
> >  - Added code to scrub NUMA nodes with no active CPU linked to them
> >  - Be robust to boot CPU not being linked to a NUMA node
> 
> Is there an updated version of this patch?
> 
> My recollection is that Jan wanted this patch to spread the work
> only on the cores and not the threads. Were there any other concerns?

And for fun I tried it on a large box and it does make a difference.

With the patch (attached) where we only use the cores and skip
the SMT threads the boot time scrubbing for 1.5TB is 66 seconds
while for all of the CPUs (including threads) is 127 seconds.

Sorry about the quality of the patch - was poking at it while
on conference calls so please treat it as HACK/RFC.

Jan/Malcom, were there any other issues that needed to be addressed?

Thanks.


[-- Attachment #2: nosmt.patch --]
[-- Type: text/plain, Size: 6613 bytes --]

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index b36a66e..0c9f12f 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -64,6 +64,8 @@ string_param("badpage", opt_badpage);
 static bool_t opt_bootscrub __initdata = 1;
 boolean_param("bootscrub", opt_bootscrub);
 
+static bool_t opt_nonsmt __initdata = 0;
+boolean_param("nonsmt", opt_nonsmt);
 /*
  * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock held
  */
@@ -103,6 +105,7 @@ struct scrub_region {
     u64 start;
     u64 chunk_size;
     u64 cpu_block_size;
+    cpumask_t cpu;
 };
 static struct scrub_region __initdata region[MAX_NUMNODES];
 
@@ -1286,6 +1289,7 @@ void __init smp_scrub_heap_pages(void *data)
     /* Determine if we are scrubbing using the boot CPU */
     if ( region->cpu_block_size != ~0ULL )
         /* Determine the current CPU's index into CPU's linked to this node*/
+        /* TODO :Ignore the siblings! */
         for_each_cpu( temp_cpu, &node_to_cpumask(local_node) )
         {
             if ( cpu == temp_cpu )
@@ -1304,7 +1308,6 @@ void __init smp_scrub_heap_pages(void *data)
     else
         end_mfn = start_mfn + region->chunk_size;
 
-
     for ( mfn = start_mfn; mfn < end_mfn; mfn++ )
     {
         pg = mfn_to_page(mfn);
@@ -1313,10 +1316,9 @@ void __init smp_scrub_heap_pages(void *data)
         if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
             continue;
 
-        /* Every 100MB, print a progress dot. */
-        if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
+        /* Every 1G, print a progress dot. */
+        if ( (mfn % ((1000*1024*1024)/PAGE_SIZE)) == 0 )
             printk(".");
-
         /* Do the scrub if possible */
         if ( page_state_is(pg, free) )
             scrub_one_page(pg);
@@ -1331,23 +1333,26 @@ void __init smp_scrub_heap_pages(void *data)
  */
 void __init scrub_heap_pages(void)
 {
-    cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }};
-    unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id();
+    cpumask_t node_cpus, node_cpus_nonsmt, total_node_cpus_mask = {{ 0 }};
+    unsigned int i, j,boot_cpu_node, total_cpus, cpu = smp_processor_id(), sibling;
     unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0;
     unsigned long mem_start, mem_end;
+    s_time_t start, end;
 
     if ( !opt_bootscrub )
         return;
 
     boot_cpu_node = cpu_to_node(cpu);
 
-    printk("Scrubbing Free RAM: ");
+    printk("Scrubbing Free RAM on %d nodes\n", num_online_nodes());
 
     /* Scrub block size */
     chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;
     if ( chunk_size == 0 )
         chunk_size = 1;
 
+    printk("CPUs have %d threads.\n", cpumask_weight(per_cpu(cpu_sibling_mask, 0)));
+    printk("CPUs have %d cores.\n", cpumask_weight(per_cpu(cpu_core_mask, 0)) / cpumask_weight(per_cpu(cpu_sibling_mask, 0)));
     /* Determine the amount of memory to scrub, per CPU on each Node */
     for_each_online_node ( i )
     {
@@ -1359,27 +1364,49 @@ void __init scrub_heap_pages(void)
         /* It's possible a node has no CPU's */
         if ( cpumask_empty(&node_cpus) )
             continue;
-        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus);
 
+        node_cpus_nonsmt = node_to_cpumask(i);
+        for_each_cpu(j, &node_cpus)
+        {
+            cpu = 0;
+            for_each_cpu(sibling, per_cpu(cpu_sibling_mask, j)) {
+                if (cpu++ == 0) /* Skip core */
+                    continue;
+                cpumask_clear_cpu(sibling, &node_cpus_nonsmt);
+            }
+        }
+        printk("node%d has %d CPUs non-SMT\n", i, cpumask_weight(&node_cpus_nonsmt));
+        for_each_cpu(j, &node_cpus_nonsmt)
+            printk("#%d,", j);
+
+        printk("\n");
+        if (opt_nonsmt)
+            cpumask_copy(&node_cpus, &node_cpus_nonsmt);
+
+        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus);
         region[i].cpu_block_size = (mem_end - mem_start) /
                                     cpumask_weight(&node_cpus);
         region[i].start = mem_start;
+        cpumask_copy(&region[i].cpu, &node_cpus);
+        printk("NODE%d scrubbing %lx PFNs spread across %d CPUs\n", i, mem_end - mem_start, cpumask_weight(&node_cpus));
 
         if ( region[i].cpu_block_size > max_cpu_blk_size )
             max_cpu_blk_size = region[i].cpu_block_size;
     }
-
+    cpu = smp_processor_id(); /* We re-used it in the loop. */
     /* Round default chunk size down if required */
     if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size )
         chunk_size = max_cpu_blk_size;
 
-    total_node_cpus = cpumask_weight(&total_node_cpus_mask);
+    total_cpus = cpumask_weight(&total_node_cpus_mask);
+    printk("Using a total of %d CPUS.\n", total_cpus);
+    start = NOW();
     /* Start all CPU's scrubbing memory, chunk_size at a time */
     for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size )
     {
         process_pending_softirqs();
 
-        atomic_set(&bootscrub_count, total_node_cpus);
+        atomic_set(&bootscrub_count, total_cpus);
 
         spin_lock(&heap_lock);
 
@@ -1388,7 +1415,7 @@ void __init scrub_heap_pages(void)
         {
             region[i].chunk_size = chunk_size;
             region[i].offset = mfn_off;
-            node_cpus = node_to_cpumask(i);
+            cpumask_copy(&node_cpus, &region[i].cpu);
             /* Clear local cpu ID */
             cpumask_clear_cpu(cpu, &node_cpus);
             /* Start page scrubbing on all other CPU's */
@@ -1406,6 +1433,10 @@ void __init scrub_heap_pages(void)
         spin_unlock(&heap_lock);
     }
 
+    end = NOW();
+    printk("Done SMP scrubbing (%d seconds). Boot scrub on BSP:\n",
+           (u32)((end-start) >> 30));
+    start = NOW();
     /* Use the boot CPU to scrub any nodes which have no CPU's linked to them */
     for_each_online_node ( i )
     {
@@ -1416,6 +1447,7 @@ void __init scrub_heap_pages(void)
 
         mem_start = max(node_start_pfn(i), first_valid_mfn);
         mem_end = min(mem_start + node_spanned_pages(i), max_page);
+        printk("NODE%d scrubbing %lx->%lx\n", i, mem_start, mem_end);
 
         region[0].offset = 0;
         region[0].cpu_block_size = ~0ULL;
@@ -1435,7 +1467,8 @@ void __init scrub_heap_pages(void)
             process_pending_softirqs();
         }
     }
-    printk("done.\n");
+    end = NOW();
+    printk("done. (%d seconds)\n", (u32)(end - start) >> 30);
 
     /* Now that the heap is initialized, run checks and set bounds
      * for the low mem virq algorithm. */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2014-04-03  1:19   ` Konrad Rzeszutek Wilk
@ 2014-04-03  8:35     ` Jan Beulich
  2014-04-03  9:00     ` Tim Deegan
  1 sibling, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2014-04-03  8:35 UTC (permalink / raw
  To: Konrad Rzeszutek Wilk
  Cc: keir, andrew.cooper3, tim, xen-devel, Malcolm Crossley

>>> On 03.04.14 at 03:19, <konrad@darnok.org> wrote:
> On Tue, Apr 01, 2014 at 03:29:05PM -0400, Konrad Rzeszutek Wilk wrote:
>> My recollection is that Jan wanted this patch to spread the work
>> only on the cores and not the threads. Were there any other concerns?
> 
> And for fun I tried it on a large box and it does make a difference.
> 
> With the patch (attached) where we only use the cores and skip
> the SMT threads the boot time scrubbing for 1.5TB is 66 seconds
> while for all of the CPUs (including threads) is 127 seconds.
> 
> Sorry about the quality of the patch - was poking at it while
> on conference calls so please treat it as HACK/RFC.
> 
> Jan/Malcom, were there any other issues that needed to be addressed?

Iirc this was the only concern, so if you could send a proper, cleaned
up patch...

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2014-04-03  1:19   ` Konrad Rzeszutek Wilk
  2014-04-03  8:35     ` Jan Beulich
@ 2014-04-03  9:00     ` Tim Deegan
  2014-04-09 14:20       ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 14+ messages in thread
From: Tim Deegan @ 2014-04-03  9:00 UTC (permalink / raw
  To: Konrad Rzeszutek Wilk
  Cc: keir, andrew.cooper3, xen-devel, JBeulich, Malcolm Crossley

At 21:19 -0400 on 02 Apr (1396469958), Konrad Rzeszutek Wilk wrote:
> On Tue, Apr 01, 2014 at 03:29:05PM -0400, Konrad Rzeszutek Wilk wrote:
> > Is there an updated version of this patch?
> > 
> > My recollection is that Jan wanted this patch to spread the work
> > only on the cores and not the threads. Were there any other concerns?
> 
> And for fun I tried it on a large box and it does make a difference.
> 
> With the patch (attached) where we only use the cores and skip
> the SMT threads the boot time scrubbing for 1.5TB is 66 seconds
> while for all of the CPUs (including threads) is 127 seconds.
> 
> Sorry about the quality of the patch - was poking at it while
> on conference calls so please treat it as HACK/RFC.
> 
> Jan/Malcom, were there any other issues that needed to be addressed?

Yes: http://lists.xen.org/archives/html/xen-devel/2013-10/msg00131.html

Tim.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2014-04-03  9:00     ` Tim Deegan
@ 2014-04-09 14:20       ` Konrad Rzeszutek Wilk
  2014-04-10 11:09         ` Dario Faggioli
  0 siblings, 1 reply; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-04-09 14:20 UTC (permalink / raw
  To: Tim Deegan
  Cc: keir, andrew.cooper3, xen-devel, JBeulich, Konrad Rzeszutek Wilk,
	Malcolm Crossley

On Thu, Apr 03, 2014 at 11:00:07AM +0200, Tim Deegan wrote:
> At 21:19 -0400 on 02 Apr (1396469958), Konrad Rzeszutek Wilk wrote:
> > On Tue, Apr 01, 2014 at 03:29:05PM -0400, Konrad Rzeszutek Wilk wrote:
> > > Is there an updated version of this patch?
> > > 
> > > My recollection is that Jan wanted this patch to spread the work
> > > only on the cores and not the threads. Were there any other concerns?
> > 
> > And for fun I tried it on a large box and it does make a difference.
> > 
> > With the patch (attached) where we only use the cores and skip
> > the SMT threads the boot time scrubbing for 1.5TB is 66 seconds
> > while for all of the CPUs (including threads) is 127 seconds.
> > 
> > Sorry about the quality of the patch - was poking at it while
> > on conference calls so please treat it as HACK/RFC.
> > 
> > Jan/Malcom, were there any other issues that needed to be addressed?
> 
> Yes: http://lists.xen.org/archives/html/xen-devel/2013-10/msg00131.html

Pardon for pasting the comments. I can't find the email in my mailbox.

Regarding your comments:

>> +            region[i].chunk_size = chunk_size;
>> +            region[i].offset = mfn_off;

>Not sure these need to be per-node values since they're always the same.
>Also, since thsy're always the same, this doesn't need to be in a
>for_each_online_node() loop: you could just set the shared offset
>variable and use total_node_cpus_mask to trigger all the workers at once

While I agree with the 'region[i].offset = mfn_off' always being the same
and making it a visible value to the workers is good, I think the other
suggestions has a potential problem. Let me explain after this comment:


>> +            cpumask_clear_cpu(cpu, &node_cpus);
>> .. snip..
>You could also call it with @wait == 1 and then you don't need to
>maintain your own bootscrub_count.

The issue with using @wait and also total_node_cpus_mask is that
the workers have no idea to which node they belong. That is the
parameter we pass in: &region[i] won't work anymore as the
'i' is based on NUMA count and not the CPU count.

An option would be for the worker to do:

void __init smp_scrub_heap_pages(void *data)
{
	bool_t scrub_bsp = (bool_t)data;
	unsigned long node_id = cpu_to_node(smp_processor_id());
	struct scrub_region *data;

	if (node_id == NUMA_NO_NODE)
		if (scrub_bsp)
			data = region[0];
		else
			goto out;
	else
		data = region[node_id];

and use that. Then the parameter passed in is just whether this
is the first loop (false) or the second loop (true).


Or perhaps:

>You're struggling a little to force this information into the SMP
>protocol; I think it would be cleaner if you carved the central loop of
>smp_scrub_heap_pages() out into its own funciton and called that here
>instead.

Just do that and then the workers will just quit if they detect
(node_id == NUMA_NO_NODE) and there is no parameter parsing.

Let me try that.		

> 
> Tim.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's
  2014-04-09 14:20       ` Konrad Rzeszutek Wilk
@ 2014-04-10 11:09         ` Dario Faggioli
  0 siblings, 0 replies; 14+ messages in thread
From: Dario Faggioli @ 2014-04-10 11:09 UTC (permalink / raw
  To: Konrad Rzeszutek Wilk
  Cc: keir, andrew.cooper3, Tim Deegan, xen-devel, JBeulich,
	Konrad Rzeszutek Wilk, Malcolm Crossley


[-- Attachment #1.1: Type: text/plain, Size: 1468 bytes --]

On Wed, 2014-04-09 at 10:20 -0400, Konrad Rzeszutek Wilk wrote:
> On Thu, Apr 03, 2014 at 11:00:07AM +0200, Tim Deegan wrote:

> An option would be for the worker to do:
> 
> void __init smp_scrub_heap_pages(void *data)
> {
> 	bool_t scrub_bsp = (bool_t)data;
> 	unsigned long node_id = cpu_to_node(smp_processor_id());
> 	struct scrub_region *data;
> 
> 	if (node_id == NUMA_NO_NODE)
> 		if (scrub_bsp)
> 			data = region[0];
> 		else
> 			goto out;
> 	else
> 		data = region[node_id];
> 
> and use that. Then the parameter passed in is just whether this
> is the first loop (false) or the second loop (true).
> 
> 
> Or perhaps:
> 
> >You're struggling a little to force this information into the SMP
> >protocol; I think it would be cleaner if you carved the central loop of
> >smp_scrub_heap_pages() out into its own funciton and called that here
> >instead.
> 
> Just do that and then the workers will just quit if they detect
> (node_id == NUMA_NO_NODE) and there is no parameter parsing.
> 
FWIW, I think this option would look a lot better.

> Let me try that.		
> 
Let us know how it goes... I'm also interested in this patch.

Thanks and Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-04-10 11:09 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-30 12:35 [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's Malcolm Crossley
2013-09-30 13:26 ` Jan Beulich
2013-09-30 13:56   ` Malcolm Crossley
2013-09-30 15:35     ` Jan Beulich
2013-09-30 15:42       ` Andrew Cooper
2013-09-30 16:08         ` Jan Beulich
2013-09-30 17:43 ` Konrad Rzeszutek Wilk
2013-10-03 11:39 ` Tim Deegan
2014-04-01 19:29 ` Konrad Rzeszutek Wilk
2014-04-03  1:19   ` Konrad Rzeszutek Wilk
2014-04-03  8:35     ` Jan Beulich
2014-04-03  9:00     ` Tim Deegan
2014-04-09 14:20       ` Konrad Rzeszutek Wilk
2014-04-10 11:09         ` Dario Faggioli

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.