[PATCH v3] mm: swap: async free swap slot cache entries

LKML Archive mirror
 help / color / mirror / Atom feed

* [PATCH v3] mm: swap: async free swap slot cache entries
@ 2024-02-13 23:20 Chris Li
  2024-02-14  0:08 ` Tim Chen
  0 siblings, 1 reply; 5+ messages in thread
From: Chris Li @ 2024-02-13 23:20 UTC (permalink / raw
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Wei Xu, Yu Zhao, Greg Thelen,
	Chun-Tse Shao, Yosry Ahmed, Michal Hocko, Mel Gorman, Huang Ying,
	Nhat Pham, Kairui Song, Barry Song, Tim Chen, Chris Li

We discovered that 1% swap page fault is 100us+ while 50% of
the swap fault is under 20us.

Further investigation show that a large portion of the time
spent in the free_swap_slots() function for the long tail case.

The percpu cache of swap slots is freed in a batch of 64 entries
inside free_swap_slots(). These cache entries are accumulated
from previous page faults, which may not be related to the current
process.

Doing the batch free in the page fault handler causes longer
tail latencies and penalizes the current process.

Add /sys/kernel/mm/swap/swap_slot_async_free to control the
async free behavior. When enabled, using work queue to async
free the swap slot when the swap slot cache is full.

Testing:

Chun-Tse did some benchmark in chromebook, showing that
zram_wait_metrics improve about 15% with 80% and 95% confidence.

I recently ran some experiments on about 1000 Google production
machines. It shows swapin latency drops in the long tail
100us - 500us bucket dramatically.

platform	(100-500us)	 	(0-100us)
A		1.12% -> 0.36%		98.47% -> 99.22%
B		0.65% -> 0.15%		98.96% -> 99.46%
C		0.61% -> 0.23%		98.96% -> 99.38%

Signed-off-by: Chris Li <chrisl@kernel.org>
---
Changes in v3:
- Address feedback from Tim Chen, direct free path will free all swap slots.
- Add /sys/kernel/mm/swap/swap_slot_async_fee to enable async free. Default is off.
- Link to v2: https://lore.kernel.org/r/20240131-async-free-v2-1-525f03e07184@kernel.org

Changes in v2:
- Add description of the impact of time changing suggest by Ying.
- Remove create_workqueue() and use schedule_work()
- Link to v1: https://lore.kernel.org/r/20231221-async-free-v1-1-94b277992cb0@kernel.org
---
 include/linux/swap_slots.h |  2 ++
 mm/swap_slots.c            | 20 ++++++++++++++++++++
 mm/swap_state.c            | 23 +++++++++++++++++++++++
 3 files changed, 45 insertions(+)

diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
index 15adfb8c813a..bb9a401d7cae 100644
--- a/include/linux/swap_slots.h
+++ b/include/linux/swap_slots.h
@@ -19,6 +19,7 @@ struct swap_slots_cache {
 	spinlock_t	free_lock;  /* protects slots_ret, n_ret */
 	swp_entry_t	*slots_ret;
 	int		n_ret;
+	struct work_struct async_free;
 };
 
 void disable_swap_slots_cache_lock(void);
@@ -27,5 +28,6 @@ void enable_swap_slots_cache(void);
 void free_swap_slot(swp_entry_t entry);
 
 extern bool swap_slot_cache_enabled;
+extern uint8_t slot_cache_async_free __read_mostly;
 
 #endif /* _LINUX_SWAP_SLOTS_H */
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0bec1f705f8e..9e9bc0ffb215 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -38,12 +38,15 @@
 static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
 static bool	swap_slot_cache_active;
 bool	swap_slot_cache_enabled;
+uint8_t	slot_cache_async_free;
+
 static bool	swap_slot_cache_initialized;
 static DEFINE_MUTEX(swap_slots_cache_mutex);
 /* Serialize swap slots cache enable/disable operations */
 static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
 
 static void __drain_swap_slots_cache(unsigned int type);
+static void swapcache_async_free_entries(struct work_struct *data);
 
 #define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
 #define SLOTS_CACHE 0x1
@@ -149,6 +152,7 @@ static int alloc_swap_slot_cache(unsigned int cpu)
 		spin_lock_init(&cache->free_lock);
 		cache->lock_initialized = true;
 	}
+	INIT_WORK(&cache->async_free, swapcache_async_free_entries);
 	cache->nr = 0;
 	cache->cur = 0;
 	cache->n_ret = 0;
@@ -269,6 +273,20 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
 	return cache->nr;
 }
 
+static void swapcache_async_free_entries(struct work_struct *data)
+{
+	struct swap_slots_cache *cache;
+
+	cache = container_of(data, struct swap_slots_cache, async_free);
+	spin_lock_irq(&cache->free_lock);
+	/* Swap slots cache may be deactivated before acquiring lock */
+	if (cache->slots_ret && cache->n_ret) {
+		swapcache_free_entries(cache->slots_ret, cache->n_ret);
+		cache->n_ret = 0;
+	}
+	spin_unlock_irq(&cache->free_lock);
+}
+
 void free_swap_slot(swp_entry_t entry)
 {
 	struct swap_slots_cache *cache;
@@ -293,6 +311,8 @@ void free_swap_slot(swp_entry_t entry)
 		}
 		cache->slots_ret[cache->n_ret++] = entry;
 		spin_unlock_irq(&cache->free_lock);
+		if (slot_cache_async_free && cache->n_ret >= SWAP_SLOTS_CACHE_SIZE)
+			schedule_work(&cache->async_free);
 	} else {
 direct_free:
 		swapcache_free_entries(&entry, 1);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e671266ad772..e4549f33556b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -912,8 +912,31 @@ static ssize_t vma_ra_enabled_store(struct kobject *kobj,
 }
 static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
 
+static ssize_t swap_slot_async_free_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", READ_ONCE(slot_cache_async_free));
+}
+static ssize_t swap_slot_async_free_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	ssize_t ret;
+	int val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+	WRITE_ONCE(slot_cache_async_free, !!val);
+	return count;
+}
+static struct kobj_attribute swap_slot_async_free_attr =
+	__ATTR(swap_slot_async_free, 0644, swap_slot_async_free_show,
+	       swap_slot_async_free_store);
+
 static struct attribute *swap_attrs[] = {
 	&vma_ra_enabled_attr.attr,
+	&swap_slot_async_free_attr.attr,
 	NULL,
 };
 

---
base-commit: eacce8189e28717da6f44ee492b7404c636ae0de
change-id: 20231216-async-free-bef392015432

Best regards,
-- 
Chris Li <chrisl@kernel.org>


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] mm: swap: async free swap slot cache entries
  2024-02-13 23:20 [PATCH v3] mm: swap: async free swap slot cache entries Chris Li
@ 2024-02-14  0:08 ` Tim Chen
  2024-02-14 18:56   ` Chris Li
  0 siblings, 1 reply; 5+ messages in thread
From: Tim Chen @ 2024-02-14  0:08 UTC (permalink / raw
  To: Chris Li, Andrew Morton
  Cc: linux-kernel, linux-mm, Wei Xu, Yu Zhao, Greg Thelen,
	Chun-Tse Shao, Yosry Ahmed, Michal Hocko, Mel Gorman, Huang Ying,
	Nhat Pham, Kairui Song, Barry Song

On Tue, 2024-02-13 at 15:20 -0800, Chris Li wrote:
> We discovered that 1% swap page fault is 100us+ while 50% of
> the swap fault is under 20us.
> 
> Further investigation show that a large portion of the time
> spent in the free_swap_slots() function for the long tail case.
> 
> The percpu cache of swap slots is freed in a batch of 64 entries
> inside free_swap_slots(). These cache entries are accumulated
> from previous page faults, which may not be related to the current
> process.
> 
> Doing the batch free in the page fault handler causes longer
> tail latencies and penalizes the current process.
> 
> Add /sys/kernel/mm/swap/swap_slot_async_free to control the
> async free behavior. When enabled, using work queue to async
> free the swap slot when the swap slot cache is full.
> 
> Testing:
> 
> Chun-Tse did some benchmark in chromebook, showing that
> zram_wait_metrics improve about 15% with 80% and 95% confidence.
> 
> I recently ran some experiments on about 1000 Google production
> machines. It shows swapin latency drops in the long tail
> 100us - 500us bucket dramatically.
> 
> platform	(100-500us)	 	(0-100us)
> A		1.12% -> 0.36%		98.47% -> 99.22%
> B		0.65% -> 0.15%		98.96% -> 99.46%
> C		0.61% -> 0.23%		98.96% -> 99.38%
> 
> Signed-off-by: Chris Li <chrisl@kernel.org>
> ---
> Changes in v3:
> - Address feedback from Tim Chen, direct free path will free all swap slots.
> - Add /sys/kernel/mm/swap/swap_slot_async_fee to enable async free. Default is off.
> - Link to v2: https://lore.kernel.org/r/20240131-async-free-v2-1-525f03e07184@kernel.org
> 
> Changes in v2:
> - Add description of the impact of time changing suggest by Ying.
> - Remove create_workqueue() and use schedule_work()
> - Link to v1: https://lore.kernel.org/r/20231221-async-free-v1-1-94b277992cb0@kernel.org
> ---
>  include/linux/swap_slots.h |  2 ++
>  mm/swap_slots.c            | 20 ++++++++++++++++++++
>  mm/swap_state.c            | 23 +++++++++++++++++++++++
>  3 files changed, 45 insertions(+)
> 
> diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
> index 15adfb8c813a..bb9a401d7cae 100644
> --- a/include/linux/swap_slots.h
> +++ b/include/linux/swap_slots.h
> @@ -19,6 +19,7 @@ struct swap_slots_cache {
>  	spinlock_t	free_lock;  /* protects slots_ret, n_ret */
>  	swp_entry_t	*slots_ret;
>  	int		n_ret;
> +	struct work_struct async_free;
>  };
>  
>  void disable_swap_slots_cache_lock(void);
> @@ -27,5 +28,6 @@ void enable_swap_slots_cache(void);
>  void free_swap_slot(swp_entry_t entry);
>  
>  extern bool swap_slot_cache_enabled;
> +extern uint8_t slot_cache_async_free __read_mostly;

Why wouldn't you enable the async_free always?
Otherwise the patch looks fine to me.

Tim

>  


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] mm: swap: async free swap slot cache entries
  2024-02-14  0:08 ` Tim Chen
@ 2024-02-14 18:56   ` Chris Li
  2024-02-14 22:54     ` Tim Chen
  0 siblings, 1 reply; 5+ messages in thread
From: Chris Li @ 2024-02-14 18:56 UTC (permalink / raw
  To: Tim Chen
  Cc: Andrew Morton, linux-kernel, linux-mm, Wei Xu, Yu Zhao,
	Greg Thelen, Chun-Tse Shao, Yosry Ahmed, Michal Hocko, Mel Gorman,
	Huang Ying, Nhat Pham, Kairui Song, Barry Song

On Tue, Feb 13, 2024 at 4:08 PM Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> On Tue, 2024-02-13 at 15:20 -0800, Chris Li wrote:
> > We discovered that 1% swap page fault is 100us+ while 50% of
> > the swap fault is under 20us.
> >
> > Further investigation show that a large portion of the time
> > spent in the free_swap_slots() function for the long tail case.
> >
> > The percpu cache of swap slots is freed in a batch of 64 entries
> > inside free_swap_slots(). These cache entries are accumulated
> > from previous page faults, which may not be related to the current
> > process.
> >
> > Doing the batch free in the page fault handler causes longer
> > tail latencies and penalizes the current process.
> >
> > Add /sys/kernel/mm/swap/swap_slot_async_free to control the
> > async free behavior. When enabled, using work queue to async
> > free the swap slot when the swap slot cache is full.
> >
> > Testing:
> >
> > Chun-Tse did some benchmark in chromebook, showing that
> > zram_wait_metrics improve about 15% with 80% and 95% confidence.
> >
> > I recently ran some experiments on about 1000 Google production
> > machines. It shows swapin latency drops in the long tail
> > 100us - 500us bucket dramatically.
> >
> > platform      (100-500us)             (0-100us)
> > A             1.12% -> 0.36%          98.47% -> 99.22%
> > B             0.65% -> 0.15%          98.96% -> 99.46%
> > C             0.61% -> 0.23%          98.96% -> 99.38%
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
> > ---
> > Changes in v3:
> > - Address feedback from Tim Chen, direct free path will free all swap slots.
> > - Add /sys/kernel/mm/swap/swap_slot_async_fee to enable async free. Default is off.
> > - Link to v2: https://lore.kernel.org/r/20240131-async-free-v2-1-525f03e07184@kernel.org
> >
> > Changes in v2:
> > - Add description of the impact of time changing suggest by Ying.
> > - Remove create_workqueue() and use schedule_work()
> > - Link to v1: https://lore.kernel.org/r/20231221-async-free-v1-1-94b277992cb0@kernel.org
> > ---
> >  include/linux/swap_slots.h |  2 ++
> >  mm/swap_slots.c            | 20 ++++++++++++++++++++
> >  mm/swap_state.c            | 23 +++++++++++++++++++++++
> >  3 files changed, 45 insertions(+)
> >
> > diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
> > index 15adfb8c813a..bb9a401d7cae 100644
> > --- a/include/linux/swap_slots.h
> > +++ b/include/linux/swap_slots.h
> > @@ -19,6 +19,7 @@ struct swap_slots_cache {
> >       spinlock_t      free_lock;  /* protects slots_ret, n_ret */
> >       swp_entry_t     *slots_ret;
> >       int             n_ret;
> > +     struct work_struct async_free;
> >  };
> >
> >  void disable_swap_slots_cache_lock(void);
> > @@ -27,5 +28,6 @@ void enable_swap_slots_cache(void);
> >  void free_swap_slot(swp_entry_t entry);
> >
> >  extern bool swap_slot_cache_enabled;
> > +extern uint8_t slot_cache_async_free __read_mostly;
>
> Why wouldn't you enable the async_free always?
> Otherwise the patch looks fine to me.

Thanks for the feedback.

Just in case someone doesn't care about this optimization and wants to
opt out this behavior?
Anyway, I am happy to update the patch without the sysfs control file as well.

Chris

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] mm: swap: async free swap slot cache entries
  2024-02-14 18:56   ` Chris Li
@ 2024-02-14 22:54     ` Tim Chen
  2024-02-15  1:03       ` Chris Li
  0 siblings, 1 reply; 5+ messages in thread
From: Tim Chen @ 2024-02-14 22:54 UTC (permalink / raw
  To: Chris Li
  Cc: Andrew Morton, linux-kernel, linux-mm, Wei Xu, Yu Zhao,
	Greg Thelen, Chun-Tse Shao, Yosry Ahmed, Michal Hocko, Mel Gorman,
	Huang Ying, Nhat Pham, Kairui Song, Barry Song

On Wed, 2024-02-14 at 10:56 -0800, Chris Li wrote:
> On Tue, Feb 13, 2024 at 4:08 PM Tim Chen <tim.c.chen@linux.intel.com> wrote:
> > 
> > 
> > > 
> > >  extern bool swap_slot_cache_enabled;
> > > +extern uint8_t slot_cache_async_free __read_mostly;
> > 
> > Why wouldn't you enable the async_free always?
> > Otherwise the patch looks fine to me.
> 
> Thanks for the feedback.
> 
> Just in case someone doesn't care about this optimization and wants to
> opt out this behavior?
> Anyway, I am happy to update the patch without the sysfs control file as well.
> 

At least I couldn't see a downside to enable it always in the latest
patch.  I think adding an extra sysfs is unnecessary.

Tim

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] mm: swap: async free swap slot cache entries
  2024-02-14 22:54     ` Tim Chen
@ 2024-02-15  1:03       ` Chris Li
  0 siblings, 0 replies; 5+ messages in thread
From: Chris Li @ 2024-02-15  1:03 UTC (permalink / raw
  To: Tim Chen
  Cc: Andrew Morton, linux-kernel, linux-mm, Wei Xu, Yu Zhao,
	Greg Thelen, Chun-Tse Shao, Yosry Ahmed, Michal Hocko, Mel Gorman,
	Huang Ying, Nhat Pham, Kairui Song, Barry Song

On Wed, Feb 14, 2024 at 2:54 PM Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> On Wed, 2024-02-14 at 10:56 -0800, Chris Li wrote:
> > On Tue, Feb 13, 2024 at 4:08 PM Tim Chen <tim.c.chen@linux.intel.com> wrote:
> > >
> > >
> > > >
> > > >  extern bool swap_slot_cache_enabled;
> > > > +extern uint8_t slot_cache_async_free __read_mostly;
> > >
> > > Why wouldn't you enable the async_free always?
> > > Otherwise the patch looks fine to me.
> >
> > Thanks for the feedback.
> >
> > Just in case someone doesn't care about this optimization and wants to
> > opt out this behavior?
> > Anyway, I am happy to update the patch without the sysfs control file as well.
> >
>
> At least I couldn't see a downside to enable it always in the latest
> patch.  I think adding an extra sysfs is unnecessary.

Thanks, I just sent out V4 to remove the sysfs control file.

Chris

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-02-15  1:04 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-02-13 23:20 [PATCH v3] mm: swap: async free swap slot cache entries Chris Li
2024-02-14  0:08 ` Tim Chen
2024-02-14 18:56   ` Chris Li
2024-02-14 22:54     ` Tim Chen
2024-02-15  1:03       ` Chris Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).