[PATCH] VM improvements for 2.1.131

Linux-mm Archive mirror
 help / color / mirror / Atom feed

* [PATCH] VM improvements for 2.1.131
@ 1998-12-06  0:34 Rik van Riel
  1998-12-06  2:10 ` Eric W. Biederman
  1998-12-07 10:47 ` Neil Conway
  0 siblings, 2 replies; 15+ messages in thread
From: Rik van Riel @ 1998-12-06  0:34 UTC (permalink / raw
  To: Linux MM; +Cc: Linux Kernel, Linus Torvalds

Hi,

this patch contains all that's needed to turn 2.1.131
into the Linux kernel with the fastest VM system the
world has ever known:

- fixes the auto balancing between buffer, cache and
  other memory by means of a vmscan.c fix and a swap.c
  adjustment (borrow percentages only are for obscely
  large amounts of swap)
- swap I/O syncing has been restored to documented behaviour
  and now again gives the possibility of increasing swap
  bandwidth by increasing pager_daemon.swap_cluster
- fixes the stats reporting for swap_cache_find_*
- swapin readahead: this is a much requested feature that
  brings a huge performance increasement to VM performance.

  The last feature is not ready however, so Linus probably
  wants to remove it (the piece concerning page_alloc.c)
  before applying the patch to his tree. Note that the patch
  _is_ completely safe and has withstood 5000+ swaps/second
  without degrading interactive performance (under X) too much.

  I will be working on a more intelligent swapin readahead
  however, so performance could become still better in the
  future. :)

regards,

Rik -- the flu hits, the flu hits, the flu hits -- MORE
+-------------------------------------------------------------------+
| Linux memory management tour guide.        H.H.vanRiel@phys.uu.nl |
| Scouting Vries cubscout leader.      http://www.phys.uu.nl/~riel/ |
+-------------------------------------------------------------------+

--- ./mm/vmscan.c.orig	Sat Dec  5 21:59:29 1998
+++ ./mm/vmscan.c	Sun Dec  6 00:55:11 1998
@@ -432,6 +432,8 @@
 
 	if (buffer_over_borrow() || pgcache_over_borrow())
 		state = 0;
+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster / 2)
+		shrink_mmap(i, gfp_mask);
 
 	switch (state) {
 		do {
--- ./mm/swap.c.orig	Sun Dec  6 00:55:46 1998
+++ ./mm/swap.c	Sun Dec  6 00:56:53 1998
@@ -61,14 +61,14 @@
 swapstat_t swapstats = {0};
 
 buffer_mem_t buffer_mem = {
-	5,	/* minimum percent buffer */
-	10,	/* borrow percent buffer */
+	1,	/* minimum percent buffer */
+	20,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
-	5,	/* minimum percent page cache */
-	15,	/* borrow percent page cache */
+	1,	/* minimum percent page cache */
+	30,	/* borrow percent page cache */
 	75	/* maximum */
 };
 
--- ./mm/page_io.c.orig	Sat Dec  5 21:59:08 1998
+++ ./mm/page_io.c	Sun Dec  6 00:53:36 1998
@@ -60,7 +60,7 @@
 	}
 
 	/* Don't allow too many pending pages in flight.. */
-	if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX)
+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
 		wait = 1;
 
 	p = &swap_info[type];
--- ./mm/page_alloc.c.orig	Sat Dec  5 21:59:08 1998
+++ ./mm/page_alloc.c	Sun Dec  6 00:53:36 1998
@@ -360,6 +360,35 @@
 }
 
 /*
+ * Primitive swap readahead code. We simply read the
+ * next 16 entries in the swap area. This method is
+ * chosen because it doesn't cost us any seek time.
+ * We also make sure to queue the 'original' request
+ * together with the readahead ones...
+ */
+void swapin_readahead(unsigned long entry) {
+        int i;
+        struct page *new_page;
+	unsigned long offset = SWP_OFFSET(entry);
+	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
+
+	for (i = 0; ++i < 16;) {
+	      if (offset >= swapdev->max
+		              || nr_free_pages - atomic_read(&nr_async_pages) <
+			      (freepages.high + freepages.low)/2)
+		      return;
+	      if (!swapdev->swap_map[offset] ||
+                              test_bit(offset, swapdev->swap_lockmap))
+		      continue;
+	      new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
+	      if (new_page != NULL)
+                      __free_page(new_page);
+	      offset++;
+	}
+	return;
+}
+
+/*
  * The tests may look silly, but it essentially makes sure that
  * no other process did a swap-in on us just as we were waiting.
  *
@@ -370,9 +399,15 @@
 	pte_t * page_table, unsigned long entry, int write_access)
 {
 	unsigned long page;
-	struct page *page_map;
-	
-	page_map = read_swap_cache(entry);
+	struct page *page_map = lookup_swap_cache(entry);
+
+	if (!page_map) {
+                swapin_readahead(entry);
+	        page_map = read_swap_cache(entry);
+	} else if (nr_free_pages > freepages.high || pgcache_over_borrow() ||
+                        buffer_over_borrow()) {
+                swapin_readahead(entry);
+        }
 
 	if (pte_val(*page_table) != entry) {
 		if (page_map)
--- ./mm/swap_state.c.orig	Sat Dec  5 21:59:08 1998
+++ ./mm/swap_state.c	Sun Dec  6 00:53:36 1998
@@ -258,9 +258,10 @@
  * incremented.
  */
 
-static struct page * lookup_swap_cache(unsigned long entry)
+struct page * lookup_swap_cache(unsigned long entry)
 {
 	struct page *found;
+	swap_cache_find_total++;
 	
 	while (1) {
 		found = find_page(&swapper_inode, entry);
@@ -268,8 +269,10 @@
 			return 0;
 		if (found->inode != &swapper_inode || !PageSwapCache(found))
 			goto out_bad;
-		if (!PageLocked(found))
+		if (!PageLocked(found)) {
+			swap_cache_find_success++;
 			return found;
+		}
 		__free_page(found);
 		__wait_on_page(found);
 	}
--- ./include/linux/swap.h.orig	Sat Dec  5 21:59:29 1998
+++ ./include/linux/swap.h	Sun Dec  6 00:53:36 1998
@@ -90,6 +90,7 @@
 extern struct page * read_swap_cache_async(unsigned long, int);
 #define read_swap_cache(entry) read_swap_cache_async(entry, 1);
 extern int FASTCALL(swap_count(unsigned long));
+extern struct page * lookup_swap_cache(unsigned long); 
 /*
  * Make these inline later once they are working properly.
  */

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-06  2:10 ` Eric W. Biederman
@ 1998-12-06  1:59   ` Rik van Riel
  1998-12-07 17:08   ` Stephen C. Tweedie
  1 sibling, 0 replies; 15+ messages in thread
From: Rik van Riel @ 1998-12-06  1:59 UTC (permalink / raw
  To: Eric W. Biederman; +Cc: Linux MM, Linus Torvalds

On 5 Dec 1998, Eric W. Biederman wrote:
> >>>>> "RR" == Rik van Riel <H.H.vanRiel@phys.uu.nl> writes:
> 
>  
> RR 	/* Don't allow too many pending pages in flight.. */
> RR-	if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX)
> RR+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
> RR 		wait = 1;
> 
> How will this possibly work if we are using a swapfile 
> and we always swap synchronously?

It won't. But if you are using a swapfile you'll always
lose. Due to on-drive track buffers and head-locality it
won't be a real performance loss though... (I hope).

What we really need is somebody to fix swapfile I/O.

regards,

Rik -- the flu hits, the flu hits, the flu hits -- MORE
+-------------------------------------------------------------------+
| Linux memory management tour guide.        H.H.vanRiel@phys.uu.nl |
| Scouting Vries cubscout leader.      http://www.phys.uu.nl/~riel/ |
+-------------------------------------------------------------------+

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-06  0:34 [PATCH] VM improvements for 2.1.131 Rik van Riel
@ 1998-12-06  2:10 ` Eric W. Biederman
  1998-12-06  1:59   ` Rik van Riel
  1998-12-07 17:08   ` Stephen C. Tweedie
  1998-12-07 10:47 ` Neil Conway
  1 sibling, 2 replies; 15+ messages in thread
From: Eric W. Biederman @ 1998-12-06  2:10 UTC (permalink / raw
  To: Rik van Riel; +Cc: Linux MM, Linux Kernel, Linus Torvalds

>>>>> "RR" == Rik van Riel <H.H.vanRiel@phys.uu.nl> writes:

 
RR 	/* Don't allow too many pending pages in flight.. */
RR-	if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX)
RR+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
RR 		wait = 1;

How will this possibly work if we are using a swapfile 
and we always swap synchronously?

Eric
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-06  0:34 [PATCH] VM improvements for 2.1.131 Rik van Riel
  1998-12-06  2:10 ` Eric W. Biederman
@ 1998-12-07 10:47 ` Neil Conway
  1998-12-07 13:04   ` Rik van Riel
  1 sibling, 1 reply; 15+ messages in thread
From: Neil Conway @ 1998-12-07 10:47 UTC (permalink / raw
  To: Rik van Riel; +Cc: Linux MM, Linux Kernel

Won't making the min_percent values (cache/buffers) equal to 1% wreck
performance on small memory machines?

Neil
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-07 10:47 ` Neil Conway
@ 1998-12-07 13:04   ` Rik van Riel
  1998-12-07 18:01     ` Stephen C. Tweedie
  0 siblings, 1 reply; 15+ messages in thread
From: Rik van Riel @ 1998-12-07 13:04 UTC (permalink / raw
  To: Neil Conway; +Cc: Linux MM, Linux Kernel

On Mon, 7 Dec 1998, Neil Conway wrote:

> Won't making the min_percent values (cache/buffers) equal to 1%
> wreck performance on small memory machines? 

No. When the caches are heavily used they will need to be
freed anyway since we need the space for new data to be
read in.

Besides, we swap_out() doesn't free any memory any more,
so we need to run shrink_mmap() regardless.

what we really need is somebody to try it out on 4M and
8M machines...

cheers,

Rik -- the flu hits, the flu hits, the flu hits -- MORE
+-------------------------------------------------------------------+
| Linux memory management tour guide.        H.H.vanRiel@phys.uu.nl |
| Scouting Vries cubscout leader.      http://www.phys.uu.nl/~riel/ |
+-------------------------------------------------------------------+

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-06  2:10 ` Eric W. Biederman
  1998-12-06  1:59   ` Rik van Riel
@ 1998-12-07 17:08   ` Stephen C. Tweedie
  1 sibling, 0 replies; 15+ messages in thread
From: Stephen C. Tweedie @ 1998-12-07 17:08 UTC (permalink / raw
  To: Eric W. Biederman; +Cc: Rik van Riel, Linux MM, Linux Kernel, Linus Torvalds

Hi,

On 05 Dec 1998 20:10:01 -0600, ebiederm+eric@ccr.net (Eric W. Biederman)
said:

>>>>>> "RR" == Rik van Riel <H.H.vanRiel@phys.uu.nl> writes:
> RR 	/* Don't allow too many pending pages in flight.. */
> RR-	if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX)
> RR+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
> RR 		wait = 1;

> How will this possibly work if we are using a swapfile 
> and we always swap synchronously?

It doesn't make any difference: these lines just put an upper limit on
the amount of asynchronous swapping we can have at any point in time.
If all of our swapping is already synchronous, then the upper limit has
no effect.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-07 13:04   ` Rik van Riel
@ 1998-12-07 18:01     ` Stephen C. Tweedie
  1998-12-07 22:04       ` Stephen C. Tweedie
  0 siblings, 1 reply; 15+ messages in thread
From: Stephen C. Tweedie @ 1998-12-07 18:01 UTC (permalink / raw
  To: Rik van Riel; +Cc: Neil Conway, Linux MM, Linux Kernel

Hi,

On Mon, 7 Dec 1998 14:04:04 +0100 (CET), Rik van Riel
<H.H.vanRiel@phys.uu.nl> said:

> what we really need is somebody to try it out on 4M and
> 8M machines...

Been doing that.  2.1.130 is the fastest kernel ever in 8MB (using
defrag builds over NFS as a benchmark): 25% faster that 2.0.36.  2.1.131
is consistently about 10% slower at the same job than 130 (but still
faster than 2.0 ever was).

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-07 18:01     ` Stephen C. Tweedie
@ 1998-12-07 22:04       ` Stephen C. Tweedie
  1998-12-07 22:51         ` Rik van Riel
  1998-12-09 17:43         ` Andrea Arcangeli
  0 siblings, 2 replies; 15+ messages in thread
From: Stephen C. Tweedie @ 1998-12-07 22:04 UTC (permalink / raw
  To: Rik van Riel
  Cc: Neil Conway, Linux MM, Linux Kernel, Stephen Tweedie,
	Andrea Arcangeli, Alan Cox

Hi,

On Mon, 7 Dec 1998 18:01:31 GMT, "Stephen C. Tweedie" <sct@redhat.com>
said:

>> what we really need is somebody to try it out on 4M and
>> 8M machines...

> Been doing that.  2.1.130 is the fastest kernel ever in 8MB (using
> defrag builds over NFS as a benchmark): 25% faster that 2.0.36.  2.1.131
> is consistently about 10% slower at the same job than 130 (but still
> faster than 2.0 ever was).

Right: 2.1.131 + Rik's fixes + my fix to Rik's fixes (see below) has set
a new record for my 8MB benchmarks.  In 64MB, it is behaving much more
rationally than older kernels: still very very very fast, especially
interactively, but with no massive cache growth and swap storms when
doing filesystem intensive operations, and swap throughput when we _do_
swap is great.

I've changed your readahead stuff to look like:

	struct page *page_map = lookup_swap_cache(entry);

	if (!page_map) {
                swapin_readahead(entry);
		page_map = read_swap_cache(entry);
	}

which is the right way to do it: we don't want to start a readahead on a
swap hit, because that will try to extend the readahead "zone" one page
at a time as we hit existing pages in the cache.  That ends up with
one-page writes, with terrible performance if we have other IO activity
on the same disk.  I also tuned the readahead down to 8 pages, for the
tests on 8MB: we can make this tunable later.  

I also fixed the readahead logic itself to start with the correct
initial page (previously you were doing a "++i" in the for () condition,
which means we were skipping the first page in the readahead).  Now that
the readahead is being submitted before we do the wait-for-page, we need
to make absolutely sure to include the required page in the readahead
set.

Finally, I'll experiment with making the readahead a granularity-based
thing, so that we read an aligned block of (say) 64k from swap at a
time.  By starting the readahead on such a boundary rather than at the
current page, we can page in entire regions of swap very rapidly given a
random pattern of page hits.

For now, this is looking very good indeed.

--Stephen

----------------------------------------------------------------
--- include/linux/swap.h.~1~	Mon Dec  7 12:05:54 1998
+++ include/linux/swap.h	Mon Dec  7 18:55:55 1998
@@ -90,6 +90,7 @@
 extern struct page * read_swap_cache_async(unsigned long, int);
 #define read_swap_cache(entry) read_swap_cache_async(entry, 1);
 extern int FASTCALL(swap_count(unsigned long));
+extern struct page * lookup_swap_cache(unsigned long); 
 /*
  * Make these inline later once they are working properly.
  */
--- mm/page_alloc.c.~1~	Fri Nov 27 12:36:42 1998
+++ mm/page_alloc.c	Mon Dec  7 20:42:36 1998
@@ -360,6 +360,35 @@
 }
 
 /*
+ * Primitive swap readahead code. We simply read the
+ * next 8 entries in the swap area. This method is
+ * chosen because it doesn't cost us any seek time.
+ * We also make sure to queue the 'original' request
+ * together with the readahead ones...
+ */
+void swapin_readahead(unsigned long entry) {
+        int i;
+        struct page *new_page;
+	unsigned long offset = SWP_OFFSET(entry);
+	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
+
+	for (i = 0; i < 8; i++) {
+	      if (offset >= swapdev->max
+		              || nr_free_pages - atomic_read(&nr_async_pages) <
+			      (freepages.high + freepages.low)/2)
+		      return;
+	      if (!swapdev->swap_map[offset] ||
+                              test_bit(offset, swapdev->swap_lockmap))
+		      continue;
+	      new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
+	      if (new_page != NULL)
+                      __free_page(new_page);
+	      offset++;
+	}
+	return;
+}
+
+/*
  * The tests may look silly, but it essentially makes sure that
  * no other process did a swap-in on us just as we were waiting.
  *
@@ -370,10 +399,12 @@
 	pte_t * page_table, unsigned long entry, int write_access)
 {
 	unsigned long page;
-	struct page *page_map;
-	
-	page_map = read_swap_cache(entry);
+	struct page *page_map = lookup_swap_cache(entry);
 
+	if (!page_map) {
+                swapin_readahead(entry);
+		page_map = read_swap_cache(entry);
+	}
 	if (pte_val(*page_table) != entry) {
 		if (page_map)
 			free_page_and_swap_cache(page_address(page_map));
--- mm/page_io.c.~1~	Fri Nov 27 12:36:42 1998
+++ mm/page_io.c	Mon Dec  7 18:55:55 1998
@@ -60,7 +60,7 @@
 	}
 
 	/* Don't allow too many pending pages in flight.. */
-	if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX)
+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
 		wait = 1;
 
 	p = &swap_info[type];
--- mm/swap.c.~1~	Mon Dec  7 12:05:54 1998
+++ mm/swap.c	Mon Dec  7 18:55:55 1998
@@ -61,14 +61,14 @@
 swapstat_t swapstats = {0};
 
 buffer_mem_t buffer_mem = {
-	5,	/* minimum percent buffer */
-	10,	/* borrow percent buffer */
+	1,	/* minimum percent buffer */
+	20,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
-	5,	/* minimum percent page cache */
-	15,	/* borrow percent page cache */
+	1,	/* minimum percent page cache */
+	30,	/* borrow percent page cache */
 	75	/* maximum */
 };
 
--- mm/swap_state.c.~1~	Fri Nov 27 12:36:42 1998
+++ mm/swap_state.c	Mon Dec  7 18:55:55 1998
@@ -258,9 +258,10 @@
  * incremented.
  */
 
-static struct page * lookup_swap_cache(unsigned long entry)
+struct page * lookup_swap_cache(unsigned long entry)
 {
 	struct page *found;
+	swap_cache_find_total++;
 	
 	while (1) {
 		found = find_page(&swapper_inode, entry);
@@ -268,8 +269,10 @@
 			return 0;
 		if (found->inode != &swapper_inode || !PageSwapCache(found))
 			goto out_bad;
-		if (!PageLocked(found))
+		if (!PageLocked(found)) {
+			swap_cache_find_success++;
 			return found;
+		}
 		__free_page(found);
 		__wait_on_page(found);
 	}
--- mm/vmscan.c.~1~	Mon Dec  7 12:05:54 1998
+++ mm/vmscan.c	Mon Dec  7 18:55:55 1998
@@ -432,6 +432,8 @@
 
 	if (buffer_over_borrow() || pgcache_over_borrow())
 		state = 0;
+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster / 2)
+		shrink_mmap(i, gfp_mask);
 
 	switch (state) {
 		do {
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-07 22:04       ` Stephen C. Tweedie
@ 1998-12-07 22:51         ` Rik van Riel
  1998-12-09 17:43         ` Andrea Arcangeli
  1 sibling, 0 replies; 15+ messages in thread
From: Rik van Riel @ 1998-12-07 22:51 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Neil Conway, Linux MM, Linux Kernel, Andrea Arcangeli, Alan Cox

On Mon, 7 Dec 1998, Stephen C. Tweedie wrote:

> Right: 2.1.131 + Rik's fixes + my fix to Rik's fixes (see below) has
> set a new record for my 8MB benchmarks.  In 64MB, it is behaving
> much more rationally than older kernels: still very very very fast,
> especially interactively, but with no massive cache growth and swap
> storms when doing filesystem intensive operations, and swap
> throughput when we _do_ swap is great. 
> 
> I've changed your readahead stuff to look like:
> 
> 	struct page *page_map = lookup_swap_cache(entry);
> 
> 	if (!page_map) {
>                 swapin_readahead(entry);
> 		page_map = read_swap_cache(entry);
> 	}
> 
> which is the right way to do it: we don't want to start a readahead
> on a swap hit, because that will try to extend the readahead "zone"
> one page at a time as we hit existing pages in the cache.  That ends
> up with one-page writes,

And one-page reads too. We should probably only start reading
when there are more than swap_readahead/2 pages to read. This
will give us enough time to keep up with 'streaming' applications
while at the same time avoiding single-page I/O.

Kswapd should also avoid calling run_task_queue(&tq_disk)
when (on exit) the number of async pages is less than one
quarter of pager_daemon.swap_cluster. We can always sync
those pages later...

Besides, moving the disk head from where it's now just is
more expensive than the temporary loss of the few kilobytes
we don't free by keeping the pages on the queue :)

> I also fixed the readahead logic itself to start with the correct
> initial page (previously you were doing a "++i" in the for ()
> condition, which means we were skipping the first page in the
> readahead).

Oops, I will fix that too in my tree...

> Finally, I'll experiment with making the readahead a
> granularity-based thing, so that we read an aligned block of (say)
> 64k from swap at a time.

This would be nice, yes. Currently we page in the most
useless rubbish because we simply don't know any better...

> For now, this is looking very good indeed.

Thanks... Always good to hear something like this :)

cheers,

Rik -- the flu hits, the flu hits, the flu hits -- MORE
+-------------------------------------------------------------------+
| Linux memory management tour guide.        H.H.vanRiel@phys.uu.nl |
| Scouting Vries cubscout leader.      http://www.phys.uu.nl/~riel/ |
+-------------------------------------------------------------------+

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-07 22:04       ` Stephen C. Tweedie
  1998-12-07 22:51         ` Rik van Riel
@ 1998-12-09 17:43         ` Andrea Arcangeli
  1998-12-09 21:05           ` Rik van Riel
  1998-12-10 13:50           ` Stephen C. Tweedie
  1 sibling, 2 replies; 15+ messages in thread
From: Andrea Arcangeli @ 1998-12-09 17:43 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Rik van Riel, Neil Conway, Linux MM, Linux Kernel, Alan Cox

On Mon, 7 Dec 1998, Stephen C. Tweedie wrote:

>Right: 2.1.131 + Rik's fixes + my fix to Rik's fixes (see below) has set
>a new record for my 8MB benchmarks.  In 64MB, it is behaving much more

I think that my state = 0 in do_try_to_free_page() helped a lot to handle
the better kernel performance.

>--- mm/vmscan.c.~1~	Mon Dec  7 12:05:54 1998
>+++ mm/vmscan.c	Mon Dec  7 18:55:55 1998
>@@ -432,6 +432,8 @@
> 
> 	if (buffer_over_borrow() || pgcache_over_borrow())
> 		state = 0;
>+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster / 2)
>+		shrink_mmap(i, gfp_mask);
> 

Doing that we risk to shrink too much cache even if not necessary but this
part of the patch improve a _lot_ swapping performance even if I don' t know
why ;)

And why not to use GFP_USER in the userspace swaping code?

Index: linux/mm/swap_state.c
diff -u linux/mm/swap_state.c:1.1.3.2 linux/mm/swap_state.c:1.1.1.1.2.4
--- linux/mm/swap_state.c:1.1.3.2	Wed Dec  9 16:11:46 1998
+++ linux/mm/swap_state.c	Wed Dec  9 18:39:03 1998
@@ -261,7 +261,9 @@
 struct page * lookup_swap_cache(unsigned long entry)
 {
 	struct page *found;
+#ifdef	SWAP_CACHE_INFO
 	swap_cache_find_total++;
+#endif
 	
 	while (1) {
 		found = find_page(&swapper_inode, entry);
@@ -270,7 +272,9 @@
 		if (found->inode != &swapper_inode || !PageSwapCache(found))
 			goto out_bad;
 		if (!PageLocked(found)) {
+#ifdef	SWAP_CACHE_INFO
 			swap_cache_find_success++;
+#endif
 			return found;
 		}
 		__free_page(found);
@@ -308,7 +336,7 @@
 	if (found_page)
 		goto out;
 
-	new_page_addr = __get_free_page(GFP_KERNEL);
+	new_page_addr = __get_free_page(GFP_USER);
 	if (!new_page_addr)
 		goto out;	/* Out of memory */
 	new_page = mem_map + MAP_NR(new_page_addr);


Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-09 17:43         ` Andrea Arcangeli
@ 1998-12-09 21:05           ` Rik van Riel
  1998-12-09 23:15             ` Andrea Arcangeli
  1998-12-10 13:50           ` Stephen C. Tweedie
  1 sibling, 1 reply; 15+ messages in thread
From: Rik van Riel @ 1998-12-09 21:05 UTC (permalink / raw
  To: Andrea Arcangeli; +Cc: Stephen C. Tweedie, Linux MM, Linux Kernel, Alan Cox

On Wed, 9 Dec 1998, Andrea Arcangeli wrote:
> On Mon, 7 Dec 1998, Stephen C. Tweedie wrote:
> 
> >Right: 2.1.131 + Rik's fixes + my fix to Rik's fixes (see below) has set
> >a new record for my 8MB benchmarks.  In 64MB, it is behaving much more
> 
> I think that my state = 0 in do_try_to_free_page() helped a lot to handle
> the better kernel performance.

It does. I wonder who the culprit was that removed the state = 0
from 2.1.129 -> 2.1.130?  We've had the state = 0 since 2.1.90
when we put it in...

> >--- mm/vmscan.c.~1~	Mon Dec  7 12:05:54 1998
> >+++ mm/vmscan.c	Mon Dec  7 18:55:55 1998
> >@@ -432,6 +432,8 @@
> > 
> > 	if (buffer_over_borrow() || pgcache_over_borrow())
> > 		state = 0;
> >+	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster / 2)
> >+		shrink_mmap(i, gfp_mask);
> > 
> 
> Doing that we risk to shrink too much cache even if not necessary
> but this part of the patch improve a _lot_ swapping performance even
> if I don' t know why ;) 

This is because 'swapped' data is added to the cache. It also
is because without it kswapd would not free memory in swap_out().
Then, because it didn't free memory, it would continue to swap
out more and more and still more with no effect (remember the
removal of page aging?).

All this is fixed by the two little lines above :)

> And why not to use GFP_USER in the userspace swaping code?

>  	if (found_page)
>  		goto out;
>  
> -	new_page_addr = __get_free_page(GFP_KERNEL);
> +	new_page_addr = __get_free_page(GFP_USER);
>  	if (!new_page_addr)
>  		goto out;	/* Out of memory */
>  	new_page = mem_map + MAP_NR(new_page_addr);

Seems like a great idea... Stephen?

cheers,

Rik -- the flu hits, the flu hits, the flu hits -- MORE
+-------------------------------------------------------------------+
| Linux memory management tour guide.        H.H.vanRiel@phys.uu.nl |
| Scouting Vries cubscout leader.      http://www.phys.uu.nl/~riel/ |
+-------------------------------------------------------------------+

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-09 21:05           ` Rik van Riel
@ 1998-12-09 23:15             ` Andrea Arcangeli
  1998-12-10  1:10               ` Rik van Riel
  1998-12-10 13:52               ` Stephen C. Tweedie
  0 siblings, 2 replies; 15+ messages in thread
From: Andrea Arcangeli @ 1998-12-09 23:15 UTC (permalink / raw
  To: Rik van Riel; +Cc: Stephen C. Tweedie, Linux MM, Linux Kernel, Alan Cox

On Wed, 9 Dec 1998, Rik van Riel wrote:

>This is because 'swapped' data is added to the cache. It also
>is because without it kswapd would not free memory in swap_out().
>Then, because it didn't free memory, it would continue to swap
>out more and more and still more with no effect (remember the
>removal of page aging?).

Nono, I reversed the vmscan changes on my tree. On my tree when swap_out
returns 1 it has really freed a page ;). I have many other differences...
I am going to do some other interesting benchmark right now to understand
if really my choices are the best as I think...

Andrea Arcangeli

PS to see other things grab arca-51.

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-09 23:15             ` Andrea Arcangeli
@ 1998-12-10  1:10               ` Rik van Riel
  1998-12-10 13:52               ` Stephen C. Tweedie
  1 sibling, 0 replies; 15+ messages in thread
From: Rik van Riel @ 1998-12-10  1:10 UTC (permalink / raw
  To: Andrea Arcangeli; +Cc: Stephen C. Tweedie, Linux MM, Linux Kernel, Alan Cox

On Thu, 10 Dec 1998, Andrea Arcangeli wrote:
> On Wed, 9 Dec 1998, Rik van Riel wrote:
> 
> >This is because 'swapped' data is added to the cache. It also
> >is because without it kswapd would not free memory in swap_out().
> >Then, because it didn't free memory, it would continue to swap
> >out more and more and still more with no effect (remember the
> >removal of page aging?).
> 
> Nono, I reversed the vmscan changes on my tree. On my tree when
> swap_out returns 1 it has really freed a page ;).

swap_out() _never_ frees a page any more. It pushes the
pages out to swap and dereferences them so we can free
them with shrink_mmap(). This provides free page aging
and several more benefits.

You can play with the algorithms as much as you want,
however -- I'll be interested to hear about the results...

cheers,

Rik -- the flu hits, the flu hits, the flu hits -- MORE
+-------------------------------------------------------------------+
| Linux memory management tour guide.        H.H.vanRiel@phys.uu.nl |
| Scouting Vries cubscout leader.      http://www.phys.uu.nl/~riel/ |
+-------------------------------------------------------------------+

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-09 17:43         ` Andrea Arcangeli
  1998-12-09 21:05           ` Rik van Riel
@ 1998-12-10 13:50           ` Stephen C. Tweedie
  1 sibling, 0 replies; 15+ messages in thread
From: Stephen C. Tweedie @ 1998-12-10 13:50 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Stephen C. Tweedie, Rik van Riel, Neil Conway, Linux MM,
	Linux Kernel, Alan Cox

Hi,

On Wed, 9 Dec 1998 18:43:25 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> I think that my state = 0 in do_try_to_free_page() helped a lot to handle
> the better kernel performance.

Have you done any benchmarking on it?  The VM is now looking pretty
good, and I'd be very reluctant to keep tweaking it now without solid
evidence as to how that will affect performance: we need to draw a line
somewhere for 2.2.  I think we're now beyond the point where it makes
sense to say "here, try THIS patch to see what happens" without at least
making some attempt to test it first.

> And why not to use GFP_USER in the userspace swaping code?

Good point.


> Index: linux/mm/swap_state.c
> diff -u linux/mm/swap_state.c:1.1.3.2 linux/mm/swap_state.c:1.1.1.1.2.4
> --- linux/mm/swap_state.c:1.1.3.2	Wed Dec  9 16:11:46 1998
> +++ linux/mm/swap_state.c	Wed Dec  9 18:39:03 1998
> @@ -308,7 +336,7 @@
>  	if (found_page)
>  		goto out;
> 
> -	new_page_addr = __get_free_page(GFP_KERNEL);
> +	new_page_addr = __get_free_page(GFP_USER);
>  	if (!new_page_addr)
>  		goto out;	/* Out of memory */
>  	new_page = mem_map + MAP_NR(new_page_addr);

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] VM improvements for 2.1.131
  1998-12-09 23:15             ` Andrea Arcangeli
  1998-12-10  1:10               ` Rik van Riel
@ 1998-12-10 13:52               ` Stephen C. Tweedie
  1 sibling, 0 replies; 15+ messages in thread
From: Stephen C. Tweedie @ 1998-12-10 13:52 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Rik van Riel, Stephen C. Tweedie, Linux MM, Linux Kernel,
	Alan Cox

Hi,

On Thu, 10 Dec 1998 00:15:50 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> Nono, I reversed the vmscan changes on my tree. On my tree when swap_out
> returns 1 it has really freed a page ;). 

There are other issues with respect to the swap_out return value: in
particular, you MUST return 1 if you block (because during the block
interval the process underneath may have been killed), no matter what
state the current page is.  Be careful with this!

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~1998-12-10 13:53 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
1998-12-06  0:34 [PATCH] VM improvements for 2.1.131 Rik van Riel
1998-12-06  2:10 ` Eric W. Biederman
1998-12-06  1:59   ` Rik van Riel
1998-12-07 17:08   ` Stephen C. Tweedie
1998-12-07 10:47 ` Neil Conway
1998-12-07 13:04   ` Rik van Riel
1998-12-07 18:01     ` Stephen C. Tweedie
1998-12-07 22:04       ` Stephen C. Tweedie
1998-12-07 22:51         ` Rik van Riel
1998-12-09 17:43         ` Andrea Arcangeli
1998-12-09 21:05           ` Rik van Riel
1998-12-09 23:15             ` Andrea Arcangeli
1998-12-10  1:10               ` Rik van Riel
1998-12-10 13:52               ` Stephen C. Tweedie
1998-12-10 13:50           ` Stephen C. Tweedie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).