From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <e@80x24.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net
X-Spam-Level: *
X-Spam-ASN: AS60068 185.152.64.0/23
X-Spam-Status: No, score=1.2 required=3.0 tests=AWL,BAYES_00,RCVD_IN_MSPIKE_BL,
	RCVD_IN_MSPIKE_ZBI,RCVD_IN_XBL,SPF_FAIL,SPF_HELO_FAIL shortcircuit=no
	autolearn=no autolearn_force=no version=3.4.0
Received: from 80x24.org (tor-exit-node.1.justaguy.be [185.152.65.180])
	by dcvr.yhbt.net (Postfix) with ESMTP id D4DEB1F42E
	for <spew@80x24.org>; Sun, 29 Apr 2018 03:50:16 +0000 (UTC)
From: Eric Wong <e@80x24.org>
To: spew@80x24.org
Subject: [PATCH 2/2] gc: rb_wait_for_single_fd performs GC if idle (Linux)
Date: Sun, 29 Apr 2018 03:50:07 +0000
Message-Id: <20180429035007.6499-3-e@80x24.org>
In-Reply-To: <20180429035007.6499-1-e@80x24.org>
References: <20180429035007.6499-1-e@80x24.org>
List-Id: <spew.80x24.org>

Before this patch, the entropy-dependent script below takes 95MB
consistently on my system.  Now, depending on the amount of
entropy on my system, it takes anywhere from 43MB to 75MB.

I'm using /dev/urandom to simulate real-world network latency
variations.  There is no improvement when using /dev/zero
because the process is never idle.

  require 'net/http'
  require 'digest/md5'
  Thread.abort_on_exception = true
  s = TCPServer.new('127.0.0.1', 0)
  len = 1024 * 1024 * 1024
  th = Thread.new do
    c = s.accept
    c.readpartial(16384)
    c.write("HTTP/1.0 200 OK\r\nContent-Length: #{len}\r\n\r\n")
    IO.copy_stream('/dev/urandom', c, len)
    c.close
  end

  addr = s.addr
  Net::HTTP.start(addr[3], addr[1]) do |http|
    http.request_get('/') do |res|
      dig = Digest::MD5.new
      res.read_body { |buf|
        dig.update(buf)
      }
      puts dig.hexdigest
    end
  end

The above script is also dependent on net/protocol using
read_nonblock.  Ordinary IO objects will need IO#nonblock=true
to see benefits (because they never hit rb_wait_for_single_fd)

* gc.c (rb_gc_inprogress): new function
  (rb_gc_step): ditto
* internal.h: declare prototypes for new gc.c functions
* thread_pthread.c (gvl_contended_p): new function
* thread_win32.c (gvl_contended_p): ditto (dummy)
* thread.c (rb_wait_for_single_fd w/ ppoll):
  use new functions to perform GC while GVL is uncontended
  and GC is lazy sweeping or incremental marking
  [ruby-core:86265]
---
 gc.c             | 21 +++++++++++++++++++++
 internal.h       |  4 ++++
 thread.c         | 21 +++++++++++++++------
 thread_pthread.c |  6 ++++++
 thread_win32.c   |  6 ++++++
 5 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/gc.c b/gc.c
index b9c1305060..143ef2a2c5 100644
--- a/gc.c
+++ b/gc.c
@@ -6518,6 +6518,27 @@ gc_rest(rb_objspace_t *objspace)
     }
 }
 
+int
+rb_gc_inprogress(const rb_execution_context_t *ec)
+{
+    rb_objspace_t *objspace = rb_ec_vm_ptr(ec)->objspace;
+
+    /* TODO: should this also check is_incremental_marking() ? */
+    return is_lazy_sweeping(&objspace->eden_heap) ||
+               is_incremental_marking(objspace);
+}
+
+/* returns true if there is more work to do, false if not */
+int
+rb_gc_step(const rb_execution_context_t *ec)
+{
+    rb_objspace_t *objspace = rb_ec_vm_ptr(ec)->objspace;
+
+    gc_rest(objspace);
+
+    return rb_gc_inprogress(ec);
+}
+
 struct objspace_and_reason {
     rb_objspace_t *objspace;
     int reason;
diff --git a/internal.h b/internal.h
index 85370ec0d7..43043e6601 100644
--- a/internal.h
+++ b/internal.h
@@ -1290,6 +1290,10 @@ void rb_gc_writebarrier_remember(VALUE obj);
 void ruby_gc_set_params(int safe_level);
 void rb_copy_wb_protected_attribute(VALUE dest, VALUE obj);
 
+struct rb_execution_context_struct;
+int rb_gc_inprogress(const struct rb_execution_context_struct *);
+int rb_gc_step(const struct rb_execution_context_struct *);
+
 #if defined(HAVE_MALLOC_USABLE_SIZE) || defined(HAVE_MALLOC_SIZE) || defined(_WIN32)
 #define ruby_sized_xrealloc(ptr, new_size, old_size) ruby_xrealloc(ptr, new_size)
 #define ruby_sized_xrealloc2(ptr, new_count, element_size, old_count) ruby_xrealloc(ptr, new_count, element_size)
diff --git a/thread.c b/thread.c
index 65844f5442..4725c809b8 100644
--- a/thread.c
+++ b/thread.c
@@ -3961,10 +3961,12 @@ ppoll(struct pollfd *fds, nfds_t nfds,
 int
 rb_wait_for_single_fd(int fd, int events, struct timeval *timeout)
 {
+    static const struct timespec zero;
     struct pollfd fds;
     int result = 0, lerrno;
     struct timespec ts, end, *tsp;
     rb_thread_t *th = GET_THREAD();
+    int do_gc = rb_gc_inprogress(th->ec);
 
     timeout_prepare(&tsp, &ts, &end, timeout);
     fds.fd = fd;
@@ -3973,13 +3975,20 @@ rb_wait_for_single_fd(int fd, int events, struct timeval *timeout)
     do {
         fds.revents = 0;
         lerrno = 0;
-        BLOCKING_REGION({
-            result = ppoll(&fds, 1, tsp, NULL);
-            if (result < 0) lerrno = errno;
-        }, ubf_select, th, FALSE);
 
-        RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
-    } while (result < 0 && retryable(errno = lerrno) &&
+        if (!do_gc || gvl_contended_p(th->vm)) {
+            BLOCKING_REGION({
+                result = ppoll(&fds, 1, tsp, NULL);
+                if (result < 0) lerrno = errno;
+            }, ubf_select, th, FALSE);
+            RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
+        }
+        else { /* no need to release GVL if nobody is waiting for it */
+            do_gc = rb_gc_step(th->ec);
+            result = ppoll(&fds, 1, &zero, NULL);
+            if (result < 0) lerrno = errno;
+        }
+    } while ((result == 0 || (result < 0 && retryable(errno = lerrno))) &&
             update_timespec(tsp, &end));
     if (result < 0) return -1;
 
diff --git a/thread_pthread.c b/thread_pthread.c
index 6337620e8a..fccac48a44 100644
--- a/thread_pthread.c
+++ b/thread_pthread.c
@@ -156,6 +156,12 @@ gvl_yield(rb_vm_t *vm, rb_thread_t *th)
     rb_native_mutex_unlock(&vm->gvl.lock);
 }
 
+static int
+gvl_contended_p(const rb_vm_t *vm)
+{
+    return vm->gvl.waiting > 0;
+}
+
 static void
 gvl_init(rb_vm_t *vm)
 {
diff --git a/thread_win32.c b/thread_win32.c
index ab308905cb..3e3a62dd59 100644
--- a/thread_win32.c
+++ b/thread_win32.c
@@ -113,6 +113,12 @@ gvl_yield(rb_vm_t *vm, rb_thread_t *th)
   gvl_acquire(vm, th);
 }
 
+static void
+gvl_contended_p(const rb_vm_t *vm)
+{
+    return 1; /* TODO for win32 maintainer */
+}
+
 static void
 gvl_init(rb_vm_t *vm)
 {
-- 
EW