[RFC PATCH v0.1 4/9] sched/umcg: implement core UMCG API

All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Oskolkov <posk@google.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	linux-kernel@vger.kernel.org, linux-api@vger.kernel.org
Cc: Paul Turner <pjt@google.com>, Ben Segall <bsegall@google.com>,
	Peter Oskolkov <posk@google.com>, Peter Oskolkov <posk@posk.io>,
	Joel Fernandes <joel@joelfernandes.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andrei Vagin <avagin@google.com>,
	Jim Newsome <jnewsome@torproject.org>
Subject: [RFC PATCH v0.1 4/9] sched/umcg: implement core UMCG API
Date: Thu, 20 May 2021 11:36:09 -0700	[thread overview]
Message-ID: <20210520183614.1227046-5-posk@google.com> (raw)
In-Reply-To: <20210520183614.1227046-1-posk@google.com>

Implement version 1 of core UMCG API (wait/wake/swap).

As has been outlined in
https://lore.kernel.org/lkml/20200722234538.166697-1-posk@posk.io/,
efficient and synchronous on-CPU context switching is key
to enabling two broad use cases: in-process M:N userspace scheduling
and fast X-process RPCs for security wrappers.

High-level design considerations/approaches used:
- wait & wake can race with each other;
- offload as much work as possible to libumcg in tools/lib/umcg,
  specifically:
  - most state changes, e.g. RUNNABLE <=> RUNNING, are done in
    the userspace (libumcg);
  - retries are offloaded to the userspace.

This implementation misses timeout handling in sys_umcg_wait
and sys_umcg_swap, which will be added in version 2.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/linux/sched.h |   7 +-
 kernel/sched/core.c   |   3 +
 kernel/sched/umcg.c   | 237 ++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/umcg.h   |  42 ++++++++
 4 files changed, 282 insertions(+), 7 deletions(-)
 create mode 100644 kernel/sched/umcg.h

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c7e7d50e2fdc..fc4b8775f514 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -66,6 +66,7 @@ struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
+struct umcg_task_data;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -778,6 +779,10 @@ struct task_struct {
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
 
+#ifdef CONFIG_UMCG
+	struct umcg_task_data __rcu	*umcg_task_data;
+#endif
+
 	/* Per-thread vma caching: */
 	struct vmacache			vmacache;
 
@@ -1022,7 +1027,7 @@ struct task_struct {
 	u64				parent_exec_id;
 	u64				self_exec_id;
 
-	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
+	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy, umcg: */
 	spinlock_t			alloc_lock;
 
 	/* Protection of the PI data structures: */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88506bc2617f..462104f13c28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3964,6 +3964,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
 	p->migration_pending = NULL;
 #endif
+#ifdef CONFIG_UMCG
+	rcu_assign_pointer(p->umcg_task_data, NULL);
+#endif
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c
index b8195cfdb76a..2d718433c773 100644
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -4,11 +4,23 @@
  * User Managed Concurrency Groups (UMCG).
  */
 
+#include <linux/freezer.h>
 #include <linux/syscalls.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/umcg.h>
 
+#include "sched.h"
+#include "umcg.h"
+
+static int __api_version(u32 requested)
+{
+	if (requested == 1)
+		return 0;
+
+	return 1;
+}
+
 /**
  * sys_umcg_api_version - query UMCG API versions that are supported.
  * @api_version:          Requested API version.
@@ -26,7 +38,52 @@
  */
 SYSCALL_DEFINE2(umcg_api_version, u32, api_version, u32, flags)
 {
-	return -ENOSYS;
+	if (flags)
+		return -EINVAL;
+
+	return __api_version(api_version);
+}
+
+static int get_state(struct umcg_task __user *ut, u32 *state)
+{
+	return get_user(*state, (u32 __user *)ut);
+}
+
+static int put_state(struct umcg_task __user *ut, u32 state)
+{
+	return put_user(state, (u32 __user *)ut);
+}
+
+static int register_core_task(u32 api_version, struct umcg_task __user *umcg_task)
+{
+	struct umcg_task_data *utd;
+	u32 state;
+
+	if (get_state(umcg_task, &state))
+		return -EFAULT;
+
+	if (state != UMCG_TASK_NONE)
+		return -EINVAL;
+
+	utd = kzalloc(sizeof(struct umcg_task_data), GFP_KERNEL);
+	if (!utd)
+		return -EINVAL;
+
+	utd->self = current;
+	utd->umcg_task = umcg_task;
+	utd->task_type = UMCG_TT_CORE;
+	utd->api_version = api_version;
+
+	if (put_state(umcg_task, UMCG_TASK_RUNNING)) {
+		kfree(utd);
+		return -EFAULT;
+	}
+
+	task_lock(current);
+	rcu_assign_pointer(current->umcg_task_data, utd);
+	task_unlock(current);
+
+	return 0;
 }
 
 /**
@@ -54,7 +111,20 @@ SYSCALL_DEFINE2(umcg_api_version, u32, api_version, u32, flags)
 SYSCALL_DEFINE4(umcg_register_task, u32, api_version, u32, flags, u32, group_id,
 		struct umcg_task __user *, umcg_task)
 {
-	return -ENOSYS;
+	if (__api_version(api_version))
+		return -EOPNOTSUPP;
+
+	if (rcu_access_pointer(current->umcg_task_data) || !umcg_task)
+		return -EINVAL;
+
+	switch (flags) {
+	case UMCG_REGISTER_CORE_TASK:
+		if (group_id != UMCG_NOID)
+			return -EINVAL;
+		return register_core_task(api_version, umcg_task);
+	default:
+		return -EINVAL;
+	}
 }
 
 /**
@@ -67,7 +137,75 @@ SYSCALL_DEFINE4(umcg_register_task, u32, api_version, u32, flags, u32, group_id,
  */
 SYSCALL_DEFINE1(umcg_unregister_task, u32, flags)
 {
-	return -ENOSYS;
+	struct umcg_task_data *utd;
+	int ret = -EINVAL;
+
+	rcu_read_lock();
+	utd = rcu_dereference(current->umcg_task_data);
+
+	if (!utd || flags)
+		goto out;
+
+	task_lock(current);
+	rcu_assign_pointer(current->umcg_task_data, NULL);
+	task_unlock(current);
+
+	ret = 0;
+
+out:
+	rcu_read_unlock();
+	if (!ret && utd) {
+		synchronize_rcu();
+		kfree(utd);
+	}
+	return ret;
+}
+
+static int do_context_switch(struct task_struct *next)
+{
+	struct umcg_task_data *utd = rcu_access_pointer(current->umcg_task_data);
+
+	/*
+	 * It is important to set_current_state(TASK_INTERRUPTIBLE) before
+	 * waking @next, as @next may immediately try to wake current back
+	 * (e.g. current is a server, @next is a worker that immediately
+	 * blocks or waits), and this next wakeup must not be lost.
+	 */
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	WRITE_ONCE(utd->in_wait, true);
+
+	if (!try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU))
+		return -EAGAIN;
+
+	freezable_schedule();
+
+	WRITE_ONCE(utd->in_wait, false);
+
+	if (signal_pending(current))
+		return -EINTR;
+
+	return 0;
+}
+
+static int do_wait(void)
+{
+	struct umcg_task_data *utd = rcu_access_pointer(current->umcg_task_data);
+
+	if (!utd)
+		return -EINVAL;
+
+	WRITE_ONCE(utd->in_wait, true);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	freezable_schedule();
+
+	WRITE_ONCE(utd->in_wait, false);
+
+	if (signal_pending(current))
+		return -EINTR;
+
+	return 0;
 }
 
 /**
@@ -90,7 +228,23 @@ SYSCALL_DEFINE1(umcg_unregister_task, u32, flags)
 SYSCALL_DEFINE2(umcg_wait, u32, flags,
 		const struct __kernel_timespec __user *, timeout)
 {
-	return -ENOSYS;
+	struct umcg_task_data *utd;
+
+	if (flags)
+		return -EINVAL;
+	if (timeout)
+		return -EOPNOTSUPP;
+
+	rcu_read_lock();
+	utd = rcu_dereference(current->umcg_task_data);
+	if (!utd) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	rcu_read_unlock();
+
+	return do_wait();
 }
 
 /**
@@ -110,7 +264,39 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags,
  */
 SYSCALL_DEFINE2(umcg_wake, u32, flags, u32, next_tid)
 {
-	return -ENOSYS;
+	struct umcg_task_data *next_utd;
+	struct task_struct *next;
+	int ret = -EINVAL;
+
+	if (!next_tid)
+		return -EINVAL;
+	if (flags)
+		return -EINVAL;
+
+	next = find_get_task_by_vpid(next_tid);
+	if (!next)
+		return -ESRCH;
+
+	rcu_read_lock();
+	next_utd = rcu_dereference(next->umcg_task_data);
+	if (!next_utd)
+		goto out;
+
+	if (!READ_ONCE(next_utd->in_wait)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	ret = wake_up_process(next);
+	put_task_struct(next);
+	if (ret)
+		ret = 0;
+	else
+		ret = -EAGAIN;
+
+out:
+	rcu_read_unlock();
+	return ret;
 }
 
 /**
@@ -139,5 +325,44 @@ SYSCALL_DEFINE2(umcg_wake, u32, flags, u32, next_tid)
 SYSCALL_DEFINE4(umcg_swap, u32, wake_flags, u32, next_tid, u32, wait_flags,
 		const struct __kernel_timespec __user *, timeout)
 {
-	return -ENOSYS;
+	struct umcg_task_data *curr_utd;
+	struct umcg_task_data *next_utd;
+	struct task_struct *next;
+	int ret = -EINVAL;
+
+	rcu_read_lock();
+	curr_utd = rcu_dereference(current->umcg_task_data);
+
+	if (!next_tid || wake_flags || wait_flags || !curr_utd)
+		goto out;
+
+	if (timeout) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	next = find_get_task_by_vpid(next_tid);
+	if (!next) {
+		ret = -ESRCH;
+		goto out;
+	}
+
+	next_utd = rcu_dereference(next->umcg_task_data);
+	if (!next_utd) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!READ_ONCE(next_utd->in_wait)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	rcu_read_unlock();
+
+	return do_context_switch(next);
+
+out:
+	rcu_read_unlock();
+	return ret;
 }
diff --git a/kernel/sched/umcg.h b/kernel/sched/umcg.h
new file mode 100644
index 000000000000..6791d570f622
--- /dev/null
+++ b/kernel/sched/umcg.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_SCHED_UMCG_H
+#define _KERNEL_SCHED_UMCG_H
+
+#ifdef CONFIG_UMCG
+
+#include <linux/sched.h>
+#include <linux/umcg.h>
+
+enum umcg_task_type {
+	UMCG_TT_CORE	= 1,
+	UMCG_TT_SERVER	= 2,
+	UMCG_TT_WORKER	= 3
+};
+
+struct umcg_task_data {
+	/* umcg_task != NULL. Never changes. */
+	struct umcg_task __user		*umcg_task;
+
+	/* The task that owns this umcg_task_data. Never changes. */
+	struct task_struct		*self;
+
+	/* Core task, server, or worker. Never changes. */
+	enum umcg_task_type		task_type;
+
+	/*
+	 * The API version used to register this task. If this is a
+	 * worker or a server, must equal group->api_version.
+	 *
+	 * Never changes.
+	 */
+	u32 api_version;
+
+	/*
+	 * Used by wait/wake routines to handle races. Written only by current.
+	 */
+	bool				in_wait;
+};
+
+#endif  /* CONFIG_UMCG */
+#endif  /* _KERNEL_SCHED_UMCG_H */
-- 
2.31.1.818.g46aad6cb9e-goog

next prev parent reply	other threads:[~2021-05-20 18:36 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-20 18:36 [RFC PATCH v0.1 0/9] UMCG early preview/RFC patchset Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 1/9] sched/umcg: add UMCG syscall stubs and CONFIG_UMCG Peter Oskolkov
2021-05-22 18:40   ` kernel test robot
2021-05-22 21:49   ` kernel test robot
2021-05-20 18:36 ` [RFC PATCH v0.1 2/9] sched/umcg: add uapi/linux/umcg.h and sched/umcg.c Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 3/9] sched: add WF_CURRENT_CPU and externise ttwu Peter Oskolkov
2021-05-20 18:36 ` Peter Oskolkov [this message]
2021-05-21 19:06   ` [RFC PATCH v0.1 4/9] sched/umcg: implement core UMCG API Andrei Vagin
2021-05-21 21:31     ` Jann Horn
2021-05-21 22:03       ` Peter Oskolkov
2021-05-21 19:32   ` Andy Lutomirski
2021-05-21 22:01     ` Peter Oskolkov
2021-05-21 21:33   ` Jann Horn
2021-06-09 13:01     ` Peter Zijlstra
2021-05-20 18:36 ` [RFC PATCH v0.1 5/9] lib/umcg: implement UMCG core API for userspace Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 6/9] selftests/umcg: add UMCG core API selftest Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 7/9] sched/umcg: add UMCG server/worker API (early RFC) Peter Oskolkov
2021-05-21 20:17   ` Andrei Vagin
2021-05-22 18:29   ` kernel test robot
2021-05-22 19:34   ` kernel test robot
2021-05-22 20:19   ` kernel test robot
2021-05-20 18:36 ` [RFC PATCH v0.1 8/9] lib/umcg: " Peter Oskolkov
2021-05-20 18:36 ` [RFC PATCH v0.1 9/9] selftests/umcg: add UMCG server/worker API selftest Peter Oskolkov
2021-05-20 21:17 ` [RFC PATCH v0.1 0/9] UMCG early preview/RFC patchset Jonathan Corbet
2021-05-20 21:38   ` Peter Oskolkov
2021-05-21  0:15     ` Randy Dunlap
2021-05-21  8:04       ` Peter Zijlstra
2021-05-21 15:08     ` Jonathan Corbet
2021-05-21 16:03       ` Peter Oskolkov
2021-05-21 19:17         ` Jonathan Corbet
2021-05-27  0:06           ` Peter Oskolkov
2021-05-27 15:41             ` Jonathan Corbet
     [not found] ` <CAEWA0a72SvpcuN4ov=98T3uWtExPCr7BQePOgjkqD1ofWKEASw@mail.gmail.com>
2021-05-21 19:13   ` Peter Oskolkov
2021-05-21 23:08     ` Jann Horn
2021-06-09 12:54 ` Peter Zijlstra
2021-06-09 20:18   ` Peter Oskolkov
2021-06-10 18:02     ` Peter Zijlstra
2021-06-10 20:06       ` Peter Oskolkov
2021-07-07 17:45       ` Thierry Delisle
2021-07-08 21:44         ` Peter Oskolkov

find likely ancestor, descendant, or conflicting patches for this message:
dfblob:c7e7d50e2fd dfblob:88506bc2617 dfblob:b8195cfdb76
dfblob:fc4b8775f51 dfblob:462104f13c2 dfblob:2d718433c77
dfblob:6791d570f62
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210520183614.1227046-5-posk@google.com \
    --to=posk@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=avagin@google.com \
    --cc=bsegall@google.com \
    --cc=jnewsome@torproject.org \
    --cc=joel@joelfernandes.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=posk@posk.io \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.