[PATCH -next v7 5/5] md: protect md

All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH -next v7 5/5] md: protect md_thread with rcu
  2023-04-13 15:10 [PATCH -next v6 " Logan Gunthorpe
@ 2023-04-14  1:32 ` Yu Kuai
  2023-04-23  2:42   ` Yu Kuai
  0 siblings, 1 reply; 11+ messages in thread
From: Yu Kuai @ 2023-04-14  1:32 UTC (permalink / raw
  To: song, logang
  Cc: linux-raid, linux-kernel, yukuai3, yukuai1, yi.zhang, yangerkun

From: Yu Kuai <yukuai3@huawei.com>

Our test reports a uaf for 'mddev->sync_thread':

T1                      T2
md_start_sync
 md_register_thread
 // mddev->sync_thread is set
			raid1d
			 md_check_recovery
			  md_reap_sync_thread
			   md_unregister_thread
			    kfree

 md_wakeup_thread
  wake_up
  ->sync_thread was freed

Root cause is that there is a small windown between register thread and
wake up thread, where the thread can be freed concurrently.

Currently, a global spinlock 'pers_lock' is borrowed to protect
'mddev->thread', this problem can be fixed likewise, however, there are
similar problems elsewhere, and use a global lock for all the cases is
not good.

This patch protect all md_thread with rcu.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md-bitmap.c    | 10 ++++--
 drivers/md/md-cluster.c   | 17 ++++++----
 drivers/md/md-multipath.c |  4 +--
 drivers/md/md.c           | 69 ++++++++++++++++++---------------------
 drivers/md/md.h           |  8 ++---
 drivers/md/raid1.c        |  7 ++--
 drivers/md/raid1.h        |  2 +-
 drivers/md/raid10.c       | 21 +++++++-----
 drivers/md/raid10.h       |  2 +-
 drivers/md/raid5-cache.c  | 22 ++++++++-----
 drivers/md/raid5.c        | 15 +++++----
 drivers/md/raid5.h        |  2 +-
 12 files changed, 98 insertions(+), 81 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 29fd41ef55a6..ab27f66dbb1f 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1221,13 +1221,19 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
 static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
 			      bool force)
 {
-	struct md_thread *thread = mddev->thread;
+	struct md_thread *thread;
+
+	rcu_read_lock();
+	thread = rcu_dereference(mddev->thread);
 
 	if (!thread)
-		return;
+		goto out;
 
 	if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT)
 		thread->timeout = timeout;
+
+out:
+	rcu_read_unlock();
 }
 
 /*
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 10e0c5381d01..bd2e0c61b2e6 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -75,14 +75,14 @@ struct md_cluster_info {
 	sector_t suspend_hi;
 	int suspend_from; /* the slot which broadcast suspend_lo/hi */
 
-	struct md_thread *recovery_thread;
+	struct md_thread __rcu *recovery_thread;
 	unsigned long recovery_map;
 	/* communication loc resources */
 	struct dlm_lock_resource *ack_lockres;
 	struct dlm_lock_resource *message_lockres;
 	struct dlm_lock_resource *token_lockres;
 	struct dlm_lock_resource *no_new_dev_lockres;
-	struct md_thread *recv_thread;
+	struct md_thread __rcu *recv_thread;
 	struct completion newdisk_completion;
 	wait_queue_head_t wait;
 	unsigned long state;
@@ -362,8 +362,8 @@ static void __recover_slot(struct mddev *mddev, int slot)
 
 	set_bit(slot, &cinfo->recovery_map);
 	if (!cinfo->recovery_thread) {
-		cinfo->recovery_thread = md_register_thread(recover_bitmaps,
-				mddev, "recover");
+		rcu_assign_pointer(cinfo->recovery_thread,
+			md_register_thread(recover_bitmaps, mddev, "recover"));
 		if (!cinfo->recovery_thread) {
 			pr_warn("md-cluster: Could not create recovery thread\n");
 			return;
@@ -526,11 +526,15 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
 	int got_lock = 0;
+	struct md_thread *thread;
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
 
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
-	wait_event(mddev->thread->wqueue,
+
+	/* demaon thread must exist */
+	thread = rcu_dereference_protected(mddev->thread, true);
+	wait_event(thread->wqueue,
 		   (got_lock = mddev_trylock(mddev)) ||
 		    test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
 	md_reload_sb(mddev, mddev->good_device_nr);
@@ -889,7 +893,8 @@ static int join(struct mddev *mddev, int nodes)
 	}
 	/* Initiate the communication resources */
 	ret = -ENOMEM;
-	cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
+	rcu_assign_pointer(cinfo->recv_thread,
+			md_register_thread(recv_daemon, mddev, "cluster_recv"));
 	if (!cinfo->recv_thread) {
 		pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
 		goto err;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 66edf5e72bd6..92c45be203d7 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -400,8 +400,8 @@ static int multipath_run (struct mddev *mddev)
 	if (ret)
 		goto out_free_conf;
 
-	mddev->thread = md_register_thread(multipathd, mddev,
-					   "multipath");
+	rcu_assign_pointer(mddev->thread,
+			   md_register_thread(multipathd, mddev, "multipath"));
 	if (!mddev->thread)
 		goto out_free_conf;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e3c30500bd15..f680eccf4197 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -70,11 +70,7 @@
 #include "md-bitmap.h"
 #include "md-cluster.h"
 
-/* pers_list is a list of registered personalities protected
- * by pers_lock.
- * pers_lock does extra service to protect accesses to
- * mddev->thread when the mutex cannot be held.
- */
+/* pers_list is a list of registered personalities protected by pers_lock. */
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
@@ -92,7 +88,7 @@ static struct workqueue_struct *md_rdev_misc_wq;
 static int remove_and_add_spares(struct mddev *mddev,
 				 struct md_rdev *this);
 static void mddev_detach(struct mddev *mddev);
-static void md_wakeup_thread_directly(struct md_thread *thread);
+static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
 
 enum md_ro_state {
 	MD_RDWR,
@@ -458,8 +454,10 @@ static void md_submit_bio(struct bio *bio)
  */
 void mddev_suspend(struct mddev *mddev)
 {
-	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
-	lockdep_assert_held(&mddev->reconfig_mutex);
+	struct md_thread *thread = rcu_dereference_protected(mddev->thread,
+			lockdep_is_held(&mddev->reconfig_mutex));
+
+	WARN_ON_ONCE(thread && current == thread->tsk);
 	if (mddev->suspended++)
 		return;
 	wake_up(&mddev->sb_wait);
@@ -801,13 +799,8 @@ void mddev_unlock(struct mddev *mddev)
 	} else
 		mutex_unlock(&mddev->reconfig_mutex);
 
-	/* As we've dropped the mutex we need a spinlock to
-	 * make sure the thread doesn't disappear
-	 */
-	spin_lock(&pers_lock);
 	md_wakeup_thread(mddev->thread);
 	wake_up(&mddev->sb_wait);
-	spin_unlock(&pers_lock);
 }
 EXPORT_SYMBOL_GPL(mddev_unlock);
 
@@ -7891,19 +7884,29 @@ static int md_thread(void *arg)
 	return 0;
 }
 
-static void md_wakeup_thread_directly(struct md_thread *thread)
+static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
 {
-	if (thread)
-		wake_up_process(thread->tsk);
+	struct md_thread *t;
+
+	rcu_read_lock();
+	t = rcu_dereference(thread);
+	if (t)
+		wake_up_process(t->tsk);
+	rcu_read_unlock();
 }
 
-void md_wakeup_thread(struct md_thread *thread)
+void md_wakeup_thread(struct md_thread __rcu *thread)
 {
-	if (thread) {
-		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
-		set_bit(THREAD_WAKEUP, &thread->flags);
-		wake_up(&thread->wqueue);
+	struct md_thread *t;
+
+	rcu_read_lock();
+	t = rcu_dereference(thread);
+	if (t) {
+		pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
+		set_bit(THREAD_WAKEUP, &t->flags);
+		wake_up(&t->wqueue);
 	}
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(md_wakeup_thread);
 
@@ -7933,22 +7936,15 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
 }
 EXPORT_SYMBOL(md_register_thread);
 
-void md_unregister_thread(struct md_thread **threadp)
+void md_unregister_thread(struct md_thread __rcu **threadp)
 {
-	struct md_thread *thread;
+	struct md_thread *thread = rcu_dereference_protected(*threadp, true);
 
-	/*
-	 * Locking ensures that mddev_unlock does not wake_up a
-	 * non-existent thread
-	 */
-	spin_lock(&pers_lock);
-	thread = *threadp;
-	if (!thread) {
-		spin_unlock(&pers_lock);
+	if (!thread)
 		return;
-	}
-	*threadp = NULL;
-	spin_unlock(&pers_lock);
+
+	rcu_assign_pointer(*threadp, NULL);
+	synchronize_rcu();
 
 	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
 	kthread_stop(thread->tsk);
@@ -9210,9 +9206,8 @@ static void md_start_sync(struct work_struct *ws)
 {
 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
 
-	mddev->sync_thread = md_register_thread(md_do_sync,
-						mddev,
-						"resync");
+	rcu_assign_pointer(mddev->sync_thread,
+			   md_register_thread(md_do_sync, mddev, "resync"));
 	if (!mddev->sync_thread) {
 		pr_warn("%s: could not start resync thread...\n",
 			mdname(mddev));
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e148e3c83b0d..324558c3fa06 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -367,8 +367,8 @@ struct mddev {
 	int				new_chunk_sectors;
 	int				reshape_backwards;
 
-	struct md_thread		*thread;	/* management thread */
-	struct md_thread		*sync_thread;	/* doing resync or reconstruct */
+	struct md_thread __rcu		*thread;	/* management thread */
+	struct md_thread __rcu		*sync_thread;	/* doing resync or reconstruct */
 
 	/* 'last_sync_action' is initialized to "none".  It is set when a
 	 * sync operation (i.e "data-check", "requested-resync", "resync",
@@ -734,8 +734,8 @@ extern struct md_thread *md_register_thread(
 	void (*run)(struct md_thread *thread),
 	struct mddev *mddev,
 	const char *name);
-extern void md_unregister_thread(struct md_thread **threadp);
-extern void md_wakeup_thread(struct md_thread *thread);
+extern void md_unregister_thread(struct md_thread __rcu **threadp);
+extern void md_wakeup_thread(struct md_thread __rcu *thread);
 extern void md_check_recovery(struct mddev *mddev);
 extern void md_reap_sync_thread(struct mddev *mddev);
 extern int mddev_init_writes_pending(struct mddev *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 68a9e2d9985b..2f1011ffdf09 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3084,7 +3084,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	}
 
 	err = -ENOMEM;
-	conf->thread = md_register_thread(raid1d, mddev, "raid1");
+	rcu_assign_pointer(conf->thread,
+			   md_register_thread(raid1d, mddev, "raid1"));
 	if (!conf->thread)
 		goto abort;
 
@@ -3177,8 +3178,8 @@ static int raid1_run(struct mddev *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->thread = conf->thread;
-	conf->thread = NULL;
+	rcu_assign_pointer(mddev->thread, conf->thread);
+	rcu_assign_pointer(conf->thread, NULL);
 	mddev->private = conf;
 	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index ebb6788820e7..468f189da7a0 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -130,7 +130,7 @@ struct r1conf {
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
-	struct md_thread	*thread;
+	struct md_thread __rcu	*thread;
 
 	/* Keep track of cluster resync window to send to other
 	 * nodes.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6c66357f92f5..6590aa49598c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -980,6 +980,7 @@ static void lower_barrier(struct r10conf *conf)
 static bool stop_waiting_barrier(struct r10conf *conf)
 {
 	struct bio_list *bio_list = current->bio_list;
+	struct md_thread *thread;
 
 	/* barrier is dropped */
 	if (!conf->barrier)
@@ -995,8 +996,11 @@ static bool stop_waiting_barrier(struct r10conf *conf)
 	    (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
 		return true;
 
+	/* daemon thread must exist while handling io */
+	thread = rcu_dereference_protected(conf->mddev->thread, true);
+
 	/* move on if recovery thread is blocked by us */
-	if (conf->mddev->thread->tsk == current &&
+	if (thread->tsk == current &&
 	    test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) &&
 	    conf->nr_queued > 0)
 		return true;
@@ -4078,7 +4082,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	atomic_set(&conf->nr_pending, 0);
 
 	err = -ENOMEM;
-	conf->thread = md_register_thread(raid10d, mddev, "raid10");
+	rcu_assign_pointer(conf->thread,
+			   md_register_thread(raid10d, mddev, "raid10"));
 	if (!conf->thread)
 		goto out;
 
@@ -4141,8 +4146,8 @@ static int raid10_run(struct mddev *mddev)
 		}
 	}
 
-	mddev->thread = conf->thread;
-	conf->thread = NULL;
+	rcu_assign_pointer(mddev->thread, conf->thread);
+	rcu_assign_pointer(conf->thread, NULL);
 
 	if (mddev->queue) {
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
@@ -4273,8 +4278,8 @@ static int raid10_run(struct mddev *mddev)
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-							"reshape");
+		rcu_assign_pointer(mddev->sync_thread,
+			md_register_thread(md_do_sync, mddev, "reshape"));
 		if (!mddev->sync_thread)
 			goto out_free_conf;
 	}
@@ -4686,8 +4691,8 @@ static int raid10_start_reshape(struct mddev *mddev)
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 
-	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-						"reshape");
+	rcu_assign_pointer(mddev->sync_thread,
+			   md_register_thread(md_do_sync, mddev, "reshape"));
 	if (!mddev->sync_thread) {
 		ret = -EAGAIN;
 		goto abort;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 8c072ce0bc54..63e48b11b552 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -100,7 +100,7 @@ struct r10conf {
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
-	struct md_thread	*thread;
+	struct md_thread __rcu	*thread;
 
 	/*
 	 * Keep track of cluster resync window to send to other nodes.
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 46182b955aef..040f5d6e1298 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -120,7 +120,7 @@ struct r5l_log {
 	struct bio_set bs;
 	mempool_t meta_pool;
 
-	struct md_thread *reclaim_thread;
+	struct md_thread __rcu *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
 					 * reclaimed.  if it's 0, reclaim spaces
 					 * used by io_units which are in
@@ -1576,17 +1576,18 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 
 void r5l_quiesce(struct r5l_log *log, int quiesce)
 {
-	struct mddev *mddev;
+	struct mddev *mddev = log->rdev->mddev;
+	struct md_thread *thread = rcu_dereference_protected(
+		log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex));
 
 	if (quiesce) {
 		/* make sure r5l_write_super_and_discard_space exits */
-		mddev = log->rdev->mddev;
 		wake_up(&mddev->sb_wait);
-		kthread_park(log->reclaim_thread->tsk);
+		kthread_park(thread->tsk);
 		r5l_wake_reclaim(log, MaxSector);
 		r5l_do_reclaim(log);
 	} else
-		kthread_unpark(log->reclaim_thread->tsk);
+		kthread_unpark(thread->tsk);
 }
 
 bool r5l_log_disk_error(struct r5conf *conf)
@@ -3063,6 +3064,7 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 {
 	struct r5l_log *log;
+	struct md_thread *thread;
 	int ret;
 
 	pr_debug("md/raid:%s: using device %pg as journal\n",
@@ -3121,11 +3123,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	spin_lock_init(&log->tree_lock);
 	INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
 
-	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
-						 log->rdev->mddev, "reclaim");
-	if (!log->reclaim_thread)
+	thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev,
+				    "reclaim");
+	if (!thread)
 		goto reclaim_thread;
-	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+
+	thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+	rcu_assign_pointer(log->reclaim_thread, thread);
 
 	init_waitqueue_head(&log->iounit_wait);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7b820b81d8c2..1f68bba9d0b9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7665,7 +7665,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	}
 
 	sprintf(pers_name, "raid%d", mddev->new_level);
-	conf->thread = md_register_thread(raid5d, mddev, pers_name);
+	rcu_assign_pointer(conf->thread,
+			   md_register_thread(raid5d, mddev, pers_name));
 	if (!conf->thread) {
 		pr_warn("md/raid:%s: couldn't allocate thread.\n",
 			mdname(mddev));
@@ -7889,8 +7890,8 @@ static int raid5_run(struct mddev *mddev)
 	}
 
 	conf->min_offset_diff = min_offset_diff;
-	mddev->thread = conf->thread;
-	conf->thread = NULL;
+	rcu_assign_pointer(mddev->thread, conf->thread);
+	rcu_assign_pointer(conf->thread, NULL);
 	mddev->private = conf;
 
 	for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
@@ -7989,8 +7990,8 @@ static int raid5_run(struct mddev *mddev)
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-							"reshape");
+		rcu_assign_pointer(mddev->sync_thread,
+			md_register_thread(md_do_sync, mddev, "reshape"));
 		if (!mddev->sync_thread)
 			goto abort;
 	}
@@ -8567,8 +8568,8 @@ static int raid5_start_reshape(struct mddev *mddev)
 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-						"reshape");
+	rcu_assign_pointer(mddev->sync_thread,
+			   md_register_thread(md_do_sync, mddev, "reshape"));
 	if (!mddev->sync_thread) {
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index e873938a6125..f19707189a7b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -679,7 +679,7 @@ struct r5conf {
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
-	struct md_thread	*thread;
+	struct md_thread __rcu	*thread;
 	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 	struct r5worker_group	*worker_groups;
 	int			group_cnt;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH -next v7 5/5] md: protect md_thread with rcu
  2023-04-14  1:32 ` [PATCH -next v7 " Yu Kuai
@ 2023-04-23  2:42   ` Yu Kuai
  2023-04-24 22:38     ` Song Liu
  0 siblings, 1 reply; 11+ messages in thread
From: Yu Kuai @ 2023-04-23  2:42 UTC (permalink / raw
  To: Yu Kuai, song, logang
  Cc: linux-raid, linux-kernel, yi.zhang, yangerkun, yukuai (C)

Hi,

在 2023/04/14 9:32, Yu Kuai 写道:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> Our test reports a uaf for 'mddev->sync_thread':
> 
> T1                      T2
> md_start_sync
>   md_register_thread
>   // mddev->sync_thread is set
> 			raid1d
> 			 md_check_recovery
> 			  md_reap_sync_thread
> 			   md_unregister_thread
> 			    kfree
> 
>   md_wakeup_thread
>    wake_up
>    ->sync_thread was freed
> 
> Root cause is that there is a small windown between register thread and
> wake up thread, where the thread can be freed concurrently.
> 
> Currently, a global spinlock 'pers_lock' is borrowed to protect
> 'mddev->thread', this problem can be fixed likewise, however, there are
> similar problems elsewhere, and use a global lock for all the cases is
> not good.
> 
> This patch protect all md_thread with rcu.

Friendly ping... Or do I need to resend the whole patchset for v7?

Thanks,
Kuai
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>   drivers/md/md-bitmap.c    | 10 ++++--
>   drivers/md/md-cluster.c   | 17 ++++++----
>   drivers/md/md-multipath.c |  4 +--
>   drivers/md/md.c           | 69 ++++++++++++++++++---------------------
>   drivers/md/md.h           |  8 ++---
>   drivers/md/raid1.c        |  7 ++--
>   drivers/md/raid1.h        |  2 +-
>   drivers/md/raid10.c       | 21 +++++++-----
>   drivers/md/raid10.h       |  2 +-
>   drivers/md/raid5-cache.c  | 22 ++++++++-----
>   drivers/md/raid5.c        | 15 +++++----
>   drivers/md/raid5.h        |  2 +-
>   12 files changed, 98 insertions(+), 81 deletions(-)
> 
> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> index 29fd41ef55a6..ab27f66dbb1f 100644
> --- a/drivers/md/md-bitmap.c
> +++ b/drivers/md/md-bitmap.c
> @@ -1221,13 +1221,19 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
>   static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
>   			      bool force)
>   {
> -	struct md_thread *thread = mddev->thread;
> +	struct md_thread *thread;
> +
> +	rcu_read_lock();
> +	thread = rcu_dereference(mddev->thread);
>   
>   	if (!thread)
> -		return;
> +		goto out;
>   
>   	if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT)
>   		thread->timeout = timeout;
> +
> +out:
> +	rcu_read_unlock();
>   }
>   
>   /*
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index 10e0c5381d01..bd2e0c61b2e6 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -75,14 +75,14 @@ struct md_cluster_info {
>   	sector_t suspend_hi;
>   	int suspend_from; /* the slot which broadcast suspend_lo/hi */
>   
> -	struct md_thread *recovery_thread;
> +	struct md_thread __rcu *recovery_thread;
>   	unsigned long recovery_map;
>   	/* communication loc resources */
>   	struct dlm_lock_resource *ack_lockres;
>   	struct dlm_lock_resource *message_lockres;
>   	struct dlm_lock_resource *token_lockres;
>   	struct dlm_lock_resource *no_new_dev_lockres;
> -	struct md_thread *recv_thread;
> +	struct md_thread __rcu *recv_thread;
>   	struct completion newdisk_completion;
>   	wait_queue_head_t wait;
>   	unsigned long state;
> @@ -362,8 +362,8 @@ static void __recover_slot(struct mddev *mddev, int slot)
>   
>   	set_bit(slot, &cinfo->recovery_map);
>   	if (!cinfo->recovery_thread) {
> -		cinfo->recovery_thread = md_register_thread(recover_bitmaps,
> -				mddev, "recover");
> +		rcu_assign_pointer(cinfo->recovery_thread,
> +			md_register_thread(recover_bitmaps, mddev, "recover"));
>   		if (!cinfo->recovery_thread) {
>   			pr_warn("md-cluster: Could not create recovery thread\n");
>   			return;
> @@ -526,11 +526,15 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
>   static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
>   {
>   	int got_lock = 0;
> +	struct md_thread *thread;
>   	struct md_cluster_info *cinfo = mddev->cluster_info;
>   	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
>   
>   	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
> -	wait_event(mddev->thread->wqueue,
> +
> +	/* demaon thread must exist */
> +	thread = rcu_dereference_protected(mddev->thread, true);
> +	wait_event(thread->wqueue,
>   		   (got_lock = mddev_trylock(mddev)) ||
>   		    test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
>   	md_reload_sb(mddev, mddev->good_device_nr);
> @@ -889,7 +893,8 @@ static int join(struct mddev *mddev, int nodes)
>   	}
>   	/* Initiate the communication resources */
>   	ret = -ENOMEM;
> -	cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
> +	rcu_assign_pointer(cinfo->recv_thread,
> +			md_register_thread(recv_daemon, mddev, "cluster_recv"));
>   	if (!cinfo->recv_thread) {
>   		pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
>   		goto err;
> diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
> index 66edf5e72bd6..92c45be203d7 100644
> --- a/drivers/md/md-multipath.c
> +++ b/drivers/md/md-multipath.c
> @@ -400,8 +400,8 @@ static int multipath_run (struct mddev *mddev)
>   	if (ret)
>   		goto out_free_conf;
>   
> -	mddev->thread = md_register_thread(multipathd, mddev,
> -					   "multipath");
> +	rcu_assign_pointer(mddev->thread,
> +			   md_register_thread(multipathd, mddev, "multipath"));
>   	if (!mddev->thread)
>   		goto out_free_conf;
>   
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index e3c30500bd15..f680eccf4197 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -70,11 +70,7 @@
>   #include "md-bitmap.h"
>   #include "md-cluster.h"
>   
> -/* pers_list is a list of registered personalities protected
> - * by pers_lock.
> - * pers_lock does extra service to protect accesses to
> - * mddev->thread when the mutex cannot be held.
> - */
> +/* pers_list is a list of registered personalities protected by pers_lock. */
>   static LIST_HEAD(pers_list);
>   static DEFINE_SPINLOCK(pers_lock);
>   
> @@ -92,7 +88,7 @@ static struct workqueue_struct *md_rdev_misc_wq;
>   static int remove_and_add_spares(struct mddev *mddev,
>   				 struct md_rdev *this);
>   static void mddev_detach(struct mddev *mddev);
> -static void md_wakeup_thread_directly(struct md_thread *thread);
> +static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
>   
>   enum md_ro_state {
>   	MD_RDWR,
> @@ -458,8 +454,10 @@ static void md_submit_bio(struct bio *bio)
>    */
>   void mddev_suspend(struct mddev *mddev)
>   {
> -	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
> -	lockdep_assert_held(&mddev->reconfig_mutex);
> +	struct md_thread *thread = rcu_dereference_protected(mddev->thread,
> +			lockdep_is_held(&mddev->reconfig_mutex));
> +
> +	WARN_ON_ONCE(thread && current == thread->tsk);
>   	if (mddev->suspended++)
>   		return;
>   	wake_up(&mddev->sb_wait);
> @@ -801,13 +799,8 @@ void mddev_unlock(struct mddev *mddev)
>   	} else
>   		mutex_unlock(&mddev->reconfig_mutex);
>   
> -	/* As we've dropped the mutex we need a spinlock to
> -	 * make sure the thread doesn't disappear
> -	 */
> -	spin_lock(&pers_lock);
>   	md_wakeup_thread(mddev->thread);
>   	wake_up(&mddev->sb_wait);
> -	spin_unlock(&pers_lock);
>   }
>   EXPORT_SYMBOL_GPL(mddev_unlock);
>   
> @@ -7891,19 +7884,29 @@ static int md_thread(void *arg)
>   	return 0;
>   }
>   
> -static void md_wakeup_thread_directly(struct md_thread *thread)
> +static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
>   {
> -	if (thread)
> -		wake_up_process(thread->tsk);
> +	struct md_thread *t;
> +
> +	rcu_read_lock();
> +	t = rcu_dereference(thread);
> +	if (t)
> +		wake_up_process(t->tsk);
> +	rcu_read_unlock();
>   }
>   
> -void md_wakeup_thread(struct md_thread *thread)
> +void md_wakeup_thread(struct md_thread __rcu *thread)
>   {
> -	if (thread) {
> -		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
> -		set_bit(THREAD_WAKEUP, &thread->flags);
> -		wake_up(&thread->wqueue);
> +	struct md_thread *t;
> +
> +	rcu_read_lock();
> +	t = rcu_dereference(thread);
> +	if (t) {
> +		pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
> +		set_bit(THREAD_WAKEUP, &t->flags);
> +		wake_up(&t->wqueue);
>   	}
> +	rcu_read_unlock();
>   }
>   EXPORT_SYMBOL(md_wakeup_thread);
>   
> @@ -7933,22 +7936,15 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
>   }
>   EXPORT_SYMBOL(md_register_thread);
>   
> -void md_unregister_thread(struct md_thread **threadp)
> +void md_unregister_thread(struct md_thread __rcu **threadp)
>   {
> -	struct md_thread *thread;
> +	struct md_thread *thread = rcu_dereference_protected(*threadp, true);
>   
> -	/*
> -	 * Locking ensures that mddev_unlock does not wake_up a
> -	 * non-existent thread
> -	 */
> -	spin_lock(&pers_lock);
> -	thread = *threadp;
> -	if (!thread) {
> -		spin_unlock(&pers_lock);
> +	if (!thread)
>   		return;
> -	}
> -	*threadp = NULL;
> -	spin_unlock(&pers_lock);
> +
> +	rcu_assign_pointer(*threadp, NULL);
> +	synchronize_rcu();
>   
>   	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
>   	kthread_stop(thread->tsk);
> @@ -9210,9 +9206,8 @@ static void md_start_sync(struct work_struct *ws)
>   {
>   	struct mddev *mddev = container_of(ws, struct mddev, del_work);
>   
> -	mddev->sync_thread = md_register_thread(md_do_sync,
> -						mddev,
> -						"resync");
> +	rcu_assign_pointer(mddev->sync_thread,
> +			   md_register_thread(md_do_sync, mddev, "resync"));
>   	if (!mddev->sync_thread) {
>   		pr_warn("%s: could not start resync thread...\n",
>   			mdname(mddev));
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index e148e3c83b0d..324558c3fa06 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -367,8 +367,8 @@ struct mddev {
>   	int				new_chunk_sectors;
>   	int				reshape_backwards;
>   
> -	struct md_thread		*thread;	/* management thread */
> -	struct md_thread		*sync_thread;	/* doing resync or reconstruct */
> +	struct md_thread __rcu		*thread;	/* management thread */
> +	struct md_thread __rcu		*sync_thread;	/* doing resync or reconstruct */
>   
>   	/* 'last_sync_action' is initialized to "none".  It is set when a
>   	 * sync operation (i.e "data-check", "requested-resync", "resync",
> @@ -734,8 +734,8 @@ extern struct md_thread *md_register_thread(
>   	void (*run)(struct md_thread *thread),
>   	struct mddev *mddev,
>   	const char *name);
> -extern void md_unregister_thread(struct md_thread **threadp);
> -extern void md_wakeup_thread(struct md_thread *thread);
> +extern void md_unregister_thread(struct md_thread __rcu **threadp);
> +extern void md_wakeup_thread(struct md_thread __rcu *thread);
>   extern void md_check_recovery(struct mddev *mddev);
>   extern void md_reap_sync_thread(struct mddev *mddev);
>   extern int mddev_init_writes_pending(struct mddev *mddev);
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 68a9e2d9985b..2f1011ffdf09 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -3084,7 +3084,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
>   	}
>   
>   	err = -ENOMEM;
> -	conf->thread = md_register_thread(raid1d, mddev, "raid1");
> +	rcu_assign_pointer(conf->thread,
> +			   md_register_thread(raid1d, mddev, "raid1"));
>   	if (!conf->thread)
>   		goto abort;
>   
> @@ -3177,8 +3178,8 @@ static int raid1_run(struct mddev *mddev)
>   	/*
>   	 * Ok, everything is just fine now
>   	 */
> -	mddev->thread = conf->thread;
> -	conf->thread = NULL;
> +	rcu_assign_pointer(mddev->thread, conf->thread);
> +	rcu_assign_pointer(conf->thread, NULL);
>   	mddev->private = conf;
>   	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
>   
> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
> index ebb6788820e7..468f189da7a0 100644
> --- a/drivers/md/raid1.h
> +++ b/drivers/md/raid1.h
> @@ -130,7 +130,7 @@ struct r1conf {
>   	/* When taking over an array from a different personality, we store
>   	 * the new thread here until we fully activate the array.
>   	 */
> -	struct md_thread	*thread;
> +	struct md_thread __rcu	*thread;
>   
>   	/* Keep track of cluster resync window to send to other
>   	 * nodes.
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 6c66357f92f5..6590aa49598c 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -980,6 +980,7 @@ static void lower_barrier(struct r10conf *conf)
>   static bool stop_waiting_barrier(struct r10conf *conf)
>   {
>   	struct bio_list *bio_list = current->bio_list;
> +	struct md_thread *thread;
>   
>   	/* barrier is dropped */
>   	if (!conf->barrier)
> @@ -995,8 +996,11 @@ static bool stop_waiting_barrier(struct r10conf *conf)
>   	    (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
>   		return true;
>   
> +	/* daemon thread must exist while handling io */
> +	thread = rcu_dereference_protected(conf->mddev->thread, true);
> +
>   	/* move on if recovery thread is blocked by us */
> -	if (conf->mddev->thread->tsk == current &&
> +	if (thread->tsk == current &&
>   	    test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) &&
>   	    conf->nr_queued > 0)
>   		return true;
> @@ -4078,7 +4082,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
>   	atomic_set(&conf->nr_pending, 0);
>   
>   	err = -ENOMEM;
> -	conf->thread = md_register_thread(raid10d, mddev, "raid10");
> +	rcu_assign_pointer(conf->thread,
> +			   md_register_thread(raid10d, mddev, "raid10"));
>   	if (!conf->thread)
>   		goto out;
>   
> @@ -4141,8 +4146,8 @@ static int raid10_run(struct mddev *mddev)
>   		}
>   	}
>   
> -	mddev->thread = conf->thread;
> -	conf->thread = NULL;
> +	rcu_assign_pointer(mddev->thread, conf->thread);
> +	rcu_assign_pointer(conf->thread, NULL);
>   
>   	if (mddev->queue) {
>   		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
> @@ -4273,8 +4278,8 @@ static int raid10_run(struct mddev *mddev)
>   		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
>   		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> -		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
> -							"reshape");
> +		rcu_assign_pointer(mddev->sync_thread,
> +			md_register_thread(md_do_sync, mddev, "reshape"));
>   		if (!mddev->sync_thread)
>   			goto out_free_conf;
>   	}
> @@ -4686,8 +4691,8 @@ static int raid10_start_reshape(struct mddev *mddev)
>   	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
>   	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
>   
> -	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
> -						"reshape");
> +	rcu_assign_pointer(mddev->sync_thread,
> +			   md_register_thread(md_do_sync, mddev, "reshape"));
>   	if (!mddev->sync_thread) {
>   		ret = -EAGAIN;
>   		goto abort;
> diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
> index 8c072ce0bc54..63e48b11b552 100644
> --- a/drivers/md/raid10.h
> +++ b/drivers/md/raid10.h
> @@ -100,7 +100,7 @@ struct r10conf {
>   	/* When taking over an array from a different personality, we store
>   	 * the new thread here until we fully activate the array.
>   	 */
> -	struct md_thread	*thread;
> +	struct md_thread __rcu	*thread;
>   
>   	/*
>   	 * Keep track of cluster resync window to send to other nodes.
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 46182b955aef..040f5d6e1298 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -120,7 +120,7 @@ struct r5l_log {
>   	struct bio_set bs;
>   	mempool_t meta_pool;
>   
> -	struct md_thread *reclaim_thread;
> +	struct md_thread __rcu *reclaim_thread;
>   	unsigned long reclaim_target;	/* number of space that need to be
>   					 * reclaimed.  if it's 0, reclaim spaces
>   					 * used by io_units which are in
> @@ -1576,17 +1576,18 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
>   
>   void r5l_quiesce(struct r5l_log *log, int quiesce)
>   {
> -	struct mddev *mddev;
> +	struct mddev *mddev = log->rdev->mddev;
> +	struct md_thread *thread = rcu_dereference_protected(
> +		log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex));
>   
>   	if (quiesce) {
>   		/* make sure r5l_write_super_and_discard_space exits */
> -		mddev = log->rdev->mddev;
>   		wake_up(&mddev->sb_wait);
> -		kthread_park(log->reclaim_thread->tsk);
> +		kthread_park(thread->tsk);
>   		r5l_wake_reclaim(log, MaxSector);
>   		r5l_do_reclaim(log);
>   	} else
> -		kthread_unpark(log->reclaim_thread->tsk);
> +		kthread_unpark(thread->tsk);
>   }
>   
>   bool r5l_log_disk_error(struct r5conf *conf)
> @@ -3063,6 +3064,7 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
>   int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>   {
>   	struct r5l_log *log;
> +	struct md_thread *thread;
>   	int ret;
>   
>   	pr_debug("md/raid:%s: using device %pg as journal\n",
> @@ -3121,11 +3123,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>   	spin_lock_init(&log->tree_lock);
>   	INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
>   
> -	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
> -						 log->rdev->mddev, "reclaim");
> -	if (!log->reclaim_thread)
> +	thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev,
> +				    "reclaim");
> +	if (!thread)
>   		goto reclaim_thread;
> -	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
> +
> +	thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
> +	rcu_assign_pointer(log->reclaim_thread, thread);
>   
>   	init_waitqueue_head(&log->iounit_wait);
>   
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 7b820b81d8c2..1f68bba9d0b9 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -7665,7 +7665,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
>   	}
>   
>   	sprintf(pers_name, "raid%d", mddev->new_level);
> -	conf->thread = md_register_thread(raid5d, mddev, pers_name);
> +	rcu_assign_pointer(conf->thread,
> +			   md_register_thread(raid5d, mddev, pers_name));
>   	if (!conf->thread) {
>   		pr_warn("md/raid:%s: couldn't allocate thread.\n",
>   			mdname(mddev));
> @@ -7889,8 +7890,8 @@ static int raid5_run(struct mddev *mddev)
>   	}
>   
>   	conf->min_offset_diff = min_offset_diff;
> -	mddev->thread = conf->thread;
> -	conf->thread = NULL;
> +	rcu_assign_pointer(mddev->thread, conf->thread);
> +	rcu_assign_pointer(conf->thread, NULL);
>   	mddev->private = conf;
>   
>   	for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
> @@ -7989,8 +7990,8 @@ static int raid5_run(struct mddev *mddev)
>   		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
>   		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> -		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
> -							"reshape");
> +		rcu_assign_pointer(mddev->sync_thread,
> +			md_register_thread(md_do_sync, mddev, "reshape"));
>   		if (!mddev->sync_thread)
>   			goto abort;
>   	}
> @@ -8567,8 +8568,8 @@ static int raid5_start_reshape(struct mddev *mddev)
>   	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
>   	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
>   	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> -	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
> -						"reshape");
> +	rcu_assign_pointer(mddev->sync_thread,
> +			   md_register_thread(md_do_sync, mddev, "reshape"));
>   	if (!mddev->sync_thread) {
>   		mddev->recovery = 0;
>   		spin_lock_irq(&conf->device_lock);
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index e873938a6125..f19707189a7b 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -679,7 +679,7 @@ struct r5conf {
>   	/* When taking over an array from a different personality, we store
>   	 * the new thread here until we fully activate the array.
>   	 */
> -	struct md_thread	*thread;
> +	struct md_thread __rcu	*thread;
>   	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
>   	struct r5worker_group	*worker_groups;
>   	int			group_cnt;
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH -next v7 5/5] md: protect md_thread with rcu
  2023-04-23  2:42   ` Yu Kuai
@ 2023-04-24 22:38     ` Song Liu
  0 siblings, 0 replies; 11+ messages in thread
From: Song Liu @ 2023-04-24 22:38 UTC (permalink / raw
  To: Yu Kuai; +Cc: logang, linux-raid, linux-kernel, yi.zhang, yangerkun, yukuai (C)

On Sat, Apr 22, 2023 at 7:42 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2023/04/14 9:32, Yu Kuai 写道:
> > From: Yu Kuai <yukuai3@huawei.com>
> >
> > Our test reports a uaf for 'mddev->sync_thread':
> >
> > T1                      T2
> > md_start_sync
> >   md_register_thread
> >   // mddev->sync_thread is set
> >                       raid1d
> >                        md_check_recovery
> >                         md_reap_sync_thread
> >                          md_unregister_thread
> >                           kfree
> >
> >   md_wakeup_thread
> >    wake_up
> >    ->sync_thread was freed
> >
> > Root cause is that there is a small windown between register thread and
> > wake up thread, where the thread can be freed concurrently.
> >
> > Currently, a global spinlock 'pers_lock' is borrowed to protect
> > 'mddev->thread', this problem can be fixed likewise, however, there are
> > similar problems elsewhere, and use a global lock for all the cases is
> > not good.
> >
> > This patch protect all md_thread with rcu.
>
> Friendly ping... Or do I need to resend the whole patchset for v7?

Sorry for the delay. But yes, please resend the whole patchset.

Song

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH -next v7 0/5] md: protect md_thread with rcu
@ 2023-04-25 11:52 Yu Kuai
  2023-04-25 11:52 ` [PATCH -next v7 1/5] md: factor out a helper to wake up md_thread directly Yu Kuai
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Yu Kuai @ 2023-04-25 11:52 UTC (permalink / raw
  To: logang, axboe, song
  Cc: linux-block, linux-kernel, linux-raid, yukuai3, yukuai1, yi.zhang,
	yangerkun

From: Yu Kuai <yukuai3@huawei.com>

Changes in v7:
 - add missing changes for md-cluster
 - rebase for the latest md-next branch

Changes in v6:
 - remove first patch from v5, and use rcu_asign_pointer() directly from
 caller.
 - always use rcu_read_lock/unlock to protect mddev_set_timeout().

Changes in v5:
 - use rcu_dereference_protected() instead of rcu_access_pointer() where
 rcu_read_lock/unlock is not required.
 - add patch 4,5 to handle that bitmap timeout is set multiple times.

Changes in v4:
 - remove patch 2 from v3
 - fix sparse errors and warnings from v3, in order to do that, all access
 to md_thread need to be modified, patch 2-4 is splited to avoid a huge
 patch.

Changes in v3:
 - remove patch 3 from v2
 - use rcu instead of a new lock

Changes in v2:
 - fix a compile error for md-cluster in patch 2
 - replace spin_lock/unlock with spin_lock/unlock_irq in patch 5
 - don't wake up inside the new lock in md wakeup_thread in patch 5

Yu Kuai (5):
  md: factor out a helper to wake up md_thread directly
  dm-raid: remove useless checking in raid_message()
  md/bitmap: always wake up md_thread in timeout_store
  md/bitmap: factor out a helper to set timeout
  md: protect md_thread with rcu

 block/blk-cgroup.c        |  3 ++
 drivers/md/dm-raid.c      |  4 +-
 drivers/md/md-bitmap.c    | 43 +++++++++++--------
 drivers/md/md-cluster.c   | 17 +++++---
 drivers/md/md-multipath.c |  4 +-
 drivers/md/md.c           | 88 +++++++++++++++++++++------------------
 drivers/md/md.h           |  8 ++--
 drivers/md/raid1.c        |  7 ++--
 drivers/md/raid1.h        |  2 +-
 drivers/md/raid10.c       | 20 +++++----
 drivers/md/raid10.h       |  2 +-
 drivers/md/raid5-cache.c  | 22 ++++++----
 drivers/md/raid5.c        | 15 +++----
 drivers/md/raid5.h        |  2 +-
 14 files changed, 135 insertions(+), 102 deletions(-)

-- 
2.39.2


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH -next v7 1/5] md: factor out a helper to wake up md_thread directly
  2023-04-25 11:52 [PATCH -next v7 0/5] md: protect md_thread with rcu Yu Kuai
@ 2023-04-25 11:52 ` Yu Kuai
  2023-04-25 11:52 ` [PATCH -next v7 2/5] dm-raid: remove useless checking in raid_message() Yu Kuai
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Yu Kuai @ 2023-04-25 11:52 UTC (permalink / raw
  To: logang, axboe, song
  Cc: linux-block, linux-kernel, linux-raid, yukuai3, yukuai1, yi.zhang,
	yangerkun

From: Yu Kuai <yukuai3@huawei.com>

md_wakeup_thread() can't wakeup md_thread->tsk if md_thread->run is
still in progress, and in some cases md_thread->tsk need to be woke up
directly, like md_set_readonly() and do_md_stop().

Commit 9dfbdafda3b3 ("md: unlock mddev before reap sync_thread in
action_store") introduce a new scenario where unregister sync_thread is
not protected by 'reconfig_mutex', this can cause null-ptr-deference in
theroy:

t1: md_set_readonly		t2: action_store
				md_unregister_thread
				// 'reconfig_mutex' is not held
// 'reconfig_mutex' is held by caller
if (mddev->sync_thread)
				 thread = *threadp
				 *threadp = NULL
 wake_up_process(mddev->sync_thread->tsk)
 // null-ptr-deference

This patch factor out a helper to wake up md_thread directly, so that
'sync_thread' won't be accessed multiple times from the reader side. And
perhaps this helper will be used later to fix action_store().

This patch also prepare to protect md_thread with rcu.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e344b4b3444..b4f482d69de3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq;
 static int remove_and_add_spares(struct mddev *mddev,
 				 struct md_rdev *this);
 static void mddev_detach(struct mddev *mddev);
+static void md_wakeup_thread_directly(struct md_thread *thread);
 
 enum md_ro_state {
 	MD_RDWR,
@@ -6269,10 +6270,12 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 	}
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-	if (mddev->sync_thread)
-		/* Thread might be blocked waiting for metadata update
-		 * which will now never happen */
-		wake_up_process(mddev->sync_thread->tsk);
+
+	/*
+	 * Thread might be blocked waiting for metadata update which will now
+	 * never happen
+	 */
+	md_wakeup_thread_directly(mddev->sync_thread);
 
 	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
 		return -EBUSY;
@@ -6333,10 +6336,12 @@ static int do_md_stop(struct mddev *mddev, int mode,
 	}
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-	if (mddev->sync_thread)
-		/* Thread might be blocked waiting for metadata update
-		 * which will now never happen */
-		wake_up_process(mddev->sync_thread->tsk);
+
+	/*
+	 * Thread might be blocked waiting for metadata update which will now
+	 * never happen
+	 */
+	md_wakeup_thread_directly(mddev->sync_thread);
 
 	mddev_unlock(mddev);
 	wait_event(resync_wait, (mddev->sync_thread == NULL &&
@@ -7886,6 +7891,12 @@ static int md_thread(void *arg)
 	return 0;
 }
 
+static void md_wakeup_thread_directly(struct md_thread *thread)
+{
+	if (thread)
+		wake_up_process(thread->tsk);
+}
+
 void md_wakeup_thread(struct md_thread *thread)
 {
 	if (thread) {
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH -next v7 2/5] dm-raid: remove useless checking in raid_message()
  2023-04-25 11:52 [PATCH -next v7 0/5] md: protect md_thread with rcu Yu Kuai
  2023-04-25 11:52 ` [PATCH -next v7 1/5] md: factor out a helper to wake up md_thread directly Yu Kuai
@ 2023-04-25 11:52 ` Yu Kuai
  2023-04-25 11:52 ` [PATCH -next v7 3/5] md/bitmap: always wake up md_thread in timeout_store Yu Kuai
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Yu Kuai @ 2023-04-25 11:52 UTC (permalink / raw
  To: logang, axboe, song
  Cc: linux-block, linux-kernel, linux-raid, yukuai3, yukuai1, yi.zhang,
	yangerkun

From: Yu Kuai <yukuai3@huawei.com>

md_wakeup_thread() handle the case that pass in md_thread is NULL, there
is no need to check this. Prepare to protect md_thread with rcu.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/dm-raid.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c8821fcb8299..8846bf510a35 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3750,11 +3750,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
 		 * canceling read-auto mode
 		 */
 		mddev->ro = 0;
-		if (!mddev->suspended && mddev->sync_thread)
+		if (!mddev->suspended)
 			md_wakeup_thread(mddev->sync_thread);
 	}
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	if (!mddev->suspended && mddev->thread)
+	if (!mddev->suspended)
 		md_wakeup_thread(mddev->thread);
 
 	return 0;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH -next v7 3/5] md/bitmap: always wake up md_thread in timeout_store
  2023-04-25 11:52 [PATCH -next v7 0/5] md: protect md_thread with rcu Yu Kuai
  2023-04-25 11:52 ` [PATCH -next v7 1/5] md: factor out a helper to wake up md_thread directly Yu Kuai
  2023-04-25 11:52 ` [PATCH -next v7 2/5] dm-raid: remove useless checking in raid_message() Yu Kuai
@ 2023-04-25 11:52 ` Yu Kuai
  2023-04-25 11:52 ` [PATCH -next v7 4/5] md/bitmap: factor out a helper to set timeout Yu Kuai
  2023-04-25 11:52 ` [PATCH -next v7 5/5] md: protect md_thread with rcu Yu Kuai
  4 siblings, 0 replies; 11+ messages in thread
From: Yu Kuai @ 2023-04-25 11:52 UTC (permalink / raw
  To: logang, axboe, song
  Cc: linux-block, linux-kernel, linux-raid, yukuai3, yukuai1, yi.zhang,
	yangerkun

From: Yu Kuai <yukuai3@huawei.com>

md_wakeup_thread() can handle the case that pass in md_thread is NULL,
the only difference is that md_wakeup_thread() will be called when
current timeout is 'MAX_SCHEDULE_TIMEOUT', this should not matter
because timeout_store() is not hot path, and the daemon process is
woke up more than demand from other context already.

This patch prepare to factor out a helper to set timeout.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md-bitmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 920bb68156d2..06b980f71a6b 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2475,11 +2475,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len)
 		 * the bitmap is all clean and we don't need to
 		 * adjust the timeout right now
 		 */
-		if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) {
+		if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT)
 			mddev->thread->timeout = timeout;
-			md_wakeup_thread(mddev->thread);
-		}
 	}
+
+	md_wakeup_thread(mddev->thread);
 	return len;
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH -next v7 4/5] md/bitmap: factor out a helper to set timeout
  2023-04-25 11:52 [PATCH -next v7 0/5] md: protect md_thread with rcu Yu Kuai
                   ` (2 preceding siblings ...)
  2023-04-25 11:52 ` [PATCH -next v7 3/5] md/bitmap: always wake up md_thread in timeout_store Yu Kuai
@ 2023-04-25 11:52 ` Yu Kuai
  2023-04-25 11:52 ` [PATCH -next v7 5/5] md: protect md_thread with rcu Yu Kuai
  4 siblings, 0 replies; 11+ messages in thread
From: Yu Kuai @ 2023-04-25 11:52 UTC (permalink / raw
  To: logang, axboe, song
  Cc: linux-block, linux-kernel, linux-raid, yukuai3, yukuai1, yi.zhang,
	yangerkun

From: Yu Kuai <yukuai3@huawei.com>

Register/unregister 'mddev->thread' are both under 'reconfig_mutex',
however, some context didn't hold the mutex to access mddev->thread,
which can cause null-ptr-deference:

1) md_bitmap_daemon_work() can be called from md_check_recovery() where
'reconfig_mutex' is not held, deference 'mddev->thread' might cause
null-ptr-deference, because md_unregister_thread() reset the pointer
before stopping the thread.

2) timeout_store() access 'mddev->thread' multiple times,
null-ptr-deference can be triggered if 'mddev->thread' is reset in the
middle.

This patch factor out a helper to set timeout, the new helper always
check if 'mddev->thread' is null first, so that problem 1 can be fixed.

Now that this helper only access 'mddev->thread' once, but it's possible
that 'mddev->thread' is freed while this helper is still in progress,
hence the problem is not fixed yet. Follow up patches will fix this by
protecting md_thread with rcu.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md-bitmap.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 06b980f71a6b..9e9fccc772e1 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1241,11 +1241,22 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
 					       sector_t offset, sector_t *blocks,
 					       int create);
 
+static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
+			      bool force)
+{
+	struct md_thread *thread = mddev->thread;
+
+	if (!thread)
+		return;
+
+	if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT)
+		thread->timeout = timeout;
+}
+
 /*
  * bitmap daemon -- periodically wakes up to clean bits and flush pages
  *			out to disk
  */
-
 void md_bitmap_daemon_work(struct mddev *mddev)
 {
 	struct bitmap *bitmap;
@@ -1269,7 +1280,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
 
 	bitmap->daemon_lastrun = jiffies;
 	if (bitmap->allclean) {
-		mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+		mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
 		goto done;
 	}
 	bitmap->allclean = 1;
@@ -1366,8 +1377,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
 
  done:
 	if (bitmap->allclean == 0)
-		mddev->thread->timeout =
-			mddev->bitmap_info.daemon_sleep;
+		mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
 	mutex_unlock(&mddev->bitmap_info.mutex);
 }
 
@@ -1820,8 +1830,7 @@ void md_bitmap_destroy(struct mddev *mddev)
 	mddev->bitmap = NULL; /* disconnect from the md device */
 	spin_unlock(&mddev->lock);
 	mutex_unlock(&mddev->bitmap_info.mutex);
-	if (mddev->thread)
-		mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+	mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
 
 	md_bitmap_free(bitmap);
 }
@@ -1964,7 +1973,7 @@ int md_bitmap_load(struct mddev *mddev)
 	/* Kick recovery in case any bits were set */
 	set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
 
-	mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
+	mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
 	md_wakeup_thread(mddev->thread);
 
 	md_bitmap_update_sb(bitmap);
@@ -2469,17 +2478,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len)
 		timeout = MAX_SCHEDULE_TIMEOUT-1;
 	if (timeout < 1)
 		timeout = 1;
-	mddev->bitmap_info.daemon_sleep = timeout;
-	if (mddev->thread) {
-		/* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then
-		 * the bitmap is all clean and we don't need to
-		 * adjust the timeout right now
-		 */
-		if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT)
-			mddev->thread->timeout = timeout;
-	}
 
+	mddev->bitmap_info.daemon_sleep = timeout;
+	mddev_set_timeout(mddev, timeout, false);
 	md_wakeup_thread(mddev->thread);
+
 	return len;
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH -next v7 5/5] md: protect md_thread with rcu
  2023-04-25 11:52 [PATCH -next v7 0/5] md: protect md_thread with rcu Yu Kuai
                   ` (3 preceding siblings ...)
  2023-04-25 11:52 ` [PATCH -next v7 4/5] md/bitmap: factor out a helper to set timeout Yu Kuai
@ 2023-04-25 11:52 ` Yu Kuai
  2023-04-26  3:20   ` Song Liu
  4 siblings, 1 reply; 11+ messages in thread
From: Yu Kuai @ 2023-04-25 11:52 UTC (permalink / raw
  To: logang, axboe, song
  Cc: linux-block, linux-kernel, linux-raid, yukuai3, yukuai1, yi.zhang,
	yangerkun

From: Yu Kuai <yukuai3@huawei.com>

Our test reports a uaf for 'mddev->sync_thread':

T1                      T2
md_start_sync
 md_register_thread
 // mddev->sync_thread is set
			raid1d
			 md_check_recovery
			  md_reap_sync_thread
			   md_unregister_thread
			    kfree

 md_wakeup_thread
  wake_up
  ->sync_thread was freed

Root cause is that there is a small windown between register thread and
wake up thread, where the thread can be freed concurrently.

Currently, a global spinlock 'pers_lock' is borrowed to protect
'mddev->thread', this problem can be fixed likewise, however, there are
similar problems elsewhere, and use a global lock for all the cases is
not good.

This patch protect all md_thread with rcu.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c        |  3 ++
 drivers/md/md-bitmap.c    | 10 ++++--
 drivers/md/md-cluster.c   | 17 ++++++----
 drivers/md/md-multipath.c |  4 +--
 drivers/md/md.c           | 69 ++++++++++++++++++---------------------
 drivers/md/md.h           |  8 ++---
 drivers/md/raid1.c        |  7 ++--
 drivers/md/raid1.h        |  2 +-
 drivers/md/raid10.c       | 20 +++++++-----
 drivers/md/raid10.h       |  2 +-
 drivers/md/raid5-cache.c  | 22 ++++++++-----
 drivers/md/raid5.c        | 15 +++++----
 drivers/md/raid5.h        |  2 +-
 13 files changed, 100 insertions(+), 81 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1c1ebeb51003..0ecb4cce8af2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -527,6 +527,9 @@ static void blkg_destroy_all(struct gendisk *disk)
 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 		struct blkcg *blkcg = blkg->blkcg;
 
+		if (hlist_unhashed(&blkg->blkcg_node))
+			continue;
+
 		spin_lock(&blkcg->lock);
 		blkg_destroy(blkg);
 		spin_unlock(&blkcg->lock);
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 9e9fccc772e1..2a21633c5917 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1244,13 +1244,19 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
 static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
 			      bool force)
 {
-	struct md_thread *thread = mddev->thread;
+	struct md_thread *thread;
+
+	rcu_read_lock();
+	thread = rcu_dereference(mddev->thread);
 
 	if (!thread)
-		return;
+		goto out;
 
 	if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT)
 		thread->timeout = timeout;
+
+out:
+	rcu_read_unlock();
 }
 
 /*
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 10e0c5381d01..3d9fd74233df 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -75,14 +75,14 @@ struct md_cluster_info {
 	sector_t suspend_hi;
 	int suspend_from; /* the slot which broadcast suspend_lo/hi */
 
-	struct md_thread *recovery_thread;
+	struct md_thread __rcu *recovery_thread;
 	unsigned long recovery_map;
 	/* communication loc resources */
 	struct dlm_lock_resource *ack_lockres;
 	struct dlm_lock_resource *message_lockres;
 	struct dlm_lock_resource *token_lockres;
 	struct dlm_lock_resource *no_new_dev_lockres;
-	struct md_thread *recv_thread;
+	struct md_thread __rcu *recv_thread;
 	struct completion newdisk_completion;
 	wait_queue_head_t wait;
 	unsigned long state;
@@ -362,8 +362,8 @@ static void __recover_slot(struct mddev *mddev, int slot)
 
 	set_bit(slot, &cinfo->recovery_map);
 	if (!cinfo->recovery_thread) {
-		cinfo->recovery_thread = md_register_thread(recover_bitmaps,
-				mddev, "recover");
+		rcu_assign_pointer(cinfo->recovery_thread,
+			md_register_thread(recover_bitmaps, mddev, "recover"));
 		if (!cinfo->recovery_thread) {
 			pr_warn("md-cluster: Could not create recovery thread\n");
 			return;
@@ -526,11 +526,15 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
 	int got_lock = 0;
+	struct md_thread *thread;
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
 
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
-	wait_event(mddev->thread->wqueue,
+
+	/* daemaon thread must exist */
+	thread = rcu_dereference_protected(mddev->thread, true);
+	wait_event(thread->wqueue,
 		   (got_lock = mddev_trylock(mddev)) ||
 		    test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
 	md_reload_sb(mddev, mddev->good_device_nr);
@@ -889,7 +893,8 @@ static int join(struct mddev *mddev, int nodes)
 	}
 	/* Initiate the communication resources */
 	ret = -ENOMEM;
-	cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
+	rcu_assign_pointer(cinfo->recv_thread,
+			md_register_thread(recv_daemon, mddev, "cluster_recv"));
 	if (!cinfo->recv_thread) {
 		pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
 		goto err;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 66edf5e72bd6..92c45be203d7 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -400,8 +400,8 @@ static int multipath_run (struct mddev *mddev)
 	if (ret)
 		goto out_free_conf;
 
-	mddev->thread = md_register_thread(multipathd, mddev,
-					   "multipath");
+	rcu_assign_pointer(mddev->thread,
+			   md_register_thread(multipathd, mddev, "multipath"));
 	if (!mddev->thread)
 		goto out_free_conf;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b4f482d69de3..a907bc168652 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -70,11 +70,7 @@
 #include "md-bitmap.h"
 #include "md-cluster.h"
 
-/* pers_list is a list of registered personalities protected
- * by pers_lock.
- * pers_lock does extra service to protect accesses to
- * mddev->thread when the mutex cannot be held.
- */
+/* pers_list is a list of registered personalities protected by pers_lock. */
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
@@ -92,7 +88,7 @@ static struct workqueue_struct *md_rdev_misc_wq;
 static int remove_and_add_spares(struct mddev *mddev,
 				 struct md_rdev *this);
 static void mddev_detach(struct mddev *mddev);
-static void md_wakeup_thread_directly(struct md_thread *thread);
+static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
 
 enum md_ro_state {
 	MD_RDWR,
@@ -458,8 +454,10 @@ static void md_submit_bio(struct bio *bio)
  */
 void mddev_suspend(struct mddev *mddev)
 {
-	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
-	lockdep_assert_held(&mddev->reconfig_mutex);
+	struct md_thread *thread = rcu_dereference_protected(mddev->thread,
+			lockdep_is_held(&mddev->reconfig_mutex));
+
+	WARN_ON_ONCE(thread && current == thread->tsk);
 	if (mddev->suspended++)
 		return;
 	wake_up(&mddev->sb_wait);
@@ -801,13 +799,8 @@ void mddev_unlock(struct mddev *mddev)
 	} else
 		mutex_unlock(&mddev->reconfig_mutex);
 
-	/* As we've dropped the mutex we need a spinlock to
-	 * make sure the thread doesn't disappear
-	 */
-	spin_lock(&pers_lock);
 	md_wakeup_thread(mddev->thread);
 	wake_up(&mddev->sb_wait);
-	spin_unlock(&pers_lock);
 }
 EXPORT_SYMBOL_GPL(mddev_unlock);
 
@@ -7891,19 +7884,29 @@ static int md_thread(void *arg)
 	return 0;
 }
 
-static void md_wakeup_thread_directly(struct md_thread *thread)
+static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
 {
-	if (thread)
-		wake_up_process(thread->tsk);
+	struct md_thread *t;
+
+	rcu_read_lock();
+	t = rcu_dereference(thread);
+	if (t)
+		wake_up_process(t->tsk);
+	rcu_read_unlock();
 }
 
-void md_wakeup_thread(struct md_thread *thread)
+void md_wakeup_thread(struct md_thread __rcu *thread)
 {
-	if (thread) {
-		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
-		set_bit(THREAD_WAKEUP, &thread->flags);
-		wake_up(&thread->wqueue);
+	struct md_thread *t;
+
+	rcu_read_lock();
+	t = rcu_dereference(thread);
+	if (t) {
+		pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
+		set_bit(THREAD_WAKEUP, &t->flags);
+		wake_up(&t->wqueue);
 	}
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(md_wakeup_thread);
 
@@ -7933,22 +7936,15 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
 }
 EXPORT_SYMBOL(md_register_thread);
 
-void md_unregister_thread(struct md_thread **threadp)
+void md_unregister_thread(struct md_thread __rcu **threadp)
 {
-	struct md_thread *thread;
+	struct md_thread *thread = rcu_dereference_protected(*threadp, true);
 
-	/*
-	 * Locking ensures that mddev_unlock does not wake_up a
-	 * non-existent thread
-	 */
-	spin_lock(&pers_lock);
-	thread = *threadp;
-	if (!thread) {
-		spin_unlock(&pers_lock);
+	if (!thread)
 		return;
-	}
-	*threadp = NULL;
-	spin_unlock(&pers_lock);
+
+	rcu_assign_pointer(*threadp, NULL);
+	synchronize_rcu();
 
 	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
 	kthread_stop(thread->tsk);
@@ -9213,9 +9209,8 @@ static void md_start_sync(struct work_struct *ws)
 {
 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
 
-	mddev->sync_thread = md_register_thread(md_do_sync,
-						mddev,
-						"resync");
+	rcu_assign_pointer(mddev->sync_thread,
+			   md_register_thread(md_do_sync, mddev, "resync"));
 	if (!mddev->sync_thread) {
 		pr_warn("%s: could not start resync thread...\n",
 			mdname(mddev));
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fd8f260ed5f8..6fa71d90fb4d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -367,8 +367,8 @@ struct mddev {
 	int				new_chunk_sectors;
 	int				reshape_backwards;
 
-	struct md_thread		*thread;	/* management thread */
-	struct md_thread		*sync_thread;	/* doing resync or reconstruct */
+	struct md_thread __rcu		*thread;	/* management thread */
+	struct md_thread __rcu		*sync_thread;	/* doing resync or reconstruct */
 
 	/* 'last_sync_action' is initialized to "none".  It is set when a
 	 * sync operation (i.e "data-check", "requested-resync", "resync",
@@ -734,8 +734,8 @@ extern struct md_thread *md_register_thread(
 	void (*run)(struct md_thread *thread),
 	struct mddev *mddev,
 	const char *name);
-extern void md_unregister_thread(struct md_thread **threadp);
-extern void md_wakeup_thread(struct md_thread *thread);
+extern void md_unregister_thread(struct md_thread __rcu **threadp);
+extern void md_wakeup_thread(struct md_thread __rcu *thread);
 extern void md_check_recovery(struct mddev *mddev);
 extern void md_reap_sync_thread(struct mddev *mddev);
 extern int mddev_init_writes_pending(struct mddev *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 68a9e2d9985b..2f1011ffdf09 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3084,7 +3084,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	}
 
 	err = -ENOMEM;
-	conf->thread = md_register_thread(raid1d, mddev, "raid1");
+	rcu_assign_pointer(conf->thread,
+			   md_register_thread(raid1d, mddev, "raid1"));
 	if (!conf->thread)
 		goto abort;
 
@@ -3177,8 +3178,8 @@ static int raid1_run(struct mddev *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->thread = conf->thread;
-	conf->thread = NULL;
+	rcu_assign_pointer(mddev->thread, conf->thread);
+	rcu_assign_pointer(conf->thread, NULL);
 	mddev->private = conf;
 	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index ebb6788820e7..468f189da7a0 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -130,7 +130,7 @@ struct r1conf {
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
-	struct md_thread	*thread;
+	struct md_thread __rcu	*thread;
 
 	/* Keep track of cluster resync window to send to other
 	 * nodes.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4fcfcb350d2b..32fb4ff0acdb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -982,6 +982,7 @@ static void lower_barrier(struct r10conf *conf)
 static bool stop_waiting_barrier(struct r10conf *conf)
 {
 	struct bio_list *bio_list = current->bio_list;
+	struct md_thread *thread;
 
 	/* barrier is dropped */
 	if (!conf->barrier)
@@ -997,12 +998,14 @@ static bool stop_waiting_barrier(struct r10conf *conf)
 	    (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
 		return true;
 
+	/* daemon thread must exist while handling io */
+	thread = rcu_dereference_protected(conf->mddev->thread, true);
 	/*
 	 * move on if io is issued from raid10d(), nr_pending is not released
 	 * from original io(see handle_read_error()). All raise barrier is
 	 * blocked until this io is done.
 	 */
-	if (conf->mddev->thread->tsk == current) {
+	if (thread->tsk == current) {
 		WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0);
 		return true;
 	}
@@ -4107,7 +4110,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	atomic_set(&conf->nr_pending, 0);
 
 	err = -ENOMEM;
-	conf->thread = md_register_thread(raid10d, mddev, "raid10");
+	rcu_assign_pointer(conf->thread,
+			   md_register_thread(raid10d, mddev, "raid10"));
 	if (!conf->thread)
 		goto out;
 
@@ -4152,8 +4156,8 @@ static int raid10_run(struct mddev *mddev)
 	if (!conf)
 		goto out;
 
-	mddev->thread = conf->thread;
-	conf->thread = NULL;
+	rcu_assign_pointer(mddev->thread, conf->thread);
+	rcu_assign_pointer(conf->thread, NULL);
 
 	if (mddev_is_clustered(conf->mddev)) {
 		int fc, fo;
@@ -4296,8 +4300,8 @@ static int raid10_run(struct mddev *mddev)
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-							"reshape");
+		rcu_assign_pointer(mddev->sync_thread,
+			md_register_thread(md_do_sync, mddev, "reshape"));
 		if (!mddev->sync_thread)
 			goto out_free_conf;
 	}
@@ -4698,8 +4702,8 @@ static int raid10_start_reshape(struct mddev *mddev)
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 
-	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-						"reshape");
+	rcu_assign_pointer(mddev->sync_thread,
+			   md_register_thread(md_do_sync, mddev, "reshape"));
 	if (!mddev->sync_thread) {
 		ret = -EAGAIN;
 		goto abort;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 8c072ce0bc54..63e48b11b552 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -100,7 +100,7 @@ struct r10conf {
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
-	struct md_thread	*thread;
+	struct md_thread __rcu	*thread;
 
 	/*
 	 * Keep track of cluster resync window to send to other nodes.
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 46182b955aef..040f5d6e1298 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -120,7 +120,7 @@ struct r5l_log {
 	struct bio_set bs;
 	mempool_t meta_pool;
 
-	struct md_thread *reclaim_thread;
+	struct md_thread __rcu *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
 					 * reclaimed.  if it's 0, reclaim spaces
 					 * used by io_units which are in
@@ -1576,17 +1576,18 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 
 void r5l_quiesce(struct r5l_log *log, int quiesce)
 {
-	struct mddev *mddev;
+	struct mddev *mddev = log->rdev->mddev;
+	struct md_thread *thread = rcu_dereference_protected(
+		log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex));
 
 	if (quiesce) {
 		/* make sure r5l_write_super_and_discard_space exits */
-		mddev = log->rdev->mddev;
 		wake_up(&mddev->sb_wait);
-		kthread_park(log->reclaim_thread->tsk);
+		kthread_park(thread->tsk);
 		r5l_wake_reclaim(log, MaxSector);
 		r5l_do_reclaim(log);
 	} else
-		kthread_unpark(log->reclaim_thread->tsk);
+		kthread_unpark(thread->tsk);
 }
 
 bool r5l_log_disk_error(struct r5conf *conf)
@@ -3063,6 +3064,7 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 {
 	struct r5l_log *log;
+	struct md_thread *thread;
 	int ret;
 
 	pr_debug("md/raid:%s: using device %pg as journal\n",
@@ -3121,11 +3123,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	spin_lock_init(&log->tree_lock);
 	INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
 
-	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
-						 log->rdev->mddev, "reclaim");
-	if (!log->reclaim_thread)
+	thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev,
+				    "reclaim");
+	if (!thread)
 		goto reclaim_thread;
-	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+
+	thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+	rcu_assign_pointer(log->reclaim_thread, thread);
 
 	init_waitqueue_head(&log->iounit_wait);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 812a12e3e41a..395b1166c1e9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7665,7 +7665,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	}
 
 	sprintf(pers_name, "raid%d", mddev->new_level);
-	conf->thread = md_register_thread(raid5d, mddev, pers_name);
+	rcu_assign_pointer(conf->thread,
+			   md_register_thread(raid5d, mddev, pers_name));
 	if (!conf->thread) {
 		pr_warn("md/raid:%s: couldn't allocate thread.\n",
 			mdname(mddev));
@@ -7888,8 +7889,8 @@ static int raid5_run(struct mddev *mddev)
 	}
 
 	conf->min_offset_diff = min_offset_diff;
-	mddev->thread = conf->thread;
-	conf->thread = NULL;
+	rcu_assign_pointer(mddev->thread, conf->thread);
+	rcu_assign_pointer(conf->thread, NULL);
 	mddev->private = conf;
 
 	for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
@@ -7986,8 +7987,8 @@ static int raid5_run(struct mddev *mddev)
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-							"reshape");
+		rcu_assign_pointer(mddev->sync_thread,
+			md_register_thread(md_do_sync, mddev, "reshape"));
 		if (!mddev->sync_thread)
 			goto abort;
 	}
@@ -8564,8 +8565,8 @@ static int raid5_start_reshape(struct mddev *mddev)
 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-						"reshape");
+	rcu_assign_pointer(mddev->sync_thread,
+			   md_register_thread(md_do_sync, mddev, "reshape"));
 	if (!mddev->sync_thread) {
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index e873938a6125..f19707189a7b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -679,7 +679,7 @@ struct r5conf {
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
-	struct md_thread	*thread;
+	struct md_thread __rcu	*thread;
 	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 	struct r5worker_group	*worker_groups;
 	int			group_cnt;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH -next v7 5/5] md: protect md_thread with rcu
  2023-04-25 11:52 ` [PATCH -next v7 5/5] md: protect md_thread with rcu Yu Kuai
@ 2023-04-26  3:20   ` Song Liu
  2023-04-26  3:57     ` Yu Kuai
  0 siblings, 1 reply; 11+ messages in thread
From: Song Liu @ 2023-04-26  3:20 UTC (permalink / raw
  To: Yu Kuai
  Cc: logang, axboe, linux-block, linux-kernel, linux-raid, yukuai3,
	yi.zhang, yangerkun

On Tue, Apr 25, 2023 at 4:54 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> Our test reports a uaf for 'mddev->sync_thread':
>
> T1                      T2
> md_start_sync
>  md_register_thread
>  // mddev->sync_thread is set
>                         raid1d
>                          md_check_recovery
>                           md_reap_sync_thread
>                            md_unregister_thread
>                             kfree
>
>  md_wakeup_thread
>   wake_up
>   ->sync_thread was freed
>
> Root cause is that there is a small windown between register thread and
> wake up thread, where the thread can be freed concurrently.
>
> Currently, a global spinlock 'pers_lock' is borrowed to protect
> 'mddev->thread', this problem can be fixed likewise, however, there are
> similar problems elsewhere, and use a global lock for all the cases is
> not good.
>
> This patch protect all md_thread with rcu.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  block/blk-cgroup.c        |  3 ++
>  drivers/md/md-bitmap.c    | 10 ++++--
>  drivers/md/md-cluster.c   | 17 ++++++----
>  drivers/md/md-multipath.c |  4 +--
>  drivers/md/md.c           | 69 ++++++++++++++++++---------------------
>  drivers/md/md.h           |  8 ++---
>  drivers/md/raid1.c        |  7 ++--
>  drivers/md/raid1.h        |  2 +-
>  drivers/md/raid10.c       | 20 +++++++-----
>  drivers/md/raid10.h       |  2 +-
>  drivers/md/raid5-cache.c  | 22 ++++++++-----
>  drivers/md/raid5.c        | 15 +++++----
>  drivers/md/raid5.h        |  2 +-
>  13 files changed, 100 insertions(+), 81 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 1c1ebeb51003..0ecb4cce8af2 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -527,6 +527,9 @@ static void blkg_destroy_all(struct gendisk *disk)
>         list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
>                 struct blkcg *blkcg = blkg->blkcg;
>
> +               if (hlist_unhashed(&blkg->blkcg_node))
> +                       continue;
> +

This change is not related, right?

I don't think we can rush this change in the 6.4 merge window. Let's
test it more thoroughly and ship it in the next merge window.

Thanks for working on this!

Song
[...]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH -next v7 5/5] md: protect md_thread with rcu
  2023-04-26  3:20   ` Song Liu
@ 2023-04-26  3:57     ` Yu Kuai
  0 siblings, 0 replies; 11+ messages in thread
From: Yu Kuai @ 2023-04-26  3:57 UTC (permalink / raw
  To: Song Liu, Yu Kuai
  Cc: logang, axboe, linux-block, linux-kernel, linux-raid, yi.zhang,
	yangerkun, yukuai (C)

Hi,

在 2023/04/26 11:20, Song Liu 写道:
> On Tue, Apr 25, 2023 at 4:54 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Our test reports a uaf for 'mddev->sync_thread':
>>
>> T1                      T2
>> md_start_sync
>>   md_register_thread
>>   // mddev->sync_thread is set
>>                          raid1d
>>                           md_check_recovery
>>                            md_reap_sync_thread
>>                             md_unregister_thread
>>                              kfree
>>
>>   md_wakeup_thread
>>    wake_up
>>    ->sync_thread was freed
>>
>> Root cause is that there is a small windown between register thread and
>> wake up thread, where the thread can be freed concurrently.
>>
>> Currently, a global spinlock 'pers_lock' is borrowed to protect
>> 'mddev->thread', this problem can be fixed likewise, however, there are
>> similar problems elsewhere, and use a global lock for all the cases is
>> not good.
>>
>> This patch protect all md_thread with rcu.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   block/blk-cgroup.c        |  3 ++
>>   drivers/md/md-bitmap.c    | 10 ++++--
>>   drivers/md/md-cluster.c   | 17 ++++++----
>>   drivers/md/md-multipath.c |  4 +--
>>   drivers/md/md.c           | 69 ++++++++++++++++++---------------------
>>   drivers/md/md.h           |  8 ++---
>>   drivers/md/raid1.c        |  7 ++--
>>   drivers/md/raid1.h        |  2 +-
>>   drivers/md/raid10.c       | 20 +++++++-----
>>   drivers/md/raid10.h       |  2 +-
>>   drivers/md/raid5-cache.c  | 22 ++++++++-----
>>   drivers/md/raid5.c        | 15 +++++----
>>   drivers/md/raid5.h        |  2 +-
>>   13 files changed, 100 insertions(+), 81 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 1c1ebeb51003..0ecb4cce8af2 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -527,6 +527,9 @@ static void blkg_destroy_all(struct gendisk *disk)
>>          list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
>>                  struct blkcg *blkcg = blkg->blkcg;
>>
>> +               if (hlist_unhashed(&blkg->blkcg_node))
>> +                       continue;
>> +
> 
> This change is not related, right?

Yes, it's not related. Sorry that I missed another fix patch into
this...

> 
> I don't think we can rush this change in the 6.4 merge window. Let's
> test it more thoroughly and ship it in the next merge window.

Of course.

Thanks,
Kuai


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2023-04-26  3:57 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-04-25 11:52 [PATCH -next v7 0/5] md: protect md_thread with rcu Yu Kuai
2023-04-25 11:52 ` [PATCH -next v7 1/5] md: factor out a helper to wake up md_thread directly Yu Kuai
2023-04-25 11:52 ` [PATCH -next v7 2/5] dm-raid: remove useless checking in raid_message() Yu Kuai
2023-04-25 11:52 ` [PATCH -next v7 3/5] md/bitmap: always wake up md_thread in timeout_store Yu Kuai
2023-04-25 11:52 ` [PATCH -next v7 4/5] md/bitmap: factor out a helper to set timeout Yu Kuai
2023-04-25 11:52 ` [PATCH -next v7 5/5] md: protect md_thread with rcu Yu Kuai
2023-04-26  3:20   ` Song Liu
2023-04-26  3:57     ` Yu Kuai
  -- strict thread matches above, loose matches on Subject: below --
2023-04-13 15:10 [PATCH -next v6 " Logan Gunthorpe
2023-04-14  1:32 ` [PATCH -next v7 " Yu Kuai
2023-04-23  2:42   ` Yu Kuai
2023-04-24 22:38     ` Song Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.