All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC net-next] net: cache the __dev_alloc_name()
@ 2024-05-06 20:32 William Tu
  2024-05-07  7:26 ` Paolo Abeni
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: William Tu @ 2024-05-06 20:32 UTC (permalink / raw
  To: netdev; +Cc: jiri, bodong, kuba, witu

When a system has around 1000 netdevs, adding the 1001st device becomes
very slow. The devlink command to create an SF
  $ devlink port add pci/0000:03:00.0 flavour pcisf \
    pfnum 0 sfnum 1001
takes around 5 seconds, and Linux perf and flamegraph show 19% of time
spent on __dev_alloc_name() [1].

The reason is that devlink first requests for next available "eth%d".
And __dev_alloc_name will scan all existing netdev to match on "ethN",
set N to a 'inuse' bitmap, and find/return next available number,
in our case eth0.

And later on based on udev rule, we renamed it from eth0 to
"en3f0pf0sf1001" and with altname below
  14: en3f0pf0sf1001: <BROADCAST,MULTICAST,UP,LOWER_UP> ...
      altname enp3s0f0npf0sf1001

So eth0 is actually never being used, but as we have 1k "en3f0pf0sfN"
devices + 1k altnames, the __dev_alloc_name spends lots of time goint
through all existing netdev and try to build the 'inuse' bitmap of
pattern 'eth%d'. And the bitmap barely has any bit set, and it rescanes
every time.

I want to see if it makes sense to save/cache the result, or is there
any way to not go through the 'eth%d' pattern search. The RFC patch
adds name_pat (name pattern) hlist and saves the 'inuse' bitmap. It saves
pattens, ex: "eth%d", "veth%d", with the bitmap, and lookup before
scanning all existing netdevs.

Note: code is working just for quick performance benchmark, and still
missing lots of stuff. Using hlist seems to overkill, as I think
we only have few patterns
$ git grep alloc_netdev drivers/ net/ | grep %d

1. https://github.com/williamtu/net-next/issues/1

Signed-off-by: William Tu <witu@nvidia.com>
---
 include/net/net_namespace.h |   1 +
 net/core/dev.c              | 103 ++++++++++++++++++++++++++++++++++--
 net/core/dev.h              |   6 +++
 3 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 20c34bd7a077..82c15020916b 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -109,6 +109,7 @@ struct net {
 
 	struct hlist_head 	*dev_name_head;
 	struct hlist_head	*dev_index_head;
+	struct hlist_head 	*dev_name_pat_head;
 	struct xarray		dev_by_index;
 	struct raw_notifier_head	netdev_chain;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 331848eca7d3..08c988cac7a2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -197,6 +197,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 }
 
+static inline struct hlist_head *
+dev_name_pat_hash(struct net *net, const char *pat)
+{
+	unsigned int hash = full_name_hash(net, pat, strnlen(pat, IFNAMSIZ));
+
+	return &net->dev_name_pat_head[hash_32(hash, NETDEV_HASHBITS)];
+}
+
 static inline void rps_lock_irqsave(struct softnet_data *sd,
 				    unsigned long *flags)
 {
@@ -231,6 +239,64 @@ static inline void rps_unlock_irq_enable(struct softnet_data *sd)
 		local_irq_enable();
 }
 
+static struct netdev_name_pat_node *
+netdev_name_pat_node_alloc(const char *pattern)
+{
+	struct netdev_name_pat_node *pat_node;
+	const int max_netdevices = 8*PAGE_SIZE;
+
+	pat_node = kmalloc(sizeof(*pat_node), GFP_KERNEL);
+	if (!pat_node)
+		return NULL;
+
+	pat_node->inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
+	if (!pat_node->inuse) {
+		kfree(pat_node);
+		return NULL;
+	}
+
+	INIT_HLIST_NODE(&pat_node->hlist);
+	strscpy(pat_node->name_pat, pattern, IFNAMSIZ);
+
+	return pat_node;
+}
+
+static void netdev_name_pat_node_free(struct netdev_name_pat_node *pat_node)
+{
+	bitmap_free(pat_node->inuse);
+	kfree(pat_node);
+}
+
+static void netdev_name_pat_node_add(struct net *net,
+				     struct netdev_name_pat_node *pat_node)
+{
+	hlist_add_head(&pat_node->hlist,
+		       dev_name_pat_hash(net, pat_node->name_pat));
+}
+
+static void netdev_name_pat_node_del(struct netdev_name_pat_node *pat_node)
+{
+	hlist_del_rcu(&pat_node->hlist);
+}
+
+static struct netdev_name_pat_node *
+netdev_name_pat_node_lookup(struct net *net, const char *pat)
+{
+	struct hlist_head *head = dev_name_pat_hash(net, pat);
+	struct netdev_name_pat_node *pat_node;
+
+	hlist_for_each_entry(pat_node, head, hlist) {
+		if (!strcmp(pat_node->name_pat, pat))
+			return pat_node;
+	}
+	return NULL;
+}
+
+static bool netdev_name_pat_in_use(struct net *net, const char *pat) // eth%d
+{
+	return netdev_name_pat_node_lookup(net, pat);
+}
+
 static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
 						       const char *name)
 {
@@ -1057,6 +1123,7 @@ EXPORT_SYMBOL(dev_valid_name);
 
 static int __dev_alloc_name(struct net *net, const char *name, char *res)
 {
+	struct netdev_name_pat_node *pat_node;
 	int i = 0;
 	const char *p;
 	const int max_netdevices = 8*PAGE_SIZE;
@@ -1071,10 +1138,21 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res)
 	if (!p || p[1] != 'd' || strchr(p + 2, '%'))
 		return -EINVAL;
 
-	/* Use one page as a bit array of possible slots */
-	inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
-	if (!inuse)
+	pat_node = netdev_name_pat_node_lookup(net, name);
+	if (pat_node) {
+		i = find_first_zero_bit(pat_node->inuse, max_netdevices);
+		__set_bit(i, pat_node->inuse);
+		strscpy(buf, name, IFNAMSIZ);
+		snprintf(res, IFNAMSIZ, buf, i);
+
+		return i;
+	}
+
+	pat_node = netdev_name_pat_node_alloc(name);
+	if (!pat_node)
 		return -ENOMEM;
+	netdev_name_pat_node_add(net, pat_node);
+	inuse = pat_node->inuse;
 
 	for_each_netdev(net, d) {
 		struct netdev_name_node *name_node;
@@ -1082,6 +1160,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res)
 		netdev_for_each_altname(d, name_node) {
 			if (!sscanf(name_node->name, name, &i))
 				continue;
+
 			if (i < 0 || i >= max_netdevices)
 				continue;
 
@@ -1102,13 +1181,14 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res)
 	}
 
 	i = find_first_zero_bit(inuse, max_netdevices);
-	bitmap_free(inuse);
+	__set_bit(i, inuse);
 	if (i == max_netdevices)
 		return -ENFILE;
 
 	/* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
 	strscpy(buf, name, IFNAMSIZ);
 	snprintf(res, IFNAMSIZ, buf, i);
+
 	return i;
 }
 
@@ -11464,12 +11544,20 @@ static int __net_init netdev_init(struct net *net)
 	if (net->dev_index_head == NULL)
 		goto err_idx;
 
+	net->dev_name_pat_head = netdev_create_hash();
+	if (net->dev_name_pat_head == NULL)
+		goto err_name_pat;
+
+	printk("%s head %px\n", __func__, net->dev_name_pat_head);
+
 	xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
 
 	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
 
 	return 0;
 
+err_name_pat:
+	kfree(net->dev_index_head);
 err_idx:
 	kfree(net->dev_name_head);
 err_name:
@@ -11563,6 +11651,7 @@ static void __net_exit netdev_exit(struct net *net)
 {
 	kfree(net->dev_name_head);
 	kfree(net->dev_index_head);
+	kfree(net->dev_name_pat_head);
 	xa_destroy(&net->dev_by_index);
 	if (net != &init_net)
 		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
@@ -11610,6 +11699,12 @@ static void __net_exit default_device_exit_net(struct net *net)
 			BUG();
 		}
 	}
+/*
+	hlist_for_each(pat_node, head, hlist) {
+		netdev_name_pat_node_del(pat_node);
+		netdev_name_pat_node_free(pat_node);
+	}
+*/
 }
 
 static void __net_exit default_device_exit_batch(struct list_head *net_list)
diff --git a/net/core/dev.h b/net/core/dev.h
index 2bcaf8eee50c..62133584dc14 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -59,6 +59,12 @@ struct netdev_name_node {
 	struct rcu_head rcu;
 };
 
+struct netdev_name_pat_node {
+	struct hlist_node hlist;
+	char name_pat[IFNAMSIZ];
+	unsigned long *inuse;
+};
+
 int netdev_get_name(struct net *net, char *name, int ifindex);
 int dev_change_name(struct net_device *dev, const char *newname);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC net-next] net: cache the __dev_alloc_name()
  2024-05-06 20:32 [PATCH RFC net-next] net: cache the __dev_alloc_name() William Tu
@ 2024-05-07  7:26 ` Paolo Abeni
  2024-05-07 18:55   ` William Tu
  2024-05-08  4:24 ` Stephen Hemminger
  2024-05-09  6:11 ` kernel test robot
  2 siblings, 1 reply; 9+ messages in thread
From: Paolo Abeni @ 2024-05-07  7:26 UTC (permalink / raw
  To: William Tu, netdev; +Cc: jiri, bodong, kuba

On Mon, 2024-05-06 at 20:32 +0000, William Tu wrote:
> When a system has around 1000 netdevs, adding the 1001st device becomes
> very slow. The devlink command to create an SF
>   $ devlink port add pci/0000:03:00.0 flavour pcisf \
>     pfnum 0 sfnum 1001
> takes around 5 seconds, and Linux perf and flamegraph show 19% of time
> spent on __dev_alloc_name() [1].
> 
> The reason is that devlink first requests for next available "eth%d".
> And __dev_alloc_name will scan all existing netdev to match on "ethN",
> set N to a 'inuse' bitmap, and find/return next available number,
> in our case eth0.
> 
> And later on based on udev rule, we renamed it from eth0 to
> "en3f0pf0sf1001" and with altname below
>   14: en3f0pf0sf1001: <BROADCAST,MULTICAST,UP,LOWER_UP> ...
>       altname enp3s0f0npf0sf1001
> 
> So eth0 is actually never being used, but as we have 1k "en3f0pf0sfN"
> devices + 1k altnames, the __dev_alloc_name spends lots of time goint
> through all existing netdev and try to build the 'inuse' bitmap of
> pattern 'eth%d'. And the bitmap barely has any bit set, and it rescanes
> every time.
> 
> I want to see if it makes sense to save/cache the result, or is there
> any way to not go through the 'eth%d' pattern search. The RFC patch
> adds name_pat (name pattern) hlist and saves the 'inuse' bitmap. It saves
> pattens, ex: "eth%d", "veth%d", with the bitmap, and lookup before
> scanning all existing netdevs.

An alternative heuristic that should be cheap and possibly reasonable
could be optimistically check for <name>0..<name><very small int>
availability, possibly restricting such attempt at scenarios where the
total number of hashed netdevice names is somewhat high.

WDYT?

Cheers,

Paolo


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC net-next] net: cache the __dev_alloc_name()
  2024-05-07  7:26 ` Paolo Abeni
@ 2024-05-07 18:55   ` William Tu
  2024-05-09  7:46     ` Paolo Abeni
  0 siblings, 1 reply; 9+ messages in thread
From: William Tu @ 2024-05-07 18:55 UTC (permalink / raw
  To: Paolo Abeni, netdev; +Cc: jiri, bodong, kuba



On 5/7/24 12:26 AM, Paolo Abeni wrote:
> External email: Use caution opening links or attachments
>
>
> On Mon, 2024-05-06 at 20:32 +0000, William Tu wrote:
>> When a system has around 1000 netdevs, adding the 1001st device becomes
>> very slow. The devlink command to create an SF
>>    $ devlink port add pci/0000:03:00.0 flavour pcisf \
>>      pfnum 0 sfnum 1001
>> takes around 5 seconds, and Linux perf and flamegraph show 19% of time
>> spent on __dev_alloc_name() [1].
>>
>> The reason is that devlink first requests for next available "eth%d".
>> And __dev_alloc_name will scan all existing netdev to match on "ethN",
>> set N to a 'inuse' bitmap, and find/return next available number,
>> in our case eth0.
>>
>> And later on based on udev rule, we renamed it from eth0 to
>> "en3f0pf0sf1001" and with altname below
>>    14: en3f0pf0sf1001: <BROADCAST,MULTICAST,UP,LOWER_UP> ...
>>        altname enp3s0f0npf0sf1001
>>
>> So eth0 is actually never being used, but as we have 1k "en3f0pf0sfN"
>> devices + 1k altnames, the __dev_alloc_name spends lots of time goint
>> through all existing netdev and try to build the 'inuse' bitmap of
>> pattern 'eth%d'. And the bitmap barely has any bit set, and it rescanes
>> every time.
>>
>> I want to see if it makes sense to save/cache the result, or is there
>> any way to not go through the 'eth%d' pattern search. The RFC patch
>> adds name_pat (name pattern) hlist and saves the 'inuse' bitmap. It saves
>> pattens, ex: "eth%d", "veth%d", with the bitmap, and lookup before
>> scanning all existing netdevs.
> An alternative heuristic that should be cheap and possibly reasonable
> could be optimistically check for <name>0..<name><very small int>
> availability, possibly restricting such attempt at scenarios where the
> total number of hashed netdevice names is somewhat high.
>
> WDYT?
>
> Cheers,
>
> Paolo
Hi Paolo,

Thanks for your suggestion!
I'm not clear with that idea.

The current code has to do a full scan of all netdevs in a list, and the 
name list is not sorted / ordered. So to get to know, ex: eth0 .. eth10, 
we still need to do a full scan, find netdev with prefix "eth", and get 
net available bit 11 (10+1).
And in another use case where users doesn't install UDEV rule to rename, 
the system can actually create eth998, eth999, eth1000....

What if we create prefix map (maybe using xarray)
idx   entry=(prefix, bitmap)
--------------------
0      eth, 1111000000...
1      veth, 1000000...
2      can, 11100000...
3      firewire, 00000...

but then we need to unset the bit when device is removed.
William



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC net-next] net: cache the __dev_alloc_name()
  2024-05-06 20:32 [PATCH RFC net-next] net: cache the __dev_alloc_name() William Tu
  2024-05-07  7:26 ` Paolo Abeni
@ 2024-05-08  4:24 ` Stephen Hemminger
  2024-05-09  3:27   ` William Tu
  2024-05-09  6:11 ` kernel test robot
  2 siblings, 1 reply; 9+ messages in thread
From: Stephen Hemminger @ 2024-05-08  4:24 UTC (permalink / raw
  To: William Tu; +Cc: netdev, jiri, bodong, kuba

On Mon, 6 May 2024 20:32:07 +0000
William Tu <witu@nvidia.com> wrote:

> When a system has around 1000 netdevs, adding the 1001st device becomes
> very slow. The devlink command to create an SF
>   $ devlink port add pci/0000:03:00.0 flavour pcisf \
>     pfnum 0 sfnum 1001
> takes around 5 seconds, and Linux perf and flamegraph show 19% of time
> spent on __dev_alloc_name() [1].
> 
> The reason is that devlink first requests for next available "eth%d".
> And __dev_alloc_name will scan all existing netdev to match on "ethN",
> set N to a 'inuse' bitmap, and find/return next available number,
> in our case eth0.
> 
> And later on based on udev rule, we renamed it from eth0 to
> "en3f0pf0sf1001" and with altname below
>   14: en3f0pf0sf1001: <BROADCAST,MULTICAST,UP,LOWER_UP> ...
>       altname enp3s0f0npf0sf1001
> 
> So eth0 is actually never being used, but as we have 1k "en3f0pf0sfN"
> devices + 1k altnames, the __dev_alloc_name spends lots of time goint
> through all existing netdev and try to build the 'inuse' bitmap of
> pattern 'eth%d'. And the bitmap barely has any bit set, and it rescanes
> every time.
> 
> I want to see if it makes sense to save/cache the result, or is there
> any way to not go through the 'eth%d' pattern search. The RFC patch
> adds name_pat (name pattern) hlist and saves the 'inuse' bitmap. It saves
> pattens, ex: "eth%d", "veth%d", with the bitmap, and lookup before
> scanning all existing netdevs.
> 
> Note: code is working just for quick performance benchmark, and still
> missing lots of stuff. Using hlist seems to overkill, as I think
> we only have few patterns
> $ git grep alloc_netdev drivers/ net/ | grep %d
> 
> 1. https://github.com/williamtu/net-next/issues/1
> 
> Signed-off-by: William Tu <witu@nvidia.com>

Actual patch is bit of a mess, with commented out code, leftover printks,
random whitespace changes. Please fix that.

The issue is that bitmap gets to be large and adds bloat to embedded devices.

Perhaps you could either force devlink to use the same device each time (eth0)
if it is going to be renamed anyway.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC net-next] net: cache the __dev_alloc_name()
  2024-05-08  4:24 ` Stephen Hemminger
@ 2024-05-09  3:27   ` William Tu
  2024-05-10 21:30     ` William Tu
  0 siblings, 1 reply; 9+ messages in thread
From: William Tu @ 2024-05-09  3:27 UTC (permalink / raw
  To: Stephen Hemminger; +Cc: netdev, jiri, bodong, kuba



On 5/7/24 9:24 PM, Stephen Hemminger wrote:
> External email: Use caution opening links or attachments
>
>
> On Mon, 6 May 2024 20:32:07 +0000
> William Tu <witu@nvidia.com> wrote:
>
>> When a system has around 1000 netdevs, adding the 1001st device becomes
>> very slow. The devlink command to create an SF
>>    $ devlink port add pci/0000:03:00.0 flavour pcisf \
>>      pfnum 0 sfnum 1001
>> takes around 5 seconds, and Linux perf and flamegraph show 19% of time
>> spent on __dev_alloc_name() [1].
>>
>> The reason is that devlink first requests for next available "eth%d".
>> And __dev_alloc_name will scan all existing netdev to match on "ethN",
>> set N to a 'inuse' bitmap, and find/return next available number,
>> in our case eth0.
>>
>> And later on based on udev rule, we renamed it from eth0 to
>> "en3f0pf0sf1001" and with altname below
>>    14: en3f0pf0sf1001: <BROADCAST,MULTICAST,UP,LOWER_UP> ...
>>        altname enp3s0f0npf0sf1001
>>
>> So eth0 is actually never being used, but as we have 1k "en3f0pf0sfN"
>> devices + 1k altnames, the __dev_alloc_name spends lots of time goint
>> through all existing netdev and try to build the 'inuse' bitmap of
>> pattern 'eth%d'. And the bitmap barely has any bit set, and it rescanes
>> every time.
>>
>> I want to see if it makes sense to save/cache the result, or is there
>> any way to not go through the 'eth%d' pattern search. The RFC patch
>> adds name_pat (name pattern) hlist and saves the 'inuse' bitmap. It saves
>> pattens, ex: "eth%d", "veth%d", with the bitmap, and lookup before
>> scanning all existing netdevs.
>>
>> Note: code is working just for quick performance benchmark, and still
>> missing lots of stuff. Using hlist seems to overkill, as I think
>> we only have few patterns
>> $ git grep alloc_netdev drivers/ net/ | grep %d
>>
>> 1. https://github.com/williamtu/net-next/issues/1
>>
>> Signed-off-by: William Tu <witu@nvidia.com>
Hi Stephen,
Thanks for your feedback.
> Actual patch is bit of a mess, with commented out code, leftover printks,
> random whitespace changes. Please fix that.
Yes, working on it.
>
> The issue is that bitmap gets to be large and adds bloat to embedded devices.
the bitmap size is fixed (8*PAGE_SIZE), set_bit is also fast. It's just 
that for each new device, we always re-scan all existing netdevs, set 
bit map, and then free the bitmap.
>
> Perhaps you could either force devlink to use the same device each time (eth0)
> if it is going to be renamed anyway.
It is working like that now (with udev) in my slow environment. So it's 
always getting eth0, (because bitmap is all 0s), and udev renames it to 
enp0xxx. Then next time rescan and since eth0 is still available, 
__dev_alloc_name still returns eth0, and udev renames it again, and next 
device creations follows the same, and the time to rescan gets longer 
and longer.

Regards,
William


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC net-next] net: cache the __dev_alloc_name()
  2024-05-06 20:32 [PATCH RFC net-next] net: cache the __dev_alloc_name() William Tu
  2024-05-07  7:26 ` Paolo Abeni
  2024-05-08  4:24 ` Stephen Hemminger
@ 2024-05-09  6:11 ` kernel test robot
  2 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2024-05-09  6:11 UTC (permalink / raw
  To: William Tu; +Cc: oe-kbuild-all

Hi William,

[This is a private test report for your RFC patch.]
kernel test robot noticed the following build warnings:

[auto build test WARNING on net/main]
[also build test WARNING on linus/master v6.9-rc7]
[cannot apply to net-next/main horms-ipvs/master next-20240508]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/William-Tu/net-cache-the-__dev_alloc_name/20240507-043305
base:   net/main
patch link:    https://lore.kernel.org/r/20240506203207.1307971-1-witu%40nvidia.com
patch subject: [PATCH RFC net-next] net: cache the __dev_alloc_name()
config: alpha-defconfig (https://download.01.org/0day-ci/archive/20240509/202405091354.2Bdd0B0X-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240509/202405091354.2Bdd0B0X-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202405091354.2Bdd0B0X-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> net/core/dev.c:295:13: warning: 'netdev_name_pat_in_use' defined but not used [-Wunused-function]
     295 | static bool netdev_name_pat_in_use(struct net *net, const char *pat) // eth%d
         |             ^~~~~~~~~~~~~~~~~~~~~~
>> net/core/dev.c:277:13: warning: 'netdev_name_pat_node_del' defined but not used [-Wunused-function]
     277 | static void netdev_name_pat_node_del(struct netdev_name_pat_node *pat_node)
         |             ^~~~~~~~~~~~~~~~~~~~~~~~
>> net/core/dev.c:264:13: warning: 'netdev_name_pat_node_free' defined but not used [-Wunused-function]
     264 | static void netdev_name_pat_node_free(struct netdev_name_pat_node *pat_node)
         |             ^~~~~~~~~~~~~~~~~~~~~~~~~


vim +/netdev_name_pat_in_use +295 net/core/dev.c

   263	
 > 264	static void netdev_name_pat_node_free(struct netdev_name_pat_node *pat_node)
   265	{
   266		bitmap_free(pat_node->inuse);
   267		kfree(pat_node);
   268	}
   269	
   270	static void netdev_name_pat_node_add(struct net *net,
   271					     struct netdev_name_pat_node *pat_node)
   272	{
   273		hlist_add_head(&pat_node->hlist,
   274			       dev_name_pat_hash(net, pat_node->name_pat));
   275	}
   276	
 > 277	static void netdev_name_pat_node_del(struct netdev_name_pat_node *pat_node)
   278	{
   279		hlist_del_rcu(&pat_node->hlist);
   280	}
   281	
   282	static struct netdev_name_pat_node *
   283	netdev_name_pat_node_lookup(struct net *net, const char *pat)
   284	{
   285		struct hlist_head *head = dev_name_pat_hash(net, pat);
   286		struct netdev_name_pat_node *pat_node;
   287	
   288		hlist_for_each_entry(pat_node, head, hlist) {
   289			if (!strcmp(pat_node->name_pat, pat))
   290				return pat_node;
   291		}
   292		return NULL;
   293	}
   294	
 > 295	static bool netdev_name_pat_in_use(struct net *net, const char *pat) // eth%d
   296	{
   297		return netdev_name_pat_node_lookup(net, pat);
   298	}
   299	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC net-next] net: cache the __dev_alloc_name()
  2024-05-07 18:55   ` William Tu
@ 2024-05-09  7:46     ` Paolo Abeni
  2024-05-09 13:06       ` William Tu
  0 siblings, 1 reply; 9+ messages in thread
From: Paolo Abeni @ 2024-05-09  7:46 UTC (permalink / raw
  To: William Tu, netdev; +Cc: jiri, bodong, kuba

On Tue, 2024-05-07 at 11:55 -0700, William Tu wrote:
> 
> On 5/7/24 12:26 AM, Paolo Abeni wrote:
> > External email: Use caution opening links or attachments
> > 
> > 
> > On Mon, 2024-05-06 at 20:32 +0000, William Tu wrote:
> > > When a system has around 1000 netdevs, adding the 1001st device becomes
> > > very slow. The devlink command to create an SF
> > >    $ devlink port add pci/0000:03:00.0 flavour pcisf \
> > >      pfnum 0 sfnum 1001
> > > takes around 5 seconds, and Linux perf and flamegraph show 19% of time
> > > spent on __dev_alloc_name() [1].
> > > 
> > > The reason is that devlink first requests for next available "eth%d".
> > > And __dev_alloc_name will scan all existing netdev to match on "ethN",
> > > set N to a 'inuse' bitmap, and find/return next available number,
> > > in our case eth0.
> > > 
> > > And later on based on udev rule, we renamed it from eth0 to
> > > "en3f0pf0sf1001" and with altname below
> > >    14: en3f0pf0sf1001: <BROADCAST,MULTICAST,UP,LOWER_UP> ...
> > >        altname enp3s0f0npf0sf1001
> > > 
> > > So eth0 is actually never being used, but as we have 1k "en3f0pf0sfN"
> > > devices + 1k altnames, the __dev_alloc_name spends lots of time goint
> > > through all existing netdev and try to build the 'inuse' bitmap of
> > > pattern 'eth%d'. And the bitmap barely has any bit set, and it rescanes
> > > every time.
> > > 
> > > I want to see if it makes sense to save/cache the result, or is there
> > > any way to not go through the 'eth%d' pattern search. The RFC patch
> > > adds name_pat (name pattern) hlist and saves the 'inuse' bitmap. It saves
> > > pattens, ex: "eth%d", "veth%d", with the bitmap, and lookup before
> > > scanning all existing netdevs.
> > An alternative heuristic that should be cheap and possibly reasonable
> > could be optimistically check for <name>0..<name><very small int>
> > availability, possibly restricting such attempt at scenarios where the
> > total number of hashed netdevice names is somewhat high.
> > 
> > WDYT?
> > 
> > Cheers,
> > 
> > Paolo
> Hi Paolo,
> 
> Thanks for your suggestion!
> I'm not clear with that idea.
> 
> The current code has to do a full scan of all netdevs in a list, and the 
> name list is not sorted / ordered. So to get to know, ex: eth0 .. eth10, 
> we still need to do a full scan, find netdev with prefix "eth", and get 
> net available bit 11 (10+1).
> And in another use case where users doesn't install UDEV rule to rename, 
> the system can actually create eth998, eth999, eth1000....
> 
> What if we create prefix map (maybe using xarray)
> idx   entry=(prefix, bitmap)
> --------------------
> 0      eth, 1111000000...
> 1      veth, 1000000...
> 2      can, 11100000...
> 3      firewire, 00000...
> 
> but then we need to unset the bit when device is removed.
> William

Sorry for the late reply. I mean something alike the following
(completely untested!!!):
---
diff --git a/net/core/dev.c b/net/core/dev.c
index d2ce91a334c1..0d428825f88a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1109,6 +1109,12 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res)
 	if (!p || p[1] != 'd' || strchr(p + 2, '%'))
 		return -EINVAL;
 
+	for (i = 0; i < 4; ++i) {
+		snprintf(buf, IFNAMSIZ, name, i);
+		if (!__dev_get_by_name(net, buf))
+			goto found;
+	}
+
 	/* Use one page as a bit array of possible slots */
 	inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
 	if (!inuse)
@@ -1144,6 +1150,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res)
 	if (i == max_netdevices)
 		return -ENFILE;
 
+found:
 	/* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
 	strscpy(buf, name, IFNAMSIZ);
 	snprintf(res, IFNAMSIZ, buf, i);

---
plus eventually some additional check to use such heuristic only if the
total number of devices	is significantly high. That would need some
additional book-keeping, not added here.

Cheers,

Paolo


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC net-next] net: cache the __dev_alloc_name()
  2024-05-09  7:46     ` Paolo Abeni
@ 2024-05-09 13:06       ` William Tu
  0 siblings, 0 replies; 9+ messages in thread
From: William Tu @ 2024-05-09 13:06 UTC (permalink / raw
  To: Paolo Abeni, netdev; +Cc: jiri, bodong, kuba



On 5/9/24 12:46 AM, Paolo Abeni wrote:
> External email: Use caution opening links or attachments
>
>
> On Tue, 2024-05-07 at 11:55 -0700, William Tu wrote:
>> On 5/7/24 12:26 AM, Paolo Abeni wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> On Mon, 2024-05-06 at 20:32 +0000, William Tu wrote:
>>>> When a system has around 1000 netdevs, adding the 1001st device becomes
>>>> very slow. The devlink command to create an SF
>>>>     $ devlink port add pci/0000:03:00.0 flavour pcisf \
>>>>       pfnum 0 sfnum 1001
>>>> takes around 5 seconds, and Linux perf and flamegraph show 19% of time
>>>> spent on __dev_alloc_name() [1].
>>>>
>>>> The reason is that devlink first requests for next available "eth%d".
>>>> And __dev_alloc_name will scan all existing netdev to match on "ethN",
>>>> set N to a 'inuse' bitmap, and find/return next available number,
>>>> in our case eth0.
>>>>
>>>> And later on based on udev rule, we renamed it from eth0 to
>>>> "en3f0pf0sf1001" and with altname below
>>>>     14: en3f0pf0sf1001: <BROADCAST,MULTICAST,UP,LOWER_UP> ...
>>>>         altname enp3s0f0npf0sf1001
>>>>
>>>> So eth0 is actually never being used, but as we have 1k "en3f0pf0sfN"
>>>> devices + 1k altnames, the __dev_alloc_name spends lots of time goint
>>>> through all existing netdev and try to build the 'inuse' bitmap of
>>>> pattern 'eth%d'. And the bitmap barely has any bit set, and it rescanes
>>>> every time.
>>>>
>>>> I want to see if it makes sense to save/cache the result, or is there
>>>> any way to not go through the 'eth%d' pattern search. The RFC patch
>>>> adds name_pat (name pattern) hlist and saves the 'inuse' bitmap. It saves
>>>> pattens, ex: "eth%d", "veth%d", with the bitmap, and lookup before
>>>> scanning all existing netdevs.
>>> An alternative heuristic that should be cheap and possibly reasonable
>>> could be optimistically check for <name>0..<name><very small int>
>>> availability, possibly restricting such attempt at scenarios where the
>>> total number of hashed netdevice names is somewhat high.
>>>
>>> WDYT?
>>>
>>> Cheers,
>>>
>>> Paolo
>> Hi Paolo,
>>
>> Thanks for your suggestion!
>> I'm not clear with that idea.
>>
>> The current code has to do a full scan of all netdevs in a list, and the
>> name list is not sorted / ordered. So to get to know, ex: eth0 .. eth10,
>> we still need to do a full scan, find netdev with prefix "eth", and get
>> net available bit 11 (10+1).
>> And in another use case where users doesn't install UDEV rule to rename,
>> the system can actually create eth998, eth999, eth1000....
>>
>> What if we create prefix map (maybe using xarray)
>> idx   entry=(prefix, bitmap)
>> --------------------
>> 0      eth, 1111000000...
>> 1      veth, 1000000...
>> 2      can, 11100000...
>> 3      firewire, 00000...
>>
>> but then we need to unset the bit when device is removed.
>> William
> Sorry for the late reply. I mean something alike the following
> (completely untested!!!):
> ---
> diff --git a/net/core/dev.c b/net/core/dev.c
> index d2ce91a334c1..0d428825f88a 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1109,6 +1109,12 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res)
>          if (!p || p[1] != 'd' || strchr(p + 2, '%'))
>                  return -EINVAL;
>
> +       for (i = 0; i < 4; ++i) {
> +               snprintf(buf, IFNAMSIZ, name, i);
> +               if (!__dev_get_by_name(net, buf))
> +                       goto found;
> +       }
> +
>          /* Use one page as a bit array of possible slots */
>          inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
>          if (!inuse)
> @@ -1144,6 +1150,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res)
>          if (i == max_netdevices)
>                  return -ENFILE;
>
> +found:
>          /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
>          strscpy(buf, name, IFNAMSIZ);
>          snprintf(res, IFNAMSIZ, buf, i);
>
> ---
> plus eventually some additional check to use such heuristic only if the
> total number of devices is significantly high. That would need some
> additional book-keeping, not added here.
>
> Cheers,
>
> Paolo
Hi Paolo,
Thanks, now I understand the idea.
Will give it a try.
William
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC net-next] net: cache the __dev_alloc_name()
  2024-05-09  3:27   ` William Tu
@ 2024-05-10 21:30     ` William Tu
  0 siblings, 0 replies; 9+ messages in thread
From: William Tu @ 2024-05-10 21:30 UTC (permalink / raw
  To: Stephen Hemminger; +Cc: netdev, jiri, bodong, kuba, Paolo Abeni



On 5/8/24 8:27 PM, William Tu wrote:
> External email: Use caution opening links or attachments
>
>
> On 5/7/24 9:24 PM, Stephen Hemminger wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On Mon, 6 May 2024 20:32:07 +0000
>> William Tu <witu@nvidia.com> wrote:
>>
>>> When a system has around 1000 netdevs, adding the 1001st device becomes
>>> very slow. The devlink command to create an SF
>>>    $ devlink port add pci/0000:03:00.0 flavour pcisf \
>>>      pfnum 0 sfnum 1001
>>> takes around 5 seconds, and Linux perf and flamegraph show 19% of time
>>> spent on __dev_alloc_name() [1].
>>>
>>> The reason is that devlink first requests for next available "eth%d".
>>> And __dev_alloc_name will scan all existing netdev to match on "ethN",
>>> set N to a 'inuse' bitmap, and find/return next available number,
>>> in our case eth0.
>>>
>>> And later on based on udev rule, we renamed it from eth0 to
>>> "en3f0pf0sf1001" and with altname below
>>>    14: en3f0pf0sf1001: <BROADCAST,MULTICAST,UP,LOWER_UP> ...
>>>        altname enp3s0f0npf0sf1001
>>>
>>> So eth0 is actually never being used, but as we have 1k "en3f0pf0sfN"
>>> devices + 1k altnames, the __dev_alloc_name spends lots of time goint
>>> through all existing netdev and try to build the 'inuse' bitmap of
>>> pattern 'eth%d'. And the bitmap barely has any bit set, and it rescanes
>>> every time.
>>>
>>> I want to see if it makes sense to save/cache the result, or is there
>>> any way to not go through the 'eth%d' pattern search. The RFC patch
>>> adds name_pat (name pattern) hlist and saves the 'inuse' bitmap. It 
>>> saves
>>> pattens, ex: "eth%d", "veth%d", with the bitmap, and lookup before
>>> scanning all existing netdevs.
>>>
>>> Note: code is working just for quick performance benchmark, and still
>>> missing lots of stuff. Using hlist seems to overkill, as I think
>>> we only have few patterns
>>> $ git grep alloc_netdev drivers/ net/ | grep %d
>>>
>>> 1. https://github.com/williamtu/net-next/issues/1
>>>
>>> Signed-off-by: William Tu <witu@nvidia.com>
> Hi Stephen,
> Thanks for your feedback.
>> Actual patch is bit of a mess, with commented out code, leftover 
>> printks,
>> random whitespace changes. Please fix that.
> Yes, working on it.
>>
>> The issue is that bitmap gets to be large and adds bloat to embedded 
>> devices.
> the bitmap size is fixed (8*PAGE_SIZE), set_bit is also fast. It's just
> that for each new device, we always re-scan all existing netdevs, set
> bit map, and then free the bitmap.
>>
>> Perhaps you could either force devlink to use the same device each 
>> time (eth0)
>> if it is going to be renamed anyway.
> It is working like that now (with udev) in my slow environment. So it's
> always getting eth0, (because bitmap is all 0s), and udev renames it to
> enp0xxx. Then next time rescan and since eth0 is still available,
> __dev_alloc_name still returns eth0, and udev renames it again, and next
> device creations follows the same, and the time to rescan gets longer
> and longer.
>
> Regards,
> William
>
>
Hi Stephen and Paoblo,

Today I realize this isn't an issue.
Basically my perf result doesn't get the full picture. The 19% of time 
spent on __dev_alloc_name seems to be OK, becuase:
$ time devlink port add pci/0000:03:00.0 flavour pcisf \
pfnum 0 sfnum 1001
real 0m1.440s
user 0m0.000s
sys 0m0.004s
It's just the 19% of the 'sys' time, not real time.

Thanks for your suggestions
William



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-05-10 21:30 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-05-06 20:32 [PATCH RFC net-next] net: cache the __dev_alloc_name() William Tu
2024-05-07  7:26 ` Paolo Abeni
2024-05-07 18:55   ` William Tu
2024-05-09  7:46     ` Paolo Abeni
2024-05-09 13:06       ` William Tu
2024-05-08  4:24 ` Stephen Hemminger
2024-05-09  3:27   ` William Tu
2024-05-10 21:30     ` William Tu
2024-05-09  6:11 ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.