在 2024/10/19 06:22, Bjorn Helgaas 写道:
On Fri, Oct 18, 2024 at 01:47:28PM +0800, Guixin Liu wrote:
PCI driver will traverse pci device list in pci_seq_start in every
sequential file reading, use xarry to store all pci devices to
accelerate finding the start.
Use "time cat /proc/bus/pci/devices" to test on a machine with 13k
pci devices, get an increase of about 90%.
s/pci/PCI/ (several times)
s/pci_seq_start/pci_seq_start()/
s/xarry/xarray/
s/, /, / (remove extra space)
OK.
Without this patch:
real 0m0.917s
user 0m0.000s
sys 0m0.913s
With this patch:
real 0m0.093s
user 0m0.000s
sys 0m0.093s
Nice speedup, for sure!
Signed-off-by: Guixin Liu <kanie@xxxxxxxxxxxxxxxxx>
---
drivers/pci/pci.h | 3 +++
drivers/pci/probe.c | 1 +
drivers/pci/proc.c | 64 +++++++++++++++++++++++++++++++++++++++-----
drivers/pci/remove.c | 1 +
include/linux/pci.h | 2 ++
5 files changed, 64 insertions(+), 7 deletions(-)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 14d00ce45bfa..1a7da91eeb80 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -962,4 +962,7 @@ void pcim_release_region(struct pci_dev *pdev, int bar);
(PCI_CONF1_ADDRESS(bus, dev, func, reg) | \
PCI_CONF1_EXT_REG(reg))
+void pci_seq_tree_add_dev(struct pci_dev *dev);
+void pci_seq_tree_remove_dev(struct pci_dev *dev);
+
#endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..1fd9e9022f70 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2592,6 +2592,7 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
WARN_ON(ret < 0);
pci_npem_create(dev);
+ pci_seq_tree_add_dev(dev);
}
struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index f967709082d6..30ca071ccad5 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -19,6 +19,9 @@
static int proc_initialized; /* = 0 */
+DEFINE_XARRAY_FLAGS(pci_seq_tree, 0);
+static unsigned long pci_max_idx;
+
static loff_t proc_bus_pci_lseek(struct file *file, loff_t off, int whence)
{
struct pci_dev *dev = pde_data(file_inode(file));
@@ -334,25 +337,72 @@ static const struct proc_ops proc_bus_pci_ops = {
#endif /* HAVE_PCI_MMAP */
};
+void pci_seq_tree_add_dev(struct pci_dev *dev)
+{
+ int ret;
+
+ if (dev) {
I don't think we should test "dev" for NULL here. If it's NULL, I
think we have bigger problems and we should oops.
Sure.
+ xa_lock(&pci_seq_tree);
+ pci_dev_get(dev);
+ ret = __xa_insert(&pci_seq_tree, pci_max_idx, dev, GFP_KERNEL);
+ if (!ret) {
+ dev->proc_seq_idx = pci_max_idx;
+ pci_max_idx++;
+ } else {
+ pci_dev_put(dev);
+ WARN_ON(ret);
+ }
+ xa_unlock(&pci_seq_tree);
+ }
+}
+
+void pci_seq_tree_remove_dev(struct pci_dev *dev)
+{
+ unsigned long idx = dev->proc_seq_idx;
+ struct pci_dev *latest_dev = NULL;
+ struct pci_dev *ret;
+
+ if (!dev)
+ return;
Same comment about testing "dev" for NULL.
Ok.
+ xa_lock(&pci_seq_tree);
+ __xa_erase(&pci_seq_tree, idx);
+ pci_dev_put(dev);
+ /*
+ * Move the latest pci_dev to this idx to keep the continuity.
+ */
+ if (idx != pci_max_idx - 1) {
+ latest_dev = __xa_erase(&pci_seq_tree, pci_max_idx - 1);
+ ret = __xa_cmpxchg(&pci_seq_tree, idx, NULL, latest_dev,
+ GFP_KERNEL);
+ if (!ret)
+ latest_dev->proc_seq_idx = idx;
+ WARN_ON(ret);
+ }
+ pci_max_idx--;
+ xa_unlock(&pci_seq_tree);
+}
+
/* iterator */
static void *pci_seq_start(struct seq_file *m, loff_t *pos)
{
- struct pci_dev *dev = NULL;
+ struct pci_dev *dev;
loff_t n = *pos;
- for_each_pci_dev(dev) {
- if (!n--)
- break;
- }
+ dev = xa_load(&pci_seq_tree, n);
+ if (dev)
+ pci_dev_get(dev);
return dev;
I'm a little hesitant to add another place that keeps track of every
PCI device. It's a fair bit of code here, and it's redundant
information, which means more work to keep them all synchronized.
This proc interface feels inherently racy. We keep track of the
current item (n) in a struct seq_file, but I don't think there's
anything to prevent a pci_dev hot-add or -remove between calls to
pci_seq_start().
Yes, maybe lost some information this time.
Is the proc interface the only place to get this information? If
there's a way to get it from sysfs, maybe that is better and we don't
need to spend effort optimizing the less-desirable path?
This is the situation I encountered: in scenarios of rapid container
scaling, when a container is started, it executes lscpu to traverse
the /proc/bus/pci/devices file, or the container process directly
traverses this file. When many containers are being started at once,
it causes numerous containers to wait due to the locks on the klist
used for traversing pci_dev, which greatly reduces the efficiency of
container scaling and also causes other CPUs to become unresponsive.
User-space programs, including Docker, are clients that we cannot easily
modify.
Therefore, I attempted to accelerate pci_seq_start() within the kernel.
This indeed resulted in the need for more code to maintain, as we must
ensure both fast access and ordering. Initially, I considered directly
modifying the klist in the driver base module, but such changes would
impact other drivers as well.
Do you have any other good suggestions?
Best Regards,
Guixin liu
}
static void *pci_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct pci_dev *dev = v;
+ struct pci_dev *dev;
(*pos)++;
- dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+ dev = xa_load(&pci_seq_tree, *pos);
+ if (dev)
+ pci_dev_get(dev);
Where is the pci_dev_put() that corresponds with this new
pci_dev_get()?
In pci_seq_stop(), will call the pci_dev_put().
return dev;
}
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index e4ce1145aa3e..257ea46233a3 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -53,6 +53,7 @@ static void pci_destroy_dev(struct pci_dev *dev)
pci_npem_remove(dev);
device_del(&dev->dev);
+ pci_seq_tree_remove_dev(dev);
down_write(&pci_bus_sem);
list_del(&dev->bus_list);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..aeb3d4cce06a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -534,6 +534,8 @@ struct pci_dev {
/* These methods index pci_reset_fn_methods[] */
u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */
+
+ unsigned long long proc_seq_idx;
};
static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
--
2.43.0