Re: [PATCH v3 kvmtool 29/32] pci: Implement reassignable BARs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

On 4/6/20 3:05 PM, André Przywara wrote:
> On 26/03/2020 15:24, Alexandru Elisei wrote:
>> BARs are used by the guest to configure the access to the PCI device by
>> writing the address to which the device will respond. The basic idea for
>> adding support for reassignable BARs is straightforward: deactivate
>> emulation for the memory region described by the old BAR value, and
>> activate emulation for the new region.
>>
>> BAR reassignement can be done while device access is enabled and memory
>> regions for different devices can overlap as long as no access is made
>> to the overlapping memory regions. This means that it is legal for the
>> BARs of two distinct devices to point to an overlapping memory region,
>> and indeed, this is how Linux does resource assignment at boot. To
>> account for this situation, the simple algorithm described above is
>> enhanced to scan for all devices and:
>>
>> - Deactivate emulation for any BARs that might overlap with the new BAR
>>   value.
>>
>> - Enable emulation for any BARs that were overlapping with the old value
>>   after the BAR has been updated.
>>
>> Activating/deactivating emulation of a memory region has side effects.
>> In order to prevent the execution of the same callback twice we now keep
>> track of the state of the region emulation. For example, this can happen
>> if we program a BAR with an address that overlaps a second BAR, thus
>> deactivating emulation for the second BAR, and then we disable all
>> region accesses to the second BAR by writing to the command register.
>>
>> Signed-off-by: Alexandru Elisei <alexandru.elisei@xxxxxxx>
>> ---
>>  include/kvm/pci.h |  14 ++-
>>  pci.c             | 273 +++++++++++++++++++++++++++++++++++++---------
>>  vfio/pci.c        |  12 ++
>>  3 files changed, 244 insertions(+), 55 deletions(-)
>>
>> diff --git a/include/kvm/pci.h b/include/kvm/pci.h
>> index 1d7d4c0cea5a..be75f77fd2cb 100644
>> --- a/include/kvm/pci.h
>> +++ b/include/kvm/pci.h
>> @@ -11,6 +11,17 @@
>>  #include "kvm/msi.h"
>>  #include "kvm/fdt.h"
>>  
>> +#define pci_dev_err(pci_hdr, fmt, ...) \
>> +	pr_err("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +#define pci_dev_warn(pci_hdr, fmt, ...) \
>> +	pr_warning("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +#define pci_dev_info(pci_hdr, fmt, ...) \
>> +	pr_info("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +#define pci_dev_dbg(pci_hdr, fmt, ...) \
>> +	pr_debug("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +#define pci_dev_die(pci_hdr, fmt, ...) \
>> +	die("[%04x:%04x] " fmt, pci_hdr->vendor_id, pci_hdr->device_id, ##__VA_ARGS__)
>> +
>>  /*
>>   * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
>>   * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
>> @@ -142,7 +153,8 @@ struct pci_device_header {
>>  	};
>>  
>>  	/* Private to lkvm */
>> -	u32		bar_size[6];
>> +	u32			bar_size[6];
>> +	bool			bar_active[6];
>>  	bar_activate_fn_t	bar_activate_fn;
>>  	bar_deactivate_fn_t	bar_deactivate_fn;
>>  	void *data;
>> diff --git a/pci.c b/pci.c
>> index c2860e6707fe..68ece65441a6 100644
>> --- a/pci.c
>> +++ b/pci.c
>> @@ -71,6 +71,11 @@ static bool pci_bar_is_implemented(struct pci_device_header *pci_hdr, int bar_nu
>>  	return pci__bar_size(pci_hdr, bar_num);
>>  }
>>  
>> +static bool pci_bar_is_active(struct pci_device_header *pci_hdr, int bar_num)
>> +{
>> +	return  pci_hdr->bar_active[bar_num];
>> +}
>> +
>>  static void *pci_config_address_ptr(u16 port)
>>  {
>>  	unsigned long offset;
>> @@ -163,6 +168,46 @@ static struct ioport_operations pci_config_data_ops = {
>>  	.io_out	= pci_config_data_out,
>>  };
>>  
>> +static int pci_activate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
>> +			    int bar_num)
>> +{
>> +	int r = 0;
>> +
>> +	if (pci_bar_is_active(pci_hdr, bar_num))
>> +		goto out;
>> +
>> +	r = pci_hdr->bar_activate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
>> +	if (r < 0) {
>> +		pci_dev_warn(pci_hdr, "Error activating emulation for BAR %d",
>> +			    bar_num);
>> +		goto out;
>> +	}
>> +	pci_hdr->bar_active[bar_num] = true;
>> +
>> +out:
>> +	return r;
>> +}
>> +
>> +static int pci_deactivate_bar(struct kvm *kvm, struct pci_device_header *pci_hdr,
>> +			      int bar_num)
>> +{
>> +	int r = 0;
>> +
>> +	if (!pci_bar_is_active(pci_hdr, bar_num))
>> +		goto out;
>> +
>> +	r = pci_hdr->bar_deactivate_fn(kvm, pci_hdr, bar_num, pci_hdr->data);
>> +	if (r < 0) {
>> +		pci_dev_warn(pci_hdr, "Error deactivating emulation for BAR %d",
>> +			    bar_num);
>> +		goto out;
>> +	}
>> +	pci_hdr->bar_active[bar_num] = false;
>> +
>> +out:
>> +	return r;
>> +}
>> +
>>  static void pci_config_command_wr(struct kvm *kvm,
>>  				  struct pci_device_header *pci_hdr,
>>  				  u16 new_command)
>> @@ -179,26 +224,179 @@ static void pci_config_command_wr(struct kvm *kvm,
>>  
>>  		if (toggle_io && pci__bar_is_io(pci_hdr, i)) {
>>  			if (__pci__io_space_enabled(new_command))
>> -				pci_hdr->bar_activate_fn(kvm, pci_hdr, i,
>> -							 pci_hdr->data);
>> +				pci_activate_bar(kvm, pci_hdr, i);
>>  			else
>> -				pci_hdr->bar_deactivate_fn(kvm, pci_hdr, i,
>> -							   pci_hdr->data);
>> +				pci_deactivate_bar(kvm, pci_hdr, i);
>>  		}
>>  
>>  		if (toggle_mem && pci__bar_is_memory(pci_hdr, i)) {
>>  			if (__pci__memory_space_enabled(new_command))
>> -				pci_hdr->bar_activate_fn(kvm, pci_hdr, i,
>> -							 pci_hdr->data);
>> +				pci_activate_bar(kvm, pci_hdr, i);
>>  			else
>> -				pci_hdr->bar_deactivate_fn(kvm, pci_hdr, i,
>> -							   pci_hdr->data);
>> +				pci_deactivate_bar(kvm, pci_hdr, i);
>>  		}
>>  	}
>>  
>>  	pci_hdr->command = new_command;
>>  }
>>  
>> +static int pci_deactivate_bar_regions(struct kvm *kvm,
>> +				      struct pci_device_header *pci_hdr,
> What is this argument for? I don't see it used, also this function is
> not used as a callback. A leftover from some refactoring?

Nicely spotted, it's a leftover from refactoring, I'll remove it.

>
>> +				      u32 start, u32 size)
>> +{
>> +	struct device_header *dev_hdr;
>> +	struct pci_device_header *tmp_hdr;
>> +	u32 tmp_addr, tmp_size;
>> +	int i, r;
>> +
>> +	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
>> +	while (dev_hdr) {
>> +		tmp_hdr = dev_hdr->data;
>> +		for (i = 0; i < 6; i++) {
>> +			if (!pci_bar_is_implemented(tmp_hdr, i))
>> +				continue;
>> +
>> +			tmp_addr = pci__bar_address(tmp_hdr, i);
>> +			tmp_size = pci__bar_size(tmp_hdr, i);
>> +
>> +			if (tmp_addr + tmp_size <= start ||
>> +			    tmp_addr >= start + size)
>> +				continue;
>> +
>> +			r = pci_deactivate_bar(kvm, tmp_hdr, i);
> This and the function below are identical, apart from this line here.
> Any chance we can merge this?

Good idea, I'll merge them.

>
>> +			if (r < 0)
>> +				return r;
>> +		}
>> +		dev_hdr = device__next_dev(dev_hdr);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int pci_activate_bar_regions(struct kvm *kvm,
>> +				    struct pci_device_header *pci_hdr,
> Same question about this parameter, we don't seem to need it. But you
> should merge those two functions anyway.
>
> The rest seems to look alright. The pci_config_bar_wr() looks a bit
> lengthy, but I guess there is no way around it, since this is the gut of
> the algorithm explained in the commit message. Can you maybe add a
> condensed version of that explanation to the code?

I can try adding a comment, but it will make the function even more lengthy. I'll
give it a go and see how it looks.

Thanks,
Alex
>
> Thanks,
> Andre
>
>
>> +				    u32 start, u32 size)
>> +{
>> +	struct device_header *dev_hdr;
>> +	struct pci_device_header *tmp_hdr;
>> +	u32 tmp_addr, tmp_size;
>> +	int i, r;
>> +
>> +	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
>> +	while (dev_hdr) {
>> +		tmp_hdr = dev_hdr->data;
>> +		for (i = 0; i < 6; i++) {
>> +			if (!pci_bar_is_implemented(tmp_hdr, i))
>> +				continue;
>> +
>> +			tmp_addr = pci__bar_address(tmp_hdr, i);
>> +			tmp_size = pci__bar_size(tmp_hdr, i);
>> +
>> +			if (tmp_addr + tmp_size <= start ||
>> +			    tmp_addr >= start + size)
>> +				continue;
>> +
>> +			r = pci_activate_bar(kvm, tmp_hdr, i);
>> +			if (r < 0)
>> +				return r;
>> +		}
>> +		dev_hdr = device__next_dev(dev_hdr);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void pci_config_bar_wr(struct kvm *kvm,
>> +			      struct pci_device_header *pci_hdr, int bar_num,
>> +			      u32 value)
>> +{
>> +	u32 old_addr, new_addr, bar_size;
>> +	u32 mask;
>> +	int r;
>> +
>> +	if (pci__bar_is_io(pci_hdr, bar_num))
>> +		mask = (u32)PCI_BASE_ADDRESS_IO_MASK;
>> +	else
>> +		mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
>> +
>> +	/*
>> +	 * If the kernel masks the BAR, it will expect to find the size of the
>> +	 * BAR there next time it reads from it. After the kernel reads the
>> +	 * size, it will write the address back.
>> +	 *
>> +	 * According to the PCI local bus specification REV 3.0: The number of
>> +	 * upper bits that a device actually implements depends on how much of
>> +	 * the address space the device will respond to. A device that wants a 1
>> +	 * MB memory address space (using a 32-bit base address register) would
>> +	 * build the top 12 bits of the address register, hardwiring the other
>> +	 * bits to 0.
>> +	 *
>> +	 * Furthermore, software can determine how much address space the device
>> +	 * requires by writing a value of all 1's to the register and then
>> +	 * reading the value back. The device will return 0's in all don't-care
>> +	 * address bits, effectively specifying the address space required.
>> +	 *
>> +	 * Software computes the size of the address space with the formula
>> +	 * S =  ~B + 1, where S is the memory size and B is the value read from
>> +	 * the BAR. This means that the BAR value that kvmtool should return is
>> +	 * B = ~(S - 1).
>> +	 */
>> +	if (value == 0xffffffff) {
>> +		value = ~(pci__bar_size(pci_hdr, bar_num) - 1);
>> +		/* Preserve the special bits. */
>> +		value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
>> +		pci_hdr->bar[bar_num] = value;
>> +		return;
>> +	}
>> +
>> +	value = (value & mask) | (pci_hdr->bar[bar_num] & ~mask);
>> +
>> +	/* Don't toggle emulation when region type access is disbled. */
>> +	if (pci__bar_is_io(pci_hdr, bar_num) &&
>> +	    !pci__io_space_enabled(pci_hdr)) {
>> +		pci_hdr->bar[bar_num] = value;
>> +		return;
>> +	}
>> +
>> +	if (pci__bar_is_memory(pci_hdr, bar_num) &&
>> +	    !pci__memory_space_enabled(pci_hdr)) {
>> +		pci_hdr->bar[bar_num] = value;
>> +		return;
>> +	}
>> +
>> +	old_addr = pci__bar_address(pci_hdr, bar_num);
>> +	new_addr = __pci__bar_address(value);
>> +	bar_size = pci__bar_size(pci_hdr, bar_num);
>> +
>> +	r = pci_deactivate_bar(kvm, pci_hdr, bar_num);
>> +	if (r < 0)
>> +		return;
>> +
>> +	r = pci_deactivate_bar_regions(kvm, pci_hdr, new_addr, bar_size);
>> +	if (r < 0) {
>> +		/*
>> +		 * We cannot update the BAR because of an overlapping region
>> +		 * that failed to deactivate emulation, so keep the old BAR
>> +		 * value and re-activate emulation for it.
>> +		 */
>> +		pci_activate_bar(kvm, pci_hdr, bar_num);
>> +		return;
>> +	}
>> +
>> +	pci_hdr->bar[bar_num] = value;
>> +	r = pci_activate_bar(kvm, pci_hdr, bar_num);
>> +	if (r < 0) {
>> +		/*
>> +		 * New region cannot be emulated, re-enable the regions that
>> +		 * were overlapping.
>> +		 */
>> +		pci_activate_bar_regions(kvm, pci_hdr, new_addr, bar_size);
>> +		return;
>> +	}
>> +
>> +	pci_activate_bar_regions(kvm, pci_hdr, old_addr, bar_size);
>> +}
>> +
>>  void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>>  {
>>  	void *base;
>> @@ -206,7 +404,6 @@ void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
>>  	struct pci_device_header *pci_hdr;
>>  	u8 dev_num = addr.device_number;
>>  	u32 value = 0;
>> -	u32 mask;
>>  
>>  	if (!pci_device_exists(addr.bus_number, dev_num, 0))
>>  		return;
>> @@ -231,46 +428,13 @@ void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data,
>>  	}
>>  
>>  	bar = (offset - PCI_BAR_OFFSET(0)) / sizeof(u32);
>> -
>> -	/*
>> -	 * If the kernel masks the BAR, it will expect to find the size of the
>> -	 * BAR there next time it reads from it. After the kernel reads the
>> -	 * size, it will write the address back.
>> -	 */
>>  	if (bar < 6) {
>> -		if (pci__bar_is_io(pci_hdr, bar))
>> -			mask = (u32)PCI_BASE_ADDRESS_IO_MASK;
>> -		else
>> -			mask = (u32)PCI_BASE_ADDRESS_MEM_MASK;
>> -		/*
>> -		 * According to the PCI local bus specification REV 3.0:
>> -		 * The number of upper bits that a device actually implements
>> -		 * depends on how much of the address space the device will
>> -		 * respond to. A device that wants a 1 MB memory address space
>> -		 * (using a 32-bit base address register) would build the top
>> -		 * 12 bits of the address register, hardwiring the other bits
>> -		 * to 0.
>> -		 *
>> -		 * Furthermore, software can determine how much address space
>> -		 * the device requires by writing a value of all 1's to the
>> -		 * register and then reading the value back. The device will
>> -		 * return 0's in all don't-care address bits, effectively
>> -		 * specifying the address space required.
>> -		 *
>> -		 * Software computes the size of the address space with the
>> -		 * formula S = ~B + 1, where S is the memory size and B is the
>> -		 * value read from the BAR. This means that the BAR value that
>> -		 * kvmtool should return is B = ~(S - 1).
>> -		 */
>>  		memcpy(&value, data, size);
>> -		if (value == 0xffffffff)
>> -			value = ~(pci__bar_size(pci_hdr, bar) - 1);
>> -		/* Preserve the special bits. */
>> -		value = (value & mask) | (pci_hdr->bar[bar] & ~mask);
>> -		memcpy(base + offset, &value, size);
>> -	} else {
>> -		memcpy(base + offset, data, size);
>> +		pci_config_bar_wr(kvm, pci_hdr, bar, value);
>> +		return;
>>  	}
>> +
>> +	memcpy(base + offset, data, size);
>>  }
>>  
>>  void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
>> @@ -338,20 +502,21 @@ int pci__register_bar_regions(struct kvm *kvm, struct pci_device_header *pci_hdr
>>  			continue;
>>  
>>  		has_bar_regions = true;
>> +		assert(!pci_bar_is_active(pci_hdr, i));
>>  
>>  		if (pci__bar_is_io(pci_hdr, i) &&
>>  		    pci__io_space_enabled(pci_hdr)) {
>> -				r = bar_activate_fn(kvm, pci_hdr, i, data);
>> -				if (r < 0)
>> -					return r;
>> -			}
>> +			r = pci_activate_bar(kvm, pci_hdr, i);
>> +			if (r < 0)
>> +				return r;
>> +		}
>>  
>>  		if (pci__bar_is_memory(pci_hdr, i) &&
>>  		    pci__memory_space_enabled(pci_hdr)) {
>> -				r = bar_activate_fn(kvm, pci_hdr, i, data);
>> -				if (r < 0)
>> -					return r;
>> -			}
>> +			r = pci_activate_bar(kvm, pci_hdr, i);
>> +			if (r < 0)
>> +				return r;
>> +		}
>>  	}
>>  
>>  	assert(has_bar_regions);
>> diff --git a/vfio/pci.c b/vfio/pci.c
>> index 18e22a8c5320..2b891496547d 100644
>> --- a/vfio/pci.c
>> +++ b/vfio/pci.c
>> @@ -457,6 +457,7 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>>  	struct vfio_pci_msix_pba *pba = &pdev->msix_pba;
>>  	struct vfio_pci_msix_table *table = &pdev->msix_table;
>>  	struct vfio_region *region;
>> +	u32 bar_addr;
>>  	bool has_msix;
>>  	int ret;
>>  
>> @@ -465,7 +466,14 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>>  	region = &vdev->regions[bar_num];
>>  	has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX;
>>  
>> +	bar_addr = pci__bar_address(pci_hdr, bar_num);
>> +	if (pci__bar_is_io(pci_hdr, bar_num))
>> +		region->port_base = bar_addr;
>> +	else
>> +		region->guest_phys_addr = bar_addr;
>> +
>>  	if (has_msix && (u32)bar_num == table->bar) {
>> +		table->guest_phys_addr = region->guest_phys_addr;
>>  		ret = kvm__register_mmio(kvm, table->guest_phys_addr,
>>  					 table->size, false,
>>  					 vfio_pci_msix_table_access, pdev);
>> @@ -474,6 +482,10 @@ static int vfio_pci_bar_activate(struct kvm *kvm,
>>  	}
>>  
>>  	if (has_msix && (u32)bar_num == pba->bar) {
>> +		if (pba->bar == table->bar)
>> +			pba->guest_phys_addr = table->guest_phys_addr + table->size;
>> +		else
>> +			pba->guest_phys_addr = region->guest_phys_addr;
>>  		ret = kvm__register_mmio(kvm, pba->guest_phys_addr,
>>  					 pba->size, false,
>>  					 vfio_pci_msix_pba_access, pdev);
>>



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux