在 KVMTOOL 中对于?PCI 设备的模拟,每个 virtio 后端驱动都会有如下类似这样的调用来初始化后端,例如virtio-vsock:virtio_vsock_init_one ->?virtio_init -> virtio_pci__init(vdev->ops->init)。?virtio_pci__init 函数就是在模拟一个PCI 设备的配置空间。
int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
int device_id, int subsys_id, int class)
{
struct virtio_pci *vpci = vdev->virtio;
u32 mmio_addr, msix_io_block;
u16 port_addr;
int r;
vpci->kvm = kvm;
vpci->dev = dev;
BUILD_BUG_ON(!is_power_of_two(PCI_IO_SIZE)); //pci设备io空间大小0x100
port_addr = pci_get_io_port_block(PCI_IO_SIZE);//从PCI_IOPORT_START(x86是
//0x6200)开始分配pci设备io port空间
mmio_addr = pci_get_mmio_block(PCI_IO_SIZE); //对于x86,从0xD2000000开始为pci设备分
//配mmio空间
msix_io_block = pci_get_mmio_block(VIRTIO_MSIX_BAR_SIZE);
vpci->pci_hdr = (struct pci_device_header) {
.vendor_id = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET), //0x1af4.
/*
如果在 QEMU/LKVM 侧模拟一个 virtio-pci 设备,其vendor ID=0x1af4。Device ID
从0x1000到0x103F。而Subsystem Device ID:为1表示是network,为2表示block
device。virtio-pci设备,其配置空间与传统的pci设备保持一致,复用了PCI标准中的中
断,DMA机制等。
那么当Guest OS扫描到该设备时,将会将其加入到系统,于是使得
virtio_pci_driver被加载,该过程导致在/sys/bus/pci/devices中出现一个vendor
id = 0x1af4的pci设备,同时该设备指向一个名为"virtio-pci"的驱动程序。这期间很重
要的一个环节是"virtio-pci"驱动中的.probe成员,也即virtio_pci_probe().
*/
.device_id = cpu_to_le16(device_id),
//eg, PCI_DEVICE_ID_VIRTIO_VSOCK (0x1012), 定义见 include/kvm/virtio-
//pci-dev.h
.command = PCI_COMMAND_IO | PCI_COMMAND_MEMORY,
.header_type = PCI_HEADER_TYPE_NORMAL,
//表示这是一个EP设备(如果设置为 PCI_HEADER_TYPE_BRIDGE 表示桥)
.revision_id = 0,
.class[0] = class & 0xff,
.class[1] = (class >> 8) & 0xff,
.class[2] = (class >> 16) & 0xff,
.subsys_vendor_id = cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
.subsys_id = cpu_to_le16(subsys_id), //VIRTIO devide id
//eg,VIRTIO_ID_VSOCK
/*
标准的PCIe设备,可能有若干个内部空间(属性可能不一样,比如有些可预读,有些不可预读)需
要映射到内存空间,设备出厂时,这些空间的大小和属性都写在Configuration BAR寄存器里
面,然后上电后,系统软件读取这些BAR,分别为其分配对应的系统内存空间,并把相应的内存基
地址写回到BAR。(BAR的地址其实是PCI总线域的地址,CPU访问的是存储器域的地址,CPU访问
PCIe设备时,需要把总线域地址转换成存储器域的地址。
*/
.bar[0] = cpu_to_le32(port_addr | PCI_BASE_ADDRESS_SPACE_IO),
//PCI_BASE_ADDRESS_SPACE_IO = 1, I/O bar。从0x6200开始
.bar[1] = cpu_to_le32(mmio_addr | PCI_BASE_ADDRESS_SPACE_MEMORY),
// PCI_BASE_ADDRESS_SPACE_MEMORY = 0, memory bar。从0xD2000000开始
.bar[2] = cpu_to_le32(msix_io_block | PCI_BASE_ADDRESS_SPACE_MEMORY),
.status = cpu_to_le16(PCI_STATUS_CAP_LIST),
.capabilities = (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
.bar_size[0] = cpu_to_le32(PCI_IO_SIZE),
.bar_size[1] = cpu_to_le32(PCI_IO_SIZE),
.bar_size[2] = cpu_to_le32(VIRTIO_MSIX_BAR_SIZE),
};
r = pci__register_bar_regions(kvm, &vpci->pci_hdr,
virtio_pci__bar_activate, // activate a bar
virtio_pci__bar_deactivate, vdev); // deactivate a bar
if (r < 0)
return r;
vpci->dev_hdr = (struct device_header) {
.bus_type = DEVICE_BUS_PCI,
.data = &vpci->pci_hdr,
};
vpci->pci_hdr.msix.cap = PCI_CAP_ID_MSIX;
vpci->pci_hdr.msix.next = 0;
/*
* We at most have VIRTIO_NR_MSIX entries (VIRTIO_PCI_MAX_VQ
* entries for virt queue, VIRTIO_PCI_MAX_CONFIG entries for
* config).
*
* To quote the PCI spec:
*
* System software reads this field to determine the
* MSI-X Table Size N, which is encoded as N-1.
* For example, a returned value of "00000000011"
* indicates a table size of 4.
*/
vpci->pci_hdr.msix.ctrl = cpu_to_le16(VIRTIO_NR_MSIX - 1);
/* Both table and PBA are mapped to the same BAR (2) */
vpci->pci_hdr.msix.table_offset = cpu_to_le32(2);
vpci->pci_hdr.msix.pba_offset = cpu_to_le32(2 | VIRTIO_MSIX_TABLE_SIZE);
vpci->config_vector = 0;
if (irq__can_signal_msi(kvm))
vpci->features |= VIRTIO_PCI_F_SIGNAL_MSI;
vpci->legacy_irq_line = pci__assign_irq(&vpci->pci_hdr);
r = device__register(&vpci->dev_hdr);
if (r < 0)
return r;
return 0;
}
// x86 guest 内存布局
// x86/include/kvm/kvm-arch.h
/*
* The hole includes VESA framebuffer and PCI memory.
*/
#define KVM_32BIT_MAX_MEM_SIZE (1ULL << 32) //0x100000000
#define KVM_32BIT_GAP_SIZE (768 << 20) i// 0x30000000 > 768Mb
#define KVM_32BIT_GAP_START (KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE) //0xD0000000
#define KVM_MMIO_START KVM_32BIT_GAP_START //0xD0000000
/* This is the address that pci_get_io_port_block() starts allocating
* from. Note that this is a PCI bus address (though same on x86).
*/
#define KVM_IOPORT_AREA 0x0
#define KVM_PCI_CFG_AREA (KVM_MMIO_START + 0x1000000)
#define KVM_PCI_MMIO_AREA (KVM_MMIO_START + 0x2000000) //0xD2000000
#define KVM_VIRTIO_MMIO_AREA (KVM_MMIO_START + 0x3000000)
Guest 因 KVM_EXIT_MMIO/KVM_EXIT_IO 原因退出时,根据退出时访问的 addr 判断所属的 bar 空间,然后进行 IN/OUT 的操作, kvm_cpu_thread(vcpu thread) ? ? -> kvm_cpu__start? ? ? ? ? -> KVM_EXIT_{MMIO,IO}: kvm_cpu__emulate_{mmio,io}? ? ? ? ? ? ? -> kvm__emulate_{mmio,io} ? ? ? ? ? ? ? ? -> mmio_fn(virtio_pci__io_mmio_callback) ? ? ? ? ? ? ? ? ? ? -> virtio_pci__data_{in,out}
static void virtio_pci__io_mmio_callback(struct kvm_cpu *vcpu,
u64 addr, u8 *data, u32 len,
u8 is_write, void *ptr)
{
struct virtio_device *vdev = ptr;
struct virtio_pci *vpci = vdev->virtio;
u32 base_addr;
u32 bar0, bar1;
bar0 = virtio_pci__port_addr(vpci); //获取 bar0: vpci->pci_hdr->bar[0]
bar1 = virtio_pci__mmio_addr(vpci); //获取 bar1: vpci->pci_hdr->bar[1]
//判断 addr 所属的 bar
if (addr >= bar0 && addr < bar0 + pci__bar_size(&vpci->pci_hdr, 0))
base_addr = bar0;//bar0 存储的基地址
else
base_addr = bar1;//bar1 存储的基地址
if (!is_write)
virtio_pci__data_in(vcpu, vdev, addr - base_addr, data, len);
//addr在bar中的偏移
else
virtio_pci__data_out(vcpu, vdev, addr - base_addr, data, len);
//addr在bar中的偏移
}
在virtio_pci__data_{out,in}函数中,将bar空间作为guest与host之间的virtio feature协商空间,例如QUEUE的配置、PCI 设备状态、QUEUE NOTIFY等,
static bool virtio_pci__data_out(struct kvm_cpu *vcpu, struct virtio_device *vdev,
unsigned long offset, void *data, int size)
{
bool ret = true;
struct virtio_pci *vpci;
struct kvm *kvm;
u32 val;
kvm = vcpu->kvm;
vpci = vdev->virtio;
switch (offset) {
case VIRTIO_PCI_GUEST_FEATURES:
val = ioport__read32(data);
virtio_set_guest_features(kvm, vdev, vpci->dev, val);
break;
case VIRTIO_PCI_QUEUE_PFN: //配置virtio queue
val = ioport__read32(data);
if (val) {
virtio_pci__init_ioeventfd(kvm, vdev,
vpci->queue_selector); //为queue idx初始化eventfd
vdev->ops->init_vq(kvm, vpci->dev, vpci->queue_selector,
1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT,
VIRTIO_PCI_VRING_ALIGN, val);//初始化用户态virtio queue
} else {
virtio_pci_exit_vq(kvm, vdev, vpci->queue_selector);
}
break;
case VIRTIO_PCI_QUEUE_SEL: //virtio queue index
vpci->queue_selector = ioport__read16(data);
break;
case VIRTIO_PCI_QUEUE_NOTIFY: //guest需要通知host后端,写此offset(如果是vhost模式,
//直接在内核态处理,不会走到这里)
val = ioport__read16(data);
vdev->ops->notify_vq(kvm, vpci->dev, val);
break;
case VIRTIO_PCI_STATUS: //guest virtio_pci 设备状态
vpci->status = ioport__read8(data);
if (!vpci->status) /* Sample endianness on reset */
vdev->endian = kvm_cpu__get_endianness(vcpu);
virtio_notify_status(kvm, vdev, vpci->dev, vpci->status);
break;
default:
ret = virtio_pci__specific_data_out(kvm, vdev, data, size, offset);
break;
};
return ret;
}
Guest 侧 virtio-pci 设备驱动加载打印如下,
virtio_pci_driver?
virtio_pci_probe
virtio_pci_modern_probe
[ 0.372195] PCI: Probing PCI hardware
[ 0.372519] PCI host bridge to bus 0000:00
[ 0.372881] pci_bus 0000:00: root bus resource [io 0x0000-0xffff]
[ 0.373418] pci_bus 0000:00: root bus resource [mem 0x00000000-0x7fffffffff]
[ 0.374028] pci_bus 0000:00: No busn resource found for root bus, will use [bus 00-ff]
[ 0.374767] pci 0000:00:00.0: [1af4:1009] type 00 class 0xff0000
[ 0.375389] pci 0000:00:00.0: reg 0x10: [io 0x6200-0x62ff] //根据上述virtio_pci__init的设置,可知这是virto-pci设备x的bar0空间
[ 0.375913] pci 0000:00:00.0: reg 0x14: [mem 0xd2000000-0xd20000ff] //根据上述virtio_pci__init的设置,可知这是virto-pci设备x的bar1空间
[ 0.376214] pci 0000:00:00.0: reg 0x18: [mem 0xd2000400-0xd20007ff]
[ 0.377300] pci 0000:00:01.0: [1af4:1009] type 00 class 0xff0000
[ 0.377920] pci 0000:00:01.0: reg 0x10: [io 0x6300-0x63ff] //根据上述virtio_pci__init的设置,可知这是virto-pci设备y的bar0空间
[ 0.378466] pci 0000:00:01.0: reg 0x14: [mem 0xd2000800-0xd20008ff] //根据上述virtio_pci__init的设置,可知这是virto-pci设备y的bar1空间
[ 0.379047] pci 0000:00:01.0: reg 0x18: [mem 0xd2000c00-0xd2000fff]
[ 0.380589] pci 0000:00:02.0: [1af4:1000] type 00 class 0x020000
[ 0.381209] pci 0000:00:02.0: reg 0x10: [io 0x6400-0x64ff] //根据上述virtio_pci__init的设置,可知这是virto-pci设备z的bar0空间
[ 0.381736] pci 0000:00:02.0: reg 0x14: [mem 0xd2001000-0xd20010ff] //根据上述virtio_pci__init的设置,可知这是virto-pci设备x的bar1空间
[ 0.382322] pci 0000:00:02.0: reg 0x18: [mem 0xd2001400-0xd20017ff]
[ 0.387756] pci_bus 0000:00: busn_res: [bus 00-ff] end is updated to 00
[ 0.413901] pci_bus 0000:00: resource 4 [io 0x0000-0xffff]
[ 0.414399] pci_bus 0000:00: resource 5 [mem 0x00000000-0x7fffffffff]
[ 0.433793] virtio-pci 0000:00:00.0: virtio_pci: leaving for legacy driver
[ 0.434506] virtio-pci 0000:00:01.0: virtio_pci: leaving for legacy driver
[ 0.435215] virtio-pci 0000:00:02.0: virtio_pci: leaving for legacy driver
References:
1, virtio spec
2, 老男孩读PCIe之六:配置和地址空间
|