内核热插拔驱动
PCI设备在BIOS启动阶段枚举,之后传给linux内核,由linux内核将每个PCI设备注册到pci_bus_type总线上,进而匹配pci驱动,如下图所示。 PCI桥设备也会匹配pci驱动,这个驱动名称为pcieport,该驱动在portdrv_pci.c文件中注册,来看一下它的匹配规则,如下所示。
static int __init pcie_portdrv_init(void)
{
if (pcie_ports_disabled)
return -EACCES;
pcie_init_services();
dmi_check_system(pcie_portdrv_dmi_table);![在这里插入图片描述](https:
return pci_register_driver(&pcie_portdriver);
}
device_initcall(pcie_portdrv_init);
static struct pci_driver pcie_portdriver = {
.name = "pcieport",
.id_table = &port_pci_ids[0],
.probe = pcie_portdrv_probe,
.remove = pcie_portdrv_remove,
.shutdown = pcie_portdrv_remove,
.err_handler = &pcie_portdrv_err_handler,
.driver.pm = PCIE_PORTDRV_PM_OPS,
};
static const struct pci_device_id port_pci_ids[] = {
{ PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x00), ~0) },
{ PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x01), ~0) },
{ },
};
#define PCI_DEVICE_CLASS(dev_class,dev_class_mask) \
.class = (dev_class), .class_mask = (dev_class_mask), \
.vendor = PCI_ANY_ID, .device = PCI_ANY_ID, \
.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
pcieport驱动会匹配所有的桥设备,然后执行驱动的probe函数,执行过程如下。
----->probe
pcie_portdrv_probe
pcie_port_device_register
get_port_device_capability
if (dev->is_hotplug_bridge &&
(pcie_ports_native || host->native_pcie_hotplug)) {
services |= PCIE_PORT_SERVICE_HP;
pcie_init_service_irqs
pcie_init_service_irqs
pci_alloc_irq_vectors(dev, 1, PCIE_PORT_MAX_MSI_ENTRIES,PCI_IRQ_MSIX | PCI_IRQ_MSI);
pcie_device_init
device = &pcie->device;
device->bus = &pcie_port_bus_type;
device->release = release_pcie_device;
dev_set_name(device, "%s:pcie%03x",
pci_name(pdev),
get_descriptor_id(pci_pcie_type(pdev), service));
device->parent = &pdev->dev;
device_enable_async_suspend(device);
retval = device_register(device);
首先检查设备所支持的服务,对于支持的服务置上标志位,这里主要关心PCIE_PORT_SERVICE_HP服务;然后为每个服务申请中断号保存,供后续服务驱动使用;最后创建一个设备注册到pcie_port_bus_type总线上。
对于有PCIE_PORT_SERVICE_HP服务的设备,将会匹配pciehp驱动,该驱动在pciehp_core.c文件中注册,注册过程如下。
int __init pcie_hp_init(void)
{
int retval = 0;
retval = pcie_port_service_register(&hpdriver_portdrv);
pr_debug("pcie_port_service_register = %d\n", retval);
if (retval)
pr_debug("Failure to register service\n");
return retval;
}
int pcie_port_service_register(struct pcie_port_service_driver *new)
{
if (pcie_ports_disabled)
return -ENODEV;
new->driver.name = new->name;
new->driver.bus = &pcie_port_bus_type;
new->driver.probe = pcie_port_probe_service;
new->driver.remove = pcie_port_remove_service;
new->driver.shutdown = pcie_port_shutdown_service;
return driver_register(&new->driver);
}
static struct pcie_port_service_driver hpdriver_portdrv = {
.name = "pciehp",
.port_type = PCIE_ANY_PORT,
.service = PCIE_PORT_SERVICE_HP,
.probe = pciehp_probe,
.remove = pciehp_remove,
#ifdef CONFIG_PM
#ifdef CONFIG_PM_SLEEP
.suspend = pciehp_suspend,
.resume_noirq = pciehp_resume_noirq,
.resume = pciehp_resume,
#endif
.runtime_suspend = pciehp_runtime_suspend,
.runtime_resume = pciehp_runtime_resume,
#endif
};
pciehp驱动也会注册在pcie_port_bus_type总线上,那么这个总线的匹配规则是什么呢?
struct bus_type pcie_port_bus_type = {
.name = "pci_express",
.match = pcie_port_bus_match,
};
static int pcie_port_bus_match(struct device *dev, struct device_driver *drv)
{
struct pcie_device *pciedev;
struct pcie_port_service_driver *driver;
if (drv->bus != &pcie_port_bus_type || dev->bus != &pcie_port_bus_type)
return 0;
pciedev = to_pcie_device(dev);
driver = to_service_driver(drv);
if (driver->service != pciedev->service)
return 0;
if (driver->port_type != PCIE_ANY_PORT &&
driver->port_type != pci_pcie_type(pciedev->port))
return 0;
return 1;
}
匹配规则是服务相等。pciehp驱动与设备匹配成功后执行pciehp_probe函数,如下。
----->pciehp_probe
pcie_init
init_slot
ops->enable_slot = pciehp_sysfs_enable_slot;
ops->disable_slot = pciehp_sysfs_disable_slot;
ops->get_power_status = get_power_status;
ops->get_adapter_status = get_adapter_status;
ops->reset_slot = pciehp_reset_slot;
pcie_init_notification
pciehp_request_irq
kthread_run(&pciehp_poll, ctrl,"pciehp_poll-%s", slot_name(ctrl));
or
request_threaded_irq(irq, pciehp_isr, pciehp_ist, IRQF_SHARED, "pciehp", ctrl);
pci_hp_add
fs_add_slot
if (has_power_file(pci_slot)) {
retval = sysfs_create_file(&pci_slot->kobj,
&hotplug_slot_attr_power.attr);
if (retval)
goto exit_power;
}
if (has_attention_file(pci_slot)) {
retval = sysfs_create_file(&pci_slot->kobj,
&hotplug_slot_attr_attention.attr);
if (retval)
goto exit_attention;
}
先初始化槽位真正的操作函数,然后创建内核查询槽位状态的线程或注册中断,最后注册sys属性给用户使用。
综上,整个过程如下图所示。
触发方法
触发方法有两种,分别为中断方式和POLL方式。
中断方式
硬中断
当SLOT上报中断给host时,将调用中断号对应的上半部中断处理函数pciehp_isr,由该函数检查状态确认有中断事件产生,然后将唤醒pciehp_ist中断处理线程执行真正处理动作。
pciehp_isr中断的上半部处理函数如下。
static irqreturn_t pciehp_isr(int irq, void *dev_id)
{
struct controller *ctrl = (struct controller *)dev_id;
struct pci_dev *pdev = ctrl_dev(ctrl);
struct device *parent = pdev->dev.parent;
u16 status, events = 0;
if (pdev->current_state == PCI_D3cold ||
(!(ctrl->slot_ctrl & PCI_EXP_SLTCTL_HPIE) && !pciehp_poll_mode))
return IRQ_NONE;
if (parent) {
pm_runtime_get_noresume(parent);
if (!pm_runtime_active(parent)) {
pm_runtime_put(parent);
disable_irq_nosync(irq);
atomic_or(RERUN_ISR, &ctrl->pending_events);
return IRQ_WAKE_THREAD;
}
}
read_status:
pcie_capability_read_word(pdev, PCI_EXP_SLTSTA, &status);
if (status == (u16) ~0) {
ctrl_info(ctrl, "%s: no response from device\n", __func__);
if (parent)
pm_runtime_put(parent);
return IRQ_NONE;
}
status &= PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD |
PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC |
PCI_EXP_SLTSTA_DLLSC;
if (ctrl->power_fault_detected)
status &= ~PCI_EXP_SLTSTA_PFD;
events |= status;
if (!events) {
if (parent)
pm_runtime_put(parent);
return IRQ_NONE;
}
if (status) {
pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, events);
if (pci_dev_msi_enabled(pdev) && !pciehp_poll_mode)
goto read_status;
}
ctrl_dbg(ctrl, "pending interrupts %#06x from Slot Status\n", events);
if (parent)
pm_runtime_put(parent);
if (events & PCI_EXP_SLTSTA_CC) {
ctrl->cmd_busy = 0;
smp_mb();
wake_up(&ctrl->queue);
if (events == PCI_EXP_SLTSTA_CC)
return IRQ_HANDLED;
events &= ~PCI_EXP_SLTSTA_CC;
}
if (pdev->ignore_hotplug) {
ctrl_dbg(ctrl, "ignoring hotplug event %#06x\n", events);
return IRQ_HANDLED;
}
atomic_or(events, &ctrl->pending_events);
return IRQ_WAKE_THREAD;
}
假设SLOT上报的中断,并置位了SLOT状态PCI_EXP_SLTSTA_DLLSC位(链路状态已变化),则将唤醒pciehp_ist中断线程(中断下半部),代码如下。
static irqreturn_t pciehp_ist(int irq, void *dev_id)
{
struct controller *ctrl = (struct controller *)dev_id;
struct pci_dev *pdev = ctrl_dev(ctrl);
irqreturn_t ret;
u32 events;
ctrl->ist_running = true;
pci_config_pm_runtime_get(pdev);
if (atomic_fetch_and(~RERUN_ISR, &ctrl->pending_events) & RERUN_ISR) {
ret = pciehp_isr(irq, dev_id);
enable_irq(irq);
if (ret != IRQ_WAKE_THREAD)
goto out;
}
synchronize_hardirq(irq);
events = atomic_xchg(&ctrl->pending_events, 0);
if (!events) {
ret = IRQ_NONE;
goto out;
}
if (events & PCI_EXP_SLTSTA_ABP) {
ctrl_info(ctrl, "Slot(%s): Attention button pressed\n",
slot_name(ctrl));
pciehp_handle_button_press(ctrl);
}
if ((events & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) {
ctrl->power_fault_detected = 1;
ctrl_err(ctrl, "Slot(%s): Power fault\n", slot_name(ctrl));
pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
PCI_EXP_SLTCTL_ATTN_IND_ON);
}
down_read(&ctrl->reset_lock);
if (events & DISABLE_SLOT)
pciehp_handle_disable_request(ctrl);
else if (events & (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC))
pciehp_handle_presence_or_link_change(ctrl, events);
up_read(&ctrl->reset_lock);
ret = IRQ_HANDLED;
out:
pci_config_pm_runtime_put(pdev);
ctrl->ist_running = false;
wake_up(&ctrl->requester);
return ret;
}
上面假设产生的是PCI_EXP_SLTSTA_DLLSC事件,因此执行pciehp_handle_presence_or_link_change函数,代码如下。
void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
{
int present, link_active;
mutex_lock(&ctrl->state_lock);
switch (ctrl->state) {
case BLINKINGOFF_STATE:
cancel_delayed_work(&ctrl->button_work);
fallthrough;
case ON_STATE:
ctrl->state = POWEROFF_STATE;
mutex_unlock(&ctrl->state_lock);
if (events & PCI_EXP_SLTSTA_DLLSC)
ctrl_info(ctrl, "Slot(%s): Link Down\n",
slot_name(ctrl));
if (events & PCI_EXP_SLTSTA_PDC)
ctrl_info(ctrl, "Slot(%s): Card not present\n",
slot_name(ctrl));
pciehp_disable_slot(ctrl, SURPRISE_REMOVAL);
break;
default:
mutex_unlock(&ctrl->state_lock);
break;
}
mutex_lock(&ctrl->state_lock);
present = pciehp_card_present(ctrl);
link_active = pciehp_check_link_active(ctrl);
if (present <= 0 && link_active <= 0) {
mutex_unlock(&ctrl->state_lock);
return;
}
switch (ctrl->state) {
case BLINKINGON_STATE:
cancel_delayed_work(&ctrl->button_work);
fallthrough;
case OFF_STATE:
ctrl->state = POWERON_STATE;
mutex_unlock(&ctrl->state_lock);
if (present)
ctrl_info(ctrl, "Slot(%s): Card present\n",
slot_name(ctrl));
if (link_active)
ctrl_info(ctrl, "Slot(%s): Link Up\n",
slot_name(ctrl));
ctrl->request_result = pciehp_enable_slot(ctrl);
break;
default:
mutex_unlock(&ctrl->state_lock);
break;
}
}
pciehp_enable_slot
__pciehp_enable_slot
board_added
pciehp_power_on_slot(ctrl);
pciehp_check_link_status(ctrl);
pciehp_configure_device(ctrl);
pci_scan_slot(parent, PCI_DEVFN(0, 0));
若SLOT之前是打开状态,则中断函数中会关闭槽位,移除资源。若SLOT之前是关闭状态,则中断函数中会打开槽位,重新枚举设备分配资源并注册到总线。
注意:SLOT单独上报一个PCI_EXP_SLTSTA_DLLSC事件是无效的,必须要和链路状态结合使用,比如,置位PCI_EXP_LNKSTA_DLLLA和置位PCI_EXP_SLTSTA_DLLSC且槽位上一次状态为OFF_STATE,则SLOT上报中断后将热插pcie设备。
硬中断触发流程: 硬件(pcie switch或slot)改变slot status、link status 状态,比如置位Data Link Layer State Changed 位和清除Data Link Layer Link Active位。硬件(pcie switch或slot)上报中断给hot-plug system driver,由hot-plug system driver读取slot status、link status 状态确认事件,如上面两位变化(热拔),则停用slot上所插pcie设备的驱动,然后移除设备和资源。
软中断
与热插拔相关属性是power,操作函数如下。
static struct pci_slot_attribute hotplug_slot_attr_power = {
.attr = {.name = "power", .mode = S_IFREG | S_IRUGO | S_IWUSR},
.show = power_read_file,
.store = power_write_file
};
static ssize_t power_write_file(struct pci_slot *pci_slot, const char *buf,
size_t count)
{
struct hotplug_slot *slot = pci_slot->hotplug;
unsigned long lpower;
u8 power;
int retval = 0;
lpower = simple_strtoul(buf, NULL, 10);
power = (u8)(lpower & 0xff);
dbg("power = %d\n", power);
if (!try_module_get(slot->owner)) {
retval = -ENODEV;
goto exit;
}
switch (power) {
case 0:
if (slot->ops->disable_slot)
retval = slot->ops->disable_slot(slot);
break;
case 1:
if (slot->ops->enable_slot)
retval = slot->ops->enable_slot(slot);
break;
default:
err("Illegal value specified for power\n");
retval = -EINVAL;
}
module_put(slot->owner);
exit:
if (retval)
return retval;
return count;
}
int pciehp_sysfs_enable_slot(struct hotplug_slot *hotplug_slot)
{
struct controller *ctrl = to_ctrl(hotplug_slot);
mutex_lock(&ctrl->state_lock);
switch (ctrl->state) {
case BLINKINGON_STATE:
case OFF_STATE:
mutex_unlock(&ctrl->state_lock);
ctrl->request_result = -ENODEV;
pciehp_request(ctrl, PCI_EXP_SLTSTA_PDC);
wait_event(ctrl->requester,
!atomic_read(&ctrl->pending_events) &&
!ctrl->ist_running);
return ctrl->request_result;
case POWERON_STATE:
ctrl_info(ctrl, "Slot(%s): Already in powering on state\n",
slot_name(ctrl));
break;
case BLINKINGOFF_STATE:
case ON_STATE:
case POWEROFF_STATE:
ctrl_info(ctrl, "Slot(%s): Already enabled\n",
slot_name(ctrl));
break;
default:
ctrl_err(ctrl, "Slot(%s): Invalid state %#x\n",
slot_name(ctrl), ctrl->state);
break;
}
mutex_unlock(&ctrl->state_lock);
return -ENODEV;
}
int pciehp_sysfs_disable_slot(struct hotplug_slot *hotplug_slot)
{
struct controller *ctrl = to_ctrl(hotplug_slot);
mutex_lock(&ctrl->state_lock);
switch (ctrl->state) {
case BLINKINGOFF_STATE:
case ON_STATE:
mutex_unlock(&ctrl->state_lock);
pciehp_request(ctrl, DISABLE_SLOT);
wait_event(ctrl->requester,
!atomic_read(&ctrl->pending_events) &&
!ctrl->ist_running);
return ctrl->request_result;
case POWEROFF_STATE:
ctrl_info(ctrl, "Slot(%s): Already in powering off state\n",
slot_name(ctrl));
break;
case BLINKINGON_STATE:
case OFF_STATE:
case POWERON_STATE:
ctrl_info(ctrl, "Slot(%s): Already disabled\n",
slot_name(ctrl));
break;
default:
ctrl_err(ctrl, "Slot(%s): Invalid state %#x\n",
slot_name(ctrl), ctrl->state);
break;
}
mutex_unlock(&ctrl->state_lock);
return -ENODEV;
}
void pciehp_request(struct controller *ctrl, int action)
{
atomic_or(action, &ctrl->pending_events);
if (!pciehp_poll_mode)
irq_wake_thread(ctrl->pcie->irq, ctrl);
}
即用户可以在host主机使用以下方法
echo 0 > /sys/bus/pci/slots/<SLOT_NUM>/power //通知linux内核需要热移除SLOT槽位下的pcie设备
echo 1 > /sys/bus/pci/slots/<SLOT_NUM>/power //通知linux内核需要热添加SLOT槽位下的pcie设备
用户操作后将唤醒pciehp驱动的中断线程(下半部中断),即pciehp_ist。
在pciehp_ist中断线程中查询ctrl->pending_events标志(上面置位了DISABLE_SLOT或PCI_EXP_SLTSTA_PDC),根据标志执行处理函数。
if (events & DISABLE_SLOT)
pciehp_handle_disable_request(ctrl);
else if (events & (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC))
pciehp_handle_presence_or_link_change(ctrl, events);
对于DISABLE_SLOT标志(echo 0 > power),执行pciehp_handle_disable_request函数,过程如下。
pciehp_handle_disable_request(struct controller *ctrl)
pciehp_disable_slot(ctrl, SAFE_REMOVAL);
pciehp_disable_slot
remove_board
static void remove_board(struct controller *ctrl, bool safe_removal)
{
pciehp_unconfigure_device(ctrl, safe_removal);
if (POWER_CTRL(ctrl)) {
pciehp_power_off_slot(ctrl);
msleep(1000);
atomic_and(~(PCI_EXP_SLTSTA_DLLSC | PCI_EXP_SLTSTA_PDC),
&ctrl->pending_events);
}
pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
INDICATOR_NOOP);
}
void pciehp_unconfigure_device(struct controller *ctrl, bool presence)
{
struct pci_dev *dev, *temp;
struct pci_bus *parent = ctrl->pcie->port->subordinate;
u16 command;
ctrl_dbg(ctrl, "%s: domain:bus:dev = %04x:%02x:00\n",
__func__, pci_domain_nr(parent), parent->number);
if (!presence)
pci_walk_bus(parent, pci_dev_set_disconnected, NULL);
pci_lock_rescan_remove();
list_for_each_entry_safe_reverse(dev, temp, &parent->devices,
bus_list) {
pci_dev_get(dev);
pci_stop_and_remove_bus_device(dev);
if (presence) {
pci_read_config_word(dev, PCI_COMMAND, &command);
command &= ~(PCI_COMMAND_MASTER | PCI_COMMAND_SERR);
command |= PCI_COMMAND_INTX_DISABLE;
pci_write_config_word(dev, PCI_COMMAND, command);
}
pci_dev_put(dev);
}
pci_unlock_rescan_remove();
}
void pci_stop_and_remove_bus_device(struct pci_dev *dev)
{
pci_stop_bus_device(dev);
pci_remove_bus_device(dev);
}
这样就将对于SLOT下所有设备移除了。
对于PCI_EXP_SLTSTA_PDC标志(echo 1 > power),执行pciehp_handle_presence_or_link_change函数,若SLOT是关闭状态,则重新打开SLOT,给SLOT上电使能,然后枚举SLOT下所有pci设备,给设备分配资源并注册到总线上。
触发流程: host用户层写指定slot的power属性,将触发软中断,比如echo 0 > power。触发软中断将强制唤醒hot-plug system driver下半部中断(pciehp_ist),并传入关闭slot事件码(DISABLE_SLOT)。pciehp_ist函数中根据事件码执行相应动作,比如DISABLE_SLOT事件码将停用slot上所插pcie设备的驱动,然后移除设备资源,最后关闭slot电源。
POLL方式
内核也提供POLL方式不断检查SLOT的状态,但这个功能需要将模块参数pciehp_poll_mode置位,POLL函数为pciehp_poll,代码如下。
static int pciehp_poll(void *data)
{
struct controller *ctrl = data;
schedule_timeout_idle(10 * HZ);
while (!kthread_should_stop()) {
while (pciehp_isr(IRQ_NOTCONNECTED, ctrl) == IRQ_WAKE_THREAD ||
atomic_read(&ctrl->pending_events))
pciehp_ist(IRQ_NOTCONNECTED, ctrl);
if (pciehp_poll_time <= 0 || pciehp_poll_time > 60)
pciehp_poll_time = 2;
schedule_timeout_idle(pciehp_poll_time * HZ);
}
return 0;
}
默认情况,每2s调用一下pciehp驱动的中断上半部函数pciehp_isr检查SLOT状态变化,若有事件产生,则调用pciehp_ist函数执行对应动作,例如slot置位Data Link Layer State Changed 位和清除Data Link Layer Link Active位,将热移除slot上所插pcie设备。
**注意:**热插拔技术实现需要给slot槽位预留出足够的资源(bus号、memory空间、I/O空间)!!!
可选方案:
- 对于主板上slot槽位,由BIOS提前预留足够资源。
- 对于PCIe switch的port口,由switch预留足够资源。
- 如果自己测试热插拔,请将pcie设备先插入slot上带电启动,由BIOS分配足够资源后进行热插拔测试。
|