AER及linux內核驅動簡介:https://blog.csdn.net/u010443710/article/details/104649179
1. AER 中斷
首先AER驅動作爲錯誤上報和處理的機制,必須有一個錯誤上報的入口。
這個入口就是AER中斷。linux的AER驅動只針對RC,也就是說軟件需要處理RC的AER中斷請求。
並在中斷處理函數中通過AER 寄存器來判斷錯誤類型並作出相應處理。
1.1 AER中斷產生
在PCIe spec中定義了2種AER中斷產生方式,類似於ep設備,可以選擇legacy的INTx或者MSI/MSIx的方式來產生中斷。
但對於RC而言,無論是INTx還是MSI/MSIx,都不需要像ep那樣真的來觸發INTx邊帶信號或發送MSI tlp來告知RC。
因爲RC作爲根節點,內部的中斷就是報給自己,可以直接在chip內部處理,不需要在PCIe協議上走一圈。
這就涉及RC內部中斷上報的機制,由於RC內部中斷不僅限於AER中斷,所以這部分單獨開一篇進行闡述。
1.2 如何使能AER中斷?
AER Capability -> Root Error Command Register (Offset 2Ch)
打開相應的報告使能bit位,當錯誤發生後,就會有中斷產生。
2. AER驅動
AER驅動與PME、pciehp、pcie-dpc一樣是作爲pcie port的可選service。
service掛載在pcie port驅動上,由portdrv_core進行管理,service通過pcie_port_service_register進行註冊。
具體來說,service的數據結構如下:
struct pcie_port_service_driver {
const char *name;
int (*probe) (struct pcie_device *dev);
void (*remove) (struct pcie_device *dev);
int (*suspend) (struct pcie_device *dev);
int (*resume) (struct pcie_device *dev);
/* Service Error Recovery Handler */
const struct pci_error_handlers *err_handler;
/* Link Reset Capability - AER service driver specific */
pci_ers_result_t (*reset_link) (struct pci_dev *dev);
int port_type; /* Type of the port this driver can handle */
u32 service; /* Port service this device represents */
struct device_driver driver;
};
以下是AER的service結構
static struct pcie_port_service_driver aerdriver = {
.name = "aer",
.port_type = PCI_EXP_TYPE_ROOT_PORT,
.service = PCIE_PORT_SERVICE_AER,
.probe = aer_probe,
.remove = aer_remove,
.err_handler = &aer_error_handlers,
.reset_link = aer_root_reset,
};
2.1 初始化(aer_probe)
AER初始化主要完成2件事情:
- 爲錯誤處理入口aer_irq,申請中斷;request_irq(dev->irq, aer_irq, IRQF_SHARED, "aerdrv", dev);
- 配置AER功能相關的cap寄存器,打開AER能使,中斷上報使能等;aer_enable_rootport(rpc);
/**
* aer_probe - initialize resources
* @dev: pointer to the pcie_dev data structure
*
* Invoked when PCI Express bus loads AER service driver.
*/
static int aer_probe(struct pcie_device *dev)
{
int status;
struct aer_rpc *rpc;
struct device *device = &dev->device;
/* Alloc rpc data structure */
rpc = aer_alloc_rpc(dev);
if (!rpc) {
dev_printk(KERN_DEBUG, device, "alloc rpc failed\n");
aer_remove(dev);
return -ENOMEM;
}
/* Request IRQ ISR */
status = request_irq(dev->irq, aer_irq, IRQF_SHARED, "aerdrv", dev);
if (status) {
dev_printk(KERN_DEBUG, device, "request IRQ failed\n");
aer_remove(dev);
return status;
}
rpc->isr = 1;
aer_enable_rootport(rpc);
return status;
}
這裏的中斷向量dev->irq,是pcie port驅動初始化時已經申請好了可能是lagecy的或者是MSI/MSIx,AER只需要再註冊一個share中斷上去。
看一下aer_enable_rootport,先清楚了所有device的狀態位,再把上下游設備的錯誤上報全部使能。
/**
* aer_enable_rootport - enable Root Port's interrupts when receiving messages
* @rpc: pointer to a Root Port data structure
*
* Invoked when PCIe bus loads AER service driver.
*/
static void aer_enable_rootport(struct aer_rpc *rpc)
{
struct pci_dev *pdev = rpc->rpd->port;
int aer_pos;
u16 reg16;
u32 reg32;
/* Clear PCIe Capability's Device Status */
pcie_capability_read_word(pdev, PCI_EXP_DEVSTA, ®16);
pcie_capability_write_word(pdev, PCI_EXP_DEVSTA, reg16);
/* Disable system error generation in response to error messages */
pcie_capability_clear_word(pdev, PCI_EXP_RTCTL,
SYSTEM_ERROR_INTR_ON_MESG_MASK);
aer_pos = pdev->aer_cap;
/* Clear error status */
pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, ®32);
pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, reg32);
pci_read_config_dword(pdev, aer_pos + PCI_ERR_COR_STATUS, ®32);
pci_write_config_dword(pdev, aer_pos + PCI_ERR_COR_STATUS, reg32);
pci_read_config_dword(pdev, aer_pos + PCI_ERR_UNCOR_STATUS, ®32);
pci_write_config_dword(pdev, aer_pos + PCI_ERR_UNCOR_STATUS, reg32);
/*
* Enable error reporting for the root port device and downstream port
* devices.
*/
set_downstream_devices_error_reporting(pdev, true);
/* Enable Root Port's interrupt in response to error messages */
pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_COMMAND, ®32);
reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_COMMAND, reg32);
}
2.2 中斷處理
AER中斷分爲上下半部,上半部aer_irq:
irqreturn_t aer_irq(int irq, void *context)
{
unsigned int status, id;
struct pcie_device *pdev = (struct pcie_device *)context;
struct aer_rpc *rpc = get_service_data(pdev);
int next_prod_idx;
unsigned long flags;
int pos;
pos = pdev->port->aer_cap;
/*
* Must lock access to Root Error Status Reg, Root Error ID Reg,
* and Root error producer/consumer index
*/
spin_lock_irqsave(&rpc->e_lock, flags);
/* Read error status */
pci_read_config_dword(pdev->port, pos + PCI_ERR_ROOT_STATUS, &status);
if (!(status & (PCI_ERR_ROOT_UNCOR_RCV|PCI_ERR_ROOT_COR_RCV))) {
spin_unlock_irqrestore(&rpc->e_lock, flags);
return IRQ_NONE;
}
/* Read error source and clear error status */
pci_read_config_dword(pdev->port, pos + PCI_ERR_ROOT_ERR_SRC, &id);
pci_write_config_dword(pdev->port, pos + PCI_ERR_ROOT_STATUS, status);
/* Store error source for later DPC handler */
next_prod_idx = rpc->prod_idx + 1;
if (next_prod_idx == AER_ERROR_SOURCES_MAX)
next_prod_idx = 0;
if (next_prod_idx == rpc->cons_idx) {
/*
* Error Storm Condition - possibly the same error occurred.
* Drop the error.
*/
spin_unlock_irqrestore(&rpc->e_lock, flags);
return IRQ_HANDLED;
}
rpc->e_sources[rpc->prod_idx].status = status;
rpc->e_sources[rpc->prod_idx].id = id;
rpc->prod_idx = next_prod_idx;
spin_unlock_irqrestore(&rpc->e_lock, flags);
/* Invoke DPC handler */
schedule_work(&rpc->dpc_handler);
return IRQ_HANDLED;
}
irq中首先讀取 Root Error Status Register,看看是不是有錯誤產生了。(PCI_ERR_ROOT_STATUS)
獲取錯誤源ID,Error Source Identification Register,PCI_ERR_ROOT_ERR_SRC
並保存在rpc->e_sources數組裏面。
中斷處理下半部 :aer_isr,這是一個worker
/**
* aer_isr - consume errors detected by root port
* @work: definition of this work item
*
* Invoked, as DPC, when root port records new detected error
*/
void aer_isr(struct work_struct *work)
{
struct aer_rpc *rpc = container_of(work, struct aer_rpc, dpc_handler);
struct pcie_device *p_device = rpc->rpd;
struct aer_err_source uninitialized_var(e_src);
mutex_lock(&rpc->rpc_mutex);
while (get_e_source(rpc, &e_src))
aer_isr_one_error(p_device, &e_src);
mutex_unlock(&rpc->rpc_mutex);
}
把剛纔上半部保存在rpc->e_sources裏面的錯誤源取出來,一個一個調用aer_isr_one_error進行處理
/**
* aer_isr_one_error - consume an error detected by root port
* @p_device: pointer to error root port service device
* @e_src: pointer to an error source
*/
static void aer_isr_one_error(struct pcie_device *p_device,
struct aer_err_source *e_src)
{
struct aer_rpc *rpc = get_service_data(p_device);
struct aer_err_info *e_info = &rpc->e_info;
/*
* There is a possibility that both correctable error and
* uncorrectable error being logged. Report correctable error first.
*/
if (e_src->status & PCI_ERR_ROOT_COR_RCV) {
e_info->id = ERR_COR_ID(e_src->id);
e_info->severity = AER_CORRECTABLE;
if (e_src->status & PCI_ERR_ROOT_MULTI_COR_RCV)
e_info->multi_error_valid = 1;
else
e_info->multi_error_valid = 0;
aer_print_port_info(p_device->port, e_info);
if (find_source_device(p_device->port, e_info))
aer_process_err_devices(p_device, e_info);
}
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
e_info->id = ERR_UNCOR_ID(e_src->id);
if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
e_info->severity = AER_FATAL;
else
e_info->severity = AER_NONFATAL;
if (e_src->status & PCI_ERR_ROOT_MULTI_UNCOR_RCV)
e_info->multi_error_valid = 1;
else
e_info->multi_error_valid = 0;
aer_print_port_info(p_device->port, e_info);
if (find_source_device(p_device->port, e_info))
aer_process_err_devices(p_device, e_info);
}
}
aer_isr_one_error就是處理錯誤的具體實現了,這裏按照AER的不同類型進行錯誤報告或者恢復處理。