內核中有若干個iommu的模塊,通過IOMMU_INIT組成成一個依賴樹。detect_intel_iommu最先被調用,它被用來做sanity檢查並設置好
x86_init.iommu.iommu_init = intel_iommu_init;
x86_platform.iommu_shutdown = intel_iommu_shutdown;
intel_iommu_init
|-> dmar_table_init -> parse_dmar_table -> dmar_walk_dmar_table
|-> dmar_dev_scope_init
|-> dmar_acpi_dev_scope_init -> dmar_acpi_insert_dev_scope
|-> dmar_pci_bus_add_dev -> dmar_insert_dev_scope
|-> bus_register_notifier
|-> dmar_init_reserved_ranges
|-> init_no_remapping_devices
|-> init_dmars
|-> dma_ops = &intel_dma_ops
|-> init_iommu_pm_ops
|-> iommu_device_sysfs_add, iommu_device_set_ops, iommu_device_register
|-> bus_set_iommu(&pci_bus_type, &intel_iommu_ops)
|-> bus_register_notifier(&pci_bus_type, &iommu_bus_notifier)
intel_iommu_init是intel iommu初始化的主函數,主要做下面的事:
1.建立三個cache,iova,dmar_domain,device_domain_info。
2.遍歷dmar表下的5個子表,建立dmar_drhd_units,dmar_rmrr_units,dmar_atsr_units鏈表。設置drhd的node親和性。
3.對andd聲明的acpi namespace類型的設備添加到device scope數組中。數組位於dmar_drhd_units鏈表中。
4.遍歷整個pci設備樹並調用dmar_pci_bus_add_dev添加設備到dmar_drhd_units,dmar_rmrr_units,dmar_atsr_units鏈表的device scope數組。
5.註冊pci熱插拔回調函數dmar_pci_bus_notifier
6.初始化reserved_iova_list,裏面放的是IOAPIC的空間,所有pci設備的MMIO空間。(LAPIC的空間不用考慮嗎?如果iova建立到LAPIC空間是不是中斷都發不出去了?)
7.init_no_remapping_devices忽略沒有掛載任何設備的drhd,忽略只有gfx設備的drhd如果igfx_off
8.init_dmars首先對每一個沒有被忽略的drhd初始化qi,分配domain_ids和domains指針表和root_entry頁,初始化si_domain(si_domain中一一映射了內存頁與usb,圖形設備的rmrr保留頁)。最後設置了dmar中斷
9.設置dma_ops
10.設置suspend/resume回調函數,iommu硬件在喚醒時需要恢復某些寄存器
11.建立sysfs接口下iommu的屬性文件
12.註冊pci總線的iommu調用函數,iommu_bus_init設置熱插拔設備時的回調函數(這個不同於5中的回調,5中的是dev scope的),並遍歷當前已有設備調用iommu_probe_device,iommu_probe_device調用剛纔註冊的pci總線下iommu回調函數ops->add_device(intel_iommu_add_device)。intel_iommu_add_device調用了iommu_group_get_for_dev,iommu_group_get_for_dev具體是調用ops->device_group建立iommu_group,並給iommu_group分配default_domain,初始group->domain也指向default_domain,最後調用iommu_group_add_device建立device與device_domain_info與dmar_domain與intel_iommu的綁定關係。
這個default_domain有兩種,passthrough的和dma的,通過命令行iommu=pt/nopt來設置。默認情況下是根據CONFIG_IOMMU_DEFAULT_PASSTHROUGH的配置來決定的。在passthrough模式下si_domain派上了用場,初始所有pci設備都會歸入si_domain除了那些有rmrr配置的設備(usb,graphic除外),rmrr配置的設備會從si_domain切換爲單獨的dma domain並且給當前設備rmrr指定的iova建立一一映射。對於dma模式初始每個iommu_group配置一個dma domain,然後碰到gfx設備,普通的pcie設備或根目錄下的pci設備會切換回si_domain,隔離性更強但是開銷也大。(不理解非根下的pci設備在pt模式不是si_domain的嗎,爲什麼在dma模式下就得是單獨的dma domain?)
13.probe_acpi_namespace_devices把acpi namespace設備也建立iommu_group。
14.最後使能每個drhd硬件的translation功能
這麼一堆初始化的用途在dma_alloc_coherent等一系列dma ops中體現出來。dma_alloc_coherent->intel_alloc_coherent->iommu_need_mapping,對於處於si_domain中的設備,如果設備dma支持的地址小於系統內存的地址範圍就需要切換到dma domain(這樣就不需要bounce buffer),否則的話就調用dma_direct_alloc直接分配物理內存,此時物理內存就是iova,用的還是si_domain。
這裏有好多容易混淆的結構,iommu_group,device,device_domain_info,dmar_domain,iommu_domain。dmar_drhd_unit,intel_iommu,iommu_device。
struct iommu_group {
struct kobject kobj;
struct kobject *devices_kobj;
struct list_head devices; /*group_device->list的鏈表,group_device->dev指向device*/
struct mutex mutex;
struct blocking_notifier_head notifier;
void *iommu_data;
void (*iommu_data_release)(void *iommu_data);
char *name;
int id;
struct iommu_domain *default_domain;
struct iommu_domain *domain;
};
struct device {
iommu_group /*iommu_group的指針*/
archdata.iommu /* device_domain_info指針 */
}
/* PCI domain-device relationship */
struct device_domain_info {
struct list_head link; /* link to domain siblings */ /*dmar_domain->devices 鏈表中的表項*/
struct list_head global; /* link to global list */ /*device_domain_list全局鏈表的表項*/
struct list_head table; /* link to pasid table */
struct list_head auxiliary_domains; /* auxiliary domains
* attached to this device
*/
u8 bus; /* PCI bus number */
u8 devfn; /* PCI devfn number */
u16 pfsid; /* SRIOV physical function source ID */
u8 pasid_supported:3;
u8 pasid_enabled:1;
u8 pri_supported:1;
u8 pri_enabled:1;
u8 ats_supported:1;
u8 ats_enabled:1;
u8 auxd_enabled:1; /* Multiple domains per device */
u8 ats_qdep;
struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ /*device指針*/
struct intel_iommu *iommu; /* IOMMU used by this device */
struct dmar_domain *domain; /* pointer to domain */ /*指向dmar_domain*/
struct pasid_table *pasid_table; /* pasid table */
};
struct dmar_domain {
int nid; /* node id */ /*dmar_domain有可能不止一個node吧*/
unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED]; /*對應iommu中的device計數*/
/* Refcount of devices per iommu */
u16 iommu_did[DMAR_UNITS_SUPPORTED]; /*對應iommu中的domain id*/
/* Domain ids per IOMMU. Use u16 since
* domain ids are 16 bit wide according
* to VT-d spec, section 9.3 */
unsigned int auxd_refcnt; /* Refcount of auxiliary attaching */
bool has_iotlb_device;
struct list_head devices; /* all devices' list */ /*此dmar_domain中的所有設備,device_domain_info->link鏈表*/
struct list_head auxd; /* link to device's auxiliary list */
struct iova_domain iovad; /* iova's that belong to this domain */ /*保留的和已經分配的iova空間*/
struct dma_pte *pgd; /* virtual address */
int gaw; /* max guest address width */
/* adjusted guest address width, 0 is level 2 30-bit */
int agaw;
int flags; /* flags to find out type of domain */
int iommu_coherency;/* indicate coherency of iommu access */
int iommu_snooping; /* indicate snooping control feature*/
int iommu_count; /* reference count of iommu */
int iommu_superpage;/* Level of superpages supported:
0 == 4KiB (no superpages), 1 == 2MiB,
2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
u64 max_addr; /* maximum mapped address */
int default_pasid; /*
* The default pasid used for non-SVM
* traffic on mediated devices.
*/
struct iommu_domain domain; /* generic domain data structure for /*硬件iommu_domain,在vfio中是個指針*/
iommu core */
};
struct iommu_domain {
unsigned type; /*IOMMU_DOMAIN_IDENTITY,IOMMU_DOMAIN_DMA,IOMMU_DOMAIN_UNMANAGED*/
const struct iommu_ops *ops;
unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */
iommu_fault_handler_t handler;
void *handler_token;
struct iommu_domain_geometry geometry;
void *iova_cookie;
};
struct intel_iommu {
void __iomem *reg; /* Pointer to hardware regs, virtual addr */
u64 reg_phys; /* physical address of hw register set */
u64 reg_size; /* size of hw register set */
u64 cap;
u64 ecap;
u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
raw_spinlock_t register_lock; /* protect register handling */
int seq_id; /* sequence id of the iommu */ /*iommu的順序號*/
int agaw; /* agaw of this iommu */
int msagaw; /* max sagaw of this iommu */
unsigned int irq, pr_irq;
u16 segment; /* PCI segment# */
unsigned char name[13]; /* Device Name */
#ifdef CONFIG_INTEL_IOMMU
unsigned long *domain_ids; /* bitmap of domains */ /*用於此iommu中domain id的分配*/
struct dmar_domain ***domains; /* ptr to domains */ /*索引dmar_domain的指針數組*/
spinlock_t lock; /* protect context, domain ids */
struct root_entry *root_entry; /* virtual address */
struct iommu_flush flush;
#endif
#ifdef CONFIG_INTEL_IOMMU_SVM
struct page_req_dsc *prq;
unsigned char prq_name[16]; /* Name for PRQ interrupt */
#endif
struct q_inval *qi; /* Queued invalidation info */
u32 *iommu_state; /* Store iommu states between suspend and resume.*/
#ifdef CONFIG_IRQ_REMAP
struct ir_table *ir_table; /* Interrupt remapping info */
struct irq_domain *ir_domain;
struct irq_domain *ir_msi_domain;
#endif
struct iommu_device iommu; /* IOMMU core code handle */
int node;
u32 flags; /* Software defined flags */
};
struct iommu_device {
struct list_head list; /*鏈接到iommu_device_list全局鏈表上,代表!drhd->ignored的所有iommu*/
const struct iommu_ops *ops;
struct fwnode_handle *fwnode;
struct device *dev;
};