内核中有若干个iommu的模块,通过IOMMU_INIT组成成一个依赖树。detect_intel_iommu最先被调用,它被用来做sanity检查并设置好
x86_init.iommu.iommu_init = intel_iommu_init;
x86_platform.iommu_shutdown = intel_iommu_shutdown;
intel_iommu_init
|-> dmar_table_init -> parse_dmar_table -> dmar_walk_dmar_table
|-> dmar_dev_scope_init
|-> dmar_acpi_dev_scope_init -> dmar_acpi_insert_dev_scope
|-> dmar_pci_bus_add_dev -> dmar_insert_dev_scope
|-> bus_register_notifier
|-> dmar_init_reserved_ranges
|-> init_no_remapping_devices
|-> init_dmars
|-> dma_ops = &intel_dma_ops
|-> init_iommu_pm_ops
|-> iommu_device_sysfs_add, iommu_device_set_ops, iommu_device_register
|-> bus_set_iommu(&pci_bus_type, &intel_iommu_ops)
|-> bus_register_notifier(&pci_bus_type, &iommu_bus_notifier)
intel_iommu_init是intel iommu初始化的主函数,主要做下面的事:
1.建立三个cache,iova,dmar_domain,device_domain_info。
2.遍历dmar表下的5个子表,建立dmar_drhd_units,dmar_rmrr_units,dmar_atsr_units链表。设置drhd的node亲和性。
3.对andd声明的acpi namespace类型的设备添加到device scope数组中。数组位于dmar_drhd_units链表中。
4.遍历整个pci设备树并调用dmar_pci_bus_add_dev添加设备到dmar_drhd_units,dmar_rmrr_units,dmar_atsr_units链表的device scope数组。
5.注册pci热插拔回调函数dmar_pci_bus_notifier
6.初始化reserved_iova_list,里面放的是IOAPIC的空间,所有pci设备的MMIO空间。(LAPIC的空间不用考虑吗?如果iova建立到LAPIC空间是不是中断都发不出去了?)
7.init_no_remapping_devices忽略没有挂载任何设备的drhd,忽略只有gfx设备的drhd如果igfx_off
8.init_dmars首先对每一个没有被忽略的drhd初始化qi,分配domain_ids和domains指针表和root_entry页,初始化si_domain(si_domain中一一映射了内存页与usb,图形设备的rmrr保留页)。最后设置了dmar中断
9.设置dma_ops
10.设置suspend/resume回调函数,iommu硬件在唤醒时需要恢复某些寄存器
11.建立sysfs接口下iommu的属性文件
12.注册pci总线的iommu调用函数,iommu_bus_init设置热插拔设备时的回调函数(这个不同于5中的回调,5中的是dev scope的),并遍历当前已有设备调用iommu_probe_device,iommu_probe_device调用刚才注册的pci总线下iommu回调函数ops->add_device(intel_iommu_add_device)。intel_iommu_add_device调用了iommu_group_get_for_dev,iommu_group_get_for_dev具体是调用ops->device_group建立iommu_group,并给iommu_group分配default_domain,初始group->domain也指向default_domain,最后调用iommu_group_add_device建立device与device_domain_info与dmar_domain与intel_iommu的绑定关系。
这个default_domain有两种,passthrough的和dma的,通过命令行iommu=pt/nopt来设置。默认情况下是根据CONFIG_IOMMU_DEFAULT_PASSTHROUGH的配置来决定的。在passthrough模式下si_domain派上了用场,初始所有pci设备都会归入si_domain除了那些有rmrr配置的设备(usb,graphic除外),rmrr配置的设备会从si_domain切换为单独的dma domain并且给当前设备rmrr指定的iova建立一一映射。对于dma模式初始每个iommu_group配置一个dma domain,然后碰到gfx设备,普通的pcie设备或根目录下的pci设备会切换回si_domain,隔离性更强但是开销也大。(不理解非根下的pci设备在pt模式不是si_domain的吗,为什么在dma模式下就得是单独的dma domain?)
13.probe_acpi_namespace_devices把acpi namespace设备也建立iommu_group。
14.最后使能每个drhd硬件的translation功能
这么一堆初始化的用途在dma_alloc_coherent等一系列dma ops中体现出来。dma_alloc_coherent->intel_alloc_coherent->iommu_need_mapping,对于处于si_domain中的设备,如果设备dma支持的地址小于系统内存的地址范围就需要切换到dma domain(这样就不需要bounce buffer),否则的话就调用dma_direct_alloc直接分配物理内存,此时物理内存就是iova,用的还是si_domain。
这里有好多容易混淆的结构,iommu_group,device,device_domain_info,dmar_domain,iommu_domain。dmar_drhd_unit,intel_iommu,iommu_device。
struct iommu_group {
struct kobject kobj;
struct kobject *devices_kobj;
struct list_head devices; /*group_device->list的链表,group_device->dev指向device*/
struct mutex mutex;
struct blocking_notifier_head notifier;
void *iommu_data;
void (*iommu_data_release)(void *iommu_data);
char *name;
int id;
struct iommu_domain *default_domain;
struct iommu_domain *domain;
};
struct device {
iommu_group /*iommu_group的指针*/
archdata.iommu /* device_domain_info指针 */
}
/* PCI domain-device relationship */
struct device_domain_info {
struct list_head link; /* link to domain siblings */ /*dmar_domain->devices 链表中的表项*/
struct list_head global; /* link to global list */ /*device_domain_list全局链表的表项*/
struct list_head table; /* link to pasid table */
struct list_head auxiliary_domains; /* auxiliary domains
* attached to this device
*/
u8 bus; /* PCI bus number */
u8 devfn; /* PCI devfn number */
u16 pfsid; /* SRIOV physical function source ID */
u8 pasid_supported:3;
u8 pasid_enabled:1;
u8 pri_supported:1;
u8 pri_enabled:1;
u8 ats_supported:1;
u8 ats_enabled:1;
u8 auxd_enabled:1; /* Multiple domains per device */
u8 ats_qdep;
struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ /*device指针*/
struct intel_iommu *iommu; /* IOMMU used by this device */
struct dmar_domain *domain; /* pointer to domain */ /*指向dmar_domain*/
struct pasid_table *pasid_table; /* pasid table */
};
struct dmar_domain {
int nid; /* node id */ /*dmar_domain有可能不止一个node吧*/
unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED]; /*对应iommu中的device计数*/
/* Refcount of devices per iommu */
u16 iommu_did[DMAR_UNITS_SUPPORTED]; /*对应iommu中的domain id*/
/* Domain ids per IOMMU. Use u16 since
* domain ids are 16 bit wide according
* to VT-d spec, section 9.3 */
unsigned int auxd_refcnt; /* Refcount of auxiliary attaching */
bool has_iotlb_device;
struct list_head devices; /* all devices' list */ /*此dmar_domain中的所有设备,device_domain_info->link链表*/
struct list_head auxd; /* link to device's auxiliary list */
struct iova_domain iovad; /* iova's that belong to this domain */ /*保留的和已经分配的iova空间*/
struct dma_pte *pgd; /* virtual address */
int gaw; /* max guest address width */
/* adjusted guest address width, 0 is level 2 30-bit */
int agaw;
int flags; /* flags to find out type of domain */
int iommu_coherency;/* indicate coherency of iommu access */
int iommu_snooping; /* indicate snooping control feature*/
int iommu_count; /* reference count of iommu */
int iommu_superpage;/* Level of superpages supported:
0 == 4KiB (no superpages), 1 == 2MiB,
2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
u64 max_addr; /* maximum mapped address */
int default_pasid; /*
* The default pasid used for non-SVM
* traffic on mediated devices.
*/
struct iommu_domain domain; /* generic domain data structure for /*硬件iommu_domain,在vfio中是个指针*/
iommu core */
};
struct iommu_domain {
unsigned type; /*IOMMU_DOMAIN_IDENTITY,IOMMU_DOMAIN_DMA,IOMMU_DOMAIN_UNMANAGED*/
const struct iommu_ops *ops;
unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */
iommu_fault_handler_t handler;
void *handler_token;
struct iommu_domain_geometry geometry;
void *iova_cookie;
};
struct intel_iommu {
void __iomem *reg; /* Pointer to hardware regs, virtual addr */
u64 reg_phys; /* physical address of hw register set */
u64 reg_size; /* size of hw register set */
u64 cap;
u64 ecap;
u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
raw_spinlock_t register_lock; /* protect register handling */
int seq_id; /* sequence id of the iommu */ /*iommu的顺序号*/
int agaw; /* agaw of this iommu */
int msagaw; /* max sagaw of this iommu */
unsigned int irq, pr_irq;
u16 segment; /* PCI segment# */
unsigned char name[13]; /* Device Name */
#ifdef CONFIG_INTEL_IOMMU
unsigned long *domain_ids; /* bitmap of domains */ /*用于此iommu中domain id的分配*/
struct dmar_domain ***domains; /* ptr to domains */ /*索引dmar_domain的指针数组*/
spinlock_t lock; /* protect context, domain ids */
struct root_entry *root_entry; /* virtual address */
struct iommu_flush flush;
#endif
#ifdef CONFIG_INTEL_IOMMU_SVM
struct page_req_dsc *prq;
unsigned char prq_name[16]; /* Name for PRQ interrupt */
#endif
struct q_inval *qi; /* Queued invalidation info */
u32 *iommu_state; /* Store iommu states between suspend and resume.*/
#ifdef CONFIG_IRQ_REMAP
struct ir_table *ir_table; /* Interrupt remapping info */
struct irq_domain *ir_domain;
struct irq_domain *ir_msi_domain;
#endif
struct iommu_device iommu; /* IOMMU core code handle */
int node;
u32 flags; /* Software defined flags */
};
struct iommu_device {
struct list_head list; /*链接到iommu_device_list全局链表上,代表!drhd->ignored的所有iommu*/
const struct iommu_ops *ops;
struct fwnode_handle *fwnode;
struct device *dev;
};