kvm 源代碼雜篇

分析KVM,對我這種菜鳥確實難度太大。下面簡單的先從虛擬機的創建和運行調用的函數分析。。。


首先申明一個kvm_context_t 變量用以描述用戶態虛擬機上下文信息,然後調用kvm_init()函數初始化虛擬機上下文信息;函數kvm_create()創建虛擬機實例,該函數通過ioctl系統調用創建虛擬機相關的內核數據結構並且返回虛擬機文件描述符給用戶態kvm_context_t數據結構;

<span style="font-size:18px;">2587 int kvm_init(void *opaque, unsigned int vcpu_size,
2588                   struct module *module)
2589 {
2590         int r;
2591         int cpu;
2592 
2593         r = kvm_arch_init(opaque);
2594         if (r)
2595                 goto out_fail;
2596 
2597         bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2598 
2599         if (bad_page == NULL) {
2600                 r = -ENOMEM;
2601                 goto out;
2602         }
2603 
2604         bad_pfn = page_to_pfn(bad_page);
2605 
2606         if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2607                 r = -ENOMEM;
2608                 goto out_free_0;
2609         }
2610 
2611         r = kvm_arch_hardware_setup();
2612         if (r < 0)
2613                 goto out_free_0a;
2614 
2615         for_each_online_cpu(cpu) {
2616                 smp_call_function_single(cpu,
2617                                 kvm_arch_check_processor_compat,
2618                                 &r, 1);
2619                 if (r < 0)
2620                         goto out_free_1;
2621         }
2622 
2623         on_each_cpu(hardware_enable, NULL, 1);
2624         r = register_cpu_notifier(&kvm_cpu_notifier);
2625         if (r)
2626                 goto out_free_2;
2627         register_reboot_notifier(&kvm_reboot_notifier);
2628 
2629         r = sysdev_class_register(&kvm_sysdev_class);
2630         if (r)
2631                 goto out_free_3;
2632 
2633         r = sysdev_register(&kvm_sysdev);
2634         if (r)
2635                 goto out_free_4;
2636 
2637         /* A kmem cache lets us meet the alignment requirements of fx_save. */
2638         kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
2639                                            __alignof__(struct kvm_vcpu),
2640                                            0, NULL);
2641         if (!kvm_vcpu_cache) {
2642                 r = -ENOMEM;
2643                 goto out_free_5;
2644         }
2645 
2646         kvm_chardev_ops.owner = module;
2647         kvm_vm_fops.owner = module;
2648         kvm_vcpu_fops.owner = module;
2649 
2650         r = misc_register(&kvm_dev);
2651         if (r) {
2652                 printk(KERN_ERR "kvm: misc device register failed\n");
2653                 goto out_free;
2654         }
2655 
2656         kvm_preempt_ops.sched_in = kvm_sched_in;
2657         kvm_preempt_ops.sched_out = kvm_sched_out;
2658 
2659         kvm_init_debug();
2660 
2661         return 0;
2662 
2663 out_free:
2664         kmem_cache_destroy(kvm_vcpu_cache);
2665 out_free_5:
2666         sysdev_unregister(&kvm_sysdev);
2667 out_free_4:
2668         sysdev_class_unregister(&kvm_sysdev_class);
2669 out_free_3:
2670         unregister_reboot_notifier(&kvm_reboot_notifier);
2671         unregister_cpu_notifier(&kvm_cpu_notifier);
2672 out_free_2:
2673         on_each_cpu(hardware_disable, NULL, 1);
2674 out_free_1:
2675         kvm_arch_hardware_unsetup();
2676 out_free_0a:
2677         free_cpumask_var(cpus_hardware_enabled);
2678 out_free_0:
2679         __free_page(bad_page);
2680 out:
2681         kvm_arch_exit();
2682 out_fail:
2683         return r;
2684 }</span>
下面稍微詳細分析下面流程:

首先,用戶態的Qemu代碼調用kvm_init函數,kvm_init通過qemu_open(“/dev/kvm”)檢查內核驅動插入情況,通過kvm_ioctl(s, KVM_GET_API_VERSION, 0)獲取API接口版本,最是調用了kvm_ioctl(s, KVM_CREATE_VM, 0)創建了KVM虛擬機,獲取虛擬機句柄。

簡單點說,就是在用戶態調用了 KVM_Init(),  然後用戶態的Qemu調用kvm_ioctl(s, KVM_CREATE_VM, 0)來獲取KVM虛擬機接口。那我們必須還要知道調用了這個函數之後會發生什麼,也就是KVM是如何由這個函數展開,然後創建虛擬機的。

內核對應的入口代碼在此:

<span style="font-size:18px;">static int kvm_dev_ioctl_create_vm(void)
2271 {
2272         int fd;
2273         struct kvm *kvm;
2274 
2275         kvm = kvm_create_vm();
2276         if (IS_ERR(kvm))
2277                 return PTR_ERR(kvm);
2278         fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0);
2279         if (fd < 0)
2280                 kvm_put_kvm(kvm);
2281 
2282         return fd;
2283 }</span>

從上面可以看出,是通過Kvm_create_vm來進一步調用。找到kvm_create_vm:

<span style="font-size:18px;">945 static struct kvm *kvm_create_vm(void)
946 {
947         struct kvm *kvm = kvm_arch_create_vm();
948 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
949         struct page *page;
950 #endif
951 
952         if (IS_ERR(kvm))
953                 goto out;
954 #ifdef CONFIG_HAVE_KVM_IRQCHIP
955         INIT_LIST_HEAD(&kvm->irq_routing);
956         INIT_HLIST_HEAD(&kvm->mask_notifier_list);
957 #endif
958 
959 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
960         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
961         if (!page) {
962                 kfree(kvm);
963                 return ERR_PTR(-ENOMEM);
964         }
965         kvm->coalesced_mmio_ring =
966                         (struct kvm_coalesced_mmio_ring *)page_address(page);
967 #endif
968 
969 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
970         {
971                 int err;
972                 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
973                 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
974                 if (err) {
975 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
976                         put_page(page);
977 #endif
978                         kfree(kvm);
979                         return ERR_PTR(err);
980                 }
981         }
982 #endif
983 
984         kvm->mm = current->mm;
985         atomic_inc(&kvm->mm->mm_count);
986         spin_lock_init(&kvm->mmu_lock);
987         spin_lock_init(&kvm->requests_lock);
988         kvm_io_bus_init(&kvm->pio_bus);
989         mutex_init(&kvm->lock);
990         kvm_io_bus_init(&kvm->mmio_bus);
991         init_rwsem(&kvm->slots_lock);
992         atomic_set(&kvm->users_count, 1);
993         spin_lock(&kvm_lock);
994         list_add(&kvm->vm_list, &vm_list);
995         spin_unlock(&kvm_lock);
996 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
997         kvm_coalesced_mmio_init(kvm);
998 #endif
999 out:
1000         return kvm;
1001 }</span>

這裏kvm_arch_create_vm():是用來初始化KVM結構體信息。

總結這個函數吧,kvm_create_vm事實上也就做了初始化和啓動硬件特性兩件事,然後將相應的句柄返回給用戶態。


創建完內核虛擬機數據結構後,再創建內核pit以及mmio等基本外設模擬設備,然後調用kvm_create_vcpu()函數來創建虛擬處理器,kvm_create_vcpu()

下面看下kvm_create_vcpu()函數

<span style="font-size:18px;">1726 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1727 {
1728         int r;
1729         struct kvm_vcpu *vcpu;
1730 
1731         if (!valid_vcpu(n))
1732                 return -EINVAL;
1733 
1734         vcpu = kvm_arch_vcpu_create(kvm, n);
1735         if (IS_ERR(vcpu))
1736                 return PTR_ERR(vcpu);
1737 
1738         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1739 
1740         r = kvm_arch_vcpu_setup(vcpu);
1741         if (r)
1742                 return r;
1743 
1744         mutex_lock(&kvm->lock);
1745         if (kvm->vcpus[n]) {
1746                 r = -EEXIST;
1747                 goto vcpu_destroy;
1748         }
1749         kvm->vcpus[n] = vcpu;
1750         mutex_unlock(&kvm->lock);
1751 
1752         /* Now it's all set up, let userspace reach it */
1753         kvm_get_kvm(kvm);
1754         r = create_vcpu_fd(vcpu);
1755         if (r < 0)
1756                 goto unlink;
1757         return r;
1758 
1759 unlink:
1760         mutex_lock(&kvm->lock);
1761         kvm->vcpus[n] = NULL;
1762 vcpu_destroy:
1763         mutex_unlock(&kvm->lock);
1764         kvm_arch_vcpu_destroy(vcpu);
1765         return r;
1766 }
1767 </span>

<span style="font-size:18px;">4365 
4366 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4367 {
4368         int r;
4369 
4370         /* We do fxsave: this must be aligned. */
4371         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4372 
4373         vcpu->arch.mtrr_state.have_fixed = 1;
4374         vcpu_load(vcpu);
4375         r = kvm_arch_vcpu_reset(vcpu);
4376         if (r == 0)
4377                 r = kvm_mmu_setup(vcpu);
4378         vcpu_put(vcpu);
4379         if (r < 0)
4380                 goto free_vcpu;
4381 
4382         return 0;
4383 free_vcpu:
4384         kvm_x86_ops->vcpu_free(vcpu);
4385         return r;
4386 }</span>

繼續流程分析:

函數通過ioctl()系統調用向由vm_fd文件描述符指向的虛擬文件調用創建虛擬處理器,並將虛擬處理器的文件描述符返回給用戶態程序,用以以後的調度使用;

好,CPU的初始化和創建暫時完成:下面是內存,即影子頁表的初始化:


創建完虛擬處理器後,由用戶態的QEMU程序申請客戶機用戶空間,用以加載和運行客戶機代碼;爲了使得客戶虛擬機正確執行,必須要在內核中爲客戶機建立正確的內存映射關係,即影子頁表信息。因此,申請客戶機內存地址空間後,調用函數kvm_create_phys_mem()創建客戶機內存映射關係,該函數主要通過ioctl系統調用向vm_fd指向的虛擬文件調用設置內核數據結構中客戶機內存域相關信息,主要建立影子頁表信息;當創建好虛擬處理器和影子頁表後,即可讀取客戶機到指定分配的空間中,然後調度虛擬處理器運行。

kvm_create_phys_mem():代碼在此

945 static struct kvm *kvm_create_vm(void)
946 {
947         struct kvm *kvm = kvm_arch_create_vm();
948 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
949         struct page *page;
950 #endif
951 
952         if (IS_ERR(kvm))
953                 goto out;
954 #ifdef CONFIG_HAVE_KVM_IRQCHIP
955         INIT_LIST_HEAD(&kvm->irq_routing);
956         INIT_HLIST_HEAD(&kvm->mask_notifier_list);
957 #endif
958 
959 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
960         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
961         if (!page) {
962                 kfree(kvm);
963                 return ERR_PTR(-ENOMEM);
964         }
965         kvm->coalesced_mmio_ring =
966                         (struct kvm_coalesced_mmio_ring *)page_address(page);
967 #endif
968 
969 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
970         {
971                 int err;
972                 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
973                 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
974                 if (err) {
975 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
976                         put_page(page);
977 #endif
978                         kfree(kvm);
979                         return ERR_PTR(err);
980                 }
981         }
982 #endif
983 
984         kvm->mm = current->mm;
985         atomic_inc(&kvm->mm->mm_count);
986         spin_lock_init(&kvm->mmu_lock);
987         spin_lock_init(&kvm->requests_lock);
988         kvm_io_bus_init(&kvm->pio_bus);
989         mutex_init(&kvm->lock);
990         kvm_io_bus_init(&kvm->mmio_bus);
991         init_rwsem(&kvm->slots_lock);
992         atomic_set(&kvm->users_count, 1);
993         spin_lock(&kvm_lock);
994         list_add(&kvm->vm_list, &vm_list);
995         spin_unlock(&kvm_lock);
996 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
997         kvm_coalesced_mmio_init(kvm);
998 #endif
999 out:
1000         return kvm;
1001 }
1002 

內存創建之後,即可以運行虛擬機了。


調度虛擬機的函數爲kvm_run(),代碼如下:

3466 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3467 {
3468         int r;
3469         sigset_t sigsaved;
3470 
3471         vcpu_load(vcpu);
3472 
3473         if (vcpu->sigset_active)
3474                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3475 
3476         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3477                 kvm_vcpu_block(vcpu);
3478                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3479                 r = -EAGAIN;
3480                 goto out;
3481         }
3482 
3483         /* re-sync apic's tpr */
3484         if (!irqchip_in_kernel(vcpu->kvm))
3485                 kvm_set_cr8(vcpu, kvm_run->cr8);
3486 
3487         if (vcpu->arch.pio.cur_count) {
3488                 r = complete_pio(vcpu);
3489                 if (r)
3490                         goto out;
3491         }
3492 #if CONFIG_HAS_IOMEM
3493         if (vcpu->mmio_needed) {
3494                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3495                 vcpu->mmio_read_completed = 1;
3496                 vcpu->mmio_needed = 0;
3497 
3498                 down_read(&vcpu->kvm->slots_lock);
3499                 r = emulate_instruction(vcpu, kvm_run,
3500                                         vcpu->arch.mmio_fault_cr2, 0,
3501                                         EMULTYPE_NO_DECODE);
3502                 up_read(&vcpu->kvm->slots_lock);
3503                 if (r == EMULATE_DO_MMIO) {
3504                         /*
3505                          * Read-modify-write.  Back to userspace.
3506                          */
3507                         r = 0;
3508                         goto out;
3509                 }
3510         }
3511 #endif
3512         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3513                 kvm_register_write(vcpu, VCPU_REGS_RAX,
3514                                      kvm_run->hypercall.ret);
3515 
3516         r = __vcpu_run(vcpu, kvm_run);
3517 
3518 out:
3519         if (vcpu->sigset_active)
3520                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3521 
3522         vcpu_put(vcpu);
3523         return r;
3524 }

該函數通過ioctl系統調用調用由虛擬處理器文件描述符指向的虛擬文件調度處理函數kvm_run()調度虛擬處理器的執行,該系統調用將虛擬處理器vcpu信息加載到物理處理器中,通過vm_entry執行進入客戶機執行。


後面就是陷入和捕獲以及上下文切換了,後面分析。。。

在客戶機正常運行期間kvm_run()函數不返回,只有發生以下兩種情況時,函數返回:1,發生了I/O事件,如客戶機發出讀寫I/O的指令;2,產生了客戶機和內核KVM都無法處理的異常。I/O事件處理完畢後,通過重新調用KVM_RUN()函數繼續調度客戶機的執行。

大致流程就是如此,還得繼續細細分析。

發佈了111 篇原創文章 · 獲贊 8 · 訪問量 8萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章