xywuyiba6 发表于 2015-10-10 12:28:42

7.2 Qemu/KVM 直接IO框架



  7.2.1 qemu pci-assign模块
  虚拟机上的设备是由qemu创建出来的,对于直接io也是如此。 区别在于直接io时,qemu直接调用vm host上的硬件设备完成相应功能;而不需要更多的软件处理。
  static const TypeInfoassign_info = { (pci-assign.c)
  .name               = "kvm-pci-assign",
  .parent             = TYPE_PCI_DEVICE,
  .instance_size      = sizeof(AssignedDevice),
  .class_init         = assign_class_init,
  };
  (1) 初始化
  static intassigned_initfn(struct PCIDevice *pci_dev)
  {
  AssignedDevice *dev =DO_UPCAST(AssignedDevice, dev, pci_dev);
  //对config空间的虚拟寄存做初始化, 将寄存器的值存在软件变量dev 的emulate_config_read 和emulate_config_write中
  assigned_dev_emulate_config_read(dev, 0,PCI_CONFIG_SPACE_SIZE);
  assigned_dev_direct_config_read(dev, PCI_STATUS,2);
  。。。。。。。。。。
  //和真实的pci设备关联, 由于启动时会输入pci bus,device,func号,所以依据这些信息能得到pci deice对应在vm host上的设别文件
  get_real_device(dev, dev->host.domain,dev->host.bus,
  dev->host.slot,dev->host.function)
  
  assigned_device_pci_cap_init(pci_dev)为pci_dev添加capability
  //增加misx的mmio处理回调assigned_dev_msix_mmio_ops
  assigned_dev_register_msix_mmio(dev);
  //为pci device的memory空间建立mmap
  assigned_dev_register_regions(dev->real_device.regions,
  dev->real_device.region_number, dev)};
  r = assign_device(dev); //调用kvm的KVM_ASSIGN_PCI_DEVICE,
  r = assign_intx(dev);//调用kvm 的KVM_ASSIGN_DEV_IRQ,管理中断
  ....
  }
  下面分析其中的关键函数:
  get_real_device ==》
  a. snprintf(dir, sizeof(dir),
  "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",r_seg, r_bus, r_dev, r_func);
  dev->config_fd = open(name, O_RDWR); //打开真实设备的config,并读出内容
  read(dev->config_fd,pci_dev->dev.config, pci_config_size(&pci_dev->dev));
  但对bar地址做特殊处理
  memset(&pci_dev->dev.config, 0, 24);
  memset(&pci_dev->dev.config,0, 4);
  b. 记录mmio信息到PCIRegion *rp;结构
  snprintf(name, sizeof(name),"%sresource", dir);
  f = fopen(name, "r");
  对每个bar做:
  fscanf(f, "%" SCNi64 "%" SCNi64 " %" SCNi64 "\n",&start, &end,&flags) ;
  rp = dev->regions + r;rp->valid = 0;
  rp->resource_fd = -1;
  size = end - start + 1;
  snprintf(name, sizeof(name),"%sresource%d", dir, r);
  fd = open(name, O_RDWR);
  rp->resource_fd = fd;
  rp->type = flags; rp->valid = 1;rp->base_addr = start; rp->size = size;
  pci_dev->v_addrs.region = rp;
  
  assigned_dev_register_regions==》
  a.pci_dev->v_addrs.u.r_virtbase = mmap(NULL,cur_region->size,
  PROT_WRITE |PROT_READ, MAP_SHARED,
  cur_region->resource_fd, (off_t)0);
  b.分为mmio和pio的case 分开处理(下面仅分析mmio)关联mmio gpa到真实设备的hva:
  若mmio size < 0x1000(没有到一个内存page大小)
  则   memory_region_init_io(&pci_dev->v_addrs.real_iomem,
  &slow_bar_ops,&pci_dev->v_addrs,
  &quot;assigned-dev-slow-bar&quot;, cur_region->size);
  否则用:void *virtbase = pci_dev->v_addrs.u.r_virtbase;
  memory_region_init_ram_ptr(&pci_dev->v_addrs.real_iomem,
  name, cur_region->size,virtbase);
  //当EPt建立好后,guest os访问gpa时就直接访问真实设备了不会有vm-exit发生
  c.         assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size);
  pci_register_bar((PCIDevice *)pci_dev, i, t,
  &pci_dev->v_addrs.container);
  
  assign_device ==>kvm_device_pci_assign ==>
  kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data); (注意assgin时同时设置了host 与guest)
  
  assign_intx ==>
  a.   intx_route = pci_device_route_intx_to_irq(&dev->dev,dev->intpin); ==》
  pci_device_route_intx_to_irq(call piix3_route_intx_pin_to_irq)得到当前dev的irq信息
  b. deassign当前irq==》kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ,&assigned_irq);
  c.重新assign当前设置kvm_device_intx_assign(kvm_state,dev->dev_id, intx_host_msi,
  intx_route.irq);==》
  static intkvm_assign_irq_internal(KVMState *s, uint32_t dev_id,
  uint32_tirq_type, uint32_t guest_irq)
  {
  struct kvm_assigned_irq assigned_irq = {
  .assigned_dev_id = dev_id,
  .guest_irq = guest_irq,
  .flags = irq_type,
  };
  
  if (kvm_check_extension(s,KVM_CAP_ASSIGN_DEV_IRQ)) {
  return kvm_vm_ioctl(s,KVM_ASSIGN_DEV_IRQ, &assigned_irq);
  } else {
  return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ,&assigned_irq);
  }
  }
  
  kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ,&assigned_irq);
  
  (2) Reset
  reset_assigned_device:
  a. 对于msix设备调用assigned_dev_update_msix(pci_dev);
  b. 真实设备reset
  snprintf(reset_file, sizeof(reset_file), &quot;/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset&quot;,
  adev->host.domain,adev->host.bus, adev->host.slot, adev->host.function);
  
  fd = open(reset_file, O_WRONLY);
  ret = write(fd, reset, strlen(reset));
  c. assigned_dev_pci_write_config(pci_dev,PCI_COMMAND, 0, 1);
  
  assigned_dev_update_msi==>分misx和intx的case
  msix case: 1. virq =kvm_irqchip_add_msi_route(kvm_state, msg);
  2. kvm_device_msi_assign(kvm_state,assigned_dev->dev_id, virq);
  最终调用kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ,&assigned_irq);
  intx case:   assign_intx(assigned_dev);
  
  intkvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
  {
  ......
  virq = kvm_irqchip_get_virq(s); //软件分配一个空闲irq号
  
  kroute.gsi = virq;
  kroute.type = KVM_IRQ_ROUTING_MSI;
  kroute.flags = 0;
  kroute.u.msi.address_lo =(uint32_t)msg.address;
  kroute.u.msi.address_hi = msg.address>> 32;
  kroute.u.msi.data = msg.data;
  //调用kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING,s->irq_routes);
  kvm_add_routing_entry(s, &kroute);
  return virq;
  
  7.2.2kvm pci assgin
  源码位于virt\assignd-dev.c:
  kvm_vm_ioctl_assigned_device==> case KVM_ASSIGN_PCI_DEVICE ==>
  kvm_vm_ioctl_assign_device==>
  a)kvm_find_assigned_dev查看dev是否assigned,若已assigned,直接返回
  b)pci_get_domain_bus_and_slot根据设备地址得到该设备的pci_device
  c)probe_sysfs_permissions打开设备sysfs的访问权限,这样qeum能访问
  d)pcidevice的相关初始化
  pci_enable_device(dev);
  pci_request_regions(dev,&quot;kvm_assigned_device&quot;);
  pci_reset_function(dev);
  pci_save_state(dev);
  match->pci_saved_state =pci_store_saved_state(dev);
  e)加入设备到assignedlist list_add(&match->list, &kvm->arch.assigned_dev_head);
  f)若vm的iommu domain未建立则kvm_iommu_map_guest(kvm);
  g)r =kvm_assign_device(kvm, match); 将设备关联到iommu
  
  kvm_iommu_map_guest ==》
  kvm->arch.iommu_domain= iommu_domain_alloc(&pci_bus_type);
  kvm_iommu_map_memslots(kvm);
  
  kvm_assign_device(virtio/iommu.c)==>
  a. r = iommu_attach_device(domain,&pdev->dev); //调用iommu关联设备
  
  kvm_iommu_map_memslots ==》
  slots =kvm_memslots(kvm);
  kvm_for_each_memslot(memslot, slots) {
  r = kvm_iommu_map_pages(kvm, memslot);
  if (r)
  break;
  }
  
  kvm_iommu_map_pages ==》
  对slot中的每个gfn
  a. iommu_iova_to_phys(domain,gfn_to_gpa(gfn)) 检查是否已建立iommu映射
  b. iommu_map(domain, gfn_to_gpa(gfn),pfn_to_hpa(pfn), page_size, flags);建立映射
  
  同时在qemu新增加memory映射时该函数也会被调用:
  __kvm_set_memory_region==》
  if((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
  r = kvm_iommu_map_pages(kvm, &new);
  return r;
  }
  
  7.2.3 kvminterruptassgin
  (1) 中断assign
  kvm_vm_ioctl_assigned_device==》 case KVM_ASSIGN_DEV_IRQ==>
  kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); ==>
  
  if (host_irq_type)
  r = assign_host_irq(kvm, match, host_irq_type);
  if (guest_irq_type)
  r = assign_guest_irq(kvm, match, assigned_irq,guest_irq_type);
  
  assign_host_irq ==> 分为了intx, msi,和msix三种case,我们以后仅分析msix的case:
  ==> assigned_device_enable_host_msix==>
  a. pci_enable_msix_exact(dev->dev, dev->host_msix_entries,dev->entries_nr);
  b. request_threaded_irq(dev->host_msix_entries.vector,
  kvm_assigned_dev_msix,
  kvm_assigned_dev_thread_msix,
  0,dev->irq_name, dev); //注册了中断处理函数
  
  assign_guest_irq ==》分为了intx, msi,和msix三种case,我们以后仅分析msix的case:
  a .id =kvm_request_irq_source_id(kvm); ==》
  b. assigned_device_enable_guest_msix
  
  assigned_device_enable_guest_msix(structkvm *kvm,
  struct kvm_assigned_dev_kernel *dev,
  struct kvm_assigned_irq *irq)
  {
  dev->guest_irq = irq->guest_irq;//guest_irq为guest os 的中断号
  dev->ack_notifier.gsi = -1;
  return 0;
  }
  
  
  (2) MSIX中断管理
  对于misx的guest irq号由assigned_dev_update_msi ==》kvm_irqchip_add_msi_route分配
  对应内核态为:
  kvm_vm_ioctl ==》case KVM_SET_GSI_ROUTING==> kvm_set_irq_routing (virt\irqchip.c) ==》
  setup_routing_entry ==》 kvm_set_routing_entry (irq_comm.c) ==>
  case KVM_IRQ_ROUTING_MSI
  e->set = kvm_set_msi;//中断注入回调函数
  e->msi.address_lo = ue->u.msi.address_lo;
  e->msi.address_hi = ue->u.msi.address_hi;
  e->msi.data = ue->u.msi.data;
  
  int kvm_set_msi(structkvm_kernel_irq_routing_entry *e,
  struct kvm *kvm, int irq_source_id, int level, boolline_status)
  {
  struct kvm_lapic_irq irq;
  kvm_set_msi_irq(e, &irq);
  return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
  }
  
  下面来看看host端中断处理:
  kvm_assigned_dev_raise_guest_irq(structkvm_assigned_dev_kernel *assigned_dev,
  int vector)
  {
  if (unlikely(assigned_dev->irq_requested_type &
  KVM_DEV_IRQ_GUEST_INTX)) {
  spin_lock(&assigned_dev->intx_mask_lock);
  if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
  kvm_set_irq(assigned_dev->kvm,
  assigned_dev->irq_source_id, vector, 1,
  false);
  spin_unlock(&assigned_dev->intx_mask_lock);
  } else
  kvm_set_irq(assigned_dev->kvm,assigned_dev->irq_source_id,
  vector, 1,false);
  }
  所以kvm_set_irq ==> kvm_set_msi(
  
  static irqreturn_tkvm_assigned_dev_msix(int irq, void *dev_id)
  {
  struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
  int index = find_index_from_host_irq(assigned_dev, irq);
  u32 vector;
  int ret = 0;
  
  if (index >= 0) {
  vector = assigned_dev->guest_msix_entries.vector;
  ret = kvm_set_irq_inatomic(assigned_dev->kvm,
  assigned_dev->irq_source_id,
  vector,1);
  }
  
  return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD :IRQ_HANDLED;
  }
  
  kvm_set_irq_inatomic==》kvm_set_msi_inatomic ==》 kvm_irq_delivery_to_apic_fast
  当真实中断发生时,向guest assigneddevice注入中断
  
  (3) intx中断管理
  assign_host_irq ==》 request_threaded_irq(dev->host_irq,irq_handler,
  kvm_assigned_dev_thread_intx, flags,
  dev->irq_name, dev);
  assign_guest_irq ==> caseintx
  static intassigned_device_enable_guest_intx(struct kvm *kvm,
  struct kvm_assigned_dev_kernel *dev,
  struct kvm_assigned_irq *irq)
  {
  dev->guest_irq = irq->guest_irq;
  dev->ack_notifier.gsi = irq->guest_irq;
  return 0;
  }
  
  kvm_assigned_dev_thread_intx==》kvm_assigned_dev_raise_guest_irq ==> kvm_set_irq
  
  由此可知,kvm的中断虚拟化流程如下:
  (1) 同时注册真实设备的中断处理函数
  (2) 当中断发生时,根据真实设别中断号对应虚拟设备号,注入中断
  如果系统采用了irq remap机制,则host的中断不会产生,直接在guest os上产生中断。
  下一节将讨论iommu.
  
  除pci-assign外,另一种直接io方法,为vfio. 它与pci-assign的区别在于,vfio更多的虚拟化实现放在了qemu用户空间中实现。 但其底层任然会使用iommu;本文就不详细分析vfio了。 其源代码位于:
  Qemu:hw\vfio_pci.c
  Host driver:drivers/pci/vfio/
         版权声明:本文为博主原创文章,未经博主允许不得转载。
页: [1]
查看完整版本: 7.2 Qemu/KVM 直接IO框架