VT-d Interrupt Posting代码分析

VT-d Posted Interrupt 代码分析

Posted Interrupt是基于Interrupt Remapping机制实现的，关于VT-d Posted Interrupt的原理可以参考VT-d Posted Interrupt，建议先了解原理再来看代码分析。

分析VT-d Posted Interrupt代码的代码需要从vCPU调度入手，为了实现中断的直接投递和中断迁移，在vCPU调度时候VMM需要为Posted Interrupt做一些额外的工作，但这些额外的工作带来的中断实时性提升是可观的。

per-vCPU Posted Interrupt Descriptor

为了支持VT-d Posted Interrup Inter为vCPU引入了Posted Interrupt Descriptor数据结构，其中有pir，on,sn,nv,ndst等几个关键域。

PIR：记录了要给虚拟机vCPU投递的vector号（由硬件自动写入并由VMM软件读取）；
ON：当中断到来时ON标志位自动置位告知guest我有中断要投递给你了；
SN：是VMM软件用来告知VT-d硬件当前vCPU不在Running状态你不要给我投中断了我收不到；
NV：是主机上配合Poste Interrupt工作的一个中断vector（它的值只能是wakeup_vector或者notification vector）；
NDST：存放当前vCPU所在PCPU的apicid（由VMM负责刷新，确保中断可以自动迁移到目的pCPU上）。

/* Posted-Interrupt Descriptor */
struct pi_desc {
    u32 pir[8];     /* Posted interrupt requested */
    union {
        struct {
                /* bit 256 - Outstanding Notification */
            u16 on  : 1,
                /* bit 257 - Suppress Notification */
                sn  : 1,
                /* bit 271:258 - Reserved */
                rsvd_1  : 14;
                /* bit 279:272 - Notification Vector */
            u8  nv;
                /* bit 287:280 - Reserved */
            u8  rsvd_2;
                /* bit 319:288 - Notification Destination */
            u32 ndst;
        };
        u64 control;
    };
    u32 rsvd[6];
} __aligned(64);

首先要明确pi_desc是per-vcpu的，所以在每个vCPU的数据结构struct vcpu_vmx里面会包含一个pi_desc数据结构。

struct vcpu_vmx {
    /* Posted interrupt descriptor */
    struct pi_desc pi_desc;
}

vCPU创建的时候会将NV置成POSTED_INTR_VECTOR也就是notification event的中断号，同时把SN置1（因为这时候vCPU还没有运行）。 kvm_vm_ioctl_create_vcpu => kvm_arch_vcpu_create => vmx_vcpu_create，这里会注册vCPU的preempt notifier，当调度器选中vCPU线程的时候VMM会收到通知，VMM调用回调函数进行处理。

static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
    preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); #注册vcpu的preempt notifier
    /*
     * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
     * or POSTED_INTR_WAKEUP_VECTOR.
     */
    vmx->pi_desc.nv = POSTED_INTR_VECTOR;
    vmx->pi_desc.sn = 1;
}

同时kvm_vm_ioctl_create_vcpu => kvm_arch_vcpu_setup => vcpu_load, vcpu_put会对pi_desc做一些修改，后面结合虚拟机vCPU调度进行代码分析。

vCPU调度与VT-d Posted Interrupt

vCPU的运行状态主要有3种：

Running 状态：vCPU正处于非根模式下运行
Runnable 状态：vCPU线程被抢占或者时间片到期，等待OS的下一次调度
Blocked 状态： vCPU执行hlt指令后从非根模式block出来准备休眠的状态

vCPU调度就是指在VMM的管理下虚拟机的vCPU线程在这几种状态之间切换的场景，针对不同的状态转变VMM会干预进来为Posted Interrupt做一些额外的工作以确保中断自动迁移可以顺利进行。

posted interrupt scheduler

vCPU 从 Runnable => Running

当vCPU被调度器选中运行之前会调用VMM的回调函数，在kvm中这个函数时kvm_sched_in。

static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
{
    struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);

    if (vcpu->preempted)
        vcpu->preempted = false;  #将vcpu被抢占的标志位清零

    kvm_arch_sched_in(vcpu, cpu); #调整一下ple window
    #将VMCS加载到pCPU上准备运行了（这里可能是调度到其他pCPU上运行，也可能是继续在原来pCPU上运行）
    kvm_arch_vcpu_load(vcpu, cpu); 
}

kvm_sched_in => kvm_arch_vcpu_load => vmx_vcpu_load => vmx_vcpu_pi_load， vCPU要从Runnable状态切换到Running状态了，这时候要:刷新NDST为vCPU要运行到的pCPU的apic id，并设置SN=0（告知硬件我现在可以接收Posted Interrupt了）。

static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
    struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
    struct pi_desc old, new;
    unsigned int dest;

    /*
     * In case of hot-plug or hot-unplug, we may have to undo
     * vmx_vcpu_pi_put even if there is no assigned device.  And we
     * always keep PI.NDST up to date for simplicity: it makes the
     * code easier, and CPU migration is not a fast path.
     */
    if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
        return;

    /*
     * First handle the simple case where no cmpxchg is necessary; just
     * allow posting non-urgent interrupts.
     *
     * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
     * PI.NDST: pi_post_block will do it for us and the wakeup_handler
     * expects the VCPU to be on the blocked_vcpu_list that matches
     * PI.NDST.
     */
    if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
        vcpu->cpu == cpu) {
        pi_clear_sn(pi_desc);
        return;
    }

    /* The full case.  */
    do {
        old.control = new.control = pi_desc->control;

        dest = cpu_physical_id(cpu);

        if (x2apic_enabled())
            new.ndst = dest;
        else
            new.ndst = (dest << 8) & 0xFF00;

        new.sn = 0;
    } while (cmpxchg64(&pi_desc->control, old.control,
               new.control) != old.control);
}

vCPU 从 Running => Runnable

当vCPU被抢占或者时间片到期的时候vCPU被调度出来，这时候会触发回调函数kvm_sched_out。

static void kvm_sched_out(struct preempt_notifier *pn,
              struct task_struct *next)
{
    struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);

    if (current->state == TASK_RUNNING)
        vcpu->preempted = true;     #置上vcpu被抢占标志位

    #将vCPU的VMCS从当前pCPU上拿下来，并且保存一下vCPU的相关信息到VMCS中
    kvm_arch_vcpu_put(vcpu);
}

kvm_sched_out => vmx_vcpu_put => vmx_vcpu_pi_put，这里vCPU要被调度出来的，那么要把SN bit置位（中断抑制），告诉硬件我不在运行了，先别给我投递中断，我暂时无法处理。

static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
    struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

    if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
        !irq_remapping_cap(IRQ_POSTING_CAP)  ||
        !kvm_vcpu_apicv_active(vcpu))
        return;

    /* Set SN when the vCPU is preempted */
    if (vcpu->preempted)
        pi_set_sn(pi_desc);  # set SN bit here
}

vCPU 从 Running => Blocked

当vCPU在Running状态下非根模式执行hlt指令后会被VMM截获发生VM Exit（肯定不能让vCPU在非根模式下中止，这样会浪费CPU资源），这时候会调用vcpu_block函数来处理。

static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
    if (!kvm_arch_vcpu_runnable(vcpu) &&
        (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
        kvm_vcpu_block(vcpu);
        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

        if (kvm_x86_ops->post_block)
            kvm_x86_ops->post_block(vcpu);

        if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
            return 1;
    }

    kvm_apic_accept_events(vcpu);
    switch(vcpu->arch.mp_state) {
    case KVM_MP_STATE_HALTED:
        vcpu->arch.pv.pv_unhalted = false;
        vcpu->arch.mp_state =
            KVM_MP_STATE_RUNNABLE;
    case KVM_MP_STATE_RUNNABLE:
        vcpu->arch.apf.halted = false;
        break;
    case KVM_MP_STATE_INIT_RECEIVED:
        break;
    default:
        return -EINTR;
        break;
    }
    return 1;
}

vcpu_block细分为3个阶段Pre Block, Block 和 Post Block。Pre Block阶段会调用pi_pre_block，这里会将vCPU添加到一个per pCPU的等待链表（waiting list）上，这个链表记录了所有在这个pCPU上休眠的vCPU列表，然后更新NDST域。

static int pi_pre_block(struct kvm_vcpu *vcpu)
{
    unsigned int dest;
    struct pi_desc old, new;
    struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
    # 虚拟机没有配置直通设备 || 不支持Posted Interrupt => 直接返回
    if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
        !irq_remapping_cap(IRQ_POSTING_CAP)  ||
        !kvm_vcpu_apicv_active(vcpu))
        return 0;

    # 关中断， 将当前vCPU线程加入到上次运行的pCPU的等待列表中
    WARN_ON(irqs_disabled());
    local_irq_disable();
    if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
        vcpu->pre_pcpu = vcpu->cpu;
        spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
        list_add_tail(&vcpu->blocked_vcpu_list,
                  &per_cpu(blocked_vcpu_on_cpu,
                       vcpu->pre_pcpu));
        spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
    }

    #刷新NDST，更新NV为wakeup vector
    do {
        old.control = new.control = pi_desc->control;

        WARN((pi_desc->sn == 1),
             "Warning: SN field of posted-interrupts "
             "is set before blocking\n");

        /*
         * Since vCPU can be preempted during this process,
         * vcpu->cpu could be different with pre_pcpu, we
         * need to set pre_pcpu as the destination of wakeup
         * notification event, then we can find the right vCPU
         * to wakeup in wakeup handler if interrupts happen
         * when the vCPU is in blocked state.
         */
        dest = cpu_physical_id(vcpu->pre_pcpu);

        if (x2apic_enabled())
            new.ndst = dest;
        else
            new.ndst = (dest << 8) & 0xFF00;

        /* set 'NV' to 'wakeup vector' */
        new.nv = POSTED_INTR_WAKEUP_VECTOR;
    } while (cmpxchg64(&pi_desc->control, old.control,
               new.control) != old.control);

    #如果在pre block阶段收到了中断，那么就不block了，直接转导Runnable状态去
    /* We should not block the vCPU if an interrupt is posted for it.  */
    if (pi_test_on(pi_desc) == 1)
        __pi_post_block(vcpu);

    local_irq_enable();
    return (vcpu->pre_pcpu == -1);
}

Pre Block阶段过后会调用kvm_vcpu_block，在这个函数中会调用schdule()主动把vCPU调度出去（休眠），让出pCPU执行其他vCPU的代码。

vCPU 从 Blocked => Runnable

可以从这么一种场景理解：如果vcpu0和vcpu1都在同一个物理CPU上运行，某一时刻vcpu0正在运行， vcpu1还处于休眠状态，这是外部设备产生了一个中断需要注入到vcpu1上：

Device会按照初始化配置的MSI-x中断格式给提交一个Interrupt Reqeust，由于提交的是Remapping格式中断会被IOMMU截获。
IOMMU查询IRTE解析出vcpu1对应点PD和NV（notification vector），但此时vcpu1还在睡觉，因此NV是被设置成wakeup vector的。
物理cpu接收到wakeup interrupt，导致正在运行的vcpu0被kick到root模式下，在wakeup interrupt handler中遍历blocked_vcpu_on_cpu链表，得知vcpu1上有个中断需要处理，将vcpu1扔到运行队列中，将vcpu从Block状态变为Runnale状态。

/*
 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
 */
void pi_wakeup_handler(void)
{
    struct kvm_vcpu *vcpu;
    // 获取当前物理CPU的id
    int cpu = smp_processor_id();

    // 遍历当前物理CPU的blocked_vcpu_list
    spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
    list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
            blocked_vcpu_list) {
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

        // 检测vcpu的PD是否ON被硬件置位
        if (pi_test_on(pi_desc) == 1)
            kvm_vcpu_kick(vcpu);  // 唤醒睡眠的vcpu
    }
    spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
}

当vCPU休眠结束之后会调用vmx_post_block => __pi_post_block这时候vCPU结束睡眠被重新调度。注意这里会更新NDST并将vCPU从pCPU等待链表上删除，并且把NV置位POSTED_INTR_VECTOR。

static void __pi_post_block(struct kvm_vcpu *vcpu)
{
    struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
    struct pi_desc old, new;
    unsigned int dest;
    #再度更新NDST，因为block睡眠之后被再调度出来执行的时候可能换了pCPU！
    do {
        old.control = new.control = pi_desc->control;
        WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
             "Wakeup handler not enabled while the VCPU is blocked\n");

        dest = cpu_physical_id(vcpu->cpu);

        if (x2apic_enabled())
            new.ndst = dest;
        else
            new.ndst = (dest << 8) & 0xFF00;

        /* set 'NV' to 'notification vector' */
        new.nv = POSTED_INTR_VECTOR;
    } while (cmpxchg64(&pi_desc->control, old.control,
               new.control) != old.control);
    #将vCPU从等待列表中删除掉
    if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
        spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
        list_del(&vcpu->blocked_vcpu_list);
        spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
        vcpu->pre_pcpu = -1;
    }
}

剩下一种状态转换路径 vCPU从 Runable => Blocked状态，这和从Running状态切换成Blocked状态一致，这里不再赘述！

整个VT-d Posted Interrupt 工作原理如下图所示：

vtd posted interrupt

VT-d Interrupt Posting代码分析

VT-d Posted Interrupt 代码分析

per-vCPU Posted Interrupt Descriptor

vCPU调度与VT-d Posted Interrupt

vCPU 从 Runnable => Running

vCPU 从 Running => Runnable

vCPU 从 Running => Blocked

vCPU 从 Blocked => Runnable

Related Posts

Published

Category

Tags

Contact