openvmm · GitVita

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

//! Interface to `mshv_vtl` driver.

mod deferred;

pub mod aarch64;
pub mod snp;
pub mod tdx;
pub mod x64;

use self::deferred::DeferredActionSlots;
use self::ioctls::*;
use crate::GuestVtl;
use crate::ioctl::deferred::DeferredAction;
use crate::mapped_page::MappedPage;
use crate::protocol;
use crate::protocol::EnterModes;
use crate::protocol::HCL_REG_PAGE_OFFSET;
use crate::protocol::HCL_VMSA_GUEST_VSM_PAGE_OFFSET;
use crate::protocol::HCL_VMSA_PAGE_OFFSET;
use crate::protocol::MSHV_APIC_PAGE_OFFSET;
use crate::protocol::hcl_intr_offload_flags;
use crate::protocol::hcl_run;
use deferred::RegisteredDeferredActions;
use deferred::push_deferred_action;
use deferred::register_deferred_actions;
use hv1_structs::ProcessorSet;
use hv1_structs::VtlArray;
use hvdef::HV_PAGE_SIZE;
use hvdef::HV_PARTITION_ID_SELF;
use hvdef::HV_VP_INDEX_SELF;
use hvdef::HvAllArchRegisterName;
#[cfg(guest_arch = "aarch64")]
use hvdef::HvArm64RegisterName;
use hvdef::HvError;
use hvdef::HvMapGpaFlags;
use hvdef::HvMessage;
use hvdef::HvRegisterName;
use hvdef::HvRegisterValue;
use hvdef::HvRegisterVsmPartitionConfig;
use hvdef::HvStatus;
use hvdef::HvX64RegisterName;
use hvdef::HvX64RegisterPage;
use hvdef::HypercallCode;
use hvdef::Vtl;
use hvdef::hypercall::AssertVirtualInterrupt;
use hvdef::hypercall::HostVisibilityType;
use hvdef::hypercall::HvGpaRange;
use hvdef::hypercall::HvGpaRangeExtended;
use hvdef::hypercall::HvInputVtl;
use hvdef::hypercall::HvInterceptParameters;
use hvdef::hypercall::HvInterceptType;
use hvdef::hypercall::HvRegisterAssoc;
use hvdef::hypercall::HypercallOutput;
use hvdef::hypercall::InitialVpContextX64;
use hvdef::hypercall::ModifyHostVisibility;
use memory_range::MemoryRange;
use pal::unix::pthread::*;
use parking_lot::Mutex;
use private::BackingPrivate;
use sidecar_client::NewSidecarClientError;
use sidecar_client::SidecarClient;
use sidecar_client::SidecarRun;
use sidecar_client::SidecarVp;
use std::cell::UnsafeCell;
use std::fmt::Debug;
use std::fs::File;
use std::io;
use std::os::unix::prelude::*;
use std::sync::Arc;
use std::sync::Once;
use std::sync::atomic::AtomicU8;
use std::sync::atomic::AtomicU32;
use std::sync::atomic::Ordering;
use thiserror::Error;
use user_driver::DmaClient;
use user_driver::memory::MemoryBlock;
use x86defs::snp::SevVmsa;
use x86defs::tdx::TdCallResultCode;
use x86defs::vmx::ApicPage;
use zerocopy::FromBytes;
use zerocopy::FromZeros;
use zerocopy::Immutable;
use zerocopy::IntoBytes;
use zerocopy::KnownLayout;

/// Error returned by HCL operations.
#[derive(Error, Debug)]
#[expect(missing_docs)]
pub enum Error {
    #[error("failed to open mshv device")]
    OpenMshv(#[source] io::Error),
    #[error("failed to open hvcall device")]
    OpenHvcall(#[source] io::Error),
    #[error("failed to open lower VTL memory device")]
    OpenGpa(#[source] io::Error),
    #[error("ReturnToLowerVtl")]
    ReturnToLowerVtl(#[source] nix::Error),
    #[error("AddVtl0Memory")]
    AddVtl0Memory(#[source] nix::Error),
    #[error("hcl_set_vp_register")]
    SetVpRegister(#[source] nix::Error),
    #[error("hcl_get_vp_register")]
    GetVpRegister(#[source] nix::Error),
    #[error("failed to get VP register {reg:#x?} from hypercall")]
    GetVpRegisterHypercall {
        #[cfg(guest_arch = "x86_64")]
        reg: HvX64RegisterName,
        #[cfg(guest_arch = "aarch64")]
        reg: HvArm64RegisterName,
        #[source]
        err: HvError,
    },
    #[error("hcl_request_interrupt")]
    RequestInterrupt(#[source] HvError),
    #[error("hcl_cancel_vp failed")]
    CancelVp(#[source] nix::Error),
    #[error("failed to signal event")]
    SignalEvent(#[source] HvError),
    #[error("failed to post message")]
    PostMessage(#[source] HvError),
    #[error("failed to mmap the vp context {:?}", .1.map(|vtl| format!("for VTL {:?}", vtl)).unwrap_or("".to_string()))]
    MmapVp(#[source] io::Error, Option<Vtl>),
    #[error("failed to set the poll file")]
    SetPollFile(#[source] nix::Error),
    #[error("failed to check hcl capabilities")]
    CheckExtensions(#[source] nix::Error),
    #[error("failed to mmap the register page")]
    MmapRegPage(#[source] io::Error),
    #[error("invalid num signal events")]
    NumSignalEvent(#[source] io::Error),
    #[error("failed to create vtl")]
    CreateVTL(#[source] nix::Error),
    #[error("Gva to gpa translation failed")]
    TranslateGvaToGpa(#[source] TranslateGvaToGpaError),
    #[error("gpa failed vtl access check")]
    CheckVtlAccess(#[source] HvError),
    #[error("failed to set registers using set_vp_registers hypercall")]
    SetRegisters(#[source] HvError),
    #[error("Unknown register name: {0:x}")]
    UnknownRegisterName(u32),
    #[error("Invalid register value")]
    InvalidRegisterValue,
    #[error("failed to set host visibility")]
    SetHostVisibility(#[source] nix::Error),
    #[error("failed to allocate host overlay page")]
    HostOverlayPageExhausted,
    #[error("sidecar error")]
    Sidecar(#[source] sidecar_client::SidecarError),
    #[error("failed to open sidecar")]
    OpenSidecar(#[source] NewSidecarClientError),
    #[error(
        "mismatch between requested isolation type {requested:?} and supported isolation type {supported:?}"
    )]
    MismatchedIsolation {
        supported: IsolationType,
        requested: IsolationType,
    },
    #[error("private page pool allocator missing, required for requested isolation type")]
    MissingPrivateMemory,
    #[error("failed to allocate pages for vp")]
    AllocVp(#[source] anyhow::Error),
}

/// Error for IOCTL errors specifically.
#[derive(Debug, Error)]
#[error("hcl request failed")]
pub struct IoctlError(#[source] pub(crate) nix::Error);

/// Error returned when issuing hypercalls.
#[derive(Debug, Error)]
#[expect(missing_docs)]
pub enum HypercallError {
    #[error("hypercall failed with {0:?}")]
    Hypervisor(HvError),
    #[error("ioctl failed")]
    Ioctl(#[source] IoctlError),
}

impl HypercallError {
    pub(crate) fn check(r: Result<i32, nix::Error>) -> Result<(), Self> {
        match r {
            Ok(n) => HvStatus(n.try_into().expect("hypervisor result out of range"))
                .result()
                .map_err(Self::Hypervisor),
            Err(err) => Err(Self::Ioctl(IoctlError(err))),
        }
    }
}

/// Errors when issuing hypercalls via the kernel direct interface.
#[derive(Error, Debug)]
#[expect(missing_docs)]
pub enum HvcallError {
    #[error(
        "kernel rejected the hypercall, most likely due to the hypercall code not being allowed via set_allowed_hypercalls"
    )]
    HypercallIoctlFailed(#[source] nix::Error),
    #[error("input parameters are larger than a page")]
    InputParametersTooLarge,
    #[error("output parameters are larger than a page")]
    OutputParametersTooLarge,
    #[error("output and input list lengths do not match")]
    InputOutputRepListMismatch,
}

/// Error applying VTL protections.
// TODO: move to `underhill_mem`.
#[derive(Error, Debug)]
#[expect(missing_docs)]
pub enum ApplyVtlProtectionsError {
    #[error(
        "hypervisor returned {output:?} error {hv_error:?} when protecting pages {range} for vtl {vtl:?}"
    )]
    Hypervisor {
        range: MemoryRange,
        output: HypercallOutput,
        #[source]
        hv_error: HvError,
        vtl: HvInputVtl,
    },
    #[error(
        "{failed_operation} when protecting pages {range} with {permissions:x?} for vtl {vtl:?}"
    )]
    Snp {
        #[source]
        failed_operation: snp::SnpPageError,
        range: MemoryRange,
        permissions: x86defs::snp::SevRmpAdjust,
        vtl: HvInputVtl,
    },
    #[error(
        "tdcall failed with {error:?} when protecting pages {range} with permissions {permissions:x?} for vtl {vtl:?}"
    )]
    Tdx {
        error: TdCallResultCode,
        range: MemoryRange,
        permissions: x86defs::tdx::TdgMemPageGpaAttr,
        vtl: HvInputVtl,
    },
    #[error("no valid protections for vtl {0:?}")]
    InvalidVtl(Vtl),
}

/// Error setting guest VSM configuration.
#[derive(Error, Debug)]
#[expect(missing_docs)]
pub enum SetGuestVsmConfigError {
    #[error(
        "hypervisor returned error {hv_error:?} when configuring guest vsm {enable_guest_vsm:?}"
    )]
    Hypervisor {
        enable_guest_vsm: bool,
        hv_error: HvError,
    },
}

/// Error getting the VP idnex from an APIC ID.
#[derive(Error, Debug)]
#[expect(missing_docs)]
pub enum GetVpIndexFromApicIdError {
    #[error("hypervisor returned error {hv_error:?} when querying vp index for {apic_id}")]
    Hypervisor { hv_error: HvError, apic_id: u32 },
}

/// Error setting VSM partition configuration.
#[derive(Error, Debug)]
#[expect(missing_docs)]
pub enum SetVsmPartitionConfigError {
    #[error(
        "hypervisor returned error {hv_error:?} when configuring vsm partition config {config:?}"
    )]
    Hypervisor {
        config: HvRegisterVsmPartitionConfig,
        hv_error: HvError,
    },
}

/// Error translating a GVA to a GPA.
#[derive(Error, Debug)]
#[expect(missing_docs)]
pub enum TranslateGvaToGpaError {
    #[error("hypervisor returned error {hv_error:?} on gva {gva:x}")]
    Hypervisor { gva: u64, hv_error: HvError },
    #[error("sidecar kernel failed on gva {gva:x}")]
    Sidecar {
        gva: u64,
        #[source]
        error: sidecar_client::SidecarError,
    },
}

/// Result from [`Hcl::check_vtl_access`] if vtl permissions were violated
#[derive(Debug)]
pub struct CheckVtlAccessResult {
    /// The intercepting VTL.
    pub vtl: Vtl,
    /// The flags that were denied.
    pub denied_flags: HvMapGpaFlags,
}

/// Error accepting pages.
// TODO: move to `underhill_mem`.
#[derive(Error, Debug)]
#[expect(missing_docs)]
pub enum AcceptPagesError {
    #[error("hypervisor returned {output:?} error {hv_error:?} when accepting pages {range}")]
    Hypervisor {
        range: MemoryRange,
        output: HypercallOutput,
        hv_error: HvError,
    },
    #[error("{failed_operation} when protecting pages {range}")]
    Snp {
        failed_operation: snp::SnpPageError,
        range: MemoryRange,
    },
    #[error("tdcall failed with {error:?} when accepting pages {range}")]
    Tdx {
        error: tdcall::AcceptPagesError,
        range: MemoryRange,
    },
}

// Action translation(to HVCALL) for pin/unpin GPA range.
#[derive(Debug, Copy, Clone)]
enum GpaPinUnpinAction {
    PinGpaRange,
    UnpinGpaRange,
}

/// Error pinning a GPA.
#[derive(Error, Debug)]
#[error("partial success: {ranges_processed} operations succeeded, but encountered an error")]
struct PinUnpinError {
    ranges_processed: usize,
    error: HvError,
}

/// Result of translate gva hypercall from [`Hcl`]
pub struct TranslateResult {
    /// The GPA that the GVA translated to.
    pub gpa_page: u64,
    /// Whether the page was an overlay page.
    pub overlay_page: bool, // Note: hardcoded to false on WHP
}

/// Possible types for rep hypercalls
enum HvcallRepInput<'a, T> {
    /// The actual elements to rep over
    Elements(&'a [T]),
    /// The elements for the rep are implied and only a count is needed
    Count(u16),
}

mod ioctls {
    #![allow(non_camel_case_types)]

    use crate::protocol;
    use hvdef::hypercall::HvRegisterAssoc;
    use nix::ioctl_none;
    use nix::ioctl_read;
    use nix::ioctl_readwrite;
    use nix::ioctl_write_ptr;

    // The unsafe interface to the `mshv` kernel module comprises
    // the following IOCTLs.
    const MSHV_IOCTL: u8 = 0xb8;
    const MSHV_VTL_RETURN_TO_LOWER_VTL: u16 = 0x27;
    const MSHV_SET_VP_REGISTERS: u16 = 0x6;
    const MSHV_GET_VP_REGISTERS: u16 = 0x5;
    const MSHV_HVCALL_SETUP: u16 = 0x1E;
    const MSHV_HVCALL: u16 = 0x1F;
    const MSHV_VTL_ADD_VTL0_MEMORY: u16 = 0x21;
    const MSHV_VTL_SET_POLL_FILE: u16 = 0x25;
    const MSHV_CREATE_VTL: u16 = 0x1D;
    const MSHV_CHECK_EXTENSION: u16 = 0x00;
    const MSHV_VTL_PVALIDATE: u16 = 0x28;
    const MSHV_VTL_RMPADJUST: u16 = 0x29;
    const MSHV_VTL_TDCALL: u16 = 0x32;
    const MSHV_VTL_READ_VMX_CR4_FIXED1: u16 = 0x33;
    const MSHV_VTL_GUEST_VSM_VMSA_PFN: u16 = 0x34;
    const MSHV_VTL_RMPQUERY: u16 = 0x35;
    const MSHV_INVLPGB: u16 = 0x36;
    const MSHV_TLBSYNC: u16 = 0x37;

    #[repr(C)]
    #[derive(Copy, Clone)]
    pub struct mshv_vp_registers {
        pub count: ::std::os::raw::c_int,
        pub regs: *mut HvRegisterAssoc,
    }

    #[repr(C, packed)]
    #[derive(Copy, Clone)]
    pub struct mshv_pvalidate {
        /// Execute the pvalidate instruction on the set of memory pages specified
        pub start_pfn: ::std::os::raw::c_ulonglong,
        pub page_count: ::std::os::raw::c_ulonglong,
        pub validate: ::std::os::raw::c_uchar,
        pub terminate_on_failure: ::std::os::raw::c_uchar,
        /// Set to 1 if the page is RAM (from the kernel's perspective), 0 if
        /// it's device memory.
        pub ram: u8,
        pub padding: [::std::os::raw::c_uchar; 1],
    }

    #[repr(C, packed)]
    #[derive(Copy, Clone)]
    pub struct mshv_rmpadjust {
        /// Execute the rmpadjust instruction on the set of memory pages specified
        pub start_pfn: ::std::os::raw::c_ulonglong,
        pub page_count: ::std::os::raw::c_ulonglong,
        pub value: ::std::os::raw::c_ulonglong,
        pub terminate_on_failure: ::std::os::raw::c_uchar,
        /// Set to 1 if the page is RAM (from the kernel's perspective), 0 if
        /// it's device memory.
        pub ram: u8,
        pub padding: [::std::os::raw::c_uchar; 6],
    }

    #[repr(C, packed)]
    #[derive(Copy, Clone)]
    pub struct mshv_rmpquery {
        /// Execute the rmpquery instruction on the set of memory pages specified
        pub start_pfn: ::std::os::raw::c_ulonglong,
        pub page_count: ::std::os::raw::c_ulonglong,
        pub terminate_on_failure: ::std::os::raw::c_uchar,
        /// Set to 1 if the page is RAM (from the kernel's perspective), 0 if
        /// it's device memory.
        pub ram: u8,
        pub padding: [::std::os::raw::c_uchar; 6],
        /// Output array for the flags, must have at least `page_count` entries.
        pub flags: *mut ::std::os::raw::c_ulonglong,
        /// Output array for the page sizes, must have at least `page_count` entries.
        pub page_size: *mut ::std::os::raw::c_ulonglong,
        /// Output for the amount of pages processed, a scalar.
        pub pages_processed: *mut ::std::os::raw::c_ulonglong,
    }

    #[repr(C, packed)]
    #[derive(Copy, Clone)]
    pub struct mshv_tdcall {
        pub rax: u64, // Call code and returned status
        pub rcx: u64,
        pub rdx: u64,
        pub r8: u64,
        pub r9: u64,
        pub r10_out: u64, // only supported as output
        pub r11_out: u64, // only supported as output
    }

    ioctl_none!(
        /// Relinquish the processor to VTL0.
        hcl_return_to_lower_vtl,
        MSHV_IOCTL,
        MSHV_VTL_RETURN_TO_LOWER_VTL
    );

    ioctl_write_ptr!(
        /// Set a VTL0 register for the current processor of the current
        /// partition.
        /// It is not allowed to set registers for other processors or
        /// other partitions for the security and coherency reasons.
        hcl_set_vp_register,
        MSHV_IOCTL,
        MSHV_SET_VP_REGISTERS,
        mshv_vp_registers
    );

    ioctl_readwrite!(
        /// Get a VTL0 register for the current processor of the current
        /// partition.
        /// It is not allowed to get registers of other processors or
        /// other partitions for the security and coherency reasons.
        hcl_get_vp_register,
        MSHV_IOCTL,
        MSHV_GET_VP_REGISTERS,
        mshv_vp_registers
    );

    ioctl_write_ptr!(
        /// Adds the VTL0 memory as a ZONE_DEVICE memory (I/O) to support
        /// DMA from the guest.
        hcl_add_vtl0_memory,
        MSHV_IOCTL,
        MSHV_VTL_ADD_VTL0_MEMORY,
        protocol::hcl_pfn_range_t
    );

    ioctl_write_ptr!(
        /// Sets the file to be polled while running a VP in VTL0. If the file
        /// becomes readable, then the VP run will be cancelled.
        hcl_set_poll_file,
        MSHV_IOCTL,
        MSHV_VTL_SET_POLL_FILE,
        protocol::hcl_set_poll_file
    );

    ioctl_write_ptr!(
        /// Sets up the hypercall allow map. Allowed once
        /// per fd.
        hcl_hvcall_setup,
        MSHV_IOCTL,
        MSHV_HVCALL_SETUP,
        protocol::hcl_hvcall_setup
    );

    ioctl_readwrite!(
        /// Performs a hypercall from the user mode.
        hcl_hvcall,
        MSHV_IOCTL,
        MSHV_HVCALL,
        protocol::hcl_hvcall
    );

    ioctl_write_ptr!(
        /// Executes the pvalidate instruction on a page range.
        hcl_pvalidate_pages,
        MSHV_IOCTL,
        MSHV_VTL_PVALIDATE,
        mshv_pvalidate
    );

    ioctl_write_ptr!(
        /// Executes the rmpadjust instruction on a page range.
        hcl_rmpadjust_pages,
        MSHV_IOCTL,
        MSHV_VTL_RMPADJUST,
        mshv_rmpadjust
    );

    ioctl_write_ptr!(
        /// Executes the rmpquery instruction on a page range.
        hcl_rmpquery_pages,
        MSHV_IOCTL,
        MSHV_VTL_RMPQUERY,
        mshv_rmpquery
    );

    ioctl_readwrite!(
        /// Executes a tdcall.
        hcl_tdcall,
        MSHV_IOCTL,
        MSHV_VTL_TDCALL,
        mshv_tdcall
    );

    ioctl_read!(
        hcl_read_vmx_cr4_fixed1,
        MSHV_IOCTL,
        MSHV_VTL_READ_VMX_CR4_FIXED1,
        u64
    );

    ioctl_readwrite!(
        hcl_read_guest_vsm_page_pfn,
        MSHV_IOCTL,
        MSHV_VTL_GUEST_VSM_VMSA_PFN,
        u64
    );

    pub const HCL_CAP_REGISTER_PAGE: u32 = 1;
    pub const HCL_CAP_VTL_RETURN_ACTION: u32 = 2;
    pub const HCL_CAP_DR6_SHARED: u32 = 3;

    ioctl_write_ptr!(
        /// Check for the presence of an extension capability.
        hcl_check_extension,
        MSHV_IOCTL,
        MSHV_CHECK_EXTENSION,
        u32
    );

    ioctl_read!(mshv_create_vtl, MSHV_IOCTL, MSHV_CREATE_VTL, u8);

    #[repr(C)]
    pub struct mshv_invlpgb {
        pub rax: u64,
        pub _pad0: u32,
        pub edx: u32,
        pub _pad1: u32,
        pub ecx: u32,
    }

    ioctl_write_ptr!(
        /// Issue an INVLPGB instruction.
        hcl_invlpgb,
        MSHV_IOCTL,
        MSHV_INVLPGB,
        mshv_invlpgb
    );

    ioctl_none!(
        /// Issue a TLBSYNC instruction.
        hcl_tlbsync,
        MSHV_IOCTL,
        MSHV_TLBSYNC
    );
}

/// The `/dev/mshv_vtl_low` device for accessing VTL0 memory.
pub struct MshvVtlLow {
    file: File,
}

impl MshvVtlLow {
    /// Opens the device.
    pub fn new() -> Result<Self, Error> {
        let file = fs_err::OpenOptions::new()
            .read(true)
            .write(true)
            .open("/dev/mshv_vtl_low")
            .map_err(Error::OpenGpa)?;

        Ok(Self { file: file.into() })
    }

    /// Gets the device file.
    pub fn get(&self) -> &File {
        &self.file
    }

    /// The flag to set in the file offset to map guest memory as shared instead
    /// of private.
    pub const SHARED_MEMORY_FLAG: u64 = 1 << 63;
}

/// An open `/dev/mshv` device file.
pub struct Mshv {
    file: File,
}

impl Mshv {
    /// Opens the mshv device.
    pub fn new() -> Result<Self, Error> {
        let file = fs_err::OpenOptions::new()
            .read(true)
            .write(true)
            .open("/dev/mshv")
            .map_err(Error::OpenMshv)?;

        Ok(Self { file: file.into() })
    }

    fn check_extension(&self, cap: u32) -> Result<bool, Error> {
        // SAFETY: calling IOCTL as documented, with no special requirements.
        let supported = unsafe {
            hcl_check_extension(self.file.as_raw_fd(), &cap).map_err(Error::CheckExtensions)?
        };
        Ok(supported != 0)
    }

    /// Opens an mshv_vtl device file.
    pub fn create_vtl(&self) -> Result<MshvVtl, Error> {
        let cap = &mut 0_u8;
        // SAFETY: calling IOCTL as documented, with no special requirements.
        let supported =
            unsafe { mshv_create_vtl(self.file.as_raw_fd(), cap).map_err(Error::CreateVTL)? };
        // SAFETY: calling IOCTL as documented, with no special requirements.
        let vtl_file = unsafe { File::from_raw_fd(supported) };
        Ok(MshvVtl { file: vtl_file })
    }
}

/// An open mshv_vtl device file.
#[derive(Debug)]
pub struct MshvVtl {
    file: File,
}

impl MshvVtl {
    /// Adds the VTL0 memory as a ZONE_DEVICE memory (I/O) to support DMA from the guest.
    pub fn add_vtl0_memory(&self, mem_range: MemoryRange, shared: bool) -> Result<(), Error> {
        let flags = if shared {
            MshvVtlLow::SHARED_MEMORY_FLAG / HV_PAGE_SIZE
        } else {
            0
        };
        let ram_disposition = protocol::hcl_pfn_range_t {
            start_pfn: mem_range.start_4k_gpn() | flags,
            last_pfn: mem_range.end_4k_gpn(),
        };

        // SAFETY: calling IOCTL as documented, with no special requirements.
        unsafe {
            hcl_add_vtl0_memory(self.file.as_raw_fd(), &ram_disposition)
                .map_err(Error::AddVtl0Memory)?;
        }

        Ok(())
    }
}

#[cfg(guest_arch = "x86_64")]
fn is_vtl_shared_mtrr(reg: HvX64RegisterName) -> bool {
    matches!(
        reg,
        HvX64RegisterName::MsrMtrrCap
            | HvX64RegisterName::MsrMtrrDefType
            | HvX64RegisterName::MsrMtrrPhysBase0
            | HvX64RegisterName::MsrMtrrPhysBase1
            | HvX64RegisterName::MsrMtrrPhysBase2
            | HvX64RegisterName::MsrMtrrPhysBase3
            | HvX64RegisterName::MsrMtrrPhysBase4
            | HvX64RegisterName::MsrMtrrPhysBase5
            | HvX64RegisterName::MsrMtrrPhysBase6
            | HvX64RegisterName::MsrMtrrPhysBase7
            | HvX64RegisterName::MsrMtrrPhysBase8
            | HvX64RegisterName::MsrMtrrPhysBase9
            | HvX64RegisterName::MsrMtrrPhysBaseA
            | HvX64RegisterName::MsrMtrrPhysBaseB
            | HvX64RegisterName::MsrMtrrPhysBaseC
            | HvX64RegisterName::MsrMtrrPhysBaseD
            | HvX64RegisterName::MsrMtrrPhysBaseE
            | HvX64RegisterName::MsrMtrrPhysBaseF
            | HvX64RegisterName::MsrMtrrPhysMask0
            | HvX64RegisterName::MsrMtrrPhysMask1
            | HvX64RegisterName::MsrMtrrPhysMask2
            | HvX64RegisterName::MsrMtrrPhysMask3
            | HvX64RegisterName::MsrMtrrPhysMask4
            | HvX64RegisterName::MsrMtrrPhysMask5
            | HvX64RegisterName::MsrMtrrPhysMask6
            | HvX64RegisterName::MsrMtrrPhysMask7
            | HvX64RegisterName::MsrMtrrPhysMask8
            | HvX64RegisterName::MsrMtrrPhysMask9
            | HvX64RegisterName::MsrMtrrPhysMaskA
            | HvX64RegisterName::MsrMtrrPhysMaskB
            | HvX64RegisterName::MsrMtrrPhysMaskC
            | HvX64RegisterName::MsrMtrrPhysMaskD
            | HvX64RegisterName::MsrMtrrPhysMaskE
            | HvX64RegisterName::MsrMtrrPhysMaskF
            | HvX64RegisterName::MsrMtrrFix64k00000
            | HvX64RegisterName::MsrMtrrFix16k80000
            | HvX64RegisterName::MsrMtrrFix16kA0000
            | HvX64RegisterName::MsrMtrrFix4kC0000
            | HvX64RegisterName::MsrMtrrFix4kC8000
            | HvX64RegisterName::MsrMtrrFix4kD0000
            | HvX64RegisterName::MsrMtrrFix4kD8000
            | HvX64RegisterName::MsrMtrrFix4kE0000
            | HvX64RegisterName::MsrMtrrFix4kE8000
            | HvX64RegisterName::MsrMtrrFix4kF0000
            | HvX64RegisterName::MsrMtrrFix4kF8000
    )
}

/// Indicate whether reg is shared across VTLs.
///
/// This function is not complete: DR6 may or may not be shared, depending on
/// the processor type; the caller needs to check HvRegisterVsmCapabilities.
/// Some MSRs are not included here as they are not represented in
/// HvX64RegisterName, including MSR_TSC_FREQUENCY, MSR_MCG_CAP,
/// MSR_MCG_STATUS, MSR_RESET, MSR_GUEST_IDLE, and MSR_DEBUG_DEVICE_OPTIONS.
#[cfg(guest_arch = "x86_64")]
fn is_vtl_shared_reg(reg: HvX64RegisterName) -> bool {
    is_vtl_shared_mtrr(reg)
        || matches!(
            reg,
            HvX64RegisterName::VpIndex
                | HvX64RegisterName::VpRuntime
                | HvX64RegisterName::TimeRefCount
                | HvX64RegisterName::Rax
                | HvX64RegisterName::Rbx
                | HvX64RegisterName::Rcx
                | HvX64RegisterName::Rdx
                | HvX64RegisterName::Rsi
                | HvX64RegisterName::Rdi
                | HvX64RegisterName::Rbp
                | HvX64RegisterName::Cr2
                | HvX64RegisterName::R8
                | HvX64RegisterName::R9
                | HvX64RegisterName::R10
                | HvX64RegisterName::R11
                | HvX64RegisterName::R12
                | HvX64RegisterName::R13
                | HvX64RegisterName::R14
                | HvX64RegisterName::R15
                | HvX64RegisterName::Dr0
                | HvX64RegisterName::Dr1
                | HvX64RegisterName::Dr2
                | HvX64RegisterName::Dr3
                | HvX64RegisterName::Xmm0
                | HvX64RegisterName::Xmm1
                | HvX64RegisterName::Xmm2
                | HvX64RegisterName::Xmm3
                | HvX64RegisterName::Xmm4
                | HvX64RegisterName::Xmm5
                | HvX64RegisterName::Xmm6
                | HvX64RegisterName::Xmm7
                | HvX64RegisterName::Xmm8
                | HvX64RegisterName::Xmm9
                | HvX64RegisterName::Xmm10
                | HvX64RegisterName::Xmm11
                | HvX64RegisterName::Xmm12
                | HvX64RegisterName::Xmm13
                | HvX64RegisterName::Xmm14
                | HvX64RegisterName::Xmm15
                | HvX64RegisterName::FpMmx0
                | HvX64RegisterName::FpMmx1
                | HvX64RegisterName::FpMmx2
                | HvX64RegisterName::FpMmx3
                | HvX64RegisterName::FpMmx4
                | HvX64RegisterName::FpMmx5
                | HvX64RegisterName::FpMmx6
                | HvX64RegisterName::FpMmx7
                | HvX64RegisterName::FpControlStatus
                | HvX64RegisterName::XmmControlStatus
                | HvX64RegisterName::Xfem
        )
}

/// Indicate whether reg is shared across VTLs.
#[cfg(guest_arch = "aarch64")]
fn is_vtl_shared_reg(reg: HvArm64RegisterName) -> bool {
    use hvdef::HvArm64RegisterName;

    matches!(
        reg,
        HvArm64RegisterName::X0
            | HvArm64RegisterName::X1
            | HvArm64RegisterName::X2
            | HvArm64RegisterName::X3
            | HvArm64RegisterName::X4
            | HvArm64RegisterName::X5
            | HvArm64RegisterName::X6
            | HvArm64RegisterName::X7
            | HvArm64RegisterName::X8
            | HvArm64RegisterName::X9
            | HvArm64RegisterName::X10
            | HvArm64RegisterName::X11
            | HvArm64RegisterName::X12
            | HvArm64RegisterName::X13
            | HvArm64RegisterName::X14
            | HvArm64RegisterName::X15
            | HvArm64RegisterName::X16
            | HvArm64RegisterName::X17
            | HvArm64RegisterName::X19
            | HvArm64RegisterName::X20
            | HvArm64RegisterName::X21
            | HvArm64RegisterName::X22
            | HvArm64RegisterName::X23
            | HvArm64RegisterName::X24
            | HvArm64RegisterName::X25
            | HvArm64RegisterName::X26
            | HvArm64RegisterName::X27
            | HvArm64RegisterName::X28
            | HvArm64RegisterName::XFp
            | HvArm64RegisterName::XLr
    )
}

/// The `/dev/mshv_hvcall` device for issuing hypercalls directly to the
/// hypervisor.
#[derive(Debug)]
pub struct MshvHvcall(File);

impl MshvHvcall {
    /// Opens the device.
    pub fn new() -> Result<Self, Error> {
        let file = fs_err::OpenOptions::new()
            .read(true)
            .write(true)
            .open("/dev/mshv_hvcall")
            .map_err(Error::OpenHvcall)?;

        Ok(Self(file.into()))
    }

    /// Set allowed hypercalls.
    pub fn set_allowed_hypercalls(&self, codes: &[HypercallCode]) {
        type ItemType = u64;
        let item_size_bytes = size_of::<ItemType>();
        let item_size_bits = item_size_bytes * 8;

        let mut allow_bitmap = Vec::<ItemType>::new();
        for &code in codes {
            let map_index = (code.0 as usize) / item_size_bits;
            if map_index >= allow_bitmap.len() {
                allow_bitmap.resize(map_index + 1, 0);
            }
            allow_bitmap[map_index] |= (1 as ItemType) << (code.0 % item_size_bits as u16);
        }

        let hvcall_setup = protocol::hcl_hvcall_setup {
            allow_bitmap_size: (allow_bitmap.len() * item_size_bytes) as u64,
            allow_bitmap_ptr: allow_bitmap.as_ptr(),
        };

        // SAFETY: following the IOCTL definition.
        unsafe {
            hcl_hvcall_setup(self.0.as_raw_fd(), &hvcall_setup)
                .expect("Hypercall setup IOCTL must be supported");
        }
    }

    /// Accepts VTL 0 pages with no host visibility.
    ///
    /// [`HypercallCode::HvCallAcceptGpaPages`] must be allowed.
    pub fn accept_gpa_pages(
        &self,
        range: MemoryRange,
        memory_type: hvdef::hypercall::AcceptMemoryType,
    ) -> Result<(), AcceptPagesError> {
        const MAX_INPUT_ELEMENTS: usize = (HV_PAGE_SIZE as usize
            - size_of::<hvdef::hypercall::AcceptGpaPages>())
            / size_of::<u64>();

        let span = tracing::span!(tracing::Level::INFO, "accept_pages", ?range);
        let _enter = span.enter();

        let mut current_page = range.start() / HV_PAGE_SIZE;
        let end = range.end() / HV_PAGE_SIZE;

        while current_page < end {
            let header = hvdef::hypercall::AcceptGpaPages {
                partition_id: HV_PARTITION_ID_SELF,
                page_attributes: hvdef::hypercall::AcceptPagesAttributes::new()
                    .with_memory_type(memory_type.0)
                    .with_host_visibility(HostVisibilityType::PRIVATE)
                    .with_vtl_set(0), // vtl protections cannot be applied for VTL 0 memory
                vtl_permission_set: hvdef::hypercall::VtlPermissionSet {
                    vtl_permission_from_1: [0; hvdef::hypercall::HV_VTL_PERMISSION_SET_SIZE],
                },
                gpa_page_base: current_page,
            };

            let remaining_pages = end - current_page;
            let count = remaining_pages.min(MAX_INPUT_ELEMENTS as u64);

            // SAFETY: The input header and rep slice are the correct types for
            //         this hypercall. A dummy type of u8 is provided to satisfy
            //         the compiler for input and output rep type. The given
            //         input and slices are valid references while this function
            //         is called.
            //
            //         The hypercall output is validated right after the hypercall is issued.
            let output = unsafe {
                self.hvcall_rep::<hvdef::hypercall::AcceptGpaPages, u8, u8>(
                    HypercallCode::HvCallAcceptGpaPages,
                    &header,
                    HvcallRepInput::Count(count as u16),
                    None,
                )
                .expect("kernel hypercall submission should always succeed")
            };

            output
                .result()
                .map_err(|err| AcceptPagesError::Hypervisor {
                    range: MemoryRange::from_4k_gpn_range(current_page..current_page + count),
                    output,
                    hv_error: err,
                })?;

            current_page += count;

            assert_eq!(output.elements_processed() as u64, count);
        }
        Ok(())
    }

    /// Modifies the host visibility of the given pages.
    ///
    /// [`HypercallCode::HvCallModifySparseGpaPageHostVisibility`] must be allowed.
    //
    // TODO SNP: this isn't really safe. Probably this should be an IOCTL in the
    // kernel so that it can validate the page ranges are VTL0 memory.
    pub fn modify_gpa_visibility(
        &self,
        host_visibility: HostVisibilityType,
        mut gpns: &[u64],
    ) -> Result<(), HvError> {
        const GPNS_PER_CALL: usize = (HV_PAGE_SIZE as usize
            - size_of::<hvdef::hypercall::ModifySparsePageVisibility>())
            / size_of::<u64>();

        while !gpns.is_empty() {
            let n = gpns.len().min(GPNS_PER_CALL);
            // SAFETY: The input header and rep slice are the correct types for this hypercall.
            //         The hypercall output is validated right after the hypercall is issued.
            let result = unsafe {
                self.hvcall_rep(
                    HypercallCode::HvCallModifySparseGpaPageHostVisibility,
                    &hvdef::hypercall::ModifySparsePageVisibility {
                        partition_id: HV_PARTITION_ID_SELF,
                        host_visibility: ModifyHostVisibility::new()
                            .with_host_visibility(host_visibility),
                        reserved: 0,
                    },
                    HvcallRepInput::Elements(&gpns[..n]),
                    None::<&mut [u8]>,
                )
                .unwrap()
            };

            match result.result() {
                Ok(()) => {
                    assert_eq!({ result.elements_processed() }, n);
                }
                Err(HvError::Timeout) => {}
                Err(e) => return Err(e),
            }
            gpns = &gpns[result.elements_processed()..];
        }
        Ok(())
    }

    /// Given a constructed hcl_hvcall protocol object, issues an IOCTL to invoke a hypercall via
    /// the direct hypercall kernel interface. This function will retry hypercalls if the hypervisor
    /// times out the hypercall.
    ///
    /// Input and output data are referenced as pointers in the call object.
    ///
    /// `Ok(HypercallOutput)` is returned if the kernel was successful in issuing the hypercall. A
    /// caller must check the return value for the result of the hypercall.
    ///
    /// Before invoking hypercalls, a list of hypercalls that are allowed
    /// has to be set with `Hcl::set_allowed_hypercalls`:
    /// ```ignore
    /// set_allowed_hypercalls(&[
    ///     hvdef::HypercallCode::HvCallCheckForIoIntercept,
    ///     hvdef::HypercallCode::HvCallInstallIntercept,
    /// ]);
    /// ```
    /// # Safety
    /// This function makes no guarantees that the given input header, input and output types are
    /// valid for the given hypercall. It is the caller's responsibility to use the correct types
    /// with the specified hypercall.
    ///
    /// The caller must ensure that the input and output data are valid for the lifetime of this
    /// call.
    ///
    /// A caller must check the returned [HypercallOutput] for success or failure from the
    /// hypervisor.
    ///
    /// Hardware isolated VMs cannot trust the output from the hypervisor and so it must be
    /// validated by the caller if needed.
    unsafe fn invoke_hvcall_ioctl(
        &self,
        mut call_object: protocol::hcl_hvcall,
    ) -> Result<HypercallOutput, HvcallError> {
        loop {
            // SAFETY: following the IOCTL definition. The data referenced in the call
            // lives as long as `self` does thus the lifetime elision doesn't contradict
            // the compiler's invariants.
            //
            // The hypervisor is trusted to fill out the output page with a valid
            // representation of an instance the output type, except in the case of hardware
            // isolated VMs where the caller must validate output as needed.
            unsafe {
                hcl_hvcall(self.0.as_raw_fd(), &mut call_object)
                    .map_err(HvcallError::HypercallIoctlFailed)?;
            }

            if call_object.status.call_status() == Err(HvError::Timeout).into() {
                // Any hypercall can timeout, even one that doesn't have reps. Continue processing
                // from wherever the hypervisor left off.  The rep start index isn't checked for
                // validity, since it is only being used as an input to the untrusted hypervisor.
                // This applies to both simple and rep hypercalls.
                call_object
                    .control
                    .set_rep_start(call_object.status.elements_processed());
            } else {
                if call_object.control.rep_count() == 0 {
                    // For non-rep hypercalls, the elements processed field should be 0.
                    assert_eq!(call_object.status.elements_processed(), 0);
                } else {
                    // Hardware isolated VMs cannot trust output from the hypervisor, but check for
                    // consistency between the number of elements processed and the expected count. A
                    // violation of this assertion indicates a buggy or malicious hypervisor.
                    assert!(
                        (call_object.status.result().is_ok()
                            && call_object.control.rep_count()
                                == call_object.status.elements_processed())
                            || (call_object.status.result().is_err()
                                && call_object.control.rep_count()
                                    > call_object.status.elements_processed())
                    );
                }

                return Ok(call_object.status);
            }
        }
    }

    /// Issues a non-rep hypercall to the hypervisor via the direct hypercall kernel interface.
    /// This is not intended to be used directly by external callers, rather via write safe hypercall wrappers.
    /// This call constructs the appropriate hypercall input control from the described parameters.
    ///
    /// `Ok(HypercallOutput)` is returned if the kernel was successful in issuing the hypercall. A caller must check the
    /// return value for the result of the hypercall.
    ///
    /// `code` is the hypercall code.
    /// `input` is the input type required by the hypercall.
    /// `output` is the output type required by the hypercall.
    ///
    /// Before invoking hypercalls, a list of hypercalls that are allowed
    /// has to be set with `Hcl::set_allowed_hypercalls`:
    /// ```ignore
    /// set_allowed_hypercalls(&[
    ///     hvdef::HypercallCode::HvCallCheckForIoIntercept,
    ///     hvdef::HypercallCode::HvCallInstallIntercept,
    /// ]);
    /// ```
    /// # Safety
    /// This function makes no guarantees that the given input header, input and output types are valid for the
    /// given hypercall. It is the caller's responsibility to use the correct types with the specified hypercall.
    ///
    /// A caller must check the returned [HypercallOutput] for success or failure from the hypervisor.
    ///
    /// Hardware isolated VMs cannot trust the output from the hypervisor and so it must be validated by the
    /// caller if needed.
    unsafe fn hvcall<I, O>(
        &self,
        code: HypercallCode,
        input: &I,
        output: &mut O,
    ) -> Result<HypercallOutput, HvcallError>
    where
        I: IntoBytes + Sized + Immutable + KnownLayout,
        O: IntoBytes + FromBytes + Sized + Immutable + KnownLayout,
    {
        const fn assert_size<I, O>()
        where
            I: Sized,
            O: Sized,
        {
            assert!(size_of::<I>() <= HV_PAGE_SIZE as usize);
            assert!(size_of::<O>() <= HV_PAGE_SIZE as usize);
        }
        assert_size::<I, O>();

        let control = hvdef::hypercall::Control::new().with_code(code.0);

        let call_object = protocol::hcl_hvcall {
            control,
            input_data: input.as_bytes().as_ptr().cast(),
            input_size: size_of::<I>(),
            status: FromZeros::new_zeroed(),
            output_data: output.as_bytes().as_ptr().cast(),
            output_size: size_of::<O>(),
        };

        // SAFETY: The data referenced in the call lives as long as `self` does.
        unsafe { self.invoke_hvcall_ioctl(call_object) }
    }

    /// Issues a rep hypercall to the hypervisor via the direct hypercall kernel
    /// interface. Like the non-rep version, this is not intended to be used
    /// externally other than to construct safe wrappers. This call constructs
    /// the appropriate hypercall input control from the described parameters.
    ///
    /// `Ok(HypercallOutput)` is returned if the kernel was successful in
    /// issuing the hypercall. A caller must check the return value for the
    /// result of the hypercall.
    ///
    /// `code` is the hypercall code. `input_header` is the hypercall fixed
    /// length input header. Variable length headers are not supported.
    /// `input_rep` is the list of input elements. The length of the slice is
    /// used as the rep count.
    ///
    /// `output_rep` is the optional output rep list. A caller must check the
    /// returned [HypercallOutput] for the number of valid elements in this
    /// list.
    ///
    /// # Safety
    /// This function makes no guarantees that the given input header, input rep
    /// and output rep types are valid for the given hypercall. It is the
    /// caller's responsibility to use the correct types with the specified
    /// hypercall.
    ///
    /// A caller must check the returned [HypercallOutput] for success or
    /// failure from the hypervisor and processed rep count.
    ///
    /// Hardware isolated VMs cannot trust output from the hypervisor. This
    /// routine will ensure that the hypervisor either returns success with all
    /// elements processed, or returns failure with an incomplete number of
    /// elements processed. Actual validation of the output elements is the
    /// respsonsibility of the caller.
    unsafe fn hvcall_rep<InputHeader, InputRep, O>(
        &self,
        code: HypercallCode,
        input_header: &InputHeader,
        input_rep: HvcallRepInput<'_, InputRep>,
        output_rep: Option<&mut [O]>,
    ) -> Result<HypercallOutput, HvcallError>
    where
        InputHeader: IntoBytes + Sized + Immutable + KnownLayout,
        InputRep: IntoBytes + Sized + Immutable + KnownLayout,
        O: IntoBytes + FromBytes + Sized + Immutable + KnownLayout,
    {
        // Construct input buffer.
        let (input, count) = match input_rep {
            HvcallRepInput::Elements(e) => {
                ([input_header.as_bytes(), e.as_bytes()].concat(), e.len())
            }
            HvcallRepInput::Count(c) => (input_header.as_bytes().to_vec(), c.into()),
        };

        if input.len() > HV_PAGE_SIZE as usize {
            return Err(HvcallError::InputParametersTooLarge);
        }

        if let Some(output_rep) = &output_rep {
            if output_rep.as_bytes().len() > HV_PAGE_SIZE as usize {
                return Err(HvcallError::OutputParametersTooLarge);
            }

            if count != output_rep.len() {
                return Err(HvcallError::InputOutputRepListMismatch);
            }
        }

        let (output_data, output_size) = match output_rep {
            Some(output_rep) => (
                output_rep.as_bytes().as_ptr().cast(),
                output_rep.as_bytes().len(),
            ),
            None => (std::ptr::null(), 0),
        };

        let control = hvdef::hypercall::Control::new()
            .with_code(code.0)
            .with_rep_count(count);

        let call_object = protocol::hcl_hvcall {
            control,
            input_data: input.as_ptr().cast(),
            input_size: input.len(),
            status: HypercallOutput::new(),
            output_data,
            output_size,
        };

        // SAFETY: The data referenced in the call lives as long as `self` does.
        unsafe { self.invoke_hvcall_ioctl(call_object) }
    }

    /// Issues a non-rep hypercall with variable input to the hypervisor via the direct hypercall kernel interface.
    /// This is not intended to be used directly by external callers, rather via write safe hypercall wrappers.
    /// This call constructs the appropriate hypercall input control from the described parameters.
    ///
    /// `Ok(HypercallOutput)` is returned if the kernel was successful in issuing the hypercall. A caller must check the
    /// return value for the result of the hypercall.
    ///
    /// `code` is the hypercall code.
    /// `input` is the input type required by the hypercall.
    /// `output` is the output type required by the hypercall.
    /// `variable_input` is the contents of the variable input to the hypercall. The length must be a multiple of 8 bytes.
    ///
    /// # Safety
    /// This function makes no guarantees that the given input header, input and output types are valid for the
    /// given hypercall. It is the caller's responsibility to use the correct types with the specified hypercall.
    ///
    /// A caller must check the returned [HypercallOutput] for success or failure from the hypervisor.
    ///
    /// Hardware isolated VMs cannot trust the output from the hypervisor and so it must be validated by the
    /// caller if needed.
    unsafe fn hvcall_var<I, O>(
        &self,
        code: HypercallCode,
        input: &I,
        variable_input: &[u8],
        output: &mut O,
    ) -> Result<HypercallOutput, HvcallError>
    where
        I: IntoBytes + Sized + Immutable + KnownLayout,
        O: IntoBytes + FromBytes + Sized + Immutable + KnownLayout,
    {
        const fn assert_size<I, O>()
        where
            I: Sized,
            O: Sized,
        {
            assert!(size_of::<I>() <= HV_PAGE_SIZE as usize);
            assert!(size_of::<O>() <= HV_PAGE_SIZE as usize);
        }
        assert_size::<I, O>();
        assert!(variable_input.len() % 8 == 0);

        let input = [input.as_bytes(), variable_input].concat();
        if input.len() > HV_PAGE_SIZE as usize {
            return Err(HvcallError::InputParametersTooLarge);
        }

        let control = hvdef::hypercall::Control::new()
            .with_code(code.0)
            .with_variable_header_size(variable_input.len() / 8);

        let call_object = protocol::hcl_hvcall {
            control,
            input_data: input.as_bytes().as_ptr().cast(),
            input_size: input.len(),
            status: FromZeros::new_zeroed(),
            output_data: output.as_bytes().as_ptr().cast(),
            output_size: size_of::<O>(),
        };

        // SAFETY: The data referenced in the call lives as long as `self` does.
        unsafe { self.invoke_hvcall_ioctl(call_object) }
    }

    /// Sets the VTL protection mask for the specified memory range.
    ///
    /// [`HypercallCode::HvCallModifyVtlProtectionMask`] must be allowed.
    pub fn modify_vtl_protection_mask(
        &self,
        range: MemoryRange,
        map_flags: HvMapGpaFlags,
        target_vtl: HvInputVtl,
    ) -> Result<(), ApplyVtlProtectionsError> {
        let header = hvdef::hypercall::ModifyVtlProtectionMask {
            partition_id: HV_PARTITION_ID_SELF,
            map_flags,
            target_vtl,
            reserved: [0; 3],
        };

        const MAX_INPUT_ELEMENTS: usize = (HV_PAGE_SIZE as usize
            - size_of::<hvdef::hypercall::ModifyVtlProtectionMask>())
            / size_of::<u64>();

        let span = tracing::span!(tracing::Level::INFO, "modify_vtl_protection_mask", ?range);
        let _enter = span.enter();

        let start = range.start() / HV_PAGE_SIZE;
        let end = range.end() / HV_PAGE_SIZE;

        // Reuse the same vector for every hypercall.
        let mut pages = Vec::new();
        for current_page in (start..end).step_by(MAX_INPUT_ELEMENTS) {
            let remaining_pages = end - current_page;
            let count = remaining_pages.min(MAX_INPUT_ELEMENTS as u64);
            pages.clear();
            pages.extend(current_page..current_page + count);

            // SAFETY: The input header and rep slice are the correct types for this hypercall. A dummy type of u8 is
            //         provided to satisfy the compiler for output rep type. The given input and slices are valid
            //         references while this function is called.
            //
            //         The hypercall output is validated right after the hypercall is issued.
            let output = unsafe {
                self.hvcall_rep::<hvdef::hypercall::ModifyVtlProtectionMask, u64, u8>(
                    HypercallCode::HvCallModifyVtlProtectionMask,
                    &header,
                    HvcallRepInput::Elements(pages.as_slice()),
                    None,
                )
                .expect("kernel hypercall submission should always succeed")
            };

            output.result().map_err(|err| {
                let page_range =
                    *pages.first().expect("not empty")..*pages.last().expect("not empty") + 1;
                ApplyVtlProtectionsError::Hypervisor {
                    range: MemoryRange::from_4k_gpn_range(page_range),
                    output,
                    hv_error: err,
                    vtl: target_vtl,
                }
            })?;

            assert_eq!(output.elements_processed() as u64, count);
        }

        Ok(())
    }

    /// Get a single VP register for the given VTL via hypercall.
    fn get_vp_register_for_vtl_inner(
        &self,
        target_vtl: HvInputVtl,
        name: HvRegisterName,
    ) -> Result<HvRegisterValue, Error> {
        let header = hvdef::hypercall::GetSetVpRegisters {
            partition_id: HV_PARTITION_ID_SELF,
            vp_index: HV_VP_INDEX_SELF,
            target_vtl,
            rsvd: [0; 3],
        };
        let mut output = [HvRegisterValue::new_zeroed()];

        // SAFETY: The input header and rep slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let status = unsafe {
            self.hvcall_rep(
                HypercallCode::HvCallGetVpRegisters,
                &header,
                HvcallRepInput::Elements(&[name]),
                Some(&mut output),
            )
            .expect("get_vp_register hypercall should not fail")
        };

        // Status must be success with 1 rep completed
        status
            .result()
            .map_err(|err| Error::GetVpRegisterHypercall {
                reg: name.into(),
                err,
            })?;
        assert_eq!(status.elements_processed(), 1);

        Ok(output[0])
    }

    /// Get a single VP register for the given VTL via hypercall. Only a select
    /// set of registers are supported; others will cause a panic.
    #[cfg(guest_arch = "x86_64")]
    pub fn get_vp_register_for_vtl(
        &self,
        vtl: HvInputVtl,
        name: HvX64RegisterName,
    ) -> Result<HvRegisterValue, Error> {
        match vtl.target_vtl().unwrap() {
            None | Some(Vtl::Vtl2) => {
                assert!(matches!(
                    name,
                    HvX64RegisterName::GuestVsmPartitionConfig
                        | HvX64RegisterName::VsmPartitionConfig
                        | HvX64RegisterName::VsmPartitionStatus
                        | HvX64RegisterName::VsmCapabilities
                        | HvX64RegisterName::TimeRefCount
                        | HvX64RegisterName::VsmVpSecureConfigVtl0
                        | HvX64RegisterName::VsmVpSecureConfigVtl1
                ));
            }
            Some(Vtl::Vtl1) => {
                todo!("TODO: allowed registers for VTL1");
            }
            Some(Vtl::Vtl0) => {
                // Only VTL-private registers can go through this path.
                // VTL-shared registers have to go through the kernel (either
                // via the CPU context page or via the dedicated ioctl), as
                // they may require special handling there.
                //
                // Register access should go through the register page if
                // possible (as a performance optimization). In practice,
                // registers that are normally available on the register page
                // are handled here only when it is unavailable (e.g., running
                // in WHP).
                assert!(!is_vtl_shared_reg(name));
            }
        }

        self.get_vp_register_for_vtl_inner(vtl, name.into())
    }

    /// Get a single VP register for the given VTL via hypercall. Only a select
    /// set of registers are supported; others will cause a panic.
    #[cfg(guest_arch = "aarch64")]
    pub fn get_vp_register_for_vtl(
        &self,
        vtl: HvInputVtl,
        name: HvArm64RegisterName,
    ) -> Result<HvRegisterValue, Error> {
        match vtl.target_vtl().unwrap() {
            None | Some(Vtl::Vtl2) => {
                assert!(matches!(
                    name,
                    HvArm64RegisterName::GuestVsmPartitionConfig
                        | HvArm64RegisterName::VsmPartitionConfig
                        | HvArm64RegisterName::VsmPartitionStatus
                        | HvArm64RegisterName::VsmCapabilities
                        | HvArm64RegisterName::TimeRefCount
                        | HvArm64RegisterName::VsmVpSecureConfigVtl0
                        | HvArm64RegisterName::VsmVpSecureConfigVtl1
                        | HvArm64RegisterName::PrivilegesAndFeaturesInfo
                ));
            }
            Some(Vtl::Vtl1) => {
                // TODO: allowed registers for VTL1
                todo!();
            }
            Some(Vtl::Vtl0) => {
                // Only VTL-private registers can go through this path.
                // VTL-shared registers have to go through the kernel (either
                // via the CPU context page or via the dedicated ioctl), as
                // they may require special handling there.
                assert!(!is_vtl_shared_reg(name));
            }
        }

        self.get_vp_register_for_vtl_inner(vtl, name.into())
    }
}

/// The HCL device and collection of fds.
#[derive(Debug)]
pub struct Hcl {
    mshv_hvcall: MshvHvcall,
    mshv_vtl: MshvVtl,
    vps: Vec<HclVp>,
    supports_vtl_ret_action: bool,
    supports_register_page: bool,
    dr6_shared: bool,
    isolation: IsolationType,
    snp_register_bitmap: [u8; 64],
    sidecar: Option<SidecarClient>,
}

/// The isolation type for a partition.
// TODO: Add guest_arch cfgs.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum IsolationType {
    /// No isolation.
    None,
    /// Hyper-V software isolation.
    Vbs,
    /// AMD SNP.
    Snp,
    /// Intel TDX.
    Tdx,
}

impl IsolationType {
    /// Returns true if the isolation type is not `None`.
    pub fn is_isolated(&self) -> bool {
        !matches!(self, Self::None)
    }

    /// Returns whether the isolation type is hardware-backed.
    pub fn is_hardware_isolated(&self) -> bool {
        matches!(self, Self::Snp | Self::Tdx)
    }
}

impl Hcl {
    /// Returns true if DR6 is a shared register on this processor.
    pub fn dr6_shared(&self) -> bool {
        self.dr6_shared
    }
}

#[derive(Debug)]
struct HclVp {
    state: Mutex<VpState>,
    run: MappedPage<hcl_run>,
    backing: BackingState,
}

#[derive(Debug)]
enum BackingState {
    Mshv {
        reg_page: Option<MappedPage<HvX64RegisterPage>>,
    },
    Snp {
        vmsa: VtlArray<MappedPage<SevVmsa>, 2>,
    },
    Tdx {
        vtl0_apic_page: MappedPage<ApicPage>,
        vtl1_apic_page: MemoryBlock,
    },
}

#[derive(Debug)]
enum VpState {
    Running(Pthread),
    NotRunning,
}

impl HclVp {
    fn new(
        hcl: &Hcl,
        vp: u32,
        map_reg_page: bool,
        isolation_type: IsolationType,
        private_dma_client: Option<&Arc<dyn DmaClient>>,
    ) -> Result<Self, Error> {
        let fd = &hcl.mshv_vtl.file;
        let run: MappedPage<hcl_run> =
            MappedPage::new(fd, vp as i64).map_err(|e| Error::MmapVp(e, None))?;
        // Block proxied interrupts on all vectors by default. The mask will be
        // relaxed as the guest runs.
        //
        // This is only used on CVMs. Skip it otherwise, since run page accesses
        // will fault on VPs that are still in the sidecar kernel.
        if isolation_type.is_hardware_isolated() {
            // SAFETY: `proxy_irr_blocked` is not accessed by any other VPs/kernel at this point (`HclVp` creation)
            // so we know we have exclusive access.
            let proxy_irr_blocked = unsafe { &mut (*run.as_ptr()).proxy_irr_blocked };
            proxy_irr_blocked.fill(!0);
        }

        let backing = match isolation_type {
            IsolationType::None | IsolationType::Vbs => BackingState::Mshv {
                reg_page: if map_reg_page {
                    Some(
                        MappedPage::new(fd, HCL_REG_PAGE_OFFSET | vp as i64)
                            .map_err(Error::MmapRegPage)?,
                    )
                } else {
                    None
                },
            },
            IsolationType::Snp => {
                let vmsa_vtl0 = MappedPage::new(fd, HCL_VMSA_PAGE_OFFSET | vp as i64)
                    .map_err(|e| Error::MmapVp(e, Some(Vtl::Vtl0)))?;
                let vmsa_vtl1 = MappedPage::new(fd, HCL_VMSA_GUEST_VSM_PAGE_OFFSET | vp as i64)
                    .map_err(|e| Error::MmapVp(e, Some(Vtl::Vtl1)))?;
                BackingState::Snp {
                    vmsa: [vmsa_vtl0, vmsa_vtl1].into(),
                }
            }
            IsolationType::Tdx => BackingState::Tdx {
                vtl0_apic_page: MappedPage::new(fd, MSHV_APIC_PAGE_OFFSET | vp as i64)
                    .map_err(|e| Error::MmapVp(e, Some(Vtl::Vtl0)))?,
                vtl1_apic_page: private_dma_client
                    .ok_or(Error::MissingPrivateMemory)?
                    .allocate_dma_buffer(HV_PAGE_SIZE as usize)
                    .map_err(Error::AllocVp)?,
            },
        };

        Ok(Self {
            state: Mutex::new(VpState::NotRunning),
            run,
            backing,
        })
    }
}

/// Object used to run and to access state for a specific VP.
pub struct ProcessorRunner<'a, T: Backing<'a>> {
    hcl: &'a Hcl,
    vp: &'a HclVp,
    sidecar: Option<SidecarVp<'a>>,
    deferred_actions: Option<RegisteredDeferredActions<'a>>,
    run: &'a UnsafeCell<hcl_run>,
    intercept_message: &'a UnsafeCell<HvMessage>,
    state: T,
}

/// An error returned by [`Hcl::runner`].
#[derive(Debug, Error)]
pub enum NoRunner {
    /// The partition is for a different isolation type.
    #[error("mismatched isolation type")]
    MismatchedIsolation,
    /// A sidecar VP was requested, but no sidecar was provided.
    #[error("missing sidecar")]
    MissingSidecar,
    /// The sidecar VP could not be contacted.
    #[error("sidecar communication error")]
    Sidecar(#[source] sidecar_client::SidecarError),
}

/// An isolation-type-specific backing for a processor runner.
#[expect(private_bounds)]
pub trait Backing<'a>: BackingPrivate<'a> {}

impl<'a, T: BackingPrivate<'a>> Backing<'a> for T {}

mod private {
    use super::Error;
    use super::Hcl;
    use super::HclVp;
    use super::NoRunner;
    use super::ProcessorRunner;
    use crate::GuestVtl;
    use hvdef::HvRegisterName;
    use hvdef::HvRegisterValue;
    use sidecar_client::SidecarVp;

    pub(super) trait BackingPrivate<'a>: Sized {
        fn new(vp: &'a HclVp, sidecar: Option<&SidecarVp<'a>>, hcl: &Hcl)
        -> Result<Self, NoRunner>;

        fn try_set_reg(
            runner: &mut ProcessorRunner<'a, Self>,
            vtl: GuestVtl,
            name: HvRegisterName,
            value: HvRegisterValue,
        ) -> Result<bool, Error>;

        fn must_flush_regs_on(runner: &ProcessorRunner<'a, Self>, name: HvRegisterName) -> bool;

        fn try_get_reg(
            runner: &ProcessorRunner<'a, Self>,
            vtl: GuestVtl,
            name: HvRegisterName,
        ) -> Result<Option<HvRegisterValue>, Error>;

        fn flush_register_page(runner: &mut ProcessorRunner<'a, Self>);
    }
}

impl<'a, T: Backing<'a>> Drop for ProcessorRunner<'a, T> {
    fn drop(&mut self) {
        self.flush_deferred_state();
        drop(self.deferred_actions.take());
        let old_state = std::mem::replace(&mut *self.vp.state.lock(), VpState::NotRunning);
        assert!(matches!(old_state, VpState::Running(thread) if thread == Pthread::current()));
    }
}

impl<'a, T: Backing<'a>> ProcessorRunner<'a, T> {
    /// Flushes any deferred state. Must be called if preparing the partition
    /// for save/restore (servicing).
    pub fn flush_deferred_state(&mut self) {
        T::flush_register_page(self);
        if let Some(actions) = &mut self.deferred_actions {
            actions.flush();
        }
    }
}

impl<'a, T: Backing<'a>> ProcessorRunner<'a, T> {
    // Registers that are shared between VTLs need to be handled by the kernel
    // as they may require special handling there. set_reg and get_reg will
    // handle these registers using a dedicated ioctl, instead of the general-
    // purpose Set/GetVpRegisters hypercalls.
    #[cfg(guest_arch = "x86_64")]
    fn is_kernel_managed(&self, name: HvX64RegisterName) -> bool {
        if name == HvX64RegisterName::Dr6 {
            self.hcl.dr6_shared()
        } else {
            is_vtl_shared_reg(name)
        }
    }

    #[cfg(guest_arch = "aarch64")]
    fn is_kernel_managed(&self, name: HvArm64RegisterName) -> bool {
        is_vtl_shared_reg(name)
    }

    fn set_reg(&mut self, vtl: GuestVtl, regs: &[HvRegisterAssoc]) -> Result<(), Error> {
        if regs.is_empty() {
            return Ok(());
        }

        if let Some(sidecar) = &mut self.sidecar {
            sidecar
                .set_vp_registers(vtl.into(), regs)
                .map_err(Error::Sidecar)?;
        } else {
            // TODO: group up to MSHV_VP_MAX_REGISTERS regs. The kernel
            // currently has a bug where it only supports one register at a
            // time. Once that's fixed, this code could set a group of
            // registers in one ioctl.
            for reg in regs {
                let hc_regs = &mut [HvRegisterAssoc {
                    name: reg.name,
                    pad: [0; 3],
                    value: reg.value,
                }];

                if self.is_kernel_managed(reg.name.into()) {
                    let hv_vp_register_args = mshv_vp_registers {
                        count: 1,
                        regs: hc_regs.as_mut_ptr(),
                    };
                    // SAFETY: ioctl call with correct types.
                    unsafe {
                        hcl_set_vp_register(
                            self.hcl.mshv_vtl.file.as_raw_fd(),
                            &hv_vp_register_args,
                        )
                        .map_err(Error::SetVpRegister)?;
                    }
                } else {
                    let hc_regs = [HvRegisterAssoc {
                        name: reg.name,
                        pad: [0; 3],
                        value: reg.value,
                    }];
                    self.set_vp_registers_hvcall_inner(vtl.into(), &hc_regs)
                        .map_err(Error::SetRegisters)?;
                }
            }
        }
        Ok(())
    }

    fn get_reg(&mut self, vtl: GuestVtl, regs: &mut [HvRegisterAssoc]) -> Result<(), Error> {
        if regs.is_empty() {
            return Ok(());
        }

        if let Some(sidecar) = &mut self.sidecar {
            sidecar
                .get_vp_registers(vtl.into(), regs)
                .map_err(Error::Sidecar)?;
        } else {
            // TODO: group up to MSHV_VP_MAX_REGISTERS regs. The kernel
            // currently has a bug where it only supports one register at a
            // time. Once that's fixed, this code could set a group of
            // registers in one ioctl.
            for reg in regs {
                if self.is_kernel_managed(reg.name.into()) {
                    let mut mshv_vp_register_args = mshv_vp_registers {
                        count: 1,
                        regs: reg,
                    };
                    // SAFETY: we know that our file is a vCPU fd, we know the kernel will only read the
                    // correct amount of memory from our pointer, and we verify the return result.
                    unsafe {
                        hcl_get_vp_register(
                            self.hcl.mshv_vtl.file.as_raw_fd(),
                            &mut mshv_vp_register_args,
                        )
                        .map_err(Error::GetVpRegister)?;
                    }
                } else {
                    reg.value = self
                        .hcl
                        .mshv_hvcall
                        .get_vp_register_for_vtl(vtl.into(), reg.name.into())?;
                }
            }
        }
        Ok(())
    }

    /// Clears the cancel flag so that the VP can be run again.
    pub fn clear_cancel(&mut self) {
        if !self.is_sidecar() {
            // SAFETY: self.run is mapped, and the cancel field is atomically
            // accessed by everyone.
            let cancel = unsafe { &*(&raw mut (*self.run.get()).cancel).cast::<AtomicU32>() };
            cancel.store(0, Ordering::SeqCst);
        }
    }

    /// Set the halted state of the VP. If `true`, then `run()` will not
    /// actually run the VP but will just wait for a cancel request or signal.
    pub fn set_halted(&mut self, halted: bool) {
        // SAFETY: the `flags` field of the run page will not be concurrently
        // updated.
        let flags = unsafe { &mut (*self.run.get()).flags };
        if halted {
            *flags |= protocol::MSHV_VTL_RUN_FLAG_HALTED
        } else {
            *flags &= !protocol::MSHV_VTL_RUN_FLAG_HALTED
        }
    }

    /// Gets the proxied interrupt request bitmap for VTL 0 from the hypervisor.
    pub fn proxy_irr_vtl0(&mut self) -> Option<[u32; 8]> {
        // SAFETY: the `scan_proxy_irr` and `proxy_irr` fields of the run page
        // are concurrently updated by the kernel on multiple processors. They
        // are accessed atomically everywhere.
        unsafe {
            let scan_proxy_irr = &*((&raw mut (*self.run.get()).scan_proxy_irr).cast::<AtomicU8>());
            let proxy_irr = &*((&raw mut (*self.run.get()).proxy_irr).cast::<[AtomicU32; 8]>());
            if scan_proxy_irr.load(Ordering::Acquire) == 0 {
                return None;
            }

            scan_proxy_irr.store(0, Ordering::SeqCst);
            let mut r = [0; 8];
            for (irr, r) in proxy_irr.iter().zip(r.iter_mut()) {
                if irr.load(Ordering::Relaxed) != 0 {
                    *r = irr.swap(0, Ordering::Relaxed);
                }
            }
            Some(r)
        }
    }

    /// Update the `proxy_irr_blocked` for VTL 0 in the run page
    pub fn update_proxy_irr_filter_vtl0(&mut self, irr_filter: &[u32; 8]) {
        // SAFETY: `proxy_irr_blocked` is accessed by current VP only, but could
        // be concurrently accessed by kernel too, hence accessing as Atomic
        let proxy_irr_blocked = unsafe {
            &mut *((&raw mut (*self.run.get()).proxy_irr_blocked).cast::<[AtomicU32; 8]>())
        };

        // `irr_filter` bitmap has bits set for all allowed vectors (i.e. SINT and device interrupts)
        // Replace current `proxy_irr_blocked` with the given `irr_filter` bitmap.
        // By default block all (i.e. set all), and only allow (unset) given vectors from `irr_filter`.
        for (filter, irr) in proxy_irr_blocked.iter_mut().zip(irr_filter.iter()) {
            filter.store(!irr, Ordering::Relaxed);
            tracing::debug!(irr, "update_proxy_irr_filter");
        }
    }

    /// Gets the proxy_irr_exit bitmask for VTL 0. This mask ensures that
    /// the masked interrupts always exit to user-space, and cannot
    /// be injected in the kernel. Interrupts matching this condition
    /// will be left on the proxy_irr field.
    pub fn proxy_irr_exit_mut_vtl0(&mut self) -> &mut [u32; 8] {
        // SAFETY: The `proxy_irr_exit` field of the run page will not be concurrently updated.
        unsafe { &mut (*self.run.get()).proxy_irr_exit }
    }

    /// Gets the current offload_flags from the run page.
    pub fn offload_flags_mut(&mut self) -> &mut hcl_intr_offload_flags {
        // SAFETY: The `offload_flags` field of the run page will not be concurrently updated.
        unsafe { &mut (*self.run.get()).offload_flags }
    }

    /// Runs the VP via the sidecar kernel.
    pub fn run_sidecar(&mut self) -> Result<SidecarRun<'_, 'a>, Error> {
        self.sidecar.as_mut().unwrap().run().map_err(Error::Sidecar)
    }

    /// Run the following VP until an exit, error, or interrupt (cancel or
    /// signal) occurs.
    ///
    /// Returns `Ok(true)` if there is an exit to process, `Ok(false)` if there
    /// was a signal or cancel request.
    pub fn run(&mut self) -> Result<bool, Error> {
        assert!(self.sidecar.is_none());
        // Apply any deferred actions to the run page.
        if let Some(actions) = &mut self.deferred_actions {
            debug_assert!(self.hcl.supports_vtl_ret_action);
            // SAFETY: there are no concurrent accesses to the deferred action
            // slots.
            let mut slots = unsafe { DeferredActionSlots::new(self.run) };
            actions.move_to_slots(&mut slots);
        };

        // N.B. cpu_context and exit_context are mutated by this call.
        //
        // SAFETY: no safety requirements for this ioctl.
        let r = unsafe { hcl_return_to_lower_vtl(self.hcl.mshv_vtl.file.as_raw_fd()) };

        let has_intercept = match r {
            Ok(_) => true,
            Err(nix::errno::Errno::EINTR) => false,
            Err(err) => return Err(Error::ReturnToLowerVtl(err)),
        };
        Ok(has_intercept)
    }

    /// Gets a reference to enter mode value, used by the kernel to specify the
    /// mode used when entering a lower VTL.
    pub fn enter_mode(&mut self) -> Option<&mut EnterModes> {
        if self.sidecar.is_some() {
            None
        } else {
            // SAFETY: self.run is mapped, and the mode field can only be mutated or accessed by
            // this object (or the kernel while `run` is called).
            Some(unsafe { &mut (*self.run.get()).mode })
        }
    }

    /// Returns a reference to the exit message from the last exit.
    pub fn exit_message(&self) -> &HvMessage {
        // SAFETY: the exit message will not be concurrently accessed by the
        // kernel while this VP is in VTL2.
        unsafe { &*self.intercept_message.get() }
    }

    /// Returns whether this is a sidecar VP.
    pub fn is_sidecar(&self) -> bool {
        self.sidecar.is_some()
    }
}

impl<'a, T: Backing<'a>> ProcessorRunner<'a, T> {
    fn get_vp_registers_inner<R: Copy + Into<HvRegisterName>>(
        &mut self,
        vtl: GuestVtl,
        names: &[R],
        values: &mut [HvRegisterValue],
    ) -> Result<(), Error> {
        assert_eq!(names.len(), values.len());
        let mut assoc = Vec::new();
        let mut offset = Vec::new();
        for (i, (&name, value)) in names.iter().zip(values.iter_mut()).enumerate() {
            if let Some(v) = T::try_get_reg(self, vtl, name.into())? {
                *value = v;
            } else {
                assoc.push(HvRegisterAssoc {
                    name: name.into(),
                    pad: Default::default(),
                    value: FromZeros::new_zeroed(),
                });
                offset.push(i);
            }
        }

        self.get_reg(vtl, &mut assoc)?;
        for (&i, assoc) in offset.iter().zip(&assoc) {
            values[i] = assoc.value;
        }
        Ok(())
    }

    /// Get the following register on the current VP.
    ///
    /// This will fail for registers that are in the mmapped CPU context, i.e.
    /// registers that are shared between VTL0 and VTL2.
    pub fn get_vp_register(
        &mut self,
        vtl: GuestVtl,
        #[cfg(guest_arch = "x86_64")] name: HvX64RegisterName,
        #[cfg(guest_arch = "aarch64")] name: HvArm64RegisterName,
    ) -> Result<HvRegisterValue, Error> {
        let mut value = [0u64.into(); 1];
        self.get_vp_registers_inner(vtl, &[name], &mut value)?;
        Ok(value[0])
    }

    /// Get the following VP registers on the current VP.
    ///
    /// # Panics
    /// Panics if `names.len() != values.len()`.
    pub fn get_vp_registers(
        &mut self,
        vtl: GuestVtl,
        #[cfg(guest_arch = "x86_64")] names: &[HvX64RegisterName],
        #[cfg(guest_arch = "aarch64")] names: &[HvArm64RegisterName],
        values: &mut [HvRegisterValue],
    ) -> Result<(), Error> {
        self.get_vp_registers_inner(vtl, names, values)
    }

    /// Set the following register on the current VP.
    ///
    /// This will fail for registers that are in the mmapped CPU context, i.e.
    /// registers that are shared between VTL0 and VTL2.
    pub fn set_vp_register(
        &mut self,
        vtl: GuestVtl,
        #[cfg(guest_arch = "x86_64")] name: HvX64RegisterName,
        #[cfg(guest_arch = "aarch64")] name: HvArm64RegisterName,
        value: HvRegisterValue,
    ) -> Result<(), Error> {
        self.set_vp_registers(vtl, [(name, value)])
    }

    /// Sets a set of VP registers.
    pub fn set_vp_registers<I>(&mut self, vtl: GuestVtl, values: I) -> Result<(), Error>
    where
        I: IntoIterator,
        I::Item: Into<HvRegisterAssoc> + Clone,
    {
        let mut assoc = Vec::new();
        for HvRegisterAssoc { name, value, .. } in values.into_iter().map(Into::into) {
            if !assoc.is_empty() && T::must_flush_regs_on(self, name) {
                self.set_reg(vtl, &assoc)?;
                assoc.clear();
            }
            if !T::try_set_reg(self, vtl, name, value)? {
                assoc.push(HvRegisterAssoc {
                    name,
                    pad: Default::default(),
                    value,
                });
            }
        }
        if !assoc.is_empty() {
            self.set_reg(vtl, &assoc)?;
        }
        Ok(())
    }

    fn set_vp_registers_hvcall_inner(
        &mut self,
        vtl: Vtl,
        registers: &[HvRegisterAssoc],
    ) -> Result<(), HvError> {
        let header = hvdef::hypercall::GetSetVpRegisters {
            partition_id: HV_PARTITION_ID_SELF,
            vp_index: HV_VP_INDEX_SELF,
            target_vtl: vtl.into(),
            rsvd: [0; 3],
        };

        tracing::trace!(?registers, "HvCallSetVpRegisters rep");

        // SAFETY: The input header and rep slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let status = unsafe {
            self.hcl
                .mshv_hvcall
                .hvcall_rep::<hvdef::hypercall::GetSetVpRegisters, HvRegisterAssoc, u8>(
                    HypercallCode::HvCallSetVpRegisters,
                    &header,
                    HvcallRepInput::Elements(registers),
                    None,
                )
                .expect("set_vp_registers hypercall should not fail")
        };

        // Status must be success
        status.result()?;
        Ok(())
    }

    /// Sets the following registers on the current VP and given VTL using a
    /// direct hypercall.
    ///
    /// This should not be used on the fast path. Therefore only a select set of
    /// registers are supported, and others will cause a panic.
    ///
    /// This function can be used with VTL2 as a target.
    pub fn set_vp_registers_hvcall<I>(&mut self, vtl: Vtl, values: I) -> Result<(), HvError>
    where
        I: IntoIterator,
        I::Item: Into<HvRegisterAssoc> + Clone,
    {
        let registers: Vec<HvRegisterAssoc> = values.into_iter().map(Into::into).collect();

        assert!(registers.iter().all(
            |HvRegisterAssoc {
                 name,
                 pad: _,
                 value: _,
             }| matches!(
                (*name).into(),
                HvX64RegisterName::PendingEvent0
                    | HvX64RegisterName::PendingEvent1
                    | HvX64RegisterName::Sipp
                    | HvX64RegisterName::Sifp
                    | HvX64RegisterName::Ghcb
                    | HvX64RegisterName::VsmPartitionConfig
                    | HvX64RegisterName::VsmVpWaitForTlbLock
                    | HvX64RegisterName::VsmVpSecureConfigVtl0
                    | HvX64RegisterName::VsmVpSecureConfigVtl1
                    | HvX64RegisterName::CrInterceptControl
            )
        ));
        self.set_vp_registers_hvcall_inner(vtl, &registers)
    }

    /// Sets the VTL that should be returned to when underhill exits
    pub fn set_exit_vtl(&mut self, vtl: GuestVtl) {
        // SAFETY: self.run is mapped, and the target_vtl field can only be
        // mutated or accessed by this object and only before the kernel is
        // invoked during `run`
        unsafe { (*self.run.get()).target_vtl = vtl.into() }
    }
}

impl Hcl {
    /// Returns a new HCL instance.
    pub fn new(isolation: IsolationType, sidecar: Option<SidecarClient>) -> Result<Hcl, Error> {
        static SIGNAL_HANDLER_INIT: Once = Once::new();
        // SAFETY: The signal handler does not perform any actions that are forbidden
        // for signal handlers to perform, as it performs nothing.
        SIGNAL_HANDLER_INIT.call_once(|| unsafe {
            signal_hook::low_level::register(libc::SIGRTMIN(), || {
                // Do nothing, the ioctl will now return with EINTR.
            })
            .unwrap();
        });

        // Open both mshv fds
        let mshv_fd = Mshv::new()?;

        // Validate the hypervisor's advertised isolation type matches the
        // requested isolation type. In CVM scenarios, this is not trusted, so
        // we still need the isolation type from the caller.
        //
        // FUTURE: the kernel driver should probably tell us this, especially
        // since the kernel ABI is different for different isolation types.
        let supported_isolation = if cfg!(guest_arch = "x86_64") {
            // xtask-fmt allow-target-arch cpu-intrinsic
            #[cfg(target_arch = "x86_64")]
            {
                let result = safe_intrinsics::cpuid(
                    hvdef::HV_CPUID_FUNCTION_MS_HV_ISOLATION_CONFIGURATION,
                    0,
                );
                match result.ebx & 0xF {
                    0 => IsolationType::None,
                    1 => IsolationType::Vbs,
                    2 => IsolationType::Snp,
                    3 => IsolationType::Tdx,
                    ty => panic!("unknown isolation type {ty:#x}"),
                }
            }
            // xtask-fmt allow-target-arch cpu-intrinsic
            #[cfg(not(target_arch = "x86_64"))]
            {
                unreachable!()
            }
        } else {
            IsolationType::None
        };

        if isolation != supported_isolation {
            return Err(Error::MismatchedIsolation {
                supported: supported_isolation,
                requested: isolation,
            });
        }

        let supports_vtl_ret_action = mshv_fd.check_extension(HCL_CAP_VTL_RETURN_ACTION)?;
        let supports_register_page = mshv_fd.check_extension(HCL_CAP_REGISTER_PAGE)?;
        let dr6_shared = mshv_fd.check_extension(HCL_CAP_DR6_SHARED)?;
        tracing::debug!(
            supports_vtl_ret_action,
            supports_register_page,
            "HCL capabilities",
        );

        let vtl_fd = mshv_fd.create_vtl()?;

        // Open the hypercall pseudo-device
        let mshv_hvcall = MshvHvcall::new()?;

        // Override certain features for hardware isolated VMs.
        // TODO: vtl return actions are inhibited for hardware isolated VMs because they currently
        // are a pessimization since interrupt handling (and synic handling) are all done from
        // within VTL2. Future vtl return actions may be different, requiring granular handling.
        let supports_vtl_ret_action = supports_vtl_ret_action && !isolation.is_hardware_isolated();
        let supports_register_page = supports_register_page && !isolation.is_hardware_isolated();
        let snp_register_bitmap = [0u8; 64];

        Ok(Hcl {
            mshv_hvcall,
            mshv_vtl: vtl_fd,
            vps: Vec::new(),
            supports_vtl_ret_action,
            supports_register_page,
            dr6_shared,
            isolation,
            snp_register_bitmap,
            sidecar,
        })
    }

    /// Set allowed hypercalls.
    pub fn set_allowed_hypercalls(&self, codes: &[HypercallCode]) {
        self.mshv_hvcall.set_allowed_hypercalls(codes)
    }

    /// Initializes SNP register tweak bitmap
    pub fn set_snp_register_bitmap(&mut self, register_bitmap: [u8; 64]) {
        self.snp_register_bitmap = register_bitmap;
    }

    /// Adds `vp_count` VPs.
    pub fn add_vps(
        &mut self,
        vp_count: u32,
        private_pool: Option<&Arc<dyn DmaClient>>,
    ) -> Result<(), Error> {
        self.vps = (0..vp_count)
            .map(|vp| {
                HclVp::new(
                    self,
                    vp,
                    self.supports_register_page,
                    self.isolation,
                    private_pool,
                )
            })
            .collect::<Result<_, _>>()?;

        Ok(())
    }

    /// Registers with the hypervisor for an intercept.
    pub fn register_intercept(
        &self,
        intercept_type: HvInterceptType,
        access_type_mask: u32,
        intercept_parameters: HvInterceptParameters,
    ) -> Result<(), HvError> {
        let intercept_info = hvdef::hypercall::InstallIntercept {
            partition_id: HV_PARTITION_ID_SELF,
            access_type_mask,
            intercept_type,
            intercept_parameters,
        };

        // SAFETY: calling hypercall with appropriate input and output.
        unsafe {
            self.mshv_hvcall
                .hvcall(
                    HypercallCode::HvCallInstallIntercept,
                    &intercept_info,
                    &mut (),
                )
                .unwrap()
                .result()
        }
    }

    /// Returns the base CPU that manages the given sidecar VP.
    pub fn sidecar_base_cpu(&self, vp_index: u32) -> Option<u32> {
        Some(self.sidecar.as_ref()?.base_cpu(vp_index))
    }

    /// Create a VP runner for the given partition.
    pub fn runner<'a, T: Backing<'a>>(
        &'a self,
        vp_index: u32,
        use_sidecar: bool,
    ) -> Result<ProcessorRunner<'a, T>, NoRunner> {
        let vp = &self.vps[vp_index as usize];

        let sidecar = if use_sidecar {
            Some(
                self.sidecar
                    .as_ref()
                    .ok_or(NoRunner::MissingSidecar)?
                    .vp(vp_index),
            )
        } else {
            None
        };

        let state = T::new(vp, sidecar.as_ref(), self)?;

        // Set this thread as the runner.
        let VpState::NotRunning =
            std::mem::replace(&mut *vp.state.lock(), VpState::Running(Pthread::current()))
        else {
            panic!("another runner already exists")
        };

        let actions = if sidecar.is_none() && self.supports_vtl_ret_action {
            Some(register_deferred_actions(self))
        } else {
            None
        };

        // SAFETY: The run page is guaranteed to be mapped and valid.
        // While the exit message might not be filled in yet we're only computing its address.
        let intercept_message = unsafe {
            &*sidecar.as_ref().map_or(
                std::ptr::addr_of!((*vp.run.as_ptr()).exit_message).cast(),
                |s| s.intercept_message().cast(),
            )
        };

        Ok(ProcessorRunner {
            hcl: self,
            vp,
            deferred_actions: actions,
            run: vp.run.as_ref(),
            intercept_message,
            state,
            sidecar,
        })
    }

    /// Trigger the following interrupt request.
    pub fn request_interrupt(
        &self,
        interrupt_control: hvdef::HvInterruptControl,
        destination_address: u64,
        requested_vector: u32,
        target_vtl: GuestVtl,
    ) -> Result<(), Error> {
        tracing::trace!(
            ?interrupt_control,
            destination_address,
            requested_vector,
            "requesting interrupt"
        );

        assert!(!self.isolation.is_hardware_isolated());

        let request = AssertVirtualInterrupt {
            partition_id: HV_PARTITION_ID_SELF,
            interrupt_control,
            destination_address,
            requested_vector,
            target_vtl: target_vtl as u8,
            rsvd0: 0,
            rsvd1: 0,
        };

        // SAFETY: calling the hypercall with correct input buffer.
        let output = unsafe {
            self.mshv_hvcall.hvcall(
                HypercallCode::HvCallAssertVirtualInterrupt,
                &request,
                &mut (),
            )
        }
        .unwrap();

        output.result().map_err(Error::RequestInterrupt)
    }

    /// Attempts to signal a given vp/sint/flag combo using HvSignalEventDirect.
    ///
    /// No result is returned because this request may be deferred until the
    /// hypervisor is returning to a lower VTL.
    pub fn signal_event_direct(&self, vp: u32, sint: u8, flag: u16) {
        tracing::trace!(vp, sint, flag, "signaling event");
        push_deferred_action(self, DeferredAction::SignalEvent { vp, sint, flag });
    }

    fn hvcall_signal_event_direct(&self, vp: u32, sint: u8, flag: u16) -> Result<bool, Error> {
        let signal_event_input = hvdef::hypercall::SignalEventDirect {
            target_partition: HV_PARTITION_ID_SELF,
            target_vp: vp,
            target_vtl: Vtl::Vtl0 as u8,
            target_sint: sint,
            flag_number: flag,
        };
        let mut signal_event_output = hvdef::hypercall::SignalEventDirectOutput {
            newly_signaled: 0,
            rsvd: [0; 7],
        };

        // SAFETY: calling the hypercall with correct input buffer.
        let output = unsafe {
            self.mshv_hvcall.hvcall(
                HypercallCode::HvCallSignalEventDirect,
                &signal_event_input,
                &mut signal_event_output,
            )
        }
        .unwrap();

        output
            .result()
            .map(|_| signal_event_output.newly_signaled != 0)
            .map_err(Error::SignalEvent)
    }

    /// Attempts to post a given message to a vp/sint combo using HvPostMessageDirect.
    pub fn post_message_direct(
        &self,
        vp: u32,
        sint: u8,
        message: &HvMessage,
    ) -> Result<(), HvError> {
        tracing::trace!(vp, sint, "posting message");

        let post_message = hvdef::hypercall::PostMessageDirect {
            partition_id: HV_PARTITION_ID_SELF,
            vp_index: vp,
            vtl: Vtl::Vtl0 as u8,
            padding0: [0; 3],
            sint,
            padding1: [0; 3],
            message: zerocopy::Unalign::new(*message),
            padding2: 0,
        };

        // SAFETY: calling the hypercall with correct input buffer.
        let output = unsafe {
            self.mshv_hvcall.hvcall(
                HypercallCode::HvCallPostMessageDirect,
                &post_message,
                &mut (),
            )
        }
        .unwrap();

        output.result()
    }

    /// Sets a file to poll during run. When the file's poll state changes, the
    /// run will be automatically cancelled.
    pub fn set_poll_file(&self, vp: u32, file: RawFd) -> Result<(), Error> {
        // SAFETY: calling the IOCTL as defined. This is safe even if the caller
        // does not own `file` since all this does is register the file for
        // polling.
        unsafe {
            hcl_set_poll_file(
                self.mshv_vtl.file.as_raw_fd(),
                &protocol::hcl_set_poll_file {
                    cpu: vp as i32,
                    fd: file,
                },
            )
            .map_err(Error::SetPollFile)?;
        }
        Ok(())
    }

    /// Gets the current hypervisor reference time.
    pub fn reference_time(&self) -> Result<u64, Error> {
        Ok(self
            .get_vp_register(HvAllArchRegisterName::TimeRefCount, HvInputVtl::CURRENT_VTL)?
            .as_u64())
    }

    /// Get a single VP register for the given VTL via hypercall. Only a select
    /// set of registers are supported; others will cause a panic.
    #[cfg(guest_arch = "x86_64")]
    pub fn get_vp_register(
        &self,
        name: impl Into<HvX64RegisterName>,
        vtl: HvInputVtl,
    ) -> Result<HvRegisterValue, Error> {
        self.mshv_hvcall.get_vp_register_for_vtl(vtl, name.into())
    }

    /// Get a single VP register for the given VTL via hypercall. Only a select
    /// set of registers are supported; others will cause a panic.
    #[cfg(guest_arch = "aarch64")]
    pub fn get_vp_register(
        &self,
        name: impl Into<HvArm64RegisterName>,
        vtl: HvInputVtl,
    ) -> Result<HvRegisterValue, Error> {
        self.mshv_hvcall.get_vp_register_for_vtl(vtl, name.into())
    }

    /// Set a single VP register via hypercall as VTL2. Only a select set of registers are
    /// supported, others will cause a panic.
    fn set_vp_register(
        &self,
        name: HvRegisterName,
        value: HvRegisterValue,
        vtl: HvInputVtl,
    ) -> Result<(), HvError> {
        match vtl.target_vtl().unwrap() {
            None | Some(Vtl::Vtl2) => {
                #[cfg(guest_arch = "x86_64")]
                assert!(matches!(
                    name.into(),
                    HvX64RegisterName::GuestVsmPartitionConfig
                        | HvX64RegisterName::VsmPartitionConfig
                        | HvX64RegisterName::PmTimerAssist
                ));

                #[cfg(guest_arch = "aarch64")]
                assert!(matches!(
                    name.into(),
                    HvArm64RegisterName::GuestVsmPartitionConfig
                        | HvArm64RegisterName::VsmPartitionConfig
                ));
            }
            Some(Vtl::Vtl1) => {
                // TODO: allowed registers for VTL1
                todo!();
            }
            Some(Vtl::Vtl0) => {
                // TODO: allowed registers for VTL0
                todo!();
            }
        }

        let header = hvdef::hypercall::GetSetVpRegisters {
            partition_id: HV_PARTITION_ID_SELF,
            vp_index: HV_VP_INDEX_SELF,
            target_vtl: HvInputVtl::CURRENT_VTL,
            rsvd: [0; 3],
        };

        let input = HvRegisterAssoc {
            name,
            pad: Default::default(),
            value,
        };

        tracing::trace!(?name, register = ?value, "HvCallSetVpRegisters");

        // SAFETY: The input header and rep slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let output = unsafe {
            self.mshv_hvcall
                .hvcall_rep::<hvdef::hypercall::GetSetVpRegisters, HvRegisterAssoc, u8>(
                    HypercallCode::HvCallSetVpRegisters,
                    &header,
                    HvcallRepInput::Elements(&[input]),
                    None,
                )
                .expect("set_vp_registers hypercall should not fail")
        };

        output.result()?;

        // hypercall must succeed with 1 rep completed
        assert_eq!(output.elements_processed(), 1);
        Ok(())
    }

    /// Translate the following gva to a gpa page.
    ///
    /// The caller must ensure `control_flags.input_vtl()` is set to a specific
    /// VTL.
    #[cfg(guest_arch = "aarch64")]
    pub fn translate_gva_to_gpa(
        &self,
        gva: u64,
        control_flags: hvdef::hypercall::TranslateGvaControlFlagsArm64,
    ) -> Result<Result<TranslateResult, aarch64::TranslateErrorAarch64>, TranslateGvaToGpaError>
    {
        use hvdef::hypercall;

        assert!(!self.isolation.is_hardware_isolated());
        assert!(
            control_flags.input_vtl().use_target_vtl(),
            "did not specify a target VTL"
        );

        let header = hypercall::TranslateVirtualAddressArm64 {
            partition_id: HV_PARTITION_ID_SELF,
            vp_index: HV_VP_INDEX_SELF,
            reserved: 0,
            control_flags,
            gva_page: gva >> hvdef::HV_PAGE_SHIFT,
        };

        let mut output: hypercall::TranslateVirtualAddressExOutputArm64 = FromZeros::new_zeroed();

        // SAFETY: The input header and slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let status = unsafe {
            self.mshv_hvcall
                .hvcall(
                    HypercallCode::HvCallTranslateVirtualAddressEx,
                    &header,
                    &mut output,
                )
                .expect("translate can never fail")
        };

        status
            .result()
            .map_err(|hv_error| TranslateGvaToGpaError::Hypervisor { gva, hv_error })?;

        // Note: WHP doesn't currently support TranslateVirtualAddressEx, so overlay_page, cache_type,
        // event_info aren't trustworthy values if the results came from WHP.
        match output.translation_result.result.result_code() {
            c if c == hypercall::TranslateGvaResultCode::SUCCESS.0 => Ok(Ok(TranslateResult {
                gpa_page: output.gpa_page,
                overlay_page: output.translation_result.result.overlay_page(),
            })),
            x => Ok(Err(aarch64::TranslateErrorAarch64 { code: x })),
        }
    }

    fn to_hv_gpa_range_array(gpa_memory_ranges: &[MemoryRange]) -> Vec<HvGpaRange> {
        const PAGES_PER_ENTRY: u64 = 2048;
        const PAGE_SIZE: u64 = HV_PAGE_SIZE;

        // Estimate the total number of pages across all memory ranges
        let estimated_size: usize = gpa_memory_ranges
            .iter()
            .map(|memory_range| {
                let total_pages = (memory_range.end() - memory_range.start()).div_ceil(PAGE_SIZE);
                total_pages.div_ceil(PAGES_PER_ENTRY)
            })
            .sum::<u64>() as usize;

        // Create a vector with the estimated size
        let mut hv_gpa_ranges = Vec::with_capacity(estimated_size);

        for memory_range in gpa_memory_ranges {
            // Calculate the total number of pages in the memory range
            let total_pages = (memory_range.end() - memory_range.start()).div_ceil(PAGE_SIZE);

            // Convert start address to page number
            let start_page = memory_range.start_4k_gpn();

            // Generate the ranges and append them to the vector
            hv_gpa_ranges.extend(
                (0..total_pages)
                    .step_by(PAGES_PER_ENTRY as usize)
                    .map(|start| {
                        let end = std::cmp::min(total_pages, start + PAGES_PER_ENTRY);
                        let pages_in_this_range = end - start;
                        let gpa_page_number = start_page + start;

                        let extended = HvGpaRangeExtended::new()
                            .with_additional_pages(pages_in_this_range - 1)
                            .with_large_page(false) // Assuming not a large page
                            .with_gpa_page_number(gpa_page_number);

                        HvGpaRange(extended.into_bits())
                    }),
            );
        }

        hv_gpa_ranges // Return the vector at the end
    }

    fn pin_unpin_gpa_ranges_internal(
        &self,
        gpa_ranges: &[HvGpaRange],
        action: GpaPinUnpinAction,
    ) -> Result<(), PinUnpinError> {
        const PIN_REQUEST_HEADER_SIZE: usize =
            size_of::<hvdef::hypercall::PinUnpinGpaPageRangesHeader>();
        const MAX_INPUT_ELEMENTS: usize =
            (HV_PAGE_SIZE as usize - PIN_REQUEST_HEADER_SIZE) / size_of::<u64>();

        let header = hvdef::hypercall::PinUnpinGpaPageRangesHeader { reserved: 0 };
        let mut ranges_processed = 0;

        for chunk in gpa_ranges.chunks(MAX_INPUT_ELEMENTS) {
            // SAFETY: This unsafe block is valid because:
            // 1. The code and header going to match the expected input for the hypercall.
            //
            // 2. Hypercall result is checked right after the hypercall is issued.
            //
            let output = unsafe {
                self.mshv_hvcall
                    .hvcall_rep(
                        match action {
                            GpaPinUnpinAction::PinGpaRange => HypercallCode::HvCallPinGpaPageRanges,
                            GpaPinUnpinAction::UnpinGpaRange => {
                                HypercallCode::HvCallUnpinGpaPageRanges
                            }
                        },
                        &header,
                        HvcallRepInput::Elements(chunk),
                        None::<&mut [u8]>,
                    )
                    .expect("submitting pin/unpin hypercall should not fail")
            };

            ranges_processed += output.elements_processed();

            output.result().map_err(|e| PinUnpinError {
                ranges_processed,
                error: e,
            })?;
        }

        // At end all the ranges should be processed
        if ranges_processed == gpa_ranges.len() {
            Ok(())
        } else {
            Err(PinUnpinError {
                ranges_processed,
                error: HvError::OperationFailed,
            })
        }
    }

    fn perform_pin_unpin_gpa_ranges(
        &self,
        gpa_ranges: &[MemoryRange],
        action: GpaPinUnpinAction,
        rollback_action: GpaPinUnpinAction,
    ) -> Result<(), HvError> {
        let hv_gpa_ranges: Vec<HvGpaRange> = Self::to_hv_gpa_range_array(gpa_ranges);

        // Attempt to pin/unpin the ranges
        match self.pin_unpin_gpa_ranges_internal(&hv_gpa_ranges, action) {
            Ok(_) => Ok(()),
            Err(PinUnpinError {
                error,
                ranges_processed,
            }) => {
                // Unpin the ranges that were successfully pinned
                let pinned_ranges = &hv_gpa_ranges[..ranges_processed];
                if let Err(rollback_error) =
                    self.pin_unpin_gpa_ranges_internal(pinned_ranges, rollback_action)
                {
                    // Panic if rollback is failing
                    panic!(
                        "Failed to perform action {:?} on ranges. Error : {:?}. \
                        Attempted to rollback {:?} ranges out of {:?}.\n rollback error: {:?}",
                        action,
                        error,
                        ranges_processed,
                        gpa_ranges.len(),
                        rollback_error
                    );
                }
                // Surface the original error
                Err(error)
            }
        }
    }

    /// Pins the specified guest physical address ranges in the hypervisor.
    /// The memory ranges passed to this function must be VA backed memory.
    /// If a partial failure occurs (i.e., some but not all the ranges were successfully pinned),
    /// the function will automatically attempt to unpin any successfully pinned ranges.
    /// This "rollback" behavior ensures that no partially pinned state remains, which
    /// could otherwise lead to inconsistencies.
    ///
    pub fn pin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
        self.perform_pin_unpin_gpa_ranges(
            ranges,
            GpaPinUnpinAction::PinGpaRange,
            GpaPinUnpinAction::UnpinGpaRange,
        )
    }

    /// Unpins the specified guest physical address ranges in the hypervisor.
    /// The memory ranges passed to this function must be VA backed memory.
    /// If a partial failure occurs (i.e., some but not all the ranges were successfully unpinned),
    /// the function will automatically attempt to pin any successfully unpinned ranges. This "rollback"
    /// behavior ensures that no partially unpinned state remains, which could otherwise lead to inconsistencies.
    ///
    pub fn unpin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
        self.perform_pin_unpin_gpa_ranges(
            ranges,
            GpaPinUnpinAction::UnpinGpaRange,
            GpaPinUnpinAction::PinGpaRange,
        )
    }

    /// Read the vsm capabilities register for VTL2.
    pub fn get_vsm_capabilities(&self) -> Result<hvdef::HvRegisterVsmCapabilities, Error> {
        let caps = hvdef::HvRegisterVsmCapabilities::from(
            self.get_vp_register(
                HvAllArchRegisterName::VsmCapabilities,
                HvInputVtl::CURRENT_VTL,
            )?
            .as_u64(),
        );

        let caps = match self.isolation {
            IsolationType::None | IsolationType::Vbs => caps,
            // TODO SNP: Return actions may be useful, but with alternate injection many of these need
            // cannot actually be processed by the hypervisor without returning to VTL2.
            // Filter them out for now.
            IsolationType::Snp => hvdef::HvRegisterVsmCapabilities::new()
                .with_deny_lower_vtl_startup(caps.deny_lower_vtl_startup())
                .with_intercept_page_available(caps.intercept_page_available()),
            IsolationType::Tdx => hvdef::HvRegisterVsmCapabilities::new()
                .with_deny_lower_vtl_startup(caps.deny_lower_vtl_startup())
                .with_intercept_page_available(caps.intercept_page_available())
                .with_dr6_shared(true),
        };

        assert_eq!(caps.dr6_shared(), self.dr6_shared());

        Ok(caps)
    }

    /// Set the [`hvdef::HvRegisterVsmPartitionConfig`] register.
    pub fn set_vtl2_vsm_partition_config(
        &self,
        vsm_config: HvRegisterVsmPartitionConfig,
    ) -> Result<(), SetVsmPartitionConfigError> {
        self.set_vp_register(
            HvAllArchRegisterName::VsmPartitionConfig.into(),
            HvRegisterValue::from(u64::from(vsm_config)),
            HvInputVtl::CURRENT_VTL,
        )
        .map_err(|e| SetVsmPartitionConfigError::Hypervisor {
            config: vsm_config,
            hv_error: e,
        })
    }

    /// Get the [`hvdef::HvRegisterGuestVsmPartitionConfig`] register
    pub fn get_guest_vsm_partition_config(
        &self,
    ) -> Result<hvdef::HvRegisterGuestVsmPartitionConfig, Error> {
        Ok(hvdef::HvRegisterGuestVsmPartitionConfig::from(
            self.get_vp_register(
                HvAllArchRegisterName::GuestVsmPartitionConfig,
                HvInputVtl::CURRENT_VTL,
            )?
            .as_u64(),
        ))
    }

    /// Configure guest VSM.
    /// The only configuration attribute currently supported is changing the maximum number of
    /// guest-visible virtual trust levels for the partition. (VTL 1)
    pub fn set_guest_vsm_partition_config(
        &self,
        enable_guest_vsm: bool,
    ) -> Result<(), SetGuestVsmConfigError> {
        let register_value = hvdef::HvRegisterGuestVsmPartitionConfig::new()
            .with_maximum_vtl(if enable_guest_vsm { 1 } else { 0 })
            .with_reserved(0);

        tracing::trace!(enable_guest_vsm, "set_guest_vsm_partition_config");
        if self.isolation.is_hardware_isolated() {
            unimplemented!("set_guest_vsm_partition_config");
        }

        self.set_vp_register(
            HvAllArchRegisterName::GuestVsmPartitionConfig.into(),
            HvRegisterValue::from(u64::from(register_value)),
            HvInputVtl::CURRENT_VTL,
        )
        .map_err(|e| SetGuestVsmConfigError::Hypervisor {
            enable_guest_vsm,
            hv_error: e,
        })
    }

    /// Sets the Power Management Timer assist in the hypervisor.
    #[cfg(guest_arch = "x86_64")]
    pub fn set_pm_timer_assist(&self, port: Option<u16>) -> Result<(), HvError> {
        tracing::debug!(?port, "set_pm_timer_assist");
        if self.isolation.is_hardware_isolated() {
            if port.is_some() {
                unimplemented!("set_pm_timer_assist");
            }
        }

        let val = HvRegisterValue::from(u64::from(match port {
            Some(p) => hvdef::HvPmTimerInfo::new()
                .with_port(p)
                .with_enabled(true)
                .with_width_24(false),
            None => 0.into(),
        }));

        self.set_vp_register(
            HvX64RegisterName::PmTimerAssist.into(),
            val,
            HvInputVtl::CURRENT_VTL,
        )
    }

    /// Sets the Power Management Timer assist in the hypervisor.
    #[cfg(guest_arch = "aarch64")]
    pub fn set_pm_timer_assist(&self, port: Option<u16>) -> Result<(), HvError> {
        tracing::debug!(?port, "set_pm_timer_assist unimplemented on aarch64");
        Err(HvError::UnknownRegisterName)
    }

    /// Sets the VTL protection mask for the specified memory range.
    pub fn modify_vtl_protection_mask(
        &self,
        range: MemoryRange,
        map_flags: HvMapGpaFlags,
        target_vtl: HvInputVtl,
    ) -> Result<(), ApplyVtlProtectionsError> {
        if self.isolation.is_hardware_isolated() {
            // TODO SNP TODO TDX - required for vmbus relay monitor page support
            todo!();
        }

        self.mshv_hvcall
            .modify_vtl_protection_mask(range, map_flags, target_vtl)
    }

    /// Checks whether the target vtl has vtl permissions for the given gpa
    pub fn check_vtl_access(
        &self,
        gpa: u64,
        target_vtl: GuestVtl,
        flags: HvMapGpaFlags,
    ) -> Result<Option<CheckVtlAccessResult>, Error> {
        assert!(!self.isolation.is_hardware_isolated());

        let header = hvdef::hypercall::CheckSparseGpaPageVtlAccess {
            partition_id: HV_PARTITION_ID_SELF,
            target_vtl: HvInputVtl::from(target_vtl),
            desired_access: u32::from(flags) as u8,
            reserved0: 0,
            reserved1: 0,
        };

        let mut output = [hvdef::hypercall::CheckSparseGpaPageVtlAccessOutput::new()];

        // SAFETY: The input header and rep slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let status = unsafe {
            self.mshv_hvcall.hvcall_rep::<hvdef::hypercall::CheckSparseGpaPageVtlAccess, u64, hvdef::hypercall::CheckSparseGpaPageVtlAccessOutput>(
                HypercallCode::HvCallCheckSparseGpaPageVtlAccess,
                &header,
                HvcallRepInput::Elements(&[gpa >> hvdef::HV_PAGE_SHIFT]),
                Some(&mut output),
            )
            .expect("check_vtl_access hypercall should not fail")
        };

        // TODO GUEST_VSM: for isolated VMs, if the status is operation denied,
        // return memory unaccepted?
        status.result().map_err(Error::CheckVtlAccess)?;

        let access_result = output[0];

        if access_result.result_code() as u32
            != hvdef::hypercall::CheckGpaPageVtlAccessResultCode::SUCCESS.0
        {
            return Ok(Some(CheckVtlAccessResult {
                vtl: (access_result.intercepting_vtl() as u8)
                    .try_into()
                    .expect("checking vtl permissions failure should return valid vtl"),
                denied_flags: (access_result.denied_access() as u32).into(),
            }));
        }

        assert_eq!(status.elements_processed(), 1);
        Ok(None)
    }

    /// Enables a vtl for the partition
    pub fn enable_partition_vtl(
        &self,
        vtl: GuestVtl,
        flags: hvdef::hypercall::EnablePartitionVtlFlags,
    ) -> Result<(), HvError> {
        use hvdef::hypercall;

        let header = hypercall::EnablePartitionVtl {
            partition_id: HV_PARTITION_ID_SELF,
            target_vtl: vtl.into(),
            flags,
            reserved_z0: 0,
            reserved_z1: 0,
        };

        // SAFETY: The input header and slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let status = unsafe {
            self.mshv_hvcall
                .hvcall(HypercallCode::HvCallEnablePartitionVtl, &header, &mut ())
                .expect("submitting hypercall should not fail")
        };

        status.result()
    }

    /// Enables a vtl on a vp
    pub fn enable_vp_vtl(
        &self,
        vp_index: u32,
        vtl: GuestVtl,
        hv_vp_context: InitialVpContextX64,
    ) -> Result<(), HvError> {
        use hvdef::hypercall;

        let header = hypercall::EnableVpVtlX64 {
            partition_id: HV_PARTITION_ID_SELF,
            vp_index,
            target_vtl: vtl.into(),
            reserved: [0; 3],
            vp_vtl_context: hv_vp_context,
        };

        // SAFETY: The input header and slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let status = unsafe {
            self.mshv_hvcall
                .hvcall(HypercallCode::HvCallEnableVpVtl, &header, &mut ())
                .expect("submitting hypercall should not fail")
        };

        status.result()
    }

    /// Gets the PFN for the VTL 1 VMSA
    pub fn vtl1_vmsa_pfn(&self, cpu_index: u32) -> u64 {
        let mut vp_pfn = cpu_index as u64; // input vp, output pfn

        // SAFETY: The ioctl requires no prerequisites other than the VTL 1 VMSA
        // should be mapped. This ioctl should never fail as long as the vtl 1
        // VMSA was mapped.
        unsafe {
            hcl_read_guest_vsm_page_pfn(self.mshv_vtl.file.as_raw_fd(), &mut vp_pfn)
                .expect("should always succeed");
        }

        vp_pfn
    }

    /// Returns the isolation type for the partition.
    pub fn isolation(&self) -> IsolationType {
        self.isolation
    }

    /// Reads MSR_IA32_VMX_CR4_FIXED1 in kernel mode.
    pub fn read_vmx_cr4_fixed1(&self) -> u64 {
        let mut value = 0;

        // SAFETY: The ioctl requires no prerequisites other than a location to
        // write the read MSR. This ioctl should never fail.
        unsafe {
            hcl_read_vmx_cr4_fixed1(self.mshv_vtl.file.as_raw_fd(), &mut value)
                .expect("should always succeed");
        }

        value
    }

    /// Invokes the HvCallMemoryMappedIoRead hypercall
    pub fn memory_mapped_io_read(&self, gpa: u64, data: &mut [u8]) -> Result<(), HvError> {
        assert!(data.len() <= hvdef::hypercall::HV_HYPERCALL_MMIO_MAX_DATA_LENGTH);

        let header = hvdef::hypercall::MemoryMappedIoRead {
            gpa,
            access_width: data.len() as u32,
            reserved_z0: 0,
        };

        let mut output: hvdef::hypercall::MemoryMappedIoReadOutput = FromZeros::new_zeroed();

        // SAFETY: The input header and slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let status = unsafe {
            self.mshv_hvcall
                .hvcall(
                    HypercallCode::HvCallMemoryMappedIoRead,
                    &header,
                    &mut output,
                )
                .expect("submitting hypercall should not fail")
        };

        // Only copy the data if the hypercall was successful
        if status.result().is_ok() {
            data.copy_from_slice(&output.data[..data.len()]);
        };

        status.result()
    }

    /// Invokes the HvCallMemoryMappedIoWrite hypercall
    pub fn memory_mapped_io_write(&self, gpa: u64, data: &[u8]) -> Result<(), HvError> {
        assert!(data.len() <= hvdef::hypercall::HV_HYPERCALL_MMIO_MAX_DATA_LENGTH);

        let mut header = hvdef::hypercall::MemoryMappedIoWrite {
            gpa,
            access_width: data.len() as u32,
            reserved_z0: 0,
            data: [0; hvdef::hypercall::HV_HYPERCALL_MMIO_MAX_DATA_LENGTH],
        };

        header.data[..data.len()].copy_from_slice(data);

        // SAFETY: The input header and slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let status = unsafe {
            self.mshv_hvcall
                .hvcall(HypercallCode::HvCallMemoryMappedIoWrite, &header, &mut ())
                .expect("submitting hypercall should not fail")
        };

        status.result()
    }

    /// Invokes the HvCallRetargetDeviceInterrupt hypercall.
    /// `target_processors` must be sorted in ascending order.
    pub fn retarget_device_interrupt(
        &self,
        device_id: u64,
        entry: hvdef::hypercall::InterruptEntry,
        vector: u32,
        multicast: bool,
        target_processors: ProcessorSet<'_>,
    ) -> Result<(), HvError> {
        let header = hvdef::hypercall::RetargetDeviceInterrupt {
            partition_id: HV_PARTITION_ID_SELF,
            device_id,
            entry,
            rsvd: 0,
            target_header: hvdef::hypercall::InterruptTarget {
                vector,
                flags: hvdef::hypercall::HvInterruptTargetFlags::default()
                    .with_multicast(multicast)
                    .with_processor_set(true),
                // Always use a generic processor set to simplify construction. This hypercall is
                // invoked relatively infrequently, the overhead should be acceptable.
                mask_or_format: hvdef::hypercall::HV_GENERIC_SET_SPARSE_4K,
            },
        };
        let processor_set = Vec::from_iter(target_processors.as_generic_set());

        // SAFETY: The input header and slice are the correct types for this hypercall.
        //         The hypercall output is validated right after the hypercall is issued.
        let status = unsafe {
            self.mshv_hvcall
                .hvcall_var(
                    HypercallCode::HvCallRetargetDeviceInterrupt,
                    &header,
                    processor_set.as_bytes(),
                    &mut (),
                )
                .expect("submitting hypercall should not fail")
        };

        status.result()
    }

    /// Gets the permissions for a vtl.
    /// Currently unused, but available for debugging purposes
    #[cfg(debug_assertions)]
    pub fn rmp_query(&self, gpa: u64, vtl: GuestVtl) -> x86defs::snp::SevRmpAdjust {
        use x86defs::snp::SevRmpAdjust;

        let page_count = 1u64;
        let flags = [u64::from(SevRmpAdjust::new().with_target_vmpl(match vtl {
            GuestVtl::Vtl0 => 2,
            GuestVtl::Vtl1 => 1,
        }))];
        let page_size = [0u64];
        let pages_processed = 0;

        debug_assert!(flags.len() == page_count as usize);
        debug_assert!(page_size.len() == page_count as usize);

        let query = mshv_rmpquery {
            start_pfn: gpa / HV_PAGE_SIZE,
            page_count,
            terminate_on_failure: 0,
            ram: 0,
            padding: Default::default(),
            flags: flags.as_ptr().cast_mut(),
            page_size: page_size.as_ptr().cast_mut(),
            pages_processed: core::ptr::from_ref(&pages_processed).cast_mut(),
        };

        // SAFETY: the input query is the correct type for this ioctl
        unsafe {
            hcl_rmpquery_pages(self.mshv_vtl.file.as_raw_fd(), &query)
                .expect("should always succeed");
        }
        debug_assert!(pages_processed <= page_count);

        SevRmpAdjust::from(flags[0])
    }

    /// Issues an INVLPGB instruction.
    pub fn invlpgb(&self, rax: u64, edx: u32, ecx: u32) {
        let data = mshv_invlpgb {
            rax,
            edx,
            ecx,
            _pad0: 0,
            _pad1: 0,
        };
        // SAFETY: ioctl has no prerequisites.
        unsafe {
            hcl_invlpgb(self.mshv_vtl.file.as_raw_fd(), &data).expect("should always succeed");
        }
    }

    /// Issues a TLBSYNC instruction.
    pub fn tlbsync(&self) {
        // SAFETY: ioctl has no prerequisites.
        unsafe {
            hcl_tlbsync(self.mshv_vtl.file.as_raw_fd()).expect("should always succeed");
        }
    }
}
microsoft/openvmm

Branches

Tags

Clone

openhcl/hcl/src/ioctl.rs