From ce17a5015cf3e4d2533375a65554697ef8c8403a Mon Sep 17 00:00:00 2001 From: GnoCiYeH Date: Mon, 15 Apr 2024 23:19:23 +0800 Subject: [PATCH 01/10] =?UTF-8?q?=E5=87=A0=E4=B8=AA=E7=BB=93=E6=9E=84?= =?UTF-8?q?=E4=BD=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/src/arch/x86_64/vm/kvm_host/mod.rs | 40 ++++++++++ kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs | 31 ++++++++ kernel/src/arch/x86_64/vm/mod.rs | 85 ++++++++++++++++++++++ kernel/src/arch/x86_64/vm/vmx/mod.rs | 26 +++++++ kernel/src/virt/vm/kvm_host/mod.rs | 28 +++++++ kernel/src/virt/vm/kvm_host/vcpu.rs | 51 +++++++++++++ kernel/src/virt/vm/mod.rs | 1 + 7 files changed, 262 insertions(+) create mode 100644 kernel/src/arch/x86_64/vm/kvm_host/mod.rs create mode 100644 kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs create mode 100644 kernel/src/arch/x86_64/vm/mod.rs create mode 100644 kernel/src/arch/x86_64/vm/vmx/mod.rs create mode 100644 kernel/src/virt/vm/kvm_host/mod.rs create mode 100644 kernel/src/virt/vm/kvm_host/vcpu.rs create mode 100644 kernel/src/virt/vm/mod.rs diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs new file mode 100644 index 000000000..dfd7501d4 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -0,0 +1,40 @@ +use system_error::SystemError; + +pub mod vcpu; + +type SysResult = Result; + +pub struct X86KvmArch { + /// 中断芯片模式 + irqchip_mode: KvmIrqChipMode, + /// 负责引导(bootstrap)kvm的vcpu——id + bsp_vcpu_id: usize, +} + +#[derive(Debug, Clone, Copy)] +pub enum KvmIrqChipMode { + None, + Kernel, + Split, +} + +pub trait KvmFunc: Send + Sync { + /// 返回该硬件支持的名字,例如“Vmx” + fn name(&self) -> &'static str; + + /// 启用硬件支持 + /// (注:只有dummy实现能够返回ENOSYS错误码,表示未指定) + fn hardware_enable(&self) -> SysResult<()>; +} + +pub struct DummyKvmFunc; + +impl KvmFunc for DummyKvmFunc { + fn name(&self) -> &'static str { + "kvm_dummy_ops" + } + + fn hardware_enable(&self) -> SysResult<()> { + Err(SystemError::ENOSYS) + } +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs new file mode 100644 index 000000000..f06bed31e --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -0,0 +1,31 @@ +use crate::{ + smp::cpu::ProcessorId, + virt::vm::kvm_host::vcpu::{MutilProcessorState, VirtCpu}, +}; + +#[derive(Debug)] +pub struct X86VcpuArch { + /// 最近一次尝试进入虚拟机的主机cpu + last_vmentry_cpu: ProcessorId, + /// 可用寄存器数量 + regs_avail: u32, + /// 脏寄存器数量 + regs_dirty: u32, + /// 多处理器状态 + mp_state: MutilProcessorState, +} + +impl VirtCpu { + pub fn init_arch(&mut self) {} +} + +impl Default for X86VcpuArch { + fn default() -> Self { + Self { + last_vmentry_cpu: ProcessorId::INVALID, + regs_avail: !0, + regs_dirty: !0, + mp_state: MutilProcessorState::Runnable, + } + } +} diff --git a/kernel/src/arch/x86_64/vm/mod.rs b/kernel/src/arch/x86_64/vm/mod.rs new file mode 100644 index 000000000..6233199d9 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mod.rs @@ -0,0 +1,85 @@ +use core::{ + arch::x86_64::{_xgetbv, _XCR_XFEATURE_ENABLED_MASK}, + sync::atomic::{AtomicU64, Ordering}, +}; + +use alloc::{boxed::Box, sync::Arc}; +use raw_cpuid::CpuId; +use system_error::SystemError; +use x86::msr::{rdmsr, IA32_CSTAR, IA32_PAT}; + +use crate::{ + kerror, + libs::{lazy_init::Lazy, rwlock::RwLock}, +}; + +use self::kvm_host::KvmFunc; + +pub mod kvm_host; +pub mod vmx; + +static KVM_X86_MANAGER: Lazy = Lazy::new(); + +pub fn kvm_x86_ops() -> Option<&'static dyn KvmFunc> { + *KVM_X86_MANAGER.funcs.read() +} + +pub struct KvmArchManager { + funcs: RwLock>, + host_xcr0: AtomicU64, +} + +impl KvmArchManager { + pub const KVM_MAX_VCPUS: usize = 1024; + + /// 厂商相关的init工作 + pub fn vendor_init(&self) -> Result<(), SystemError> { + let cpuid = CpuId::new(); + let cpu_feature = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?; + + let kvm_x86_ops = kvm_x86_ops(); + + // 是否已经设置过 + if let Some(ops) = kvm_x86_ops { + kerror!("[KVM] already loaded vendor module {}", ops.name()); + return Err(SystemError::EEXIST); + } + + // 确保cpu支持fpu浮点数处理器 + if !cpu_feature.has_fpu() || !cpu_feature.has_fxsave_fxstor() { + kerror!("[KVM] inadequate fpu"); + return Err(SystemError::ENOSYS); + } + + // TODO:实时内核需要判断tsc + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9472 + + // 读取主机page attribute table(页属性表) + let host_pat = unsafe { rdmsr(IA32_PAT) }; + // PAT[0]是否为write back类型,即判断低三位是否为0b110(0x06) + if host_pat & 0b111 != 0b110 { + kerror!("[KVM] host PAT[0] is not WB"); + return Err(SystemError::EIO); + } + + // TODO:mmu vendor init + + if cpu_feature.has_xsave() { + self.host_xcr0.store( + unsafe { _xgetbv(_XCR_XFEATURE_ENABLED_MASK) }, + Ordering::SeqCst, + ); + } + + Ok(()) + } +} + +/// ### Kvm的功能特性 +#[derive(Debug)] +pub struct KvmCapabilities { + has_tsc_control: bool, + max_guest_tsc_khz: u32, + tsc_scaling_ratio_frac_bits: u8, + +} diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs new file mode 100644 index 000000000..e9921ec71 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -0,0 +1,26 @@ +use raw_cpuid::CpuId; + +pub struct Vmx; + +impl Vmx { + /// @brief 查看CPU是否支持虚拟化 + pub fn kvm_arch_cpu_supports_vm() -> bool { + let cpuid = CpuId::new(); + // Check to see if CPU is Intel (“GenuineIntel”). + if let Some(vi) = cpuid.get_vendor_info() { + if vi.as_str() != "GenuineIntel" { + return false; + } + } + // Check processor supports for Virtual Machine Extension (VMX) technology + // CPUID.1:ECX.VMX[bit 5] = 1 (Intel Manual: 24.6 Discovering Support for VMX) + if let Some(fi) = cpuid.get_feature_info() { + if !fi.has_vmx() { + return false; + } + } + return true; + } +} + +pub fn vmx_init() {} diff --git a/kernel/src/virt/vm/kvm_host/mod.rs b/kernel/src/virt/vm/kvm_host/mod.rs new file mode 100644 index 000000000..92e915644 --- /dev/null +++ b/kernel/src/virt/vm/kvm_host/mod.rs @@ -0,0 +1,28 @@ +use core::sync::atomic::AtomicUsize; + +use alloc::string::String; + +use crate::mm::ucontext::AddressSpace; + +pub mod vcpu; + +const KVM_ADDRESS_SPACE_NUM: usize = 1; + +pub struct KvmMemSlots { + /// 最后一次使用到的内存插槽 + last_use: AtomicUsize, + /// 存储虚拟地址(hva)和内存插槽之间的映射关系 + // Rbt + /// 用于存储全局页帧号(gfn)和内存插槽之间的映射关系 + // Rbt + /// 将内存插槽的ID映射到对应的内存插槽。 + // HashMap + /// 节点索引 + node_idx: usize, +} + +pub struct Vm { + mm: AddressSpace, + max_vcpus: usize, + name: String, +} diff --git a/kernel/src/virt/vm/kvm_host/vcpu.rs b/kernel/src/virt/vm/kvm_host/vcpu.rs new file mode 100644 index 000000000..fc54f11a4 --- /dev/null +++ b/kernel/src/virt/vm/kvm_host/vcpu.rs @@ -0,0 +1,51 @@ +use alloc::{string::String, sync::Arc}; + +use crate::{ + process::{Pid, ProcessManager}, + smp::cpu::ProcessorId, +}; + +use super::{KvmMemSlots, Vm}; + +pub struct VirtCpu { + cpu: ProcessorId, + kvm: Arc, + vcpu_id: usize, + pid: Option, + preempted: bool, + ready: bool, + last_used_slot: Option>, + stats_id: String, +} + +impl VirtCpu { + /// ### 创建一个vcpu,并且初始化部分数据 + pub fn create(vm: Arc, id: usize) -> Self { + Self { + cpu: ProcessorId::INVALID, + kvm: vm, + vcpu_id: id, + pid: None, + preempted: false, + ready: false, + last_used_slot: None, + stats_id: format!("kvm-{}/vcpu-{}", ProcessManager::current_pid().data(), id), + } + } +} + +/// ## 多处理器状态(有些状态在某些架构并不合法) +#[derive(Debug, Clone, Copy)] +pub enum MutilProcessorState { + Runnable, + Uninitialized, + InitReceived, + Halted, + SipiReceived, + Stopped, + CheckStop, + Operating, + Load, + ApResetHold, + Suspended, +} diff --git a/kernel/src/virt/vm/mod.rs b/kernel/src/virt/vm/mod.rs new file mode 100644 index 000000000..cf82f0060 --- /dev/null +++ b/kernel/src/virt/vm/mod.rs @@ -0,0 +1 @@ +pub mod kvm_host; \ No newline at end of file From 074b3a9b06b2a3a447ecf90e0a9100f14e9700d0 Mon Sep 17 00:00:00 2001 From: GnoCiYeH Date: Sat, 4 May 2024 15:40:00 +0800 Subject: [PATCH 02/10] =?UTF-8?q?=E9=80=9A=E8=BF=87vmx=5Finit=E4=BB=A5?= =?UTF-8?q?=E5=8F=8Acreate=5Fvm=EF=BC=8Ccreate=5Fvcpu=E9=83=A8=E5=88=86TOD?= =?UTF-8?q?O?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/Cargo.toml | 2 +- kernel/crates/bitmap/src/alloc_bitmap.rs | 2 +- kernel/src/arch/x86_64/kvm/vmx/vcpu.rs | 2 +- kernel/src/arch/x86_64/mod.rs | 9 + kernel/src/arch/x86_64/vm/asm.rs | 436 +++++++ kernel/src/arch/x86_64/vm/cpuid.rs | 58 + kernel/src/arch/x86_64/vm/kvm_host/lapic.rs | 43 + kernel/src/arch/x86_64/vm/kvm_host/mod.rs | 284 ++++- kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs | 554 ++++++++- kernel/src/arch/x86_64/vm/mem.rs | 21 + kernel/src/arch/x86_64/vm/mmu.rs | 356 ++++++ kernel/src/arch/x86_64/vm/mod.rs | 547 ++++++++- kernel/src/arch/x86_64/vm/vmx/capabilities.rs | 509 ++++++++ kernel/src/arch/x86_64/vm/vmx/mod.rs | 1066 ++++++++++++++++- kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs | 158 +++ kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs | 265 ++++ kernel/src/init/init.rs | 5 +- kernel/src/lib.rs | 2 + kernel/src/libs/rbtree.rs | 9 + kernel/src/mm/mod.rs | 2 +- kernel/src/virt/mod.rs | 1 + kernel/src/virt/vm/kvm_dev.rs | 428 +++++++ kernel/src/virt/vm/kvm_host/mem.rs | 502 ++++++++ kernel/src/virt/vm/kvm_host/mod.rs | 236 +++- kernel/src/virt/vm/kvm_host/vcpu.rs | 115 +- kernel/src/virt/vm/mod.rs | 4 +- kernel/src/virt/vm/user_api.rs | 430 +++++++ user/apps/test_kvm/main.c | 20 +- 28 files changed, 5946 insertions(+), 120 deletions(-) create mode 100644 kernel/src/arch/x86_64/vm/asm.rs create mode 100644 kernel/src/arch/x86_64/vm/cpuid.rs create mode 100644 kernel/src/arch/x86_64/vm/kvm_host/lapic.rs create mode 100644 kernel/src/arch/x86_64/vm/mem.rs create mode 100644 kernel/src/arch/x86_64/vm/mmu.rs create mode 100644 kernel/src/arch/x86_64/vm/vmx/capabilities.rs create mode 100644 kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs create mode 100644 kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs create mode 100644 kernel/src/virt/vm/kvm_dev.rs create mode 100644 kernel/src/virt/vm/kvm_host/mem.rs create mode 100644 kernel/src/virt/vm/user_api.rs diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index 941fe9f8a..a62c492be 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -82,4 +82,4 @@ debug = true # Controls whether the compiler passes `-g` # The release profile, used for `cargo build --release` [profile.release] -debug = false +debug = true diff --git a/kernel/crates/bitmap/src/alloc_bitmap.rs b/kernel/crates/bitmap/src/alloc_bitmap.rs index d4c6c5107..36ee33e3d 100644 --- a/kernel/crates/bitmap/src/alloc_bitmap.rs +++ b/kernel/crates/bitmap/src/alloc_bitmap.rs @@ -2,7 +2,7 @@ use alloc::vec::Vec; use crate::{bitmap_core::BitMapCore, traits::BitMapOps}; -#[derive(Clone)] +#[derive(Debug, Clone)] pub struct AllocBitmap { elements: usize, data: Vec, diff --git a/kernel/src/arch/x86_64/kvm/vmx/vcpu.rs b/kernel/src/arch/x86_64/kvm/vmx/vcpu.rs index f940069f8..4b5e813fc 100644 --- a/kernel/src/arch/x86_64/kvm/vmx/vcpu.rs +++ b/kernel/src/arch/x86_64/kvm/vmx/vcpu.rs @@ -503,7 +503,7 @@ pub fn get_segment_base(gdt_base: *const u64, gdt_size: u16, segment_selector: u // } pub fn adjust_vmx_controls(ctl_min: u32, ctl_opt: u32, msr: u32, result: &mut u32) { let vmx_msr_low: u32 = unsafe { (msr::rdmsr(msr) & 0x0000_0000_FFFF_FFFF) as u32 }; - let vmx_msr_high: u32 = unsafe { (msr::rdmsr(msr) << 32) as u32 }; + let vmx_msr_high: u32 = unsafe { (msr::rdmsr(msr) >> 32) as u32 }; let mut ctl: u32 = ctl_min | ctl_opt; ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ diff --git a/kernel/src/arch/x86_64/mod.rs b/kernel/src/arch/x86_64/mod.rs index b2069f087..11c7ff27b 100644 --- a/kernel/src/arch/x86_64/mod.rs +++ b/kernel/src/arch/x86_64/mod.rs @@ -19,6 +19,7 @@ pub mod sched; pub mod smp; pub mod syscall; pub mod time; +pub mod vm; pub use self::pci::pci::X86_64PciArch as PciArch; @@ -38,3 +39,11 @@ pub use crate::arch::elf::X86_64ElfArch as CurrentElfArch; pub use crate::arch::smp::X86_64SMPArch as CurrentSMPArch; pub use crate::arch::sched::X86_64SchedArch as CurrentSchedArch; + +pub use crate::arch::vm::KvmArchManager as CurrentKvmManager; + +pub use crate::arch::vm::kvm_host::X86KvmArch as KvmArch; + +pub use crate::arch::vm::x86_kvm_ops as kvm_arch_ops; + +pub use crate::arch::vm::kvm_host::vcpu::X86VcpuArch as VirtCpuArch; diff --git a/kernel/src/arch/x86_64/vm/asm.rs b/kernel/src/arch/x86_64/vm/asm.rs new file mode 100644 index 000000000..92867c616 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/asm.rs @@ -0,0 +1,436 @@ +use core::arch::asm; + +use alloc::slice; +use raw_cpuid::CpuId; +use system_error::SystemError; +use x86::{ + bits64::vmx::vmxon, + controlregs::{cr0, cr0_write, cr4, cr4_write, Cr0, Cr4}, + msr::{ + rdmsr, wrmsr, IA32_FEATURE_CONTROL, IA32_VMX_CR0_FIXED0, IA32_VMX_CR0_FIXED1, + IA32_VMX_CR4_FIXED0, IA32_VMX_CR4_FIXED1, + }, +}; + +use crate::{ + arch::mm::barrier, + kdebug, + mm::{phys_2_virt, PhysAddr}, +}; + +pub struct KvmX86Asm; + +impl KvmX86Asm { + pub fn read_pkru() -> u32 { + let cpuid = CpuId::new(); + if let Some(feat) = cpuid.get_extended_feature_info() { + if feat.has_ospke() { + return Self::rdpkru(); + } + } + return 0; + } + + fn rdpkru() -> u32 { + let ecx: u32 = 0; + let pkru: u32; + let edx: u32; + + unsafe { + asm!( + "rdpkru", + out("eax") pkru, + out("edx") edx, + in("ecx") ecx, + ); + } + + pkru + } + + pub fn get_segment_base(gdt_base: *const u64, gdt_size: u16, segment_selector: u16) -> u64 { + let table = segment_selector & 0x0004; // get table indicator in selector + let index = (segment_selector >> 3) as usize; // get index in selector + if table == 0 && index == 0 { + return 0; + } + let descriptor_table = unsafe { slice::from_raw_parts(gdt_base, gdt_size.into()) }; + let descriptor = descriptor_table[index]; + + let base_high = (descriptor & 0xFF00_0000_0000_0000) >> 32; + let base_mid = (descriptor & 0x0000_00FF_0000_0000) >> 16; + let base_low = (descriptor & 0x0000_0000_FFFF_0000) >> 16; + let segment_base = (base_high | base_mid | base_low) & 0xFFFFFFFF; + let virtaddr = phys_2_virt(segment_base.try_into().unwrap()) + .try_into() + .unwrap(); + kdebug!( + "segment_base={:x}", + phys_2_virt(segment_base.try_into().unwrap()) + ); + return virtaddr; + } +} + +pub struct VmxAsm; + +impl VmxAsm { + pub fn vmclear(phys_addr: PhysAddr) { + kdebug!("vmclear addr {phys_addr:?}"); + match unsafe { x86::bits64::vmx::vmclear(phys_addr.data() as u64) } { + Ok(_) => {} + Err(e) => { + panic!("[VMX] vmclear failed! reason: {e:?}"); + } + } + } + + pub fn vmcs_load(phys_addr: PhysAddr) { + match unsafe { x86::bits64::vmx::vmptrld(phys_addr.data() as u64) } { + Ok(_) => {} + Err(e) => { + panic!("[VMX] vmptrld failed! reason: {e:?}"); + } + } + } + + /// vmrite the current VMCS. + pub fn vmx_vmwrite(vmcs_field: u32, value: u64) { + unsafe { + x86::bits64::vmx::vmwrite(vmcs_field, value) + .expect(&format!("vmcs_field: {:x} vmx_write fail", vmcs_field)) + } + } + + pub fn kvm_cpu_vmxon(phys_addr: PhysAddr) -> Result<(), SystemError> { + unsafe { + let mut cr4 = cr4(); + cr4.insert(Cr4::CR4_ENABLE_VMX); + cr4_write(cr4); + + Self::vmx_set_lock_bit()?; + Self::vmx_set_cr0_bits(); + Self::vmx_set_cr4_bits(); + kdebug!("vmxon addr {phys_addr:?}"); + + vmxon(phys_addr.data() as u64).expect("[VMX] vmxon failed! reason"); + + barrier::mfence(); + + Ok(()) + } + } + + /// Set the mandatory bits in CR4 and clear bits that are mandatory zero + /// (Intel Manual: 24.8 Restrictions on VMX Operation) + fn vmx_set_cr4_bits() { + let ia32_vmx_cr4_fixed0 = unsafe { rdmsr(IA32_VMX_CR4_FIXED0) }; + let ia32_vmx_cr4_fixed1 = unsafe { rdmsr(IA32_VMX_CR4_FIXED1) }; + + let mut cr4 = unsafe { cr4() }; + + cr4 |= Cr4::from_bits_truncate(ia32_vmx_cr4_fixed0 as usize); + cr4 &= Cr4::from_bits_truncate(ia32_vmx_cr4_fixed1 as usize); + + unsafe { cr4_write(cr4) }; + } + + /// Check if we need to set bits in IA32_FEATURE_CONTROL + // (Intel Manual: 24.7 Enabling and Entering VMX Operation) + fn vmx_set_lock_bit() -> Result<(), SystemError> { + const VMX_LOCK_BIT: u64 = 1 << 0; + const VMXON_OUTSIDE_SMX: u64 = 1 << 2; + + let ia32_feature_control = unsafe { rdmsr(IA32_FEATURE_CONTROL) }; + + if (ia32_feature_control & VMX_LOCK_BIT) == 0 { + unsafe { + wrmsr( + IA32_FEATURE_CONTROL, + VMXON_OUTSIDE_SMX | VMX_LOCK_BIT | ia32_feature_control, + ) + }; + } else if (ia32_feature_control & VMXON_OUTSIDE_SMX) == 0 { + return Err(SystemError::EPERM); + } + + Ok(()) + } + + /// Set the mandatory bits in CR0 and clear bits that are mandatory zero + /// (Intel Manual: 24.8 Restrictions on VMX Operation) + fn vmx_set_cr0_bits() { + let ia32_vmx_cr0_fixed0 = unsafe { rdmsr(IA32_VMX_CR0_FIXED0) }; + let ia32_vmx_cr0_fixed1 = unsafe { rdmsr(IA32_VMX_CR0_FIXED1) }; + + let mut cr0 = unsafe { cr0() }; + + cr0 |= Cr0::from_bits_truncate(ia32_vmx_cr0_fixed0 as usize); + cr0 &= Cr0::from_bits_truncate(ia32_vmx_cr0_fixed1 as usize); + + unsafe { cr0_write(cr0) }; + } +} + +bitflags! { + pub struct MiscEnable: u64 { + const MSR_IA32_MISC_ENABLE_FAST_STRING = 1 << 0; + const MSR_IA32_MISC_ENABLE_TCC = 1 << 1; + const MSR_IA32_MISC_ENABLE_EMON = 1 << 7; + const MSR_IA32_MISC_ENABLE_BTS_UNAVAIL = 1 << 11; + const MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL = 1 << 12; + const MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP = 1 << 16; + const MSR_IA32_MISC_ENABLE_MWAIT = 1 << 18; + const MSR_IA32_MISC_ENABLE_LIMIT_CPUID= 1 << 22; + const MSR_IA32_MISC_ENABLE_XTPR_DISABLE = 1 << 23; + const MSR_IA32_MISC_ENABLE_XD_DISABLE = 1 << 34; + } + + pub struct ArchCapabilities: u64 { + /// Not susceptible to Meltdown + const ARCH_CAP_RDCL_NO = 1 << 0; + /// Enhanced IBRS support + const ARCH_CAP_IBRS_ALL = 1 << 1; + /// RET may use alternative branch predictors + const ARCH_CAP_RSBA = 1 << 2; + /// Skip L1D flush on vmentry + const ARCH_CAP_SKIP_VMENTRY_L1DFLUSH = 1 << 3; + /// + /// Not susceptible to Speculative Store Bypass + /// attack, so no Speculative Store Bypass + /// control required. + /// + const ARCH_CAP_SSB_NO = 1 << 4; + /// Not susceptible to + /// Microarchitectural Data + /// Sampling (MDS) vulnerabilities. + const ARCH_CAP_MDS_NO = 1 << 5; + /// The processor is not susceptible to a + /// machine check error due to modifying the + /// code page size along with either the + /// physical address or cache type + /// without TLB invalidation. + const ARCH_CAP_PSCHANGE_MC_NO = 1 << 6; + /// MSR for TSX control is available. + const ARCH_CAP_TSX_CTRL_MSR = 1 << 7; + /// Not susceptible to + /// TSX Async Abort (TAA) vulnerabilities. + const ARCH_CAP_TAA_NO = 1 << 8; + /// Not susceptible to SBDR and SSDP + /// variants of Processor MMIO stale data + /// vulnerabilities. + const ARCH_CAP_SBDR_SSDP_NO = 1 << 13; + /// Not susceptible to FBSDP variant of + /// Processor MMIO stale data + /// vulnerabilities. + const ARCH_CAP_FBSDP_NO = 1 << 14; + /// Not susceptible to PSDP variant of + /// Processor MMIO stale data + /// vulnerabilities. + const ARCH_CAP_PSDP_NO = 1 << 15; + /// VERW clears CPU fill buffer + /// even on MDS_NO CPUs. + const ARCH_CAP_FB_CLEAR = 1 << 17; + /// MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS] + /// bit available to control VERW + /// behavior. + const ARCH_CAP_FB_CLEAR_CTRL = 1 << 18; + /// Indicates RET may use predictors + /// other than the RSB. With eIBRS + /// enabled predictions in kernel mode + /// are restricted to targets in + /// kernel. + const ARCH_CAP_RRSBA = 1 << 19; + /// Not susceptible to Post-Barrier + /// Return Stack Buffer Predictions. + const ARCH_CAP_PBRSB_NO = 1 << 24; + /// CPU is vulnerable to Gather + /// Data Sampling (GDS) and + /// has controls for mitigation. + const ARCH_CAP_GDS_CTRL = 1 << 25; + /// CPU is not vulnerable to Gather + /// Data Sampling (GDS). + const ARCH_CAP_GDS_NO = 1 << 26; + /// IA32_XAPIC_DISABLE_STATUS MSR + /// supported + const ARCH_CAP_XAPIC_DISABLE = 1 << 21; + + const KVM_SUPPORTED_ARCH_CAP = ArchCapabilities::ARCH_CAP_RDCL_NO.bits + | ArchCapabilities::ARCH_CAP_IBRS_ALL.bits + | ArchCapabilities::ARCH_CAP_RSBA.bits + | ArchCapabilities::ARCH_CAP_SKIP_VMENTRY_L1DFLUSH.bits + | ArchCapabilities::ARCH_CAP_SSB_NO.bits + | ArchCapabilities::ARCH_CAP_MDS_NO.bits + | ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO.bits + | ArchCapabilities::ARCH_CAP_TSX_CTRL_MSR.bits + | ArchCapabilities::ARCH_CAP_TAA_NO.bits + | ArchCapabilities::ARCH_CAP_SBDR_SSDP_NO.bits + | ArchCapabilities::ARCH_CAP_FBSDP_NO.bits + | ArchCapabilities::ARCH_CAP_PSDP_NO.bits + | ArchCapabilities::ARCH_CAP_FB_CLEAR.bits + | ArchCapabilities::ARCH_CAP_RRSBA.bits + | ArchCapabilities::ARCH_CAP_PBRSB_NO.bits + | ArchCapabilities::ARCH_CAP_GDS_NO.bits; + } +} + +#[derive(Debug, Default, Copy, Clone)] +pub struct MsrData { + pub host_initiated: bool, + pub index: u32, + pub data: u64, +} + +#[derive(Debug, Default, Copy, Clone)] +pub struct KvmMsrEntry { + pub index: u32, + pub reserved: u32, + pub data: u64, +} + +pub mod hyperv { + /* Hyper-V specific model specific registers (MSRs) */ + + /* MSR used to identify the guest OS. */ + pub const HV_X64_MSR_GUEST_OS_ID: u32 = 0x40000000; + + /* MSR used to setup pages used to communicate with the hypervisor. */ + pub const HV_X64_MSR_HYPERCALL: u32 = 0x40000001; + + /* MSR used to provide vcpu index */ + pub const HV_REGISTER_VP_INDEX: u32 = 0x40000002; + + /* MSR used to reset the guest OS. */ + pub const HV_X64_MSR_RESET: u32 = 0x40000003; + + /* MSR used to provide vcpu runtime in 100ns units */ + pub const HV_X64_MSR_VP_RUNTIME: u32 = 0x40000010; + + /* MSR used to read the per-partition time reference counter */ + pub const HV_REGISTER_TIME_REF_COUNT: u32 = 0x40000020; + + /* A partition's reference time stamp counter (TSC) page */ + pub const HV_REGISTER_REFERENCE_TSC: u32 = 0x40000021; + + /* MSR used to retrieve the TSC frequency */ + pub const HV_X64_MSR_TSC_FREQUENCY: u32 = 0x40000022; + + /* MSR used to retrieve the local APIC timer frequency */ + pub const HV_X64_MSR_APIC_FREQUENCY: u32 = 0x40000023; + + /* Define the virtual APIC registers */ + pub const HV_X64_MSR_EOI: u32 = 0x40000070; + pub const HV_X64_MSR_ICR: u32 = 0x40000071; + pub const HV_X64_MSR_TPR: u32 = 0x40000072; + pub const HV_X64_MSR_VP_ASSIST_PAGE: u32 = 0x40000073; + + /* Define synthetic interrupt controller model specific registers. */ + pub const HV_REGISTER_SCONTROL: u32 = 0x40000080; + pub const HV_REGISTER_SVERSION: u32 = 0x40000081; + pub const HV_REGISTER_SIEFP: u32 = 0x40000082; + pub const HV_REGISTER_SIMP: u32 = 0x40000083; + pub const HV_REGISTER_EOM: u32 = 0x40000084; + pub const HV_REGISTER_SINT0: u32 = 0x40000090; + pub const HV_REGISTER_SINT1: u32 = 0x40000091; + pub const HV_REGISTER_SINT2: u32 = 0x40000092; + pub const HV_REGISTER_SINT3: u32 = 0x40000093; + pub const HV_REGISTER_SINT4: u32 = 0x40000094; + pub const HV_REGISTER_SINT5: u32 = 0x40000095; + pub const HV_REGISTER_SINT6: u32 = 0x40000096; + pub const HV_REGISTER_SINT7: u32 = 0x40000097; + pub const HV_REGISTER_SINT8: u32 = 0x40000098; + pub const HV_REGISTER_SINT9: u32 = 0x40000099; + pub const HV_REGISTER_SINT10: u32 = 0x4000009A; + pub const HV_REGISTER_SINT11: u32 = 0x4000009B; + pub const HV_REGISTER_SINT12: u32 = 0x4000009C; + pub const HV_REGISTER_SINT13: u32 = 0x4000009D; + pub const HV_REGISTER_SINT14: u32 = 0x4000009E; + pub const HV_REGISTER_SINT15: u32 = 0x4000009F; + + /* + * Define synthetic interrupt controller model specific registers for + * nested hypervisor. + */ + pub const HV_REGISTER_NESTED_SCONTROL: u32 = 0x40001080; + pub const HV_REGISTER_NESTED_SVERSION: u32 = 0x40001081; + pub const HV_REGISTER_NESTED_SIEFP: u32 = 0x40001082; + pub const HV_REGISTER_NESTED_SIMP: u32 = 0x40001083; + pub const HV_REGISTER_NESTED_EOM: u32 = 0x40001084; + pub const HV_REGISTER_NESTED_SINT0: u32 = 0x40001090; + + /* + * Synthetic Timer MSRs. Four timers per vcpu. + */ + pub const HV_REGISTER_STIMER0_CONFIG: u32 = 0x400000B0; + pub const HV_REGISTER_STIMER0_COUNT: u32 = 0x400000B1; + pub const HV_REGISTER_STIMER1_CONFIG: u32 = 0x400000B2; + pub const HV_REGISTER_STIMER1_COUNT: u32 = 0x400000B3; + pub const HV_REGISTER_STIMER2_CONFIG: u32 = 0x400000B4; + pub const HV_REGISTER_STIMER2_COUNT: u32 = 0x400000B5; + pub const HV_REGISTER_STIMER3_CONFIG: u32 = 0x400000B6; + pub const HV_REGISTER_STIMER3_COUNT: u32 = 0x400000B7; + + /* Hyper-V guest idle MSR */ + pub const HV_X64_MSR_GUEST_IDLE: u32 = 0x400000F0; + + /* Hyper-V guest crash notification MSR's */ + pub const HV_REGISTER_CRASH_P0: u32 = 0x40000100; + pub const HV_REGISTER_CRASH_P1: u32 = 0x40000101; + pub const HV_REGISTER_CRASH_P2: u32 = 0x40000102; + pub const HV_REGISTER_CRASH_P3: u32 = 0x40000103; + pub const HV_REGISTER_CRASH_P4: u32 = 0x40000104; + pub const HV_REGISTER_CRASH_CTL: u32 = 0x40000105; + + /* TSC emulation after migration */ + pub const HV_X64_MSR_REENLIGHTENMENT_CONTROL: u32 = 0x40000106; + pub const HV_X64_MSR_TSC_EMULATION_CONTROL: u32 = 0x40000107; + pub const HV_X64_MSR_TSC_EMULATION_STATUS: u32 = 0x40000108; + + /* TSC invariant control */ + pub const HV_X64_MSR_TSC_INVARIANT_CONTROL: u32 = 0x40000118; + + /* + * The defines related to the synthetic debugger are required by KDNet, but + * they are not documented in the Hyper-V TLFS because the synthetic debugger + * functionality has been deprecated and is subject to removal in future + * versions of Windows. + */ + pub const HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: u32 = 0x40000080; + pub const HYPERV_CPUID_SYNDBG_INTERFACE: u32 = 0x40000081; + pub const HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: u32 = 0x40000082; + + /* + * Hyper-V synthetic debugger platform capabilities + * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits. + */ + pub const HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING: u32 = 1 << 1; + + /* Hyper-V Synthetic debug options MSR */ + pub const HV_X64_MSR_SYNDBG_CONTROL: u32 = 0x400000F1; + pub const HV_X64_MSR_SYNDBG_STATUS: u32 = 0x400000F2; + pub const HV_X64_MSR_SYNDBG_SEND_BUFFER: u32 = 0x400000F3; + pub const HV_X64_MSR_SYNDBG_RECV_BUFFER: u32 = 0x400000F4; + pub const HV_X64_MSR_SYNDBG_PENDING_BUFFER: u32 = 0x400000F5; + pub const HV_X64_MSR_SYNDBG_OPTIONS: u32 = 0x400000FF; +} + +pub mod kvm_msr { + pub const MSR_KVM_WALL_CLOCK: u32 = 0x11; + pub const MSR_KVM_SYSTEM_TIME: u32 = 0x12; + + /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ + pub const MSR_KVM_WALL_CLOCK_NEW: u32 = 0x4b564d00; + pub const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b564d01; + pub const MSR_KVM_ASYNC_PF_EN: u32 = 0x4b564d02; + pub const MSR_KVM_STEAL_TIME: u32 = 0x4b564d03; + pub const MSR_KVM_PV_EOI_EN: u32 = 0x4b564d04; + pub const MSR_KVM_POLL_CONTROL: u32 = 0x4b564d05; + pub const MSR_KVM_ASYNC_PF_INT: u32 = 0x4b564d06; + pub const MSR_KVM_ASYNC_PF_ACK: u32 = 0x4b564d07; + pub const MSR_KVM_MIGRATION_CONTROL: u32 = 0x4b564d08; + + pub const PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x00000016; + pub const CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x0401e172; + pub const VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x00036dff; + pub const VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x000011ff; +} diff --git a/kernel/src/arch/x86_64/vm/cpuid.rs b/kernel/src/arch/x86_64/vm/cpuid.rs new file mode 100644 index 000000000..28e91d6a9 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/cpuid.rs @@ -0,0 +1,58 @@ +use alloc::vec::Vec; + +#[derive(Debug, Default, Clone, Copy)] +pub struct KvmCpuidEntry2 { + pub function: u32, + pub index: u32, + pub flags: KvmCpuidFlag, + pub eax: u32, + pub ebx: u32, + pub ecx: u32, + pub edx: u32, + padding: [u32; 3], +} + +impl KvmCpuidEntry2 { + pub fn find( + entries: &Vec, + function: u32, + index: Option, + ) -> Option { + for e in entries { + if e.function != function { + continue; + } + + if !e + .flags + .contains(KvmCpuidFlag::KVM_CPUID_FLAG_SIGNIFCANT_INDEX) + || Some(e.index) == index + { + return Some(*e); + } + + if index.is_none() { + return Some(*e); + } + } + + None + } +} + +bitflags! { + pub struct KvmCpuidFlag: u32 { + /// 表示CPUID函数的输入索引值是重要的,它会影响CPUID函数的行为或返回值 + const KVM_CPUID_FLAG_SIGNIFCANT_INDEX = 1 << 0; + /// 表示CPUID函数是有状态的,即它的行为可能受到先前CPUID函数调用的影响 + const KVM_CPUID_FLAG_STATEFUL_FUNC = 1 << 1; + /// 表示CPUID函数的状态应该在下一次CPUID函数调用中读取 + const KVM_CPUID_FLAG_STATE_READ_NEXT = 1 << 2; + } +} + +impl Default for KvmCpuidFlag { + fn default() -> Self { + Self::empty() + } +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs new file mode 100644 index 000000000..dc41c2371 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs @@ -0,0 +1,43 @@ +use crate::{arch::kvm_arch_ops, virt::vm::kvm_host::vcpu::VirtCpu}; + +const APIC_DEFAULT_PHYS_BASE: u64 = 0xfee00000; +const MSR_IA32_APICBASE: u64 = 0x0000001b; +const MSR_IA32_APICBASE_BSP: u64 = (1 << 8); +const MSR_IA32_APICBASE_ENABLE: u64 = (1 << 11); +const MSR_IA32_APICBASE_BASE: u64 = (0xfffff << 12); + +impl VirtCpu { + pub fn lapic_reset(&mut self, init_event: bool) { + let apic = self.arch.apic; + + kvm_arch_ops().apicv_pre_state_restore(self); + + if !init_event { + let mut msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; + if self.kvm().lock().arch.bsp_vcpu_id == self.vcpu_id { + msr_val |= MSR_IA32_APICBASE_BSP; + } + } + } + + fn lapic_set_base(&mut self, value: u64) { + let old_val = self.arch.apic_base; + let apic = self.arch.apic; + + self.arch.apic_base = value; + + if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 { + // TODO: kvm_update_cpuid_runtime(vcpu); + } + + if apic.is_none() { + return; + } + + if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 { + if value & MSR_IA32_APICBASE_ENABLE != 0 {} + } + + todo!() + } +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs index dfd7501d4..5b0150e90 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -1,40 +1,298 @@ +use core::fmt::Debug; + +use alloc::{boxed::Box, vec::Vec}; +use bit_field::BitField; +use bitmap::{traits::BitMapOps, AllocBitmap}; use system_error::SystemError; +use x86::{ + bits64::rflags::RFlags, + controlregs::{Cr0, Cr4}, +}; +use x86_64::registers::control::EferFlags; -pub mod vcpu; +use crate::{ + smp::cpu::ProcessorId, + virt::vm::kvm_host::{ + vcpu::VirtCpu, Vm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, KVM_USERSAPCE_IRQ_SOURCE_ID, + }, +}; + +use crate::arch::VirtCpuArch; -type SysResult = Result; +use super::{ + asm::{KvmMsrEntry, MsrData}, + vmx::vmx_info, + x86_kvm_manager, x86_kvm_ops, +}; + +pub mod lapic; +pub mod vcpu; +#[derive(Debug, Default)] pub struct X86KvmArch { /// 中断芯片模式 irqchip_mode: KvmIrqChipMode, - /// 负责引导(bootstrap)kvm的vcpu——id + /// 负责引导(bootstrap)kvm的vcpu_id bsp_vcpu_id: usize, + pub pause_in_guest: bool, + pub cstate_in_guest: bool, + irq_sources_bitmap: u64, + default_tsc_khz: u64, + guest_can_read_msr_platform_info: bool, + apicv_inhibit_reasons: usize, + + msr_fliter: Option>, } -#[derive(Debug, Clone, Copy)] +impl X86KvmArch { + pub fn init(kvm_type: usize) -> Result { + if kvm_type != 0 { + return Err(SystemError::EINVAL); + } + let mut arch = x86_kvm_ops().vm_init(); + + // 设置中断源位图 + arch.irq_sources_bitmap + .set_bit(KVM_USERSAPCE_IRQ_SOURCE_ID, true) + .set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, true); + + arch.default_tsc_khz = x86_kvm_manager().max_tsc_khz; + arch.guest_can_read_msr_platform_info = true; + + arch.apicv_init(); + Ok(arch) + } + + fn apicv_init(&mut self) { + self.apicv_inhibit_reasons + .set_bit(KvmApicvInhibit::ABSENT, true); + + if !vmx_info().enable_apicv { + self.apicv_inhibit_reasons + .set_bit(KvmApicvInhibit::DISABLE, true); + } + } + + pub fn msr_allowed(&self, msr: u32, ftype: MsrFilterType) -> bool { + // x2APIC MSRs + if msr >= 0x800 && msr <= 0x8ff { + return true; + } + + if let Some(msr_filter) = &self.msr_fliter { + let mut allowed = msr_filter.default_allow; + + for i in 0..msr_filter.count as usize { + let range = &msr_filter.ranges[i]; + let start = range.base; + let end = start + range.nmsrs; + let flags = range.flags; + let bitmap = &range.bitmap; + if msr >= start && msr < end && flags.contains(ftype) { + allowed = bitmap.get((msr - start) as usize).unwrap_or(false); + break; + } + } + + return allowed; + } else { + return true; + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] pub enum KvmIrqChipMode { None, Kernel, Split, } -pub trait KvmFunc: Send + Sync { +impl Default for KvmIrqChipMode { + fn default() -> Self { + Self::None + } +} + +pub trait KvmInitFunc { + fn hardware_setup(&self) -> Result<(), SystemError>; + fn handle_intel_pt_intr(&self) -> u32; + fn runtime_funcs(&self) -> &'static dyn KvmFunc; +} + +pub trait KvmFunc: Send + Sync + Debug { /// 返回该硬件支持的名字,例如“Vmx” fn name(&self) -> &'static str; /// 启用硬件支持 - /// (注:只有dummy实现能够返回ENOSYS错误码,表示未指定) - fn hardware_enable(&self) -> SysResult<()>; + fn hardware_enable(&self) -> Result<(), SystemError>; + + fn vm_init(&self) -> X86KvmArch; + + fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm); + + fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: ProcessorId); + + fn cache_reg(&self, vcpu: &VirtCpuArch, reg: KvmReg); + + fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu); + + fn set_msr(&self, vcpu: &mut VirtCpuArch, msr: MsrData); + + fn set_rflags(&self, vcpu: &mut VirtCpu, rflags: RFlags); + + fn get_rflags(&self, vcpu: &VirtCpu) -> RFlags; + + fn set_cr0(&self, vcpu: &mut VirtCpu, cr0: Cr0); + + fn set_cr4(&self, vcpu: &mut VirtCpu, cr4: Cr4); + + fn set_efer(&self, vcpu: &mut VirtCpu, efer: EferFlags); + + fn update_exception_bitmap(&self, vcpu: &mut VirtCpu); + + fn vcpu_reset(&self, vcpu: &mut VirtCpu, init_event: bool); + + fn has_emulated_msr(&self, msr: u32) -> bool; + + fn get_msr_feature(&self, msr: &mut KvmMsrEntry) -> bool; } -pub struct DummyKvmFunc; +/// ## 中断抑制的原因位 +#[derive(Debug)] +pub struct KvmApicvInhibit; + +impl KvmApicvInhibit { + // Intel与AMD共用 + + /// APIC 加速功能被模块参数禁用,或者硬件不支持 + pub const DISABLE: usize = 0; + + /// Hyper-V 客户机正在使用 AutoEOI 功能,导致 APIC 加速被禁用。 + pub const HYPERV: usize = 1; + + /// 因为用户空间尚未启用内核或分裂的中断控制器,导致 APIC 加速被禁用。 + pub const ABSENT: usize = 2; + + /// KVM_GUESTDBG_BLOCKIRQ(一种调试措施,用于阻止该 vCPU 上的所有中断)被启用,以避免 AVIC/APICv 绕过此功能。 + pub const BLOCKIRQ: usize = 3; + + /// 当所有 vCPU 的 APIC ID 和 vCPU 的 1:1 映射被更改且 KVM 未应用其 x2APIC 热插拔修补程序时,APIC 加速被禁用。 + pub const PHYSICAL_ID_ALIASED: usize = 4; + + /// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。 + pub const APIC_ID_MODIFIED: usize = 5; + /// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。 + pub const APIC_BASE_MODIFIED: usize = 6; + + // 仅仅对AMD适用 + + /// 当 vCPU 运行嵌套客户机时,AVIC 被禁用。因为与 APICv 不同,当 vCPU 运行嵌套时,该 vCPU 的同级无法使用门铃机制通过 AVIC 信号中断。 + pub const NESTED: usize = 7; + + /// 在 SVM 上,等待 IRQ 窗口的实现使用挂起的虚拟中断,而在 KVM 等待 IRQ 窗口时无法注入这些虚拟中断,因此在等待 IRQ 窗口时 AVIC 被禁用。 + pub const IRQWIN: usize = 8; + + /// PIT(i8254)的“重新注入”模式依赖于 EOI 拦截,而 AVIC 不支持边沿触发中断的 EOI 拦截。 + pub const PIT_REINJ: usize = 9; + + /// SEV 不支持 AVIC,因此 AVIC 被禁用。 + pub const SEV: usize = 10; -impl KvmFunc for DummyKvmFunc { - fn name(&self) -> &'static str { - "kvm_dummy_ops" + /// 当所有带有有效 LDR 的 vCPU 之间的逻辑 ID 和 vCPU 的 1:1 映射被更改时,AVIC 被禁用。 + pub const LOGICAL_ID_ALIASED: usize = 11; +} + +#[derive(Debug)] +pub struct KvmX86MsrFilter { + count: u8, + default_allow: bool, + ranges: Vec, +} + +#[derive(Debug)] +pub struct KernelMsrRange { + pub flags: MsrFilterType, + pub nmsrs: u32, + pub base: u32, + pub bitmap: AllocBitmap, +} + +#[repr(C)] +pub struct PosixMsrFilterRange { + pub flags: u32, + pub nmsrs: u32, + pub base: u32, + pub bitmap: *const u8, +} + +bitflags! { + pub struct MsrFilterType: u8 { + const KVM_MSR_FILTER_READ = 1 << 0; + const KVM_MSR_FILTER_WRITE = 1 << 1; } +} - fn hardware_enable(&self) -> SysResult<()> { - Err(SystemError::ENOSYS) +#[derive(Debug, Clone, Copy)] +pub enum KvmReg { + VcpuRegsRax = 0, + VcpuRegsRcx = 1, + VcpuRegsRdx = 2, + VcpuRegsRbx = 3, + VcpuRegsRsp = 4, + VcpuRegsRbp = 5, + VcpuRegsRsi = 6, + VcpuRegsRdi = 7, + + VcpuRegsR8 = 8, + VcpuRegsR9 = 9, + VcpuRegsR10 = 10, + VcpuRegsR11 = 11, + VcpuRegsR12 = 12, + VcpuRegsR13 = 13, + VcpuRegsR14 = 14, + VcpuRegsR15 = 15, + + VcpuRegsRip = 16, + NrVcpuRegs = 17, + + VcpuExregCr0, + VcpuExregCr3, + VcpuExregCr4, + VcpuExregRflags, + VcpuExregSegments, + VcpuExregExitInfo1, + VcpuExregExitInfo2, +} + +bitflags! { + pub struct HFlags: u8 { + const HF_GUEST_MASK = 1 << 0; /* VCPU is in guest-mode */ + const HF_SMM_MASK = 1 << 1; + const HF_SMM_INSIDE_NMI_MASK = 1 << 2; } } + +/// ### 虚拟机的通用寄存器 +#[derive(Debug, Default, Clone, Copy)] +#[repr(C)] +pub struct KvmCommonRegs { + rax: u64, + rbx: u64, + rcx: u64, + rdx: u64, + rsi: u64, + rdi: u64, + rsp: u64, + rbp: u64, + r8: u64, + r9: u64, + r10: u64, + r11: u64, + r12: u64, + r13: u64, + r14: u64, + r15: u64, + rip: u64, + rflags: u64, +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs index f06bed31e..9d7be8976 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -1,31 +1,559 @@ +use core::intrinsics::unlikely; + +use alloc::{boxed::Box, sync::Arc, vec::Vec}; +use bitmap::{traits::BitMapOps, AllocBitmap, StaticBitmap}; +use raw_cpuid::CpuId; +use system_error::SystemError; +use x86::{ + bits64::rflags::RFlags, + controlregs::{Cr0, Cr4}, + msr::{ + IA32_APIC_BASE, IA32_CSTAR, IA32_FS_BASE, IA32_GS_BASE, IA32_KERNEL_GSBASE, IA32_LSTAR, + IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, IA32_TSC_AUX, + }, +}; +use x86_64::registers::control::EferFlags; + use crate::{ - smp::cpu::ProcessorId, - virt::vm::kvm_host::vcpu::{MutilProcessorState, VirtCpu}, + arch::{ + kvm_arch_ops, + vm::{ + asm::{KvmX86Asm, MiscEnable, MsrData}, + cpuid::KvmCpuidEntry2, + kvm_host::KvmReg, + mmu::{KvmMmu, LockedKvmMmu}, + vmx::vmcs::LoadedVmcs, + x86_kvm_manager, x86_kvm_manager_mut, x86_kvm_ops, + }, + }, + kdebug, kerror, + mm::{PhysAddr, VirtAddr}, + smp::{core::smp_get_processor_id, cpu::ProcessorId}, + virt::vm::kvm_host::{ + mem::GfnToHvaCache, + vcpu::{GuestDebug, VirtCpu}, + LockedVm, MutilProcessorState, Vm, + }, }; +use super::{HFlags, KvmCommonRegs, KvmIrqChipMode}; + #[derive(Debug)] pub struct X86VcpuArch { /// 最近一次尝试进入虚拟机的主机cpu last_vmentry_cpu: ProcessorId, /// 可用寄存器数量 - regs_avail: u32, + regs_avail: AllocBitmap, /// 脏寄存器数量 - regs_dirty: u32, + regs_dirty: AllocBitmap, /// 多处理器状态 mp_state: MutilProcessorState, + pub apic_base: u64, + /// apic + pub apic: Option<()>, + /// 主机pkru寄存器 + host_pkru: u32, + /// hflag + hflags: HFlags, + + pub guest_state_protected: bool, + + pub cpuid_entries: Vec, + + pub exception: KvmQueuedException, + pub exception_vmexit: KvmQueuedException, + pub apf: KvmAsyncPageFault, + + pub smbase: u64, + + pub interrupt: KvmQueuedInterrupt, + + pub tsc_offset_adjustment: u64, + + pub mmu: Option>, + pub root_mmu: Option>, + pub guset_mmu: Option>, + pub walk_mmu: Option>, + pub nested_mmu: Option>, + + pub max_phyaddr: usize, + + pub regs: [u64; KvmReg::NrVcpuRegs as usize], + + pub cr0: Cr0, + pub cr0_guest_owned_bits: Cr0, + pub cr2: usize, + pub cr3: usize, + pub cr4: Cr4, + pub cr4_guest_owned_bits: Cr4, + pub cr4_guest_rsvd_bits: usize, + pub cr8: usize, + pub efer: EferFlags, + + pub dr6: usize, + pub dr7: usize, + + pub single_step_rip: usize, + + pub msr_misc_features_enables: u64, + pub ia32_misc_enable_msr: MiscEnable, + + pub smi_pending: bool, + pub smi_count: u64, + pub nmi_queued: usize, + /// 待注入的 NMI 数量,不包括硬件 vNMI。 + pub nmi_pending: u32, + pub nmi_injected: bool, + + pub db: [usize; Self::KVM_NR_DB_REGS], } -impl VirtCpu { - pub fn init_arch(&mut self) {} +impl X86VcpuArch { + const KVM_NR_DB_REGS: usize = 4; + + #[inline(never)] + pub fn new() -> Self { + let mut ret: Box = unsafe { Box::new_zeroed().assume_init() }; + ret.last_vmentry_cpu = ProcessorId::INVALID; + ret.regs_avail = AllocBitmap::new(32); + ret.regs_dirty = AllocBitmap::new(32); + ret.mp_state = MutilProcessorState::Runnable; + *ret + } + + pub fn lapic_in_kernel(&self) -> bool { + if x86_kvm_manager().has_noapic_vcpu { + return self.apic.is_some(); + } + true + } + + pub fn read_cr0_bits(&mut self, mask: Cr0) -> Cr0 { + let tmask = mask & (Cr0::CR0_TASK_SWITCHED | Cr0::CR0_WRITE_PROTECT); + if tmask.contains(self.cr0_guest_owned_bits) + && !self + .regs_avail + .get(KvmReg::VcpuExregCr0 as usize) + .unwrap_or_default() + { + x86_kvm_ops().cache_reg(self, KvmReg::VcpuExregCr0); + } + + return self.cr0 & mask; + } + + pub fn read_cr4_bits(&mut self, mask: Cr4) -> Cr4 { + let tmask = mask + & (Cr4::CR4_VIRTUAL_INTERRUPTS + | Cr4::CR4_DEBUGGING_EXTENSIONS + | Cr4::CR4_ENABLE_PPMC + | Cr4::CR4_ENABLE_SSE + | Cr4::CR4_UNMASKED_SSE + | Cr4::CR4_ENABLE_GLOBAL_PAGES + | Cr4::CR4_TIME_STAMP_DISABLE + | Cr4::CR4_ENABLE_FSGSBASE); + + if tmask.contains(self.cr4_guest_owned_bits) + && !self + .regs_avail + .get(KvmReg::VcpuExregCr4 as usize) + .unwrap_or_default() + { + x86_kvm_ops().cache_reg(self, KvmReg::VcpuExregCr4) + } + + return self.cr4 & mask; + } + + #[inline] + pub fn is_smm(&self) -> bool { + self.hflags.contains(HFlags::HF_SMM_MASK) + } + + #[inline] + pub fn is_guest_mode(&self) -> bool { + self.hflags.contains(HFlags::HF_GUEST_MASK) + } + + #[inline] + fn clear_interrupt_queue(&mut self) { + self.interrupt.injected = false; + } + + #[inline] + fn clear_exception_queue(&mut self) { + self.exception.pending = false; + self.exception.injected = false; + self.exception_vmexit.pending = false; + } + + pub fn set_msr(&mut self, index: u32, data: u64, host_initiated: bool) { + match index { + IA32_FS_BASE | IA32_GS_BASE | IA32_KERNEL_GSBASE | IA32_CSTAR | IA32_LSTAR => { + if VirtAddr::new(data as usize).is_canonical() { + return; + } + } + + IA32_SYSENTER_EIP | IA32_SYSENTER_ESP => { + // 需要将Data转为合法地址,但是现在先这样写 + assert!(VirtAddr::new(data as usize).is_canonical()); + } + IA32_TSC_AUX => { + if x86_kvm_manager() + .find_user_return_msr_idx(IA32_TSC_AUX) + .is_none() + { + return; + } + + todo!() + } + _ => {} + } + + let msr_data = MsrData { + host_initiated, + index, + data, + }; + + return kvm_arch_ops().set_msr(self, msr_data); + } + + pub fn update_cpuid_runtime(&mut self, entries: &Vec) { + let cpuid = CpuId::new(); + let feat = cpuid.get_feature_info().unwrap(); + let base = KvmCpuidEntry2::find(entries, 1, None); + if let Some(base) = base { + if feat.has_xsave() {} + } + + todo!() + } + + #[inline] + fn mark_register_dirty(&mut self, reg: KvmReg) { + self.regs_avail.set(reg as usize, true); + self.regs_dirty.set(reg as usize, true); + } + + #[inline] + fn write_reg(&mut self, reg: KvmReg, data: u64) { + self.regs[reg as usize] = data; + } + + #[inline] + fn write_reg_raw(&mut self, reg: KvmReg, data: u64) { + self.regs[reg as usize] = data; + self.mark_register_dirty(reg); + } + + #[inline] + fn read_reg(&self, reg: KvmReg) -> u64 { + return self.regs[reg as usize]; + } + + #[inline] + fn read_reg_raw(&self, reg: KvmReg) -> u64 { + if self.regs_avail.get(reg as usize) == Some(true) { + kvm_arch_ops().cache_reg(self, reg); + } + + return self.regs[reg as usize]; + } + + #[inline] + fn get_linear_rip(&mut self) -> u64 { + if self.guest_state_protected { + return 0; + } + return self.read_reg_raw(KvmReg::VcpuRegsRip); + } } -impl Default for X86VcpuArch { - fn default() -> Self { - Self { - last_vmentry_cpu: ProcessorId::INVALID, - regs_avail: !0, - regs_dirty: !0, - mp_state: MutilProcessorState::Runnable, +impl VirtCpu { + pub fn init_arch(&mut self, vm: &Vm) { + self.arch.last_vmentry_cpu = ProcessorId::INVALID; + self.arch.regs_avail.set_all(true); + self.arch.regs_dirty.set_all(true); + + if vm.arch.irqchip_mode != KvmIrqChipMode::None || vm.arch.bsp_vcpu_id == self.vcpu_id { + self.arch.mp_state = MutilProcessorState::Runnable; + } else { + self.arch.mp_state = MutilProcessorState::Uninitialized; } + + self.arch.vcpu_arch_mmu_create(); + + if vm.arch.irqchip_mode != KvmIrqChipMode::None { + todo!() + } else { + x86_kvm_manager_mut().has_noapic_vcpu = true; + } + + x86_kvm_ops().vcpu_create(self, vm); + + self.load(); + self.vcpu_reset(false); + self.arch.kvm_init_mmu(); + } + + pub fn run(&mut self) -> Result { + self.load(); + todo!() + } + + #[inline] + fn load(&mut self) { + self.arch_vcpu_load(smp_get_processor_id()) } + + fn arch_vcpu_load(&mut self, cpu: ProcessorId) { + x86_kvm_ops().vcpu_load(self, cpu); + + self.arch.host_pkru = KvmX86Asm::read_pkru(); + + // 下列两个TODO为处理时钟信息 + if unlikely(self.arch.tsc_offset_adjustment != 0) { + todo!() + } + + if unlikely(self.cpu != cpu) { + // TODO: 设置tsc + self.cpu = cpu; + } + + self.request(VirCpuRequest::KVM_REQ_STEAL_UPDATE) + } + + pub fn request(&mut self, req: VirCpuRequest) { + self.request.insert(req); + } + + pub fn vcpu_reset(&mut self, init_event: bool) { + let old_cr0 = self.arch.read_cr0_bits(Cr0::all()); + + if self.arch.is_guest_mode() { + todo!() + } + + // :TODO + // self.lapic_reset(init_event); + + self.arch.hflags = HFlags::empty(); + + self.arch.smi_pending = false; + self.arch.smi_count = 0; + self.arch.nmi_queued = 0; + self.arch.nmi_pending = 0; + self.arch.nmi_injected = false; + + self.arch.clear_exception_queue(); + self.arch.clear_interrupt_queue(); + + for i in &mut self.arch.db { + *i = 0; + } + + // TODO: kvm_update_dr0123(vcpu); + + // DR6_ACTIVE_LOW + self.arch.dr6 = 0xffff0ff0; + // DR7_FIXED_1 + self.arch.dr7 = 0x00000400; + + // TODO: kvm_update_dr7(vcpu); + + self.arch.cr2 = 0; + + self.request(VirCpuRequest::KVM_REQ_EVENT); + + self.arch.apf.msr_en_val = 0; + self.arch.apf.msr_int_val = 0; + // TODO:st + + // TODO: kvmclock_reset(vcpu); + + // TODO: kvm_clear_async_pf_completion_queue(vcpu); + + for i in &mut self.arch.apf.gfns { + *i = u64::MAX; + } + + self.arch.apf.halted = false; + + // TODO: fpu + + if !init_event { + // TODO:pmu + self.arch.smbase = 0x30000; + + self.arch.msr_misc_features_enables = 0; + self.arch.ia32_misc_enable_msr = MiscEnable::MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL + | MiscEnable::MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; + + // TODO: __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); + // 0xda0: MSR_IA32_XSS + self.arch.set_msr(0xda0, 0, true); + } + + for reg in &mut self.arch.regs { + *reg = 0; + } + + self.arch.mark_register_dirty(KvmReg::VcpuRegsRsp); + + let cpuid_0x1 = KvmCpuidEntry2::find(&self.arch.cpuid_entries, 1, None); + let val = if let Some(cpuid) = cpuid_0x1 { + cpuid.eax + } else { + 0x600 + }; + self.arch.write_reg(KvmReg::VcpuRegsRdx, val as u64); + + kvm_arch_ops().vcpu_reset(self, init_event); + + self.set_rflags(RFlags::FLAGS_A1); + self.arch.write_reg_raw(KvmReg::VcpuRegsRip, 0xfff0); + + self.arch.cr3 = 0; + self.arch.mark_register_dirty(KvmReg::VcpuExregCr3); + + let mut new_cr0 = Cr0::CR0_EXTENSION_TYPE; + if init_event { + new_cr0.insert(old_cr0 & (Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE)); + } else { + new_cr0.insert(Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE); + } + + kvm_arch_ops().set_cr0(self, new_cr0); + kvm_arch_ops().set_cr4(self, Cr4::empty()); + kvm_arch_ops().set_efer(self, EferFlags::empty()); + kvm_arch_ops().update_exception_bitmap(self); + + if old_cr0.contains(Cr0::CR0_ENABLE_PAGING) { + self.request(VirCpuRequest::KVM_REQ_TLB_FLUSH_GUEST); + self.arch.reset_mmu_context(); + } + + if init_event { + self.request(VirCpuRequest::KVM_REQ_TLB_FLUSH_GUEST); + } + } + + fn set_rflags(&mut self, rflags: RFlags) { + self._set_rflags(rflags); + self.request(VirCpuRequest::KVM_REQ_EVENT); + } + + fn _set_rflags(&mut self, mut rflags: RFlags) { + if self.guest_debug.contains(GuestDebug::SINGLESTEP) + && self.is_linear_rip(self.arch.single_step_rip) + { + rflags.insert(RFlags::FLAGS_TF); + } + + kvm_arch_ops().set_rflags(self, rflags); + } + + fn get_rflags(&self) -> RFlags { + let mut rflags = kvm_arch_ops().get_rflags(self); + if self.guest_debug.contains(GuestDebug::SINGLESTEP) { + rflags.insert(RFlags::FLAGS_TF); + } + return rflags; + } + + fn is_linear_rip(&mut self, linear_rip: usize) -> bool { + return self.arch.get_linear_rip() == linear_rip as u64; + } + + pub fn get_regs(&mut self) -> KvmCommonRegs { + self.load(); + return self._get_regs(); + } + + fn _get_regs(&self) -> KvmCommonRegs { + KvmCommonRegs { + rax: self.arch.read_reg(KvmReg::VcpuRegsRax), + rbx: self.arch.read_reg(KvmReg::VcpuRegsRbx), + rcx: self.arch.read_reg(KvmReg::VcpuRegsRcx), + rdx: self.arch.read_reg(KvmReg::VcpuRegsRdx), + rsi: self.arch.read_reg(KvmReg::VcpuRegsRsi), + rdi: self.arch.read_reg(KvmReg::VcpuRegsRdi), + rsp: self.arch.read_reg(KvmReg::VcpuRegsRsp), + rbp: self.arch.read_reg(KvmReg::VcpuRegsRbp), + r8: self.arch.read_reg(KvmReg::VcpuRegsR8), + r9: self.arch.read_reg(KvmReg::VcpuRegsR9), + r10: self.arch.read_reg(KvmReg::VcpuRegsR10), + r11: self.arch.read_reg(KvmReg::VcpuRegsR11), + r12: self.arch.read_reg(KvmReg::VcpuRegsR12), + r13: self.arch.read_reg(KvmReg::VcpuRegsR13), + r14: self.arch.read_reg(KvmReg::VcpuRegsR14), + r15: self.arch.read_reg(KvmReg::VcpuRegsR15), + rip: self.arch.read_reg_raw(KvmReg::VcpuRegsRip), + rflags: self.get_rflags().bits(), + } + } +} + +bitflags! { + pub struct VirCpuRequest: u32 { + const KVM_REQUEST_NO_WAKEUP = 1 << 0; + const KVM_REQUEST_WAIT = 1 << 1; + const KVM_REQUEST_NO_ACTION = 1 << 2; + const KVM_REQ_EVENT = 1 << 6; + const KVM_REQ_STEAL_UPDATE = 1 << 8; + const KVM_REQ_TLB_FLUSH_GUEST = 1 << 27 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; + const KVM_REQ_TLB_FLUSH = 1 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; + } +} + +#[derive(Debug, Default)] +pub struct KvmQueuedInterrupt { + pub injected: bool, + pub soft: bool, + pub nr: u8, +} + +#[derive(Debug, Default)] +pub struct KvmQueuedException { + pending: bool, + injected: bool, + has_error_code: bool, + vector: u8, + error_code: u32, + payload: usize, + has_payload: bool, +} + +#[derive(Debug)] +pub struct KvmAsyncPageFault { + /// 是否处于停止状态 + halted: bool, + /// 存储异步页面错误的 GFN(Guest Frame Number) + gfns: [u64; Self::ASYNC_PF_PER_VCPU], + /// 用于 GFN 到 HVA(Host Virtual Address)的缓存 + data: GfnToHvaCache, + /// MSR_KVM_ASYNC_PF_EN 寄存器的值 + msr_en_val: u64, + /// MSR_KVM_ASYNC_PF_INT 寄存器的值 + msr_int_val: u64, + /// 异步 PF 的向量 + vec: u16, + /// 异步 PF 的 ID + id: u32, + /// 是否仅发送给用户空间 + send_user_only: bool, + /// 主机 APF 标志 + host_apf_flags: u32, + /// 是否作为页面错误 VMExit 传递 + delivery_as_pf_vmexit: bool, + /// 是否处于页面就绪挂起状态 + pageready_pending: bool, +} + +impl KvmAsyncPageFault { + pub const ASYNC_PF_PER_VCPU: usize = 64; } diff --git a/kernel/src/arch/x86_64/vm/mem.rs b/kernel/src/arch/x86_64/vm/mem.rs new file mode 100644 index 000000000..b3ac5f0f4 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mem.rs @@ -0,0 +1,21 @@ +use alloc::sync::Arc; +use system_error::SystemError; + +use crate::virt::vm::kvm_host::{ + mem::{KvmMemoryChangeMode, LockedKvmMemSlot}, + Vm, +}; + +pub struct KvmArchMemorySlot {} + +impl Vm { + pub fn arch_prepare_memory_region( + &self, + old: Option<&Arc>, + new: Option<&Arc>, + change: KvmMemoryChangeMode, + ) -> Result<(), SystemError> { + // todo + Ok(()) + } +} diff --git a/kernel/src/arch/x86_64/vm/mmu.rs b/kernel/src/arch/x86_64/vm/mmu.rs new file mode 100644 index 000000000..852f7cfe8 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mmu.rs @@ -0,0 +1,356 @@ +use alloc::{sync::Arc, vec::Vec}; +use bitfield_struct::bitfield; +use x86::controlregs::{Cr0, Cr4}; +use x86_64::registers::control::EferFlags; + +use crate::{ + arch::{MMArch, VirtCpuArch}, + libs::spinlock::{SpinLock, SpinLockGuard}, + mm::MemoryManagementArch, +}; + +use super::vmx::vmx_info; + +const PT64_ROOT_5LEVEL: usize = 5; +const PT64_ROOT_4LEVEL: usize = 4; +const PT32_ROOT_LEVEL: usize = 2; +const PT32E_ROOT_LEVEL: usize = 3; + +static mut TDP_ENABLED: bool = false; +static mut TDP_ROOT_LEVEL: usize = 0; +static mut MAX_TDP_LEVEL: usize = 0; +static mut SHADOW_ACCESSED_MASK: usize = 0; + +#[derive(Debug)] +pub struct LockedKvmMmu { + inner: SpinLock, +} + +impl LockedKvmMmu { + pub fn new(mmu: KvmMmu) -> Arc { + Arc::new(Self { + inner: SpinLock::new(mmu), + }) + } + + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } +} + +#[derive(Debug, Default)] +pub struct KvmMmu { + root: KvmMmuRootInfo, + cpu_role: KvmCpuRole, + root_role: KvmMmuPageRole, + + pkru_mask: u32, + + prev_roots: [KvmMmuRootInfo; Self::KVM_MMU_NUM_PREV_ROOTS], + + pae_root: Vec, +} + +impl KvmMmu { + const KVM_MMU_NUM_PREV_ROOTS: usize = 3; + const INVALID_PAGE: u64 = u64::MAX; + + #[inline] + pub fn tdp_enabled() -> bool { + unsafe { TDP_ENABLED } + } + + #[inline] + pub fn tdp_root_level() -> usize { + unsafe { TDP_ROOT_LEVEL } + } + + #[inline] + pub fn max_tdp_level() -> usize { + unsafe { MAX_TDP_LEVEL } + } + + #[inline] + pub fn ad_enabled() -> bool { + unsafe { SHADOW_ACCESSED_MASK != 0 } + } +} + +#[derive(Debug, Default)] +pub struct KvmMmuRootInfo { + pgd: u64, + hpa: u64, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct KvmCpuRole { + base: KvmMmuPageRole, + extend: KvmMmuExtenedRole, +} + +impl PartialEq for KvmCpuRole { + fn eq(&self, other: &Self) -> bool { + self.base.0 == other.base.0 && self.extend.0 == other.extend.0 + } +} + +/// ### 用于跟踪影子页(包括 TDP 页)的属性,以确定页面是否可以在给定的 MMU 上下文中使用。 +#[bitfield(u32)] +pub struct KvmMmuPageRole { + /// 表示页表级别,占用 4 位。对于普通的页表,取值是 2(二级页表)、3(三级页表)、4(四级页表)和 5(五级页表) + #[bits(4)] + level: u32, + /// 页表项是否为 4 字节,占用 1 位。在非 PAE 分页模式下,该值为 1 + has_4_byte_gpte: bool, + /// 表示页表项所在的象限,占用 2 位。该字段仅在 has_4_byte_gpte 为 1 时有效。 + #[bits(2)] + quadrant: u32, + /// 页面是否直接映射 + direct: bool, + /// 页面的访问权限 + #[bits(3)] + access: u32, + /// 页面是否无效 + invalid: bool, + /// 页面是否启用 NX(不可执行)位 + efer_nx: bool, + /// CR0 寄存器中的写保护位(WP)是否被置位 + cr0_wp: bool, + /// SMEP(Supervisor Mode Execution Protection)和非写保护位的组合 + smep_andnot_wp: bool, + /// SMAP(Supervisor Mode Access Prevention)和非写保护位的组合 + smap_andnot_wp: bool, + /// 页面是否禁用访问位(Accessed Bit) + ad_disabled: bool, + /// 当前页是否处于客户机模式 + guest_mode: bool, + /// 是否将此页透传给客户机 + passthrough: bool, + /// 未使用位域 + #[bits(5)] + unused: u32, + /// 表示 SMM(System Management Mode)模式 + #[bits(8)] + smm: u32, +} + +impl KvmMmuPageRole { + pub fn is_cr0_pg(&self) -> bool { + self.level() > 0 + } + + pub fn is_cr4_pae(&self) -> bool { + !self.has_4_byte_gpte() + } +} + +#[bitfield(u32)] +pub struct KvmMmuExtenedRole { + valid: bool, + execonly: bool, + cr4_pse: bool, + cr4_pke: bool, + cr4_smap: bool, + cr4_smep: bool, + cr4_la57: bool, + efer_lma: bool, + #[bits(24)] + unused: u32, +} + +pub struct KvmMmuRoleRegs { + pub cr0: Cr0, + pub cr4: Cr4, + pub efer: EferFlags, +} + +impl VirtCpuArch { + pub fn kvm_init_mmu(&mut self) { + let regs = self.role_regs(); + let cpu_role = self.calc_cpu_role(®s); + + if self.walk_mmu.is_some() + && self.nested_mmu.is_some() + && Arc::ptr_eq( + self.walk_mmu.as_ref().unwrap(), + self.nested_mmu.as_ref().unwrap(), + ) + { + todo!() + } else if KvmMmu::tdp_enabled() { + self.init_tdp_mmu(cpu_role); + } else { + todo!() + } + } + + fn unload_mmu(&mut self) { + // TODO + } + + pub fn reset_mmu_context(&mut self) { + self.unload_mmu(); + self.kvm_init_mmu(); + } + + fn role_regs(&mut self) -> KvmMmuRoleRegs { + KvmMmuRoleRegs { + cr0: self.read_cr0_bits(Cr0::CR0_ENABLE_PAGING | Cr0::CR0_WRITE_PROTECT), + cr4: self.read_cr4_bits( + Cr4::CR4_ENABLE_PSE + | Cr4::CR4_ENABLE_PAE + | Cr4::CR4_ENABLE_LA57 + | Cr4::CR4_ENABLE_SMEP + | Cr4::CR4_ENABLE_SMAP + | Cr4::CR4_ENABLE_PROTECTION_KEY, + ), + efer: self.efer, + } + } + + fn calc_cpu_role(&self, regs: &KvmMmuRoleRegs) -> KvmCpuRole { + let mut role = KvmCpuRole::default(); + let base = &mut role.base; + let ext = &mut role.extend; + base.set_access(0b111); + base.set_smm(self.is_smm() as u32); + base.set_guest_mode(self.is_guest_mode()); + ext.set_valid(true); + + if !regs.cr0.contains(Cr0::CR0_ENABLE_PAGING) { + base.set_direct(true); + return role; + } + + base.set_efer_nx(regs.efer.contains(EferFlags::NO_EXECUTE_ENABLE)); + base.set_cr0_wp(regs.cr0.contains(Cr0::CR0_WRITE_PROTECT)); + base.set_smep_andnot_wp( + regs.cr4.contains(Cr4::CR4_ENABLE_SMEP) && !regs.cr0.contains(Cr0::CR0_WRITE_PROTECT), + ); + base.set_smap_andnot_wp( + regs.cr4.contains(Cr4::CR4_ENABLE_SMAP) && !regs.cr0.contains(Cr0::CR0_WRITE_PROTECT), + ); + + base.set_has_4_byte_gpte(!regs.cr4.contains(Cr4::CR4_ENABLE_PAE)); + + if regs.efer.contains(EferFlags::LONG_MODE_ACTIVE) { + let level = if regs.cr4.contains(Cr4::CR4_ENABLE_LA57) { + PT64_ROOT_5LEVEL as u32 + } else { + PT64_ROOT_4LEVEL as u32 + }; + base.set_level(level); + } else if regs.cr4.contains(Cr4::CR4_ENABLE_PAE) { + base.set_level(PT32E_ROOT_LEVEL as u32); + } else { + base.set_level(PT32_ROOT_LEVEL as u32); + } + + ext.set_cr4_smep(regs.cr4.contains(Cr4::CR4_ENABLE_SMEP)); + ext.set_cr4_smap(regs.cr4.contains(Cr4::CR4_ENABLE_SMAP)); + ext.set_cr4_pse(regs.cr4.contains(Cr4::CR4_ENABLE_PSE)); + ext.set_cr4_pke( + regs.efer.contains(EferFlags::LONG_MODE_ACTIVE) + && regs.cr4.contains(Cr4::CR4_ENABLE_PROTECTION_KEY), + ); + ext.set_cr4_la57( + regs.efer.contains(EferFlags::LONG_MODE_ACTIVE) + && regs.cr4.contains(Cr4::CR4_ENABLE_LA57), + ); + ext.set_efer_lma(regs.efer.contains(EferFlags::LONG_MODE_ACTIVE)); + + role + } + + /// https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/mmu/mmu.c#6019 + pub fn vcpu_arch_mmu_create(&mut self) { + if vmx_info().tdp_enabled() { + self.guset_mmu = Some(self._mmu_create()); + } + + self.root_mmu = Some(self._mmu_create()); + self.mmu = self.root_mmu.clone(); + self.walk_mmu = self.root_mmu.clone(); + } + + fn _mmu_create(&self) -> Arc { + let mut mmu = KvmMmu::default(); + + mmu.root.hpa = KvmMmu::INVALID_PAGE; + mmu.root.pgd = 0; + + for role in &mut mmu.prev_roots { + role.hpa = KvmMmu::INVALID_PAGE; + role.pgd = KvmMmu::INVALID_PAGE; + } + + if KvmMmu::tdp_enabled() && self.mmu_get_tdp_level() > PT32E_ROOT_LEVEL { + return LockedKvmMmu::new(mmu); + } + + mmu.pae_root + .resize(MMArch::PAGE_SIZE / core::mem::size_of::(), 0); + + return LockedKvmMmu::new(mmu); + } + + fn mmu_get_tdp_level(&self) -> usize { + if KvmMmu::tdp_root_level() != 0 { + return KvmMmu::tdp_root_level(); + } + + if KvmMmu::max_tdp_level() == 5 && self.max_phyaddr <= 48 { + return 4; + } + + return KvmMmu::max_tdp_level(); + } + + pub fn init_tdp_mmu(&mut self, cpu_role: KvmCpuRole) { + let context = self.root_mmu(); + let mut context = context.lock(); + let root_role = self.calc_tdp_mmu_root_page_role(cpu_role); + + if cpu_role == context.cpu_role && root_role.0 == context.root_role.0 { + return; + } + + context.cpu_role = cpu_role; + context.root_role = root_role; + + // todo 设置函数集 + + if !context.cpu_role.base.is_cr0_pg() { + // todo: context->gva_to_gpa = nonpaging_gva_to_gpa; + } else if context.cpu_role.base.is_cr4_pae() { + // todo: context->gva_to_gpa = paging64_gva_to_gpa; + } else { + // todo: context->gva_to_gpa = paging32_gva_to_gpa; + } + + // todo: + // reset_guest_paging_metadata(vcpu, context); + // reset_tdp_shadow_zero_bits_mask(context); + } + + #[inline] + pub fn root_mmu(&self) -> &Arc { + self.root_mmu.as_ref().unwrap() + } + + fn calc_tdp_mmu_root_page_role(&self, cpu_role: KvmCpuRole) -> KvmMmuPageRole { + let mut role = KvmMmuPageRole::default(); + + role.set_access(0b111); + role.set_cr0_wp(true); + role.set_efer_nx(true); + role.set_smm(cpu_role.base.smm()); + role.set_guest_mode(cpu_role.base.guest_mode()); + role.set_ad_disabled(!KvmMmu::ad_enabled()); + role.set_level(self.mmu_get_tdp_level() as u32); + role.set_direct(true); + role.set_has_4_byte_gpte(false); + + role + } +} diff --git a/kernel/src/arch/x86_64/vm/mod.rs b/kernel/src/arch/x86_64/vm/mod.rs index 6233199d9..87ad42fcd 100644 --- a/kernel/src/arch/x86_64/vm/mod.rs +++ b/kernel/src/arch/x86_64/vm/mod.rs @@ -1,47 +1,273 @@ -use core::{ - arch::x86_64::{_xgetbv, _XCR_XFEATURE_ENABLED_MASK}, - sync::atomic::{AtomicU64, Ordering}, -}; - -use alloc::{boxed::Box, sync::Arc}; +use alloc::vec::Vec; use raw_cpuid::CpuId; use system_error::SystemError; -use x86::msr::{rdmsr, IA32_CSTAR, IA32_PAT}; +use x86::{ + controlregs::Xcr0, + msr::{ + rdmsr, IA32_BIOS_SIGN_ID, IA32_CSTAR, IA32_EFER, IA32_FEATURE_CONTROL, IA32_FMASK, + IA32_KERNEL_GSBASE, IA32_LSTAR, IA32_MCG_CTL, IA32_MCG_STATUS, IA32_MISC_ENABLE, IA32_PAT, + IA32_PERFEVTSEL0, IA32_PERFEVTSEL7, IA32_PERF_CAPABILITIES, IA32_PMC0, IA32_PMC7, + IA32_SMBASE, IA32_STAR, IA32_SYSENTER_CS, IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, + IA32_TIME_STAMP_COUNTER, IA32_TSC_ADJUST, IA32_TSC_AUX, IA32_TSC_DEADLINE, IA32_VMX_BASIC, + IA32_VMX_CR0_FIXED0, IA32_VMX_CR4_FIXED0, IA32_VMX_EPT_VPID_CAP, IA32_VMX_MISC, + IA32_VMX_PROCBASED_CTLS2, IA32_VMX_TRUE_ENTRY_CTLS, IA32_VMX_TRUE_EXIT_CTLS, + IA32_VMX_TRUE_PINBASED_CTLS, IA32_VMX_TRUE_PROCBASED_CTLS, IA32_VMX_VMCS_ENUM, + IA32_VMX_VMFUNC, MSR_C1_PMON_EVNT_SEL0, MSR_C5_PMON_BOX_CTRL, MSR_IA32_ADDR0_END, + MSR_IA32_ADDR0_START, MSR_IA32_ADDR1_END, MSR_IA32_ADDR1_START, MSR_IA32_ADDR2_END, + MSR_IA32_ADDR2_START, MSR_IA32_ADDR3_END, MSR_IA32_ADDR3_START, MSR_IA32_CR3_MATCH, + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK_PTRS, + MSR_IA32_RTIT_STATUS, MSR_IA32_TSX_CTRL, MSR_PERF_FIXED_CTR0, MSR_PERF_FIXED_CTR2, + MSR_PLATFORM_INFO, MSR_POWER_CTL, MSR_SMI_COUNT, + }, +}; use crate::{ - kerror, - libs::{lazy_init::Lazy, rwlock::RwLock}, + arch::vm::vmx::{VmxL1dFlushState, L1TF_VMX_MITIGATION}, + kdebug, kerror, + libs::once::Once, + mm::percpu::{PerCpu, PerCpuVar}, }; -use self::kvm_host::KvmFunc; +use self::{ + asm::{hyperv::*, kvm_msr::*, ArchCapabilities, KvmMsrEntry}, + kvm_host::{KvmFunc, KvmInitFunc}, +}; +use super::driver::tsc::TSCManager; + +mod asm; +mod cpuid; pub mod kvm_host; +pub mod mem; +mod mmu; pub mod vmx; -static KVM_X86_MANAGER: Lazy = Lazy::new(); +static mut KVM_X86_MANAGER: Option = None; + +pub fn x86_kvm_ops() -> &'static dyn KvmFunc { + unsafe { KVM_X86_MANAGER.as_ref().unwrap().funcs() } +} + +pub fn x86_kvm_manager() -> &'static KvmArchManager { + unsafe { KVM_X86_MANAGER.as_ref().unwrap() } +} -pub fn kvm_x86_ops() -> Option<&'static dyn KvmFunc> { - *KVM_X86_MANAGER.funcs.read() +pub fn x86_kvm_manager_mut() -> &'static mut KvmArchManager { + unsafe { KVM_X86_MANAGER.as_mut().unwrap() } } +pub fn init_kvm_arch() { + static ONCE: Once = Once::new(); + ONCE.call_once(|| unsafe { + KVM_X86_MANAGER = Some(KvmArchManager::default()); + + let mut user_return_msrs = Vec::new(); + user_return_msrs.resize(PerCpu::MAX_CPU_NUM as usize, KvmUserReturnMsrs::default()); + USER_RETURN_MSRS = Some(PerCpuVar::new(user_return_msrs).unwrap()); + }) +} + +/// fixme:这些成员是否需要加锁呢?? +#[derive(Debug, Default)] pub struct KvmArchManager { - funcs: RwLock>, - host_xcr0: AtomicU64, + funcs: Option<&'static dyn KvmFunc>, + host_xcr0: u64, + host_efer: u64, + host_xss: u64, + host_arch_capabilities: u64, + kvm_uret_msrs_list: Vec, + kvm_caps: KvmCapabilities, + max_tsc_khz: u64, + msrs_to_save: Vec, + emulated_msrs: Vec, + msr_based_features: Vec, + + has_noapic_vcpu: bool, + + enable_pmu: bool, } impl KvmArchManager { + #[inline] + pub fn set_runtime_func(&mut self, funcs: &'static dyn KvmFunc) { + self.funcs = Some(funcs); + } + + #[inline] + pub fn funcs(&self) -> &'static dyn KvmFunc { + self.funcs.unwrap() + } + + pub fn find_user_return_msr_idx(&self, msr: u32) -> Option { + for (i, val) in self.kvm_uret_msrs_list.iter().enumerate() { + if *val == msr { + return Some(i); + } + } + + None + } + pub const KVM_MAX_VCPUS: usize = 1024; + pub const KVM_MAX_NR_USER_RETURN_MSRS: usize = 7; + + const MSRS_TO_SAVE_BASE: &[u32] = &[ + IA32_SYSENTER_CS, + IA32_SYSENTER_ESP, + IA32_SYSENTER_EIP, + IA32_STAR, + IA32_CSTAR, + IA32_KERNEL_GSBASE, + IA32_FMASK, + IA32_LSTAR, + IA32_TIME_STAMP_COUNTER, + IA32_PAT, + 0xc0010117, // MSR_VM_HSAVE_PA? + IA32_FEATURE_CONTROL, + MSR_C1_PMON_EVNT_SEL0, + IA32_TSC_AUX, + 0x48, // MSR_IA32_SPEC_CTRL + MSR_IA32_TSX_CTRL, + MSR_IA32_RTIT_CTL, + MSR_IA32_RTIT_STATUS, + MSR_IA32_CR3_MATCH, + MSR_IA32_RTIT_OUTPUT_BASE, + MSR_IA32_RTIT_OUTPUT_MASK_PTRS, + MSR_IA32_ADDR0_START, + MSR_IA32_ADDR0_END, + MSR_IA32_ADDR1_START, + MSR_IA32_ADDR1_END, + MSR_IA32_ADDR2_START, + MSR_IA32_ADDR2_END, + MSR_IA32_ADDR3_START, + MSR_IA32_ADDR3_END, + 0xe1, // MSR_IA32_UMWAIT_CONTROL + 0x1c4, // MSR_IA32_XFD + 0x1c5, // MSR_IA32_XFD_ERR + ]; + + const EMULATED_MSRS_ALL: &[u32] = &[ + MSR_KVM_SYSTEM_TIME, + MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, + MSR_KVM_WALL_CLOCK_NEW, + HV_X64_MSR_GUEST_OS_ID, + HV_X64_MSR_HYPERCALL, + HV_REGISTER_TIME_REF_COUNT, + HV_REGISTER_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, + HV_X64_MSR_APIC_FREQUENCY, + HV_REGISTER_CRASH_P0, + HV_REGISTER_CRASH_P1, + HV_REGISTER_CRASH_P2, + HV_REGISTER_CRASH_P3, + HV_REGISTER_CRASH_P4, + HV_REGISTER_CRASH_CTL, + HV_X64_MSR_RESET, + HV_REGISTER_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, + HV_REGISTER_SCONTROL, + HV_REGISTER_STIMER0_CONFIG, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_REENLIGHTENMENT_CONTROL, + HV_X64_MSR_TSC_EMULATION_CONTROL, + HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_TSC_INVARIANT_CONTROL, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, + HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, + HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, + MSR_KVM_ASYNC_PF_EN, + MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, + MSR_KVM_ASYNC_PF_INT, + MSR_KVM_ASYNC_PF_ACK, + IA32_TSC_ADJUST, + IA32_TSC_DEADLINE, + IA32_PERF_CAPABILITIES, + 0x10a, // MSR_IA32_ARCH_CAPABILITIES, + IA32_MISC_ENABLE, + IA32_MCG_STATUS, + IA32_MCG_CTL, + 0x4d0, // MSR_IA32_MCG_EXT_CTL, + IA32_SMBASE, + MSR_SMI_COUNT, + MSR_PLATFORM_INFO, + 0x140, // MSR_MISC_FEATURES_ENABLES, + 0xc001011f, // MSR_AMD64_VIRT_SPEC_CTRL, + 0xc0000104, // MSR_AMD64_TSC_RATIO, + MSR_POWER_CTL, + IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV, + /* + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). + */ + IA32_VMX_BASIC, + IA32_VMX_TRUE_PINBASED_CTLS, + IA32_VMX_TRUE_PROCBASED_CTLS, + IA32_VMX_TRUE_EXIT_CTLS, + IA32_VMX_TRUE_ENTRY_CTLS, + IA32_VMX_MISC, + IA32_VMX_CR0_FIXED0, + IA32_VMX_CR4_FIXED0, + IA32_VMX_VMCS_ENUM, + IA32_VMX_PROCBASED_CTLS2, + IA32_VMX_EPT_VPID_CAP, + IA32_VMX_VMFUNC, + 0xc0010015, // MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, + ]; + + const MSR_BASED_FEATURES_ALL_EXCEPT_VMX: &[u32] = &[ + 0xc0011029, // MSR_AMD64_DE_CFG + IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV + 0x10a, // MSR_IA32_ARCH_CAPABILITIES, + IA32_PERF_CAPABILITIES, + ]; + + pub fn arch_hardware_enable(&self) -> Result<(), SystemError> { + self.online_user_return_msr(); + + x86_kvm_ops().hardware_enable()?; + + // TODO: 这里需要对TSC进行一系列检测 + + Ok(()) + } + + /// ## 初始化当前cpu的kvm msr寄存器 + fn online_user_return_msr(&self) { + let user_return_msrs = user_return_msrs().get_mut(); + + for (idx, msr) in self.kvm_uret_msrs_list.iter().enumerate() { + let val = unsafe { rdmsr(*msr) }; + user_return_msrs.values[idx].host = val; + user_return_msrs.values[idx].curr = val; + } + } /// 厂商相关的init工作 - pub fn vendor_init(&self) -> Result<(), SystemError> { + pub fn vendor_init(&mut self, init_ops: &'static dyn KvmInitFunc) -> Result<(), SystemError> { let cpuid = CpuId::new(); let cpu_feature = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?; + let cpu_extend = cpuid.get_extended_state_info().ok_or(SystemError::ENOSYS)?; + let extend_features = cpuid + .get_extended_feature_info() + .ok_or(SystemError::ENOSYS)?; - let kvm_x86_ops = kvm_x86_ops(); + let kvm_x86_ops = &self.funcs; // 是否已经设置过 - if let Some(ops) = kvm_x86_ops { - kerror!("[KVM] already loaded vendor module {}", ops.name()); + if kvm_x86_ops.is_some() { + kerror!( + "[KVM] already loaded vendor module {}", + kvm_x86_ops.unwrap().name() + ); return Err(SystemError::EEXIST); } @@ -63,14 +289,219 @@ impl KvmArchManager { } // TODO:mmu vendor init - if cpu_feature.has_xsave() { - self.host_xcr0.store( - unsafe { _xgetbv(_XCR_XFEATURE_ENABLED_MASK) }, - Ordering::SeqCst, + // fixme:这里会UD,后续再修 + // self.host_xcr0 = unsafe { _xgetbv(_XCR_XFEATURE_ENABLED_MASK) }; + } + // 保存efer + self.host_efer = unsafe { rdmsr(IA32_EFER) }; + + // 保存xss + if cpu_extend.has_xsaves_xrstors() { + self.host_xss = unsafe { rdmsr(MSR_C5_PMON_BOX_CTRL) }; + } + + // TODO: 初始化性能监视单元(PMU) + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9518 + if extend_features.has_sha() { + self.host_arch_capabilities = unsafe { + // MSR_IA32_ARCH_CAPABILITIES + rdmsr(0x10a) + } + } + + init_ops.hardware_setup()?; + + self.set_runtime_func(init_ops.runtime_funcs()); + + self.kvm_timer_init()?; + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9544 + + let kvm_caps = &mut self.kvm_caps; + if !cpu_extend.has_xsaves_xrstors() { + kvm_caps.supported_xss = 0; + } + + if kvm_caps.has_tsc_control { + kvm_caps.max_guest_tsc_khz = 0x7fffffff.min( + ((kvm_caps.max_tsc_scaling_ratio as i128 * TSCManager::tsc_khz() as i128) + >> kvm_caps.tsc_scaling_ratio_frac_bits) as u32, ); } + kvm_caps.default_tsc_scaling_ratio = 1 << kvm_caps.tsc_scaling_ratio_frac_bits; + self.kvm_init_msr_lists(); + Ok(()) + } + + fn kvm_init_msr_lists(&mut self) { + self.msrs_to_save.clear(); + self.emulated_msrs.clear(); + self.msr_based_features.clear(); + + for msr in Self::MSRS_TO_SAVE_BASE { + self.kvm_probe_msr_to_save(*msr); + } + + if self.enable_pmu { + todo!() + } + + for msr in Self::EMULATED_MSRS_ALL { + if !x86_kvm_ops().has_emulated_msr(*msr) { + continue; + } + self.emulated_msrs.push(*msr); + } + + for msr in IA32_VMX_BASIC..=IA32_VMX_VMFUNC { + self.kvm_prove_feature_msr(msr) + } + + for msr in Self::MSR_BASED_FEATURES_ALL_EXCEPT_VMX { + self.kvm_prove_feature_msr(*msr); + } + } + + fn kvm_probe_msr_to_save(&mut self, msr: u32) { + let cpuid = CpuId::new(); + let cpu_feat = cpuid.get_feature_info().unwrap(); + let cpu_extend = cpuid.get_extended_feature_info().unwrap(); + + match msr { + MSR_C1_PMON_EVNT_SEL0 => { + if !cpu_extend.has_mpx() { + return; + } + } + + IA32_TSC_AUX => { + if !cpu_feat.has_tsc() { + return; + } + } + // MSR_IA32_UNWAIT_CONTROL + 0xe1 => { + if !cpu_extend.has_waitpkg() { + return; + } + } + MSR_IA32_RTIT_CTL | MSR_IA32_RTIT_STATUS => { + if !cpu_extend.has_processor_trace() { + return; + } + } + MSR_IA32_CR3_MATCH => { + // TODO: 判断intel_pt_validate_hw_cap(PT_CAP_cr3_filtering) + if !cpu_extend.has_processor_trace() { + return; + } + } + MSR_IA32_RTIT_OUTPUT_BASE | MSR_IA32_RTIT_OUTPUT_MASK_PTRS => { + // TODO: 判断!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&!intel_pt_validate_hw_cap(PT_CAP_single_range_output) + if !cpu_extend.has_processor_trace() { + return; + } + } + MSR_IA32_ADDR0_START..MSR_IA32_ADDR3_END => { + // TODO: 判断msr_index - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) + if !cpu_extend.has_processor_trace() { + return; + } + } + IA32_PMC0..IA32_PMC7 => { + // TODO: 判断msr是否符合配置 + } + IA32_PERFEVTSEL0..IA32_PERFEVTSEL7 => { + // TODO: 判断msr是否符合配置 + } + MSR_PERF_FIXED_CTR0..MSR_PERF_FIXED_CTR2 => { + // TODO: 判断msr是否符合配置 + } + MSR_IA32_TSX_CTRL => { + // TODO: !(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR) + // 这个寄存器目前不支持,现在先return + // return; + } + _ => {} + } + + self.msrs_to_save.push(msr); + } + + fn kvm_prove_feature_msr(&mut self, index: u32) { + let mut msr = KvmMsrEntry { + index, + reserved: Default::default(), + data: Default::default(), + }; + + if self.get_msr_feature(&mut msr) { + return; + } + + self.msr_based_features.push(index); + } + + fn get_msr_feature(&self, msr: &mut KvmMsrEntry) -> bool { + match msr.index { + 0x10a => { + // MSR_IA32_ARCH_CAPABILITIES, + msr.data = self.get_arch_capabilities(); + } + IA32_PERF_CAPABILITIES => { + msr.data = self.kvm_caps.supported_perf_cap; + } + IA32_BIOS_SIGN_ID => { + // MSR_IA32_UCODE_REV + msr.data = unsafe { rdmsr(msr.index) }; + } + _ => { + return x86_kvm_ops().get_msr_feature(msr); + } + } + + return true; + } + + fn get_arch_capabilities(&self) -> u64 { + let cpuid = CpuId::new(); + let extend_feat = cpuid.get_extended_feature_info().unwrap(); + + let mut data = ArchCapabilities::from_bits_truncate(self.host_arch_capabilities) + & ArchCapabilities::KVM_SUPPORTED_ARCH_CAP; + data.insert(ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO); + + if *L1TF_VMX_MITIGATION.read() != VmxL1dFlushState::FlushNever { + data.insert(ArchCapabilities::ARCH_CAP_SKIP_VMENTRY_L1DFLUSH); + } + + // fixme:这里是直接赋值,这里应该是需要判断cpu是否存在某些bug + + data.insert( + ArchCapabilities::ARCH_CAP_RDCL_NO + | ArchCapabilities::ARCH_CAP_SSB_NO + | ArchCapabilities::ARCH_CAP_MDS_NO + | ArchCapabilities::ARCH_CAP_GDS_NO, + ); + + return data.bits(); + } + + pub fn add_user_return_msr(&mut self, msr: u32) { + assert!(self.kvm_uret_msrs_list.len() < Self::KVM_MAX_NR_USER_RETURN_MSRS); + self.kvm_uret_msrs_list.push(msr) + } + + fn kvm_timer_init(&mut self) -> Result<(), SystemError> { + let cpuid = CpuId::new(); + let cpu_feature = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?; + if cpu_feature.has_tsc() { + self.max_tsc_khz = TSCManager::tsc_khz(); + } + + // TODO:此处未完成 Ok(()) } } @@ -78,8 +509,76 @@ impl KvmArchManager { /// ### Kvm的功能特性 #[derive(Debug)] pub struct KvmCapabilities { + /// 是否支持控制客户机的 TSC(时间戳计数器)速率 has_tsc_control: bool, + /// 客户机可以使用的 TSC 的最大速率,以khz为单位 max_guest_tsc_khz: u32, + /// TSC 缩放比例的小数部分的位数 tsc_scaling_ratio_frac_bits: u8, - + /// TSC 缩放比例的最大允许值 + max_tsc_scaling_ratio: u64, + /// 默认的 TSC 缩放比例,其值为 1ull << tsc_scaling_ratio_frac_bits + default_tsc_scaling_ratio: u64, + /// 是否支持总线锁定的退出 + has_bus_lock_exit: bool, + /// 是否支持 VM 退出通知 + has_notify_vmexit: bool, + /// 支持的 MCE(机器检查异常)功能的位掩码 + supported_mce_cap: McgCap, + /// 支持的 XCR0 寄存器的位掩码 + supported_xcr0: Xcr0, + /// 支持的 XSS(XSAVE Extended State)寄存器的位掩码 + supported_xss: u64, + /// 支持的性能监控功能的位掩码 + supported_perf_cap: u64, +} + +impl Default for KvmCapabilities { + fn default() -> Self { + Self { + has_tsc_control: Default::default(), + max_guest_tsc_khz: Default::default(), + tsc_scaling_ratio_frac_bits: Default::default(), + max_tsc_scaling_ratio: Default::default(), + default_tsc_scaling_ratio: Default::default(), + has_bus_lock_exit: Default::default(), + has_notify_vmexit: Default::default(), + supported_mce_cap: McgCap::MCG_CTL_P | McgCap::MCG_SER_P, + supported_xcr0: Xcr0::empty(), + supported_xss: Default::default(), + supported_perf_cap: Default::default(), + } + } +} + +bitflags! { + pub struct McgCap: u64 { + const MCG_BANKCNT_MASK = 0xff; /* Number of Banks */ + const MCG_CTL_P = 1 << 8; /* MCG_CTL register available */ + const MCG_EXT_P = 1 << 9; /* Extended registers available */ + const MCG_CMCI_P = 1 << 10; /* CMCI supported */ + const MCG_EXT_CNT_MASK = 0xff0000; /* Number of Extended registers */ + const MCG_EXT_CNT_SHIFT = 16; + const MCG_SER_P = 1 << 24; /* MCA recovery/new status bits */ + const MCG_ELOG_P = 1 << 26; /* Extended error log supported */ + const MCG_LMCE_P = 1 << 27; /* Local machine check supported */ + } +} + +static mut USER_RETURN_MSRS: Option> = None; + +fn user_return_msrs() -> &'static PerCpuVar { + unsafe { USER_RETURN_MSRS.as_ref().unwrap() } +} + +#[derive(Debug, Default, Clone)] +struct KvmUserReturnMsrs { + pub registered: bool, + pub values: [KvmUserReturnMsrsValues; KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS], +} + +#[derive(Debug, Default, Clone)] +struct KvmUserReturnMsrsValues { + pub host: u64, + pub curr: u64, } diff --git a/kernel/src/arch/x86_64/vm/vmx/capabilities.rs b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs new file mode 100644 index 000000000..2cc81520d --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs @@ -0,0 +1,509 @@ +use x86::{ + msr::{ + IA32_VMX_BASIC, IA32_VMX_CR0_FIXED0, IA32_VMX_CR0_FIXED1, IA32_VMX_CR4_FIXED0, + IA32_VMX_CR4_FIXED1, IA32_VMX_ENTRY_CTLS, IA32_VMX_EPT_VPID_CAP, IA32_VMX_EXIT_CTLS, + IA32_VMX_MISC, IA32_VMX_PINBASED_CTLS, IA32_VMX_PROCBASED_CTLS, IA32_VMX_PROCBASED_CTLS2, + IA32_VMX_TRUE_ENTRY_CTLS, IA32_VMX_TRUE_EXIT_CTLS, IA32_VMX_TRUE_PINBASED_CTLS, + IA32_VMX_TRUE_PROCBASED_CTLS, IA32_VMX_VMCS_ENUM, IA32_VMX_VMFUNC, + }, + vmx::vmcs::control::{ + EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls, + }, +}; + +use crate::arch::vm::{ + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR, PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, + VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR, VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR, +}; + +use super::{vmcs::feat::VmxFeat, Vmx}; + +#[derive(Debug)] +pub struct VmcsConfig { + pub size: u32, + pub basic_cap: u32, + pub revision_id: u32, + pub pin_based_exec_ctrl: PinbasedControls, + pub cpu_based_exec_ctrl: PrimaryControls, + pub cpu_based_2nd_exec_ctrl: SecondaryControls, + pub cpu_based_3rd_exec_ctrl: u32, + pub vmexit_ctrl: ExitControls, + pub vmentry_ctrl: EntryControls, + pub misc: u64, + pub nested: NestedVmxMsrs, +} + +impl Default for VmcsConfig { + fn default() -> Self { + Self { + size: Default::default(), + basic_cap: Default::default(), + revision_id: Default::default(), + pin_based_exec_ctrl: PinbasedControls::empty(), + cpu_based_exec_ctrl: PrimaryControls::empty(), + cpu_based_2nd_exec_ctrl: SecondaryControls::empty(), + cpu_based_3rd_exec_ctrl: Default::default(), + vmexit_ctrl: ExitControls::empty(), + vmentry_ctrl: EntryControls::empty(), + misc: Default::default(), + nested: Default::default(), + } + } +} + +#[derive(Debug, Default)] +pub struct NestedVmxMsrs { + /// 主处理器基于控制,分为低32位和高32位 + pub procbased_ctls_low: u32, + /// 主处理器基于控制,分为低32位和高32位 + pub procbased_ctls_high: u32, + /// 次要处理器控制,分为低32位和高32位 + pub secondary_ctls_low: u32, + /// 次要处理器控制,分为低32位和高32位 + pub secondary_ctls_high: u32, + /// VMX 的针脚基于控制,分为低32位和高32位 + pub pinbased_ctls_low: u32, + /// VMX 的针脚基于控制,分为低32位和高32位 + pub pinbased_ctls_high: u32, + /// VM退出控制,分为低32位和高32位 + pub exit_ctls_low: u32, + /// VM退出控制,分为低32位和高32位 + pub exit_ctls_high: u32, + /// VM进入控制,分为低32位和高32位 + pub entry_ctls_low: u32, + /// VM进入控制,分为低32位和高32位 + pub entry_ctls_high: u32, + /// VMX 的其他杂项控制,分为低32位和高32位 + pub misc_low: u32, + /// VMX 的其他杂项控制,分为低32位和高32位 + pub misc_high: u32, + /// 扩展页表(EPT)的能力信息 + pub ept_caps: u32, + /// 虚拟处理器标识(VPID)的能力信息 + pub vpid_caps: u32, + /// 基本能力 + pub basic: u64, + /// VMX 控制的CR0寄存器的固定位 + pub cr0_fixed0: u64, + /// VMX 控制的CR0寄存器的固定位 + pub cr0_fixed1: u64, + /// VMX 控制的CR4寄存器的固定位 + pub cr4_fixed0: u64, + /// VMX 控制的CR4寄存器的固定位 + pub cr4_fixed1: u64, + /// VMX 控制的VMCS寄存器的编码 + pub vmcs_enum: u64, + /// VM功能控制 + pub vmfunc_controls: u64, +} + +impl NestedVmxMsrs { + pub fn control_msr(low: u32, high: u32) -> u64 { + (high as u64) << 32 | low as u64 + } + + pub fn get_vmx_msr(&self, msr_index: u32) -> Option { + match msr_index { + IA32_VMX_BASIC => { + return Some(self.basic); + } + IA32_VMX_TRUE_PINBASED_CTLS | IA32_VMX_PINBASED_CTLS => { + let mut data = + NestedVmxMsrs::control_msr(self.pinbased_ctls_low, self.pinbased_ctls_high); + if msr_index == IA32_VMX_PINBASED_CTLS { + data |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + } + return Some(data); + } + IA32_VMX_TRUE_PROCBASED_CTLS | IA32_VMX_PROCBASED_CTLS => { + let mut data = + NestedVmxMsrs::control_msr(self.procbased_ctls_low, self.procbased_ctls_high); + if msr_index == IA32_VMX_PROCBASED_CTLS { + data |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + } + return Some(data); + } + IA32_VMX_TRUE_EXIT_CTLS | IA32_VMX_EXIT_CTLS => { + let mut data = NestedVmxMsrs::control_msr(self.exit_ctls_low, self.exit_ctls_high); + if msr_index == IA32_VMX_EXIT_CTLS { + data |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + } + return Some(data); + } + IA32_VMX_TRUE_ENTRY_CTLS | IA32_VMX_ENTRY_CTLS => { + let mut data = + NestedVmxMsrs::control_msr(self.entry_ctls_low, self.entry_ctls_high); + if msr_index == IA32_VMX_ENTRY_CTLS { + data |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + } + return Some(data); + } + IA32_VMX_MISC => { + return Some(NestedVmxMsrs::control_msr(self.misc_low, self.misc_high)); + } + IA32_VMX_CR0_FIXED0 => { + return Some(self.cr0_fixed0); + } + IA32_VMX_CR0_FIXED1 => { + return Some(self.cr0_fixed1); + } + IA32_VMX_CR4_FIXED0 => { + return Some(self.cr4_fixed0); + } + IA32_VMX_CR4_FIXED1 => { + return Some(self.cr4_fixed1); + } + IA32_VMX_VMCS_ENUM => { + return Some(self.vmcs_enum); + } + IA32_VMX_PROCBASED_CTLS2 => { + return Some(NestedVmxMsrs::control_msr( + self.secondary_ctls_low, + self.secondary_ctls_high, + )); + } + IA32_VMX_EPT_VPID_CAP => { + return Some(self.ept_caps as u64 | ((self.vpid_caps as u64) << 32)); + } + IA32_VMX_VMFUNC => { + return Some(self.vmfunc_controls); + } + _ => { + return None; + } + } + } +} + +#[derive(Debug, Default)] +pub struct VmxCapability { + pub ept: EptFlag, + pub vpid: VpidFlag, +} + +#[derive(Debug)] +pub enum ProcessorTraceMode { + System, + HostGuest, +} + +bitflags! { + #[derive(Default)] + pub struct VpidFlag: u32 { + /// 表示处理器支持 INVVPID 指令 + const INVVPID = 1 << 0; /* (32 - 32) */ + /// 表示 VPID 支持以单独地址方式进行范围 + const EXTENT_INDIVIDUAL_ADDR = 1 << 8; /* (40 - 32) */ + /// 表示 VPID 支持以单个上下文方式进行范围 + const EXTENT_SINGLE_CONTEXT = 1 << 9; /* (41 - 32) */ + /// 表示 VPID 支持以全局上下文方式进行范围 + const EXTENT_GLOBAL_CONTEXT = 1 << 10; /* (42 - 32) */ + /// 表示 VPID 支持以单个非全局方式进行范围 + const EXTENT_SINGLE_NON_GLOBAL = 1 << 11; /* (43 - 32) */ + } + + #[derive(Default)] + pub struct EptFlag: u32 { + /// EPT 条目是否允许执行 + const EPT_EXECUTE_ONLY = 1; + /// 处理器是否支持 4 级页表 + const EPT_PAGE_WALK_4 = 1 << 6; + /// 处理器是否支持 5 级页表 + const EPT_PAGE_WALK_5 = 1 << 7; + /// EPT 表的内存类型是否为不可缓存(uncached) + const EPTP_UC = 1 << 8; + /// EPT 表的内存类型是否为写回(write-back) + const EPTP_WB = 1 << 14; + /// 处理器是否支持 2MB 大页 + const EPT_2MB_PAGE = 1 << 16; + /// 处理器是否支持 1GB 大页 + const EPT_1GB_PAGE = 1 << 17; + /// 处理器是否支持 INV-EPT 指令,用于刷新 EPT TLB + const EPT_INVEPT = 1 << 20; + /// EPT 表是否支持访问位(Access-Dirty) + const EPT_AD = 1 << 21; + /// 处理器是否支持上下文扩展 + const EPT_EXTENT_CONTEXT = 1 << 25; + /// 处理器是否支持全局扩展 + const EPT_EXTENT_GLOBAL = 1 << 26; + } +} + +impl VmxCapability { + pub fn set_val_from_msr_val(&mut self, val: u64) { + self.ept = EptFlag::from_bits_truncate(val as u32); + self.vpid = VpidFlag::from_bits_truncate((val >> 32) as u32); + } +} + +impl Vmx { + /// 检查处理器是否支持VMX基本控制结构的输入输出功能 + #[inline] + pub fn has_basic_inout(&self) -> bool { + return ((self.vmcs_config.basic_cap as u64) << 32) & VmxFeat::VMX_BASIC_INOUT != 0; + } + + /// 检查处理器是否支持虚拟的非屏蔽中断(NMI) + #[inline] + pub fn has_virtual_nmis(&self) -> bool { + return self + .vmcs_config + .pin_based_exec_ctrl + .contains(PinbasedControls::VIRTUAL_NMIS) + && self + .vmcs_config + .cpu_based_exec_ctrl + .contains(PrimaryControls::NMI_WINDOW_EXITING); + } + + /// 检查处理器是否支持VMX的抢占计时器功能 + #[inline] + pub fn has_preemption_timer(&self) -> bool { + return self + .vmcs_config + .pin_based_exec_ctrl + .contains(PinbasedControls::VMX_PREEMPTION_TIMER); + } + + /// 检查处理器是否支持VMX的posted interrupt功能 + #[inline] + pub fn has_posted_intr(&self) -> bool { + return self + .vmcs_config + .pin_based_exec_ctrl + .contains(PinbasedControls::POSTED_INTERRUPTS); + } + + /// 是否支持加载IA32_EFER寄存器 + #[inline] + pub fn has_load_ia32_efer(&self) -> bool { + return self + .vmcs_config + .vmentry_ctrl + .contains(EntryControls::LOAD_IA32_EFER); + } + + /// 是否支持加载IA32_PERF_GLOBAL_CTRL寄存器 + #[inline] + pub fn has_load_perf_global_ctrl(&self) -> bool { + return self + .vmcs_config + .vmentry_ctrl + .contains(EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL); + } + + /// 是否支持加载边界检查配置寄存器(MPX) + #[inline] + pub fn has_mpx(&self) -> bool { + return self + .vmcs_config + .vmentry_ctrl + .contains(EntryControls::LOAD_IA32_BNDCFGS); + } + + /// 是否支持虚拟处理器的任务优先级(TPR)影子 + #[inline] + pub fn has_tpr_shadow(&self) -> bool { + return self + .vmcs_config + .cpu_based_exec_ctrl + .contains(PrimaryControls::USE_TPR_SHADOW); + } + + /// 检查处理器是否支持 VMX中的 VPID(Virtual Processor ID)功能 + /// + /// VPID 允许虚拟机监视器为每个虚拟处理器分配唯一的标识符,从而使得在不同的虚拟机之间进行快速的上下文切换和恢复成为可能。 + /// + /// 通过使用 VPID,VMM 可以更快速地识别和恢复之前保存的虚拟处理器的状态,从而提高了虚拟化性能和效率。 + #[inline] + pub fn has_vpid(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_VPID); + } + + /// 是否支持invvpid + /// + /// INVVPID 指令用于通知处理器无效化指定虚拟处理器标识符(VPID)相关的 TLB(Translation Lookaside Buffer)条目 + #[inline] + pub fn has_invvpid(&self) -> bool { + return self.vmx_cap.vpid.contains(VpidFlag::INVVPID); + } + + /// VPID 是否支持以单独地址方式进行范围 + #[inline] + pub fn has_invvpid_individual_addr(&self) -> bool { + return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_INDIVIDUAL_ADDR); + } + + /// VPID 是否支持以单个上下文方式进行范围 + #[inline] + pub fn has_invvpid_single(&self) -> bool { + return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_SINGLE_CONTEXT); + } + + /// VPID 是否支持以全局上下文方式进行范围 + #[inline] + pub fn has_invvpid_global(&self) -> bool { + return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_GLOBAL_CONTEXT); + } + + /// 是否启用EPT(Extended Page Tables) + /// + /// EPT:EPT 是一种硬件虚拟化技术,允许虚拟机管理程序(例如 Hypervisor) 控制客户操作系统中虚拟地址和物理地址之间的映射。 + /// + /// 通过启用 EPT,处理器可以将虚拟地址直接映射到物理地址,从而提高虚拟机的性能和安全性。 + #[inline] + pub fn has_ept(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_EPT); + } + + /// 是否支持4级页表 + #[inline] + pub fn has_ept_4levels(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPT_PAGE_WALK_4); + } + + /// 判断mt(Memory type)是否为write back + #[inline] + pub fn has_ept_mt_wb(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPTP_WB); + } + + /// EPT是否支持全局拓展 + #[inline] + pub fn has_invept_global(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPT_EXTENT_GLOBAL); + } + + /// EPT是否支持访问位 + #[inline] + pub fn has_ept_ad_bits(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPT_AD); + } + + /// 是否支持 VMX 中的无限制客户(unrestricted guest)功能 + /// + /// 无限制客户功能允许客户操作系统在未受到主机操作系统干预的情况下运行 + #[inline] + pub fn has_unrestricted_guest(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::UNRESTRICTED_GUEST); + } + + /// 是否支持 VMX 中的 FlexPriority 功能 + /// + /// FlexPriority 是一种功能,可以在 TPR shadow 和虚拟化 APIC 访问同时可用时启用。 + /// + /// TPR shadow 允许虚拟机管理程序(VMM)跟踪虚拟机中处理器的 TPR 值,并在需要时拦截和修改。 + /// + /// 虚拟化 APIC 访问允许 VMM 控制虚拟机中的 APIC 寄存器访问。 + #[inline] + pub fn has_flexproirity(&self) -> bool { + return self.has_tpr_shadow() && self.has_virtualize_apic_accesses(); + } + + /// 是否支持 VMX 中的虚拟化 APIC 访问功能。 + /// + /// 当启用此功能时,虚拟机管理程序(VMM)可以控制虚拟机中的 APIC 寄存器访问。 + #[inline] + pub fn has_virtualize_apic_accesses(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::VIRTUALIZE_APIC); + } + + /// 是否支持 VMX 中的 ENCLS 指令导致的 VM 退出功能 + #[inline] + pub fn has_encls_vmexit(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENCLS_EXITING); + } + + /// 是否支持 VMX 中的 PLE (Pause Loop Exiting) 功能。 + #[inline] + pub fn has_ple(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::PAUSE_LOOP_EXITING); + } + + /// 是否支持 VMX 中的 APICv 功能 + #[inline] + pub fn has_apicv(&self) -> bool { + return self.has_apic_register_virt() + && self.has_posted_intr() + && self.has_virtual_intr_delivery(); + } + + /// 是否支持虚拟化的 APIC 寄存器功能 + #[inline] + pub fn has_apic_register_virt(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::VIRTUALIZE_APIC_REGISTER); + } + + /// 是否支持虚拟化的中断传递功能 + #[inline] + pub fn has_virtual_intr_delivery(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY); + } + + /// 是否支持虚拟化的中断注入(Inter-Processor Interrupt Virtualization,IPIV) + #[inline] + pub fn has_ipiv(&self) -> bool { + return false; + } + + /// 是否支持虚拟化的 TSC 缩放功能 + #[inline] + pub fn has_tsc_scaling(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::USE_TSC_SCALING); + } + + /// 是否支持虚拟化的页修改日志(Page Modification Logging) + #[inline] + pub fn has_pml(&self) -> bool { + return self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_PML); + } + + /// 检查 CPU 是否支持使用 MSR 位图来控制 VMX + #[inline] + pub fn has_msr_bitmap(&self) -> bool { + return self + .vmcs_config + .cpu_based_exec_ctrl + .contains(PrimaryControls::USE_MSR_BITMAPS); + } + + #[inline] + pub fn has_bus_lock_detection(&self) -> bool { + false + } + + #[inline] + pub fn has_notify_vmexit(&self) -> bool { + false + } +} diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs index e9921ec71..b6c4b4349 100644 --- a/kernel/src/arch/x86_64/vm/vmx/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -1,10 +1,614 @@ +use core::{ + mem::MaybeUninit, + sync::atomic::{AtomicBool, Ordering}, +}; + +use crate::{ + arch::{ + vm::{ + asm::KvmX86Asm, + kvm_host::{vcpu::VirCpuRequest, X86KvmArch}, + vmx::vmcs::vmx_area, + }, + CurrentIrqArch, VirtCpuArch, + }, + exception::InterruptArch, + kdebug, + libs::{once::Once, spinlock::SpinLock}, + mm::{ + percpu::{PerCpu, PerCpuVar}, + virt_2_phys, PhysAddr, + }, + smp::{core::smp_get_processor_id, cpu::ProcessorId}, + virt::vm::{kvm_dev::kvm_init, kvm_host::vcpu::VirtCpu}, +}; +use alloc::{alloc::Global, boxed::Box, collections::LinkedList, sync::Arc, vec::Vec}; +use bitmap::{traits::BitMapOps, AllocBitmap, StaticBitmap}; use raw_cpuid::CpuId; +use system_error::SystemError; +use x86::{ + controlregs::Xcr0, + msr::{ + rdmsr, IA32_CSTAR, IA32_EFER, IA32_FMASK, IA32_FS_BASE, IA32_GS_BASE, IA32_KERNEL_GSBASE, + IA32_LSTAR, IA32_SMBASE, IA32_STAR, IA32_SYSENTER_CS, IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, + IA32_TIME_STAMP_COUNTER, IA32_TSC_AUX, IA32_VMX_BASIC, IA32_VMX_CR0_FIXED0, + IA32_VMX_CR0_FIXED1, IA32_VMX_CR4_FIXED0, IA32_VMX_CR4_FIXED1, IA32_VMX_ENTRY_CTLS, + IA32_VMX_EPT_VPID_CAP, IA32_VMX_EXIT_CTLS, IA32_VMX_MISC, IA32_VMX_PINBASED_CTLS, + IA32_VMX_PROCBASED_CTLS, IA32_VMX_PROCBASED_CTLS2, IA32_VMX_TRUE_ENTRY_CTLS, + IA32_VMX_TRUE_EXIT_CTLS, IA32_VMX_TRUE_PINBASED_CTLS, IA32_VMX_TRUE_PROCBASED_CTLS, + IA32_VMX_VMCS_ENUM, IA32_VMX_VMFUNC, MSR_CORE_C1_RESIDENCY, MSR_CORE_C3_RESIDENCY, + MSR_CORE_C6_RESIDENCY, MSR_CORE_C7_RESIDENCY, MSR_IA32_ADDR0_START, MSR_IA32_ADDR3_END, + MSR_IA32_CR3_MATCH, MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK_PTRS, + MSR_IA32_RTIT_STATUS, MSR_IA32_TSX_CTRL, MSR_LASTBRANCH_TOS, MSR_LBR_SELECT, + }, + vmx::vmcs::{ + control::{ + EntryControls, ExitControls, PrimaryControls, SecondaryControls, PINBASED_EXEC_CONTROLS, + }, + host, + }, +}; +use x86_64::instructions::tables::sidt; + +use crate::{ + arch::{ + vm::{vmx::vmcs::feat::VmxFeat, x86_kvm_manager_mut, McgCap}, + KvmArch, + }, + kerror, kwarn, + libs::{lazy_init::Lazy, rwlock::RwLock}, + virt::vm::kvm_host::Vm, +}; + +use self::{ + capabilities::{NestedVmxMsrs, ProcessorTraceMode, VmcsConfig, VmxCapability}, + vmcs::{ + current_loaded_vmcs_list_mut, current_vmcs, current_vmcs_mut, LockedLoadedVmcs, + VMControlStructure, VmxMsrBitmapAccess, VmxMsrBitmapAction, PERCPU_LOADED_VMCS_LIST, + PERCPU_VMCS, VMXAREA, + }, +}; + +use super::{ + asm::VmxAsm, + init_kvm_arch, + kvm_host::{KvmFunc, KvmInitFunc, MsrFilterType}, + x86_kvm_manager, KvmArchManager, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR, + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR, + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR, +}; + +pub mod capabilities; +pub mod vmcs; +pub struct VmxKvmInitFunc; + +impl VmxKvmInitFunc { + pub fn setup_per_cpu(&self) { + let mut vmcs_areas = Vec::new(); + vmcs_areas.resize(PerCpu::MAX_CPU_NUM as usize, VMControlStructure::new()); + unsafe { VMXAREA = PerCpuVar::new(vmcs_areas) }; + + let mut percpu_current_vmcs = Vec::new(); + percpu_current_vmcs.resize(PerCpu::MAX_CPU_NUM as usize, None); + unsafe { PERCPU_VMCS = PerCpuVar::new(percpu_current_vmcs) } + + let mut percpu_loaded_vmcs_lists = Vec::new(); + percpu_loaded_vmcs_lists.resize(PerCpu::MAX_CPU_NUM as usize, LinkedList::new()); + unsafe { PERCPU_LOADED_VMCS_LIST = PerCpuVar::new(percpu_loaded_vmcs_lists) } + } +} + +impl KvmInitFunc for VmxKvmInitFunc { + #[inline(never)] + fn hardware_setup(&self) -> Result<(), SystemError> { + let idt = sidt(); + let cpuid = CpuId::new(); + let cpu_extend_feature = cpuid + .get_extended_processor_and_feature_identifiers() + .ok_or(SystemError::ENOSYS)?; + + let mut vmx_init: Box = unsafe { + Box::try_new_zeroed_in(Global) + .map_err(|_| SystemError::ENOMEM)? + .assume_init() + }; + vmx_init.host_idt_base = idt.base.as_u64(); + Vmx::set_up_user_return_msrs(); + + Vmx::setup_vmcs_config(&mut vmx_init.vmcs_config, &mut vmx_init.vmx_cap)?; + + let manager = x86_kvm_manager_mut(); + let kvm_cap = &mut manager.kvm_caps; + + if vmx_init.has_mpx() { + kvm_cap.supported_xcr0 &= !(Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE); + } + + // 判断是否启用vpid + if !vmx_init.has_vpid() + || !vmx_init.has_invvpid() + || !vmx_init.has_invvpid_single() + || !vmx_init.has_invvpid_global() + { + vmx_init.enable_vpid = false; + } + + if !vmx_init.has_ept() + || !vmx_init.has_ept_4levels() + || !vmx_init.has_ept_mt_wb() + || !vmx_init.has_invept_global() + { + vmx_init.enable_ept = false; + } + + // 是否启用了 EPT 并且检查 CPU 是否支持 Execute Disable(NX)功能 + // Execute Disable 是一种 CPU 功能,可以防止代码在数据内存区域上执行 + if !vmx_init.enable_ept && !cpu_extend_feature.has_execute_disable() { + kerror!("[KVM] NX (Execute Disable) not supported"); + return Err(SystemError::ENOSYS); + } + + if !vmx_init.has_ept_ad_bits() || !vmx_init.enable_ept { + vmx_init.enable_ept_ad = false; + } + + if !vmx_init.has_unrestricted_guest() || !vmx_init.enable_ept { + vmx_init.enable_unrestricted_guest = false; + } + + if !vmx_init.has_flexproirity() { + vmx_init.enable_flexpriority = false; + } + + if !vmx_init.has_virtual_nmis() { + vmx_init.enable_vnmi = false; + } + + if !vmx_init.has_encls_vmexit() { + vmx_init.enable_sgx = false; + } + + if !vmx_init.enable_flexpriority { + VmxKvmFunc::CONFIG.write().have_set_apic_access_page_addr = false; + } + + if !vmx_init.has_tpr_shadow() { + VmxKvmFunc::CONFIG.write().have_update_cr8_intercept = false; + } + + // TODO:https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#8501 - 8513 + + if !vmx_init.has_ple() { + vmx_init.ple_gap = 0; + vmx_init.ple_window = 0; + vmx_init.ple_window_grow = 0; + vmx_init.ple_window_max = 0; + vmx_init.ple_window_shrink = 0; + } + + if !vmx_init.has_apicv() { + vmx_init.enable_apicv = false; + } + + if !vmx_init.enable_apicv { + // TODO: 设置sync_pir_to_irr + } + + if !vmx_init.enable_apicv || !vmx_init.has_ipiv() { + vmx_init.enable_ipiv = false; + } + + if vmx_init.has_tsc_scaling() { + kvm_cap.has_tsc_control = true; + } + + kvm_cap.max_tsc_scaling_ratio = 0xffffffffffffffff; + kvm_cap.tsc_scaling_ratio_frac_bits = 48; + kvm_cap.has_bus_lock_exit = vmx_init.has_bus_lock_detection(); + kvm_cap.has_notify_vmexit = vmx_init.has_notify_vmexit(); + + vmx_init.vpid_bitmap.lock().set_all(false); + + if vmx_init.enable_ept { + // TODO: mmu_set_ept_masks + kwarn!("mmu_set_ept_masks TODO!"); + } + + kwarn!("vmx_setup_me_spte_mask TODO!"); + + kwarn!("kvm_configure_mmu TODO!"); + + if !vmx_init.enable_ept || !vmx_init.enable_ept_ad || !vmx_init.has_pml() { + vmx_init.enable_pml = false; + } + + if !vmx_init.enable_pml { + // TODO: Set cpu dirty log size + } + + if !vmx_init.has_preemption_timer() { + vmx_init.enable_preemption_timer = false; + } + + if vmx_init.enable_preemption_timer { + // TODO + } + + if !vmx_init.enable_preemption_timer { + // TODO + } + + kvm_cap + .supported_mce_cap + .insert(McgCap::MCG_LMCE_P | McgCap::MCG_CMCI_P); + + // TODO: pt_mode + + // TODO: setup_default_sgx_lepubkeyhash + + // TODO: nested + + // TODO: vmx_set_cpu_caps + init_vmx(vmx_init); + self.setup_per_cpu(); + + Ok(()) + } + + fn handle_intel_pt_intr(&self) -> u32 { + todo!() + } + + fn runtime_funcs(&self) -> &'static dyn super::kvm_host::KvmFunc { + &VmxKvmFunc + } +} + +#[derive(Debug)] +pub struct VmxKvmFunc; + +pub struct VmxKvmFuncConfig { + pub have_set_apic_access_page_addr: bool, + pub have_update_cr8_intercept: bool, +} + +impl VmxKvmFunc { + pub const CONFIG: RwLock = RwLock::new(VmxKvmFuncConfig { + have_set_apic_access_page_addr: true, + have_update_cr8_intercept: true, + }); + + pub fn vcpu_load_vmcs( + vcpu: &mut VirtCpu, + cpu: ProcessorId, + _buddy: Option>, + ) { + let vmx = vcpu.vmx(); + let already_loaded = vmx.loaded_vmcs.lock().cpu == cpu; + + if !already_loaded { + Self::loaded_vmcs_clear(&vmx.loaded_vmcs); + let _irq_guard = unsafe { CurrentIrqArch::save_and_disable_irq() }; + + current_loaded_vmcs_list_mut().push_back(vmx.loaded_vmcs.clone()); + } + + if let Some(prev) = current_vmcs() { + let vmcs = vmx.loaded_vmcs.lock().vmcs.clone(); + if !Arc::ptr_eq(&vmcs, prev) { + VmxAsm::vmcs_load(vmcs.phys_addr()); + *current_vmcs_mut() = Some(vmcs); + + // TODO:buddy barrier? + } + } else { + let vmcs = vmx.loaded_vmcs.lock().vmcs.clone(); + VmxAsm::vmcs_load(vmcs.phys_addr()); + *current_vmcs_mut() = Some(vmcs); + + // TODO:buddy barrier? + } + + if !already_loaded { + let mut pseudo_descriptpr: x86::dtables::DescriptorTablePointer = + Default::default(); + unsafe { + x86::dtables::sgdt(&mut pseudo_descriptpr); + }; + + vmx.loaded_vmcs.lock().cpu = cpu; + let id = vmx.loaded_vmcs.lock().vmcs.lock().revision_id(); + kdebug!("revision_id {id}"); + vcpu.request(VirCpuRequest::KVM_REQ_TLB_FLUSH); + + VmxAsm::vmx_vmwrite( + host::TR_BASE, + KvmX86Asm::get_segment_base( + pseudo_descriptpr.base, + pseudo_descriptpr.limit, + unsafe { x86::task::tr().bits() }, + ), + ); + + VmxAsm::vmx_vmwrite(host::GDTR_BASE, pseudo_descriptpr.base as usize as u64); + + VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_ESP, unsafe { rdmsr(IA32_SYSENTER_ESP) }); + } + } + + pub fn loaded_vmcs_clear(loaded_vmcs: &Arc) { + let mut guard = loaded_vmcs.lock(); + if guard.cpu == ProcessorId::INVALID { + return; + } + + if guard.cpu == smp_get_processor_id() { + if let Some(vmcs) = current_vmcs() { + if Arc::ptr_eq(vmcs, &guard.vmcs) { + *current_vmcs_mut() = None; + } + } + + VmxAsm::vmclear(guard.vmcs.phys_addr()); + + if let Some(shadow) = &guard.shadow_vmcs { + if guard.launched { + VmxAsm::vmclear(shadow.phys_addr()); + } + } + + let _ = current_loaded_vmcs_list_mut().extract_if(|x| Arc::ptr_eq(&x, loaded_vmcs)); + + guard.cpu = ProcessorId::INVALID; + guard.launched = false; + } else { + // 交由对应cpu处理 + todo!() + } + } +} + +impl KvmFunc for VmxKvmFunc { + fn name(&self) -> &'static str { + "VMX" + } + + fn hardware_enable(&self) -> Result<(), SystemError> { + let vmcs = vmx_area().get().as_ref(); + + kdebug!("vmcs idx {}", vmcs.abort); + + let phys_addr = virt_2_phys(vmcs as *const _ as usize); + + // TODO: intel_pt_handle_vmx(1); + + VmxAsm::kvm_cpu_vmxon(PhysAddr::new(phys_addr))?; + + Ok(()) + } + + fn vm_init(&self) -> X86KvmArch { + let vmx_init = vmx_info(); + + let mut arch = X86KvmArch::default(); + if vmx_init.ple_gap == 0 { + arch.pause_in_guest = true; + } + + return arch; + } + + fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm) { + VmxVCpuPriv::init(vcpu, vm); + } + + fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: crate::smp::cpu::ProcessorId) { + Self::vcpu_load_vmcs(vcpu, cpu, None); + // TODO: vmx_vcpu_pi_load + } + + fn cache_reg(&self, vcpu: &VirtCpuArch, reg: super::kvm_host::KvmReg) { + todo!() + } + + fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu) { + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#6924 + // TODO: pi + // todo!() + } + + fn set_msr(&self, vcpu: &mut VirtCpuArch, msr: super::asm::MsrData) { + todo!() + } + + fn vcpu_reset(&self, vcpu: &mut VirtCpu, init_event: bool) { + todo!() + } + + fn set_rflags(&self, vcpu: &mut VirtCpu, rflags: x86::bits64::rflags::RFlags) { + todo!() + } + + fn set_cr0(&self, vcpu: &mut VirtCpu, cr0: x86::controlregs::Cr0) { + todo!() + } + + fn set_cr4(&self, vcpu: &mut VirtCpu, cr4: x86::controlregs::Cr4) { + todo!() + } + + fn set_efer(&self, vcpu: &mut VirtCpu, efer: x86_64::registers::control::EferFlags) { + todo!() + } + + fn update_exception_bitmap(&self, vcpu: &mut VirtCpu) { + todo!() + } + + fn has_emulated_msr(&self, msr: u32) -> bool { + match msr { + IA32_SMBASE => { + return vmx_info().enable_unrestricted_guest + || vmx_info().emulate_invalid_guest_state; + } + + IA32_VMX_BASIC..=IA32_VMX_VMFUNC => { + return vmx_info().nested; + } + + 0xc001011f | 0xc0000104 => { + // MSR_AMD64_VIRT_SPEC_CTRL | MSR_AMD64_TSC_RATIO + return false; + } + + _ => { + return true; + } + } + } + + fn get_msr_feature(&self, msr: &mut super::asm::KvmMsrEntry) -> bool { + match msr.index { + IA32_VMX_BASIC..=IA32_VMX_VMFUNC => { + if !vmx_info().nested { + return false; + } + + match vmx_info().vmcs_config.nested.get_vmx_msr(msr.index) { + Some(data) => { + msr.data = data; + return true; + } + None => { + return false; + } + } + } + _ => { + return false; + } + } + } + + fn get_rflags(&self, vcpu: &VirtCpu) -> x86::bits64::rflags::RFlags { + todo!() + } +} + +static mut VMX: Option = None; + +#[inline] +pub fn vmx_info() -> &'static Vmx { + unsafe { VMX.as_ref().unwrap() } +} -pub struct Vmx; +#[inline(never)] +pub fn init_vmx(vmx: Box) { + static INIT_ONCE: AtomicBool = AtomicBool::new(false); + if INIT_ONCE + .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) + .is_ok() + { + unsafe { VMX = Some(*vmx) }; + } else { + panic!("init_vmx can only be called once"); + } +} + +#[derive(Debug)] +pub struct Vmx { + pub host_idt_base: u64, + pub vmcs_config: VmcsConfig, + pub vmx_cap: VmxCapability, + pub vpid_bitmap: SpinLock>, + pub enable_vpid: bool, + pub enable_ept: bool, + pub enable_ept_ad: bool, + pub enable_unrestricted_guest: bool, + pub emulate_invalid_guest_state: bool, + pub enable_flexpriority: bool, + pub enable_vnmi: bool, + pub enable_sgx: bool, + pub enable_apicv: bool, + pub enable_ipiv: bool, + pub enable_pml: bool, + pub enable_preemption_timer: bool, + + pub nested: bool, + + pub ple_gap: u32, + pub ple_window: u32, + pub ple_window_grow: u32, + pub ple_window_max: u32, + pub ple_window_shrink: u32, + + pub pt_mode: ProcessorTraceMode, +} + +impl Default for Vmx { + fn default() -> Self { + Self { + host_idt_base: Default::default(), + vmcs_config: Default::default(), + vmx_cap: Default::default(), + vpid_bitmap: SpinLock::new(StaticBitmap::new()), + enable_vpid: true, + enable_ept: true, + enable_ept_ad: true, + enable_unrestricted_guest: true, + enable_flexpriority: true, + enable_vnmi: true, + enable_sgx: true, + ple_gap: 128, + ple_window: 4096, + ple_window_grow: 2, + ple_window_max: u32::MAX, + ple_window_shrink: 0, + enable_apicv: true, + enable_ipiv: true, + enable_pml: true, + enable_preemption_timer: true, + pt_mode: ProcessorTraceMode::System, + emulate_invalid_guest_state: true, + + // 目前先不管嵌套虚拟化,后续再实现 + nested: true, + } + } +} impl Vmx { - /// @brief 查看CPU是否支持虚拟化 - pub fn kvm_arch_cpu_supports_vm() -> bool { + /* + * Internal error codes that are used to indicate that MSR emulation encountered + * an error that should result in #GP in the guest, unless userspace + * handles it. + */ + pub const KVM_MSR_RET_INVALID: u32 = 2; /* in-kernel MSR emulation #GP condition */ + pub const KVM_MSR_RET_FILTERED: u32 = 3; /* #GP due to userspace MSR filter */ + + pub const MAX_POSSIBLE_PASSTHROUGH_MSRS: usize = 16; + + pub const VMX_POSSIBLE_PASSTHROUGH_MSRS: [u32; Self::MAX_POSSIBLE_PASSTHROUGH_MSRS] = [ + 0x48, // MSR_IA32_SPEC_CTRL + 0x49, // MSR_IA32_PRED_CMD + 0x10b, // MSR_IA32_FLUSH_CMD + IA32_TIME_STAMP_COUNTER, + IA32_FS_BASE, + IA32_GS_BASE, + IA32_KERNEL_GSBASE, + 0x1c4, // MSR_IA32_XFD + 0x1c5, // MSR_IA32_XFD_ERR + IA32_SYSENTER_CS, + IA32_SYSENTER_ESP, + IA32_SYSENTER_EIP, + MSR_CORE_C1_RESIDENCY, + MSR_CORE_C3_RESIDENCY, + MSR_CORE_C6_RESIDENCY, + MSR_CORE_C7_RESIDENCY, + ]; + + /// ### 查看CPU是否支持虚拟化 + pub fn check_vmx_support() -> bool { let cpuid = CpuId::new(); // Check to see if CPU is Intel (“GenuineIntel”). if let Some(vi) = cpuid.get_vendor_info() { @@ -21,6 +625,460 @@ impl Vmx { } return true; } + + #[inline(never)] + pub fn set_up_user_return_msrs() { + const VMX_URET_MSRS_LIST: &'static [u32] = &[ + IA32_FMASK, + IA32_LSTAR, + IA32_CSTAR, + IA32_EFER, + IA32_TSC_AUX, + IA32_STAR, + // 这个寄存器会出错<,先注释掉 + // MSR_IA32_TSX_CTRL, + ]; + + let manager = x86_kvm_manager_mut(); + for msr in VMX_URET_MSRS_LIST { + manager.add_user_return_msr(*msr); + } + } + + /// 初始化设置vmcs的config + #[inline(never)] + pub fn setup_vmcs_config( + vmcs_config: &mut VmcsConfig, + vmx_cap: &mut VmxCapability, + ) -> Result<(), SystemError> { + const VMCS_ENTRY_EXIT_PAIRS: &'static [VmcsEntryExitPair] = &[ + VmcsEntryExitPair::new( + EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL, + ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL, + ), + VmcsEntryExitPair::new(EntryControls::LOAD_IA32_PAT, ExitControls::LOAD_IA32_PAT), + VmcsEntryExitPair::new(EntryControls::LOAD_IA32_EFER, ExitControls::LOAD_IA32_EFER), + VmcsEntryExitPair::new( + EntryControls::LOAD_IA32_BNDCFGS, + ExitControls::CLEAR_IA32_BNDCFGS, + ), + VmcsEntryExitPair::new( + EntryControls::LOAD_IA32_RTIT_CTL, + ExitControls::CLEAR_IA32_RTIT_CTL, + ), + ]; + + let mut cpu_based_exec_control = VmxFeat::adjust_primary_controls()?; + + let mut cpu_based_2nd_exec_control = + if cpu_based_exec_control.contains(PrimaryControls::SECONDARY_CONTROLS) { + VmxFeat::adjust_secondary_controls()? + } else { + SecondaryControls::empty() + }; + + if cpu_based_2nd_exec_control.contains(SecondaryControls::VIRTUALIZE_APIC) { + cpu_based_exec_control.remove(PrimaryControls::USE_TPR_SHADOW) + } + + if !cpu_based_exec_control.contains(PrimaryControls::USE_TPR_SHADOW) { + cpu_based_2nd_exec_control.remove( + SecondaryControls::VIRTUALIZE_APIC_REGISTER + | SecondaryControls::VIRTUALIZE_X2APIC + | SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY, + ) + } + + let cap = unsafe { rdmsr(IA32_VMX_EPT_VPID_CAP) }; + vmx_cap.set_val_from_msr_val(cap); + + // 不支持ept但是读取到了值 + if !cpu_based_2nd_exec_control.contains(SecondaryControls::ENABLE_EPT) + && !vmx_cap.ept.is_empty() + { + kwarn!("EPT CAP should not exist if not support. 1-setting enable EPT VM-execution control"); + return Err(SystemError::EIO); + } + + if !cpu_based_2nd_exec_control.contains(SecondaryControls::ENABLE_VPID) + && !vmx_cap.vpid.is_empty() + { + kwarn!("VPID CAP should not exist if not support. 1-setting enable VPID VM-execution control"); + return Err(SystemError::EIO); + } + + let cpuid = CpuId::new(); + let cpu_extend_feat = cpuid + .get_extended_feature_info() + .ok_or(SystemError::ENOSYS)?; + if !cpu_extend_feat.has_sgx() { + cpu_based_2nd_exec_control.remove(SecondaryControls::ENCLS_EXITING); + } + + let cpu_based_3rd_exec_control = 0; + // if cpu_based_exec_control.contains(SecondaryControls::TERTIARY_CONTROLS) { + // // Self::adjust_vmx_controls64(VmxFeature::IPI_VIRT, IA32_CTLS3) + // todo!() + // } else { + // 0 + // }; + + let vmxexit_control = VmxFeat::adjust_exit_controls()?; + + let pin_based_exec_control = VmxFeat::adjust_pin_based_controls()?; + + // TODO: broken timer? + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#2676 + + let vmentry_control = VmxFeat::adjust_entry_controls()?; + + for pair in VMCS_ENTRY_EXIT_PAIRS { + let n_ctrl = pair.entry; + let x_ctrl = pair.exit; + + // if !(vmentry_control.bits() & n_ctrl.bits) == !(vmxexit_control.bits() & x_ctrl.bits) { + // continue; + // } + if !(vmentry_control.contains(n_ctrl)) == !(vmxexit_control.contains(x_ctrl)) { + continue; + } + + kwarn!( + "Inconsistent VM-Entry/VM-Exit pair, entry = {:?}, exit = {:?}", + vmentry_control & n_ctrl, + vmxexit_control & x_ctrl, + ); + + return Err(SystemError::EIO); + } + + let basic = unsafe { rdmsr(IA32_VMX_BASIC) }; + let vmx_msr_high = (basic >> 32) as u32; + let vmx_msr_low = basic as u32; + + // 64位cpu,VMX_BASIC[48] == 0 + if vmx_msr_high & (1 << 16) != 0 { + return Err(SystemError::EIO); + } + + // 判断是否为写回(WB) + if (vmx_msr_high >> 18) & 15 != 6 { + return Err(SystemError::EIO); + } + + let misc_msr = unsafe { rdmsr(IA32_VMX_MISC) }; + + vmcs_config.size = vmx_msr_high & 0x1fff; + vmcs_config.basic_cap = vmx_msr_high & !0x1fff; + vmcs_config.revision_id = vmx_msr_low; + vmcs_config.pin_based_exec_ctrl = pin_based_exec_control; + vmcs_config.cpu_based_exec_ctrl = cpu_based_exec_control; + vmcs_config.cpu_based_2nd_exec_ctrl = cpu_based_2nd_exec_control; + vmcs_config.cpu_based_3rd_exec_ctrl = cpu_based_3rd_exec_control; + vmcs_config.vmentry_ctrl = vmentry_control; + vmcs_config.vmexit_ctrl = vmxexit_control; + vmcs_config.misc = misc_msr; + + Ok(()) + } + + fn adjust_vmx_controls(ctl_min: u32, ctl_opt: u32, msr: u32) -> Result { + let mut ctl = ctl_min | ctl_opt; + let val = unsafe { rdmsr(msr) }; + let low = val as u32; + let high = (val >> 32) as u32; + + ctl &= high; + ctl |= low; + + if ctl_min & !ctl != 0 { + return Err(SystemError::EIO); + } + + return Ok(ctl); + } + + fn adjust_vmx_controls64(ctl_opt: u32, msr: u32) -> u32 { + let allow = unsafe { rdmsr(msr) } as u32; + ctl_opt & allow + } + + pub fn alloc_vpid(&self) -> Option { + if !self.enable_vpid { + return None; + } + + let mut bitmap_guard = self.vpid_bitmap.lock(); + + let idx = bitmap_guard.first_false_index(); + if let Some(idx) = idx { + bitmap_guard.set(idx, true); + } + + return idx; + } + + pub fn free_vpid(&self, vpid: Option) { + if !self.enable_vpid || vpid.is_none() { + return; + } + + self.vpid_bitmap.lock().set(vpid.unwrap(), false); + } + + pub fn is_valid_passthrough_msr(msr: u32) -> bool { + match msr { + 0x800..0x8ff => { + // x2Apic msr寄存器 + return true; + } + MSR_IA32_RTIT_STATUS + | MSR_IA32_RTIT_OUTPUT_BASE + | MSR_IA32_RTIT_OUTPUT_MASK_PTRS + | MSR_IA32_CR3_MATCH + | MSR_LBR_SELECT + | MSR_LASTBRANCH_TOS => { + return true; + } + MSR_IA32_ADDR0_START..MSR_IA32_ADDR3_END => { + return true; + } + 0xdc0..0xddf => { + // MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31 + return true; + } + 0x680..0x69f => { + // MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31 + return true; + } + 0x6c0..0x6df => { + // MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31 + return true; + } + 0x40..0x48 => { + // MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8 + return true; + } + 0x60..0x68 => { + // MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8 + return true; + } + _ => { + return Self::possible_passthrough_msr_slot(msr).is_some(); + } + } + } + + pub fn possible_passthrough_msr_slot(msr: u32) -> Option { + for (idx, val) in Self::VMX_POSSIBLE_PASSTHROUGH_MSRS.iter().enumerate() { + if *val == msr { + return Some(idx); + } + } + + return None; + } + + pub fn tdp_enabled(&self) -> bool { + self.enable_ept + } + + fn setup_l1d_flush(&self) { + // TODO:先这样写 + *L1TF_VMX_MITIGATION.write() = VmxL1dFlushState::FlushNotRequired; + } +} + +struct VmcsEntryExitPair { + entry: EntryControls, + exit: ExitControls, +} + +impl VmcsEntryExitPair { + pub const fn new(entry: EntryControls, exit: ExitControls) -> Self { + Self { entry, exit } + } +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct VmxUretMsr { + load_into_hardware: bool, + data: u64, + mask: u64, +} + +#[derive(Debug)] +pub struct VmxVCpuPriv { + vpid: Option, + vmcs01: Arc, + loaded_vmcs: Arc, + guest_uret_msrs: [VmxUretMsr; KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS], + + shadow_msr_intercept_read: AllocBitmap, + shadow_msr_intercept_write: AllocBitmap, +} + +impl VmxVCpuPriv { + /// 参考:https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#7452 + pub fn init(vcpu: &mut VirtCpu, vm: &Vm) { + let vmcs = LockedLoadedVmcs::new(); + let mut vmx = Self { + vpid: None, + vmcs01: vmcs.clone(), + loaded_vmcs: vmcs, + guest_uret_msrs: [VmxUretMsr::default(); KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS], + shadow_msr_intercept_read: AllocBitmap::new(16), + shadow_msr_intercept_write: AllocBitmap::new(16), + }; + + vmx.vpid = vmx_info().alloc_vpid(); + + for i in 0..x86_kvm_manager().kvm_uret_msrs_list.len() { + vmx.guest_uret_msrs[i].mask = u64::MAX; + } + + if CpuId::new().get_extended_feature_info().unwrap().has_rtm() { + let tsx_ctrl = vmx.find_uret_msr_mut(MSR_IA32_TSX_CTRL); + if let Some(tsx_ctrl) = tsx_ctrl { + // Disable TSX enumeration + tsx_ctrl.mask = !(1 << 1); + } + } + + vmx.shadow_msr_intercept_read.set_all(true); + vmx.shadow_msr_intercept_write.set_all(true); + + let arch = &vm.arch; + + vmx.disable_intercept_for_msr(arch, IA32_TIME_STAMP_COUNTER, MsrType::READ); + vmx.disable_intercept_for_msr(arch, IA32_FS_BASE, MsrType::RW); + vmx.disable_intercept_for_msr(arch, IA32_GS_BASE, MsrType::RW); + vmx.disable_intercept_for_msr(arch, IA32_KERNEL_GSBASE, MsrType::RW); + + vmx.disable_intercept_for_msr(arch, IA32_SYSENTER_CS, MsrType::RW); + vmx.disable_intercept_for_msr(arch, IA32_SYSENTER_ESP, MsrType::RW); + vmx.disable_intercept_for_msr(arch, IA32_SYSENTER_EIP, MsrType::RW); + + if arch.pause_in_guest { + vmx.disable_intercept_for_msr(arch, MSR_CORE_C1_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, MSR_CORE_C3_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, MSR_CORE_C6_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, MSR_CORE_C7_RESIDENCY, MsrType::READ); + } + + if vmx_info().enable_flexpriority && vcpu.arch.lapic_in_kernel() { + todo!() + } + + if vmx_info().enable_ept && !vmx_info().enable_unrestricted_guest { + todo!() + } + + if vcpu.arch.lapic_in_kernel() && vmx_info().enable_ipiv { + todo!() + } + + // 初始化vmx私有信息 + vcpu.private = Some(vmx); + } + + pub fn find_uret_msr(&self, msr: u32) -> Option<&VmxUretMsr> { + let idx = x86_kvm_manager().find_user_return_msr_idx(msr); + if let Some(index) = idx { + return Some(&self.guest_uret_msrs[index]); + } else { + return None; + } + } + + pub fn find_uret_msr_mut(&mut self, msr: u32) -> Option<&mut VmxUretMsr> { + let idx = x86_kvm_manager().find_user_return_msr_idx(msr); + if let Some(index) = idx { + return Some(&mut self.guest_uret_msrs[index]); + } else { + return None; + } + } + + /// ## 禁用对特定的 MSR 的拦截 + fn disable_intercept_for_msr(&mut self, arch: &KvmArch, msr: u32, mut msr_type: MsrType) { + if !vmx_info().has_msr_bitmap() { + return; + } + + let msr_bitmap = &mut self.vmcs01.lock().msr_bitmap; + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#3974 + // 嵌套vmx处理 + + if Vmx::is_valid_passthrough_msr(msr) { + if let Some(idx) = Vmx::possible_passthrough_msr_slot(msr) { + if msr_type.contains(MsrType::READ) { + self.shadow_msr_intercept_read.set(idx, false); + } + if msr_type.contains(MsrType::WRITE) { + self.shadow_msr_intercept_write.set(idx, false); + } + } + } + + if msr_type.contains(MsrType::READ) + && !arch.msr_allowed(msr, MsrFilterType::KVM_MSR_FILTER_READ) + { + msr_bitmap.ctl(msr, VmxMsrBitmapAction::Set, VmxMsrBitmapAccess::Read); + msr_type.remove(MsrType::READ); + } + + if msr_type.contains(MsrType::WRITE) + && !arch.msr_allowed(msr, MsrFilterType::KVM_MSR_FILTER_WRITE) + { + msr_bitmap.ctl(msr, VmxMsrBitmapAction::Set, VmxMsrBitmapAccess::Write); + msr_type.remove(MsrType::WRITE); + } + + if msr_type.contains(MsrType::READ) { + msr_bitmap.ctl(msr, VmxMsrBitmapAction::Clear, VmxMsrBitmapAccess::Read); + } + + if msr_type.contains(MsrType::WRITE) { + msr_bitmap.ctl(msr, VmxMsrBitmapAction::Clear, VmxMsrBitmapAccess::Write); + } + } +} + +bitflags! { + pub struct MsrType: u8 { + const READ = 1; + const WRITE = 2; + const RW = 3; + } +} + +#[derive(Debug, PartialEq)] +pub enum VmxL1dFlushState { + FlushAuto, + FlushNever, + FlushCond, + FlushAlways, + FlushEptDisabled, + FlushNotRequired, } -pub fn vmx_init() {} +pub static L1TF_VMX_MITIGATION: RwLock = RwLock::new(VmxL1dFlushState::FlushAuto); + +pub fn vmx_init() -> Result<(), SystemError> { + let cpuid = CpuId::new(); + let cpu_feat = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?; + if !cpu_feat.has_vmx() { + return Err(SystemError::ENOSYS); + } + + init_kvm_arch(); + + x86_kvm_manager_mut().vendor_init(&VmxKvmInitFunc)?; + + vmx_info().setup_l1d_flush(); + + kvm_init()?; + Ok(()) +} diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs new file mode 100644 index 000000000..cd16f6c6a --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs @@ -0,0 +1,158 @@ +use system_error::SystemError; +use x86::{ + msr::{ + IA32_VMX_ENTRY_CTLS, IA32_VMX_EXIT_CTLS, IA32_VMX_PINBASED_CTLS, IA32_VMX_PROCBASED_CTLS, + IA32_VMX_PROCBASED_CTLS2, + }, + vmx::vmcs::control::{ + EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls, + }, +}; + +use crate::arch::vm::vmx::Vmx; + +pub struct VmxFeat; + +impl VmxFeat { + pub const KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL: u32 = PrimaryControls::HLT_EXITING.bits() + | PrimaryControls::CR3_LOAD_EXITING.bits() + | PrimaryControls::CR3_STORE_EXITING.bits() + | PrimaryControls::UNCOND_IO_EXITING.bits() + | PrimaryControls::MOV_DR_EXITING.bits() + | PrimaryControls::USE_TSC_OFFSETTING.bits() + | PrimaryControls::MWAIT_EXITING.bits() + | PrimaryControls::MONITOR_EXITING.bits() + | PrimaryControls::INVLPG_EXITING.bits() + | PrimaryControls::RDPMC_EXITING.bits() + | PrimaryControls::INTERRUPT_WINDOW_EXITING.bits() + | PrimaryControls::CR8_LOAD_EXITING.bits() + | PrimaryControls::CR8_STORE_EXITING.bits(); + + pub const KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL: u32 = PrimaryControls::RDTSC_EXITING + .bits() + | PrimaryControls::USE_TPR_SHADOW.bits() + | PrimaryControls::USE_IO_BITMAPS.bits() + | PrimaryControls::MONITOR_TRAP_FLAG.bits() + | PrimaryControls::USE_MSR_BITMAPS.bits() + | PrimaryControls::NMI_WINDOW_EXITING.bits() + | PrimaryControls::PAUSE_EXITING.bits() + | PrimaryControls::SECONDARY_CONTROLS.bits(); + + pub const KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL: u32 = 0; + + pub const KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL: u32 = SecondaryControls::VIRTUALIZE_APIC + .bits() + | SecondaryControls::VIRTUALIZE_X2APIC.bits() + | SecondaryControls::WBINVD_EXITING.bits() + | SecondaryControls::ENABLE_VPID.bits() + | SecondaryControls::ENABLE_EPT.bits() + | SecondaryControls::UNRESTRICTED_GUEST.bits() + | SecondaryControls::PAUSE_LOOP_EXITING.bits() + | SecondaryControls::DTABLE_EXITING.bits() + | SecondaryControls::ENABLE_RDTSCP.bits() + | SecondaryControls::ENABLE_INVPCID.bits() + | SecondaryControls::VIRTUALIZE_APIC_REGISTER.bits() + | SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY.bits() + | SecondaryControls::VMCS_SHADOWING.bits() + | SecondaryControls::ENABLE_XSAVES_XRSTORS.bits() + | SecondaryControls::RDSEED_EXITING.bits() + | SecondaryControls::RDRAND_EXITING.bits() + | SecondaryControls::ENABLE_PML.bits() + | SecondaryControls::USE_TSC_SCALING.bits() + | SecondaryControls::ENABLE_USER_WAIT_PAUSE.bits() + | SecondaryControls::INTEL_PT_GUEST_PHYSICAL.bits() + | SecondaryControls::CONCEAL_VMX_FROM_PT.bits() + | SecondaryControls::ENABLE_VM_FUNCTIONS.bits() + | SecondaryControls::ENCLS_EXITING.bits(); + // | SecondaryControls::BUS_LOCK_DETECTION.bits() + // | SecondaryControls::NOTIFY_VM_EXITING.bits() + + pub const KVM_REQUIRED_VMX_VM_EXIT_CONTROLS: u32 = ExitControls::SAVE_DEBUG_CONTROLS.bits() + | ExitControls::ACK_INTERRUPT_ON_EXIT.bits() + | ExitControls::HOST_ADDRESS_SPACE_SIZE.bits(); + + pub const KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS: u32 = ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL + .bits() + | ExitControls::SAVE_IA32_PAT.bits() + | ExitControls::LOAD_IA32_PAT.bits() + | ExitControls::SAVE_IA32_EFER.bits() + | ExitControls::SAVE_VMX_PREEMPTION_TIMER.bits() + | ExitControls::LOAD_IA32_EFER.bits() + | ExitControls::CLEAR_IA32_BNDCFGS.bits() + | ExitControls::CONCEAL_VMX_FROM_PT.bits() + | ExitControls::CLEAR_IA32_RTIT_CTL.bits(); + + pub const KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = + PinbasedControls::EXTERNAL_INTERRUPT_EXITING.bits() | PinbasedControls::NMI_EXITING.bits(); + + pub const KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = PinbasedControls::VIRTUAL_NMIS + .bits() + | PinbasedControls::POSTED_INTERRUPTS.bits() + | PinbasedControls::VMX_PREEMPTION_TIMER.bits(); + + pub const KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS: u32 = + EntryControls::LOAD_DEBUG_CONTROLS.bits() | EntryControls::IA32E_MODE_GUEST.bits(); + + pub const KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS: u32 = EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL + .bits() + | EntryControls::LOAD_IA32_PAT.bits() + | EntryControls::LOAD_IA32_EFER.bits() + | EntryControls::LOAD_IA32_BNDCFGS.bits() + | EntryControls::CONCEAL_VMX_FROM_PT.bits() + | EntryControls::LOAD_IA32_RTIT_CTL.bits(); + + /* VMX_BASIC bits and bitmasks */ + pub const VMX_BASIC_VMCS_SIZE_SHIFT: u64 = 32; + pub const VMX_BASIC_TRUE_CTLS: u64 = 1 << 55; + pub const VMX_BASIC_64: u64 = 0x0001000000000000; + pub const VMX_BASIC_MEM_TYPE_SHIFT: u64 = 50; + pub const VMX_BASIC_MEM_TYPE_MASK: u64 = 0x003c000000000000; + pub const VMX_BASIC_MEM_TYPE_WB: u64 = 6; + pub const VMX_BASIC_INOUT: u64 = 0x0040000000000000; + + pub fn adjust_primary_controls() -> Result { + Ok(PrimaryControls::from_bits_truncate( + Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, + Self::KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, + IA32_VMX_PROCBASED_CTLS, + )?, + )) + } + + pub fn adjust_secondary_controls() -> Result { + Ok(SecondaryControls::from_bits_truncate( + Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, + Self::KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, + IA32_VMX_PROCBASED_CTLS2, + )?, + )) + } + + pub fn adjust_exit_controls() -> Result { + Ok(ExitControls::from_bits_truncate(Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, + Self::KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, + IA32_VMX_EXIT_CTLS, + )?)) + } + + pub fn adjust_entry_controls() -> Result { + Ok(EntryControls::from_bits_truncate(Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, + Self::KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, + IA32_VMX_ENTRY_CTLS, + )?)) + } + + pub fn adjust_pin_based_controls() -> Result { + Ok(PinbasedControls::from_bits_truncate( + Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, + Self::KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, + IA32_VMX_PINBASED_CTLS, + )?, + )) + } +} diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs new file mode 100644 index 000000000..c3318f209 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs @@ -0,0 +1,265 @@ +use alloc::{boxed::Box, collections::LinkedList, sync::Arc, vec::Vec}; +use bitmap::{traits::BitMapOps, AllocBitmap}; +use system_error::SystemError; + +use crate::{ + arch::{vm::asm::VmxAsm, MMArch}, + kdebug, + libs::spinlock::{SpinLock, SpinLockGuard}, + mm::{percpu::PerCpuVar, virt_2_phys, MemoryManagementArch, PhysAddr}, + smp::cpu::ProcessorId, +}; + +use super::vmx_info; + +pub mod feat; + +pub static mut PERCPU_VMCS: Option>>> = None; +pub static mut PERCPU_LOADED_VMCS_LIST: Option>>> = None; +pub static mut VMXAREA: Option>> = None; + +pub fn current_vmcs() -> &'static Option> { + unsafe { PERCPU_VMCS.as_ref().unwrap().get() } +} + +pub fn current_vmcs_mut() -> &'static mut Option> { + unsafe { PERCPU_VMCS.as_ref().unwrap().get_mut() } +} + +pub fn current_loaded_vmcs_list_mut() -> &'static mut LinkedList> { + unsafe { PERCPU_LOADED_VMCS_LIST.as_ref().unwrap().get_mut() } +} + +pub fn current_loaded_vmcs_list() -> &'static LinkedList> { + unsafe { PERCPU_LOADED_VMCS_LIST.as_ref().unwrap().get() } +} + +pub fn vmx_area() -> &'static PerCpuVar> { + unsafe { VMXAREA.as_ref().unwrap() } +} + +#[repr(C, align(4096))] +#[derive(Debug, Clone)] +pub struct VMControlStructure { + pub header: u32, + pub abort: u32, + pub data: [u8; MMArch::PAGE_SIZE - core::mem::size_of::() - core::mem::size_of::()], +} + +impl VMControlStructure { + pub fn new() -> Box { + let mut vmcs: Box = unsafe { + Box::try_new_zeroed() + .expect("alloc vmcs failed") + .assume_init() + }; + + vmcs.set_revision_id(vmx_info().vmcs_config.revision_id); + vmcs + } + + pub fn revision_id(&self) -> u32 { + self.header & 0x7FFF_FFFF + } + + pub fn is_shadow_vmcs(&self) -> bool { + self.header & 0x8000_0000 == 1 + } + + pub fn set_shadow_vmcs(&mut self, shadow: bool) { + self.header |= (shadow as u32) << 31; + } + + pub fn set_revision_id(&mut self, id: u32) { + self.header = self.header & 0x8000_0000 | (id & 0x7FFF_FFFF); + } +} + +#[derive(Debug)] +pub struct LockedVMControlStructure { + /// 记录内部的vmcs的物理地址 + phys_addr: PhysAddr, + inner: SpinLock>, +} + +impl LockedVMControlStructure { + #[inline(never)] + pub fn new(shadow: bool) -> Arc { + let mut vmcs = VMControlStructure::new(); + + let phys_addr = PhysAddr::new(virt_2_phys(vmcs.as_ref() as *const _ as usize)); + + vmcs.set_shadow_vmcs(shadow); + + Arc::new(Self { + phys_addr, + inner: SpinLock::new(vmcs), + }) + } + + pub fn lock(&self) -> SpinLockGuard> { + self.inner.lock() + } + + pub fn phys_addr(&self) -> PhysAddr { + self.phys_addr + } +} + +#[derive(Debug, Default)] +pub struct VmcsHostState { + pub cr3: usize, + pub cr4: usize, + pub gs_base: usize, + pub fs_base: usize, + pub rsp: usize, + pub fs_sel: u16, + pub gs_sel: u16, + pub ldt_sel: u16, + pub ds_sel: u16, + pub rs_sel: u16, +} + +#[derive(Debug, Default)] +pub struct VmcsControlsShadow { + vm_entry: u32, + vm_exit: u32, + pin: u32, + exec: u32, + secondary_exec: u32, + tertiary_exec: u32, +} + +#[derive(Debug)] +pub struct LoadedVmcs { + pub vmcs: Arc, + pub shadow_vmcs: Option>, + pub cpu: ProcessorId, + /// 是否已经执行了 VMLAUNCH 指令 + pub launched: bool, + /// NMI 是否已知未被屏蔽 + nmi_known_unmasked: bool, + /// Hypervisor 定时器是否被软禁用 + hv_timer_soft_disabled: bool, + /// 支持 vnmi-less CPU 的字段,指示 VNMI 是否被软阻止 + soft_vnmi_blocked: bool, + /// 记录 VM 进入时间 + entry_time: u64, + /// 记录 VNMI 被阻止的时间 + vnmi_blocked_time: u64, + /// msr位图 + pub msr_bitmap: VmxMsrBitmap, + /// 保存 VMCS 主机状态的结构体 + host_state: VmcsHostState, + /// 保存 VMCS 控制字段的shadow状态的结构体。 + controls_shadow: VmcsControlsShadow, +} + +#[derive(Debug)] +pub struct LockedLoadedVmcs { + inner: SpinLock, +} + +impl LockedLoadedVmcs { + pub fn new() -> Arc { + let bitmap = if vmx_info().has_msr_bitmap() { + let bitmap = VmxMsrBitmap::new(true, MMArch::PAGE_SIZE * u8::BITS as usize); + bitmap + } else { + VmxMsrBitmap::new(true, 0) + }; + let vmcs = LockedVMControlStructure::new(false); + + VmxAsm::vmclear(vmcs.phys_addr); + + Arc::new(Self { + inner: SpinLock::new(LoadedVmcs { + vmcs, + shadow_vmcs: None, + cpu: ProcessorId::INVALID, + launched: false, + hv_timer_soft_disabled: false, + msr_bitmap: bitmap, + host_state: VmcsHostState::default(), + controls_shadow: VmcsControlsShadow::default(), + nmi_known_unmasked: false, + soft_vnmi_blocked: false, + entry_time: 0, + vnmi_blocked_time: 0, + }), + }) + } + + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } +} + +#[derive(Debug)] +pub struct VmxMsrBitmap { + data: AllocBitmap, +} + +pub enum VmxMsrBitmapAction { + Test, + Set, + Clear, +} + +pub enum VmxMsrBitmapAccess { + Write, + Read, +} + +impl VmxMsrBitmapAccess { + pub const fn base(&self) -> usize { + match self { + VmxMsrBitmapAccess::Write => 0x800 * core::mem::size_of::(), + VmxMsrBitmapAccess::Read => 0, + } + } +} + +impl VmxMsrBitmap { + pub fn new(init_val: bool, size: usize) -> Self { + let mut data = AllocBitmap::new(size); + data.set_all(init_val); + Self { data } + } + + pub fn ctl( + &mut self, + msr: u32, + action: VmxMsrBitmapAction, + access: VmxMsrBitmapAccess, + ) -> bool { + if msr <= 0x1fff { + return self.bit_op(msr as usize, access.base(), action); + } else if msr >= 0xc0000000 && msr <= 0xc0001fff { + return self.bit_op(msr as usize, access.base(), action); + } else { + return true; + } + } + + fn bit_op(&mut self, msr: usize, base: usize, action: VmxMsrBitmapAction) -> bool { + match action { + VmxMsrBitmapAction::Test => { + let ret = self.data.get(msr + base); + if let Some(ret) = ret { + ret + } else { + false + } + } + VmxMsrBitmapAction::Set => { + self.data.set(msr + base, true); + true + } + VmxMsrBitmapAction::Clear => { + self.data.set(msr + base, false); + true + } + } + } +} diff --git a/kernel/src/init/init.rs b/kernel/src/init/init.rs index c1f2f602f..6dfb1d5fe 100644 --- a/kernel/src/init/init.rs +++ b/kernel/src/init/init.rs @@ -2,6 +2,7 @@ use crate::{ arch::{ init::{early_setup_arch, setup_arch, setup_arch_post}, time::time_init, + vm::vmx::vmx_init, CurrentIrqArch, CurrentSMPArch, CurrentSchedArch, }, driver::{base::init::driver_init, serial::serial_early_init, video::VideoRefreshManager}, @@ -80,8 +81,10 @@ fn do_start_kernel() { setup_arch_post().expect("setup_arch_post failed"); + // #[cfg(all(target_arch = "x86_64", feature = "kvm"))] + // crate::virt::kvm::kvm_init(); #[cfg(all(target_arch = "x86_64", feature = "kvm"))] - crate::virt::kvm::kvm_init(); + vmx_init().unwrap(); } /// 在内存管理初始化之前,执行的初始化 diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 71a7a4ff5..81cf7b78d 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -22,6 +22,8 @@ #![feature(trait_upcasting)] #![feature(slice_ptr_get)] #![feature(vec_into_raw_parts)] +// match语句中能够使用范围 +#![feature(exclusive_range_pattern)] #![cfg_attr(target_os = "none", no_std)] // clippy的配置 #![deny(clippy::all)] diff --git a/kernel/src/libs/rbtree.rs b/kernel/src/libs/rbtree.rs index aed58744f..cddc8d7e4 100644 --- a/kernel/src/libs/rbtree.rs +++ b/kernel/src/libs/rbtree.rs @@ -831,6 +831,15 @@ impl IntoIterator for RBTree { } } +impl Default for RBTree { + fn default() -> Self { + RBTree { + root: NodePtr::null(), + len: 0, + } + } +} + impl RBTree { /// Creates an empty `RBTree`. pub fn new() -> RBTree { diff --git a/kernel/src/mm/mod.rs b/kernel/src/mm/mod.rs index c2dae1705..3b95f7c2b 100644 --- a/kernel/src/mm/mod.rs +++ b/kernel/src/mm/mod.rs @@ -246,7 +246,7 @@ impl core::ops::SubAssign for PhysAddr { } /// 虚拟内存地址 -#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)] +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Default)] #[repr(transparent)] pub struct VirtAddr(usize); diff --git a/kernel/src/virt/mod.rs b/kernel/src/virt/mod.rs index 937d3d510..0f2205dd6 100644 --- a/kernel/src/virt/mod.rs +++ b/kernel/src/virt/mod.rs @@ -1 +1,2 @@ pub mod kvm; +pub mod vm; diff --git a/kernel/src/virt/vm/kvm_dev.rs b/kernel/src/virt/vm/kvm_dev.rs new file mode 100644 index 000000000..d42c319f6 --- /dev/null +++ b/kernel/src/virt/vm/kvm_dev.rs @@ -0,0 +1,428 @@ +use core::{ + intrinsics::unlikely, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use alloc::sync::{Arc, Weak}; +use system_error::SystemError; + +use crate::{ + arch::{ + vm::{kvm_host::KvmCommonRegs, x86_kvm_manager}, + MMArch, + }, + driver::base::device::device_number::DeviceNumber, + filesystem::{ + devfs::{devfs_register, DevFS, DeviceINode}, + vfs::{ + core::generate_inode_id, + file::{File, FileMode}, + syscall::ModeType, + FileType, IndexNode, Metadata, + }, + }, + libs::spinlock::SpinLock, + mm::MemoryManagementArch, + process::ProcessManager, + syscall::user_access::{UserBufferReader, UserBufferWriter}, + time::PosixTimeSpec, + virt::vm::{ + kvm_host::check_stack_usage, + user_api::{KvmUserspaceMemoryRegion, PosixKvmUserspaceMemoryRegion}, + }, +}; + +use super::kvm_host::{ + vcpu::{LockedVirtCpu, VirtCpu}, + LockedVm, Vm, +}; + +#[derive(Debug)] +pub struct KvmInode { + /// 指向自身的弱引用 + self_ref: Weak, + /// 指向inode所在的文件系统对象的指针 + fs: Weak, + /// INode 元数据 + metadata: Metadata, +} + +#[derive(Debug)] +pub struct LockedKvmInode { + inner: SpinLock, +} + +impl LockedKvmInode { + const KVM_CREATE_VM: u32 = 0xAE01; + const KVM_GET_VCPU_MMAP_SIZE: u32 = 0xAE04; + + pub fn new() -> Arc { + let inode = KvmInode { + self_ref: Weak::default(), + fs: Weak::default(), + metadata: Metadata { + dev_id: 1, + inode_id: generate_inode_id(), + size: 0, + blk_size: 0, + blocks: 0, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + file_type: FileType::KvmDevice, // 文件夹,block设备,char设备 + mode: ModeType::S_IALLUGO, + nlinks: 1, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), // 这里用来作为device number + }, + }; + + let result = Arc::new(LockedKvmInode { + inner: SpinLock::new(inode), + }); + result.inner.lock().self_ref = Arc::downgrade(&result); + + return result; + } + + fn create_vm(&self, vm_type: usize) -> Result { + let kvm = LockedVm::create(vm_type)?; + + let instance = KvmInstance::new(kvm); + + let current = ProcessManager::current_pcb(); + + let file = File::new(instance, FileMode::O_RDWR)?; + let fd = current.fd_table().write().alloc_fd(file, None)?; + return Ok(fd as usize); + } +} + +impl DeviceINode for LockedKvmInode { + fn set_fs(&self, fs: Weak) { + self.inner.lock().fs = fs; + } +} + +impl IndexNode for LockedKvmInode { + fn open( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + _mode: &FileMode, + ) -> Result<(), SystemError> { + Ok(()) + } + fn read_at( + &self, + _offset: usize, + _len: usize, + _buf: &mut [u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + Err(SystemError::ENOSYS) + } + + fn write_at( + &self, + _offset: usize, + _len: usize, + _buf: &[u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + Err(SystemError::ENOSYS) + } + + fn fs(&self) -> Arc { + self.inner.lock().fs.upgrade().unwrap() + } + + fn as_any_ref(&self) -> &dyn core::any::Any { + self + } + + fn list(&self) -> Result, system_error::SystemError> { + Err(SystemError::ENOSYS) + } + + fn metadata(&self) -> Result { + Ok(self.inner.lock().metadata.clone()) + } + + fn ioctl( + &self, + cmd: u32, + arg: usize, + _private_data: &crate::filesystem::vfs::FilePrivateData, + ) -> Result { + match cmd { + Self::KVM_CREATE_VM => { + let ret = self.create_vm(arg); + kwarn!("[KVM]: KVM_CREATE_VM {ret:?}"); + + return ret; + } + + Self::KVM_GET_VCPU_MMAP_SIZE => { + if arg != 0 { + return Err(SystemError::EINVAL); + } + kdebug!("[KVM] KVM_GET_VCPU_MMAP_SIZE"); + return Ok(MMArch::PAGE_SIZE); + } + + _ => { + // TODO: arch_ioctl + kwarn!("[KVM]: unknown iooctl cmd {cmd:x}"); + } + } + + Ok(0) + } + + fn close( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result<(), SystemError> { + Ok(()) + } +} + +#[derive(Debug)] +pub struct KvmInstance { + kvm: Arc, + metadata: Metadata, +} + +impl KvmInstance { + const KVM_CREATE_VCPU: u32 = 0xAE41; + const KVM_SET_USER_MEMORY_REGION: u32 = 0xAE46; + + pub fn new(vm: Arc) -> Arc { + Arc::new(Self { + kvm: vm, + metadata: Metadata { + dev_id: 1, + inode_id: generate_inode_id(), + size: 0, + blk_size: 0, + blocks: 0, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + file_type: FileType::KvmDevice, + mode: ModeType::S_IALLUGO, + nlinks: 1, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), // 这里用来作为device number + }, + }) + } +} + +impl IndexNode for KvmInstance { + fn open( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + _mode: &crate::filesystem::vfs::file::FileMode, + ) -> Result<(), SystemError> { + Ok(()) + } + + #[inline(never)] + fn ioctl( + &self, + cmd: u32, + arg: usize, + _private_data: &crate::filesystem::vfs::FilePrivateData, + ) -> Result { + kdebug!("ioctl"); + check_stack_usage(); + match cmd { + Self::KVM_CREATE_VCPU => { + let ret = self.kvm.lock().create_vcpu(arg); + kwarn!("!!!###$$"); + return ret; + } + + Self::KVM_SET_USER_MEMORY_REGION => { + kdebug!("[KVM-INSTANCE] KVM_SET_USER_MEMORY_REGION"); + let user_reader = UserBufferReader::new( + arg as *const PosixKvmUserspaceMemoryRegion, + core::mem::size_of::(), + true, + )?; + + let region = user_reader.read_one_from_user::(0)?; + + self.kvm + .lock() + .set_memory_region(KvmUserspaceMemoryRegion::from_posix(region)?)?; + + return Ok(0); + } + + _ => { + // arch_ioctl + } + } + + todo!() + } + + fn read_at( + &self, + offset: usize, + len: usize, + buf: &mut [u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + todo!() + } + + fn write_at( + &self, + offset: usize, + len: usize, + buf: &[u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + todo!() + } + + fn fs(&self) -> Arc { + todo!() + } + + fn as_any_ref(&self) -> &dyn core::any::Any { + todo!() + } + + fn list(&self) -> Result, SystemError> { + todo!() + } + + fn metadata(&self) -> Result { + Ok(self.metadata.clone()) + } + + fn close( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result<(), SystemError> { + Ok(()) + } +} + +#[derive(Debug)] +pub struct KvmVcpuDev { + vcpu: Arc, +} + +impl KvmVcpuDev { + const KVM_RUN: u32 = 0xAE80; + const KVM_GET_REGS: u32 = 0xAE81; + const KVM_SET_REGS: u32 = 0xAE82; + const KVM_GET_SREGS: u32 = 0xAE83; + const KVM_SET_SREGS: u32 = 0xAE84; + + pub fn new(vcpu: Arc) -> Arc { + Arc::new(Self { vcpu }) + } +} + +impl IndexNode for KvmVcpuDev { + fn open( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + _mode: &FileMode, + ) -> Result<(), SystemError> { + Ok(()) + } + + fn close( + &self, + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result<(), SystemError> { + Ok(()) + } + + fn ioctl( + &self, + cmd: u32, + arg: usize, + _private_data: &crate::filesystem::vfs::FilePrivateData, + ) -> Result { + match cmd { + Self::KVM_RUN => { + let mut vcpu = self.vcpu.lock(); + let oldpid = vcpu.pid; + if unlikely(oldpid != Some(ProcessManager::current_pid())) { + vcpu.pid = Some(ProcessManager::current_pid()); + } + + return vcpu.run(); + } + Self::KVM_GET_REGS => { + kdebug!("KVM_GET_REGS"); + let kvm_regs = self.vcpu.lock().get_regs(); + kdebug!("get regs {kvm_regs:?}"); + let mut user_writer = UserBufferWriter::new( + arg as *const KvmCommonRegs as *mut KvmCommonRegs, + core::mem::size_of::(), + true, + )?; + + user_writer.copy_one_to_user(&kvm_regs, 0)?; + return Ok(0); + } + _ => { + // arch ioctl + kwarn!("[KVM-VCPU] unknown ioctl cmd {cmd:x}"); + } + } + + Ok(0) + } + + fn read_at( + &self, + offset: usize, + len: usize, + buf: &mut [u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + todo!() + } + + fn write_at( + &self, + offset: usize, + len: usize, + buf: &[u8], + _data: crate::libs::spinlock::SpinLockGuard, + ) -> Result { + todo!() + } + + fn fs(&self) -> Arc { + todo!() + } + + fn as_any_ref(&self) -> &dyn core::any::Any { + todo!() + } + + fn list(&self) -> Result, SystemError> { + todo!() + } +} + +pub fn kvm_init() -> Result<(), SystemError> { + let kvm_inode = LockedKvmInode::new(); + + devfs_register("kvm", kvm_inode)?; + + Ok(()) +} diff --git a/kernel/src/virt/vm/kvm_host/mem.rs b/kernel/src/virt/vm/kvm_host/mem.rs new file mode 100644 index 000000000..bf5180073 --- /dev/null +++ b/kernel/src/virt/vm/kvm_host/mem.rs @@ -0,0 +1,502 @@ +use alloc::{ + boxed::Box, + sync::{Arc, Weak}, +}; +use bitmap::AllocBitmap; +use hashbrown::HashMap; +use system_error::SystemError; +use x86::bits64::registers::rbp; + +use crate::{ + arch::{kvm_arch_ops, MMArch}, + libs::{ + rbtree::RBTree, + rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard}, + spinlock::{SpinLock, SpinLockGuard}, + }, + mm::{MemoryManagementArch, PhysAddr, VirtAddr}, + virt::vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion}, +}; + +use super::{check_stack_usage, LockedVm, Vm}; + +pub const KVM_USER_MEM_SLOTS: u16 = u16::MAX; +pub const KVM_INTERNAL_MEM_SLOTS: u16 = 3; +pub const KVM_MEM_SLOTS_NUM: u16 = KVM_USER_MEM_SLOTS - KVM_INTERNAL_MEM_SLOTS; +pub const KVM_MEM_MAX_NR_PAGES: usize = (1 << 31) - 1; + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default)] +pub struct AddrRange { + pub start: VirtAddr, + pub last: VirtAddr, +} + +#[derive(Debug, Default)] +pub struct KvmMemSlotSet { + /// 最后一次使用到的内存插槽 + pub last_use: Option>, + /// 存储虚拟地址(hva)和内存插槽之间的映射关系 + hva_tree: RBTree>, + /// 用于存储全局页帧号(gfn)和内存插槽之间的映射关系 + gfn_tree: RBTree>, + /// 将内存插槽的ID映射到对应的内存插槽。 + slots: HashMap>, + + pub node_idx: usize, + pub generation: u64, +} + +impl KvmMemSlotSet { + pub fn get_slot(&self, id: u16) -> Option> { + self.slots.get(&id).cloned() + } +} + +#[derive(Debug)] +pub struct LockedKvmMemSlot { + inner: RwLock, +} + +impl LockedKvmMemSlot { + pub fn new() -> Arc { + Arc::new(Self { + inner: RwLock::new(KvmMemSlot::default()), + }) + } + + #[inline] + pub fn read(&self) -> RwLockReadGuard { + self.inner.read() + } + + #[inline] + pub fn write(&self) -> RwLockWriteGuard { + self.inner.write() + } + + #[inline] + pub fn copy_from(&self, other: &Arc) { + let mut guard = self.write(); + let other = other.read(); + + guard.base_gfn = other.base_gfn; + guard.npages = other.npages; + + guard.dirty_bitmap = other.dirty_bitmap.clone(); + guard.arch = other.arch; + guard.userspace_addr = other.userspace_addr; + guard.flags = other.flags; + guard.id = other.id; + guard.as_id = other.as_id; + } +} + +#[derive(Debug, Default)] +pub struct KvmMemSlot { + /// 首个gfn + base_gfn: u64, + /// 页数量 + npages: usize, + /// 脏页位图 + dirty_bitmap: Option, + /// 架构相关 + arch: (), + userspace_addr: VirtAddr, + flags: UserMemRegionFlag, + id: u16, + as_id: u16, + + hva_node_key: [AddrRange; 2], +} + +#[derive(Debug)] +pub struct LockedVmMemSlotSet { + inner: SpinLock, +} + +impl LockedVmMemSlotSet { + pub fn new(slots: KvmMemSlotSet) -> Arc { + Arc::new(Self { + inner: SpinLock::new(slots), + }) + } + + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } +} + +#[derive(Debug, Default)] +pub struct GfnToHvaCache { + generation: u64, + /// 客户机对应物理地址(Guest Physical Address) + gpa: u64, + /// 主机用户空间虚拟地址(User Host Virtual Address) + uhva: Option, + /// 主机内核空间虚拟地址(Kernel Host Virtual Address) + khva: u64, + /// 对应内存插槽 + memslot: Option>, + /// 对应物理页帧号(Page Frame Number) + pfn: Option, + /// 缓存项的使用情况 + usage: PfnCacheUsage, + /// 是否处于活动状态 + active: bool, + /// 是否有效 + valid: bool, + vm: Option>, +} + +impl GfnToHvaCache { + pub fn init(vm: Weak, usage: PfnCacheUsage) -> Self { + // check_stack_usage(); + // let mut ret: Box = unsafe { Box::new_zeroed().assume_init() }; + // ret.usage = usage; + // ret.vm = Some(vm); + // *ret + Self { + usage, + vm: Some(vm), + ..Default::default() + } + } +} + +bitflags! { + #[derive(Default)] + pub struct PfnCacheUsage: u8 { + const GUEST_USES_PFN = 1 << 0; + const HOST_USES_PFN = 1 << 1; + const GUEST_AND_HOST_USES_PFN = Self::GUEST_USES_PFN.bits | Self::HOST_USES_PFN.bits; + } + + pub struct UserMemRegionFlag: u32 { + /// 用来开启内存脏页 + const LOG_DIRTY_PAGES = 1 << 0; + /// 开启内存只读 + const READONLY = 1 << 1; + /// 标记invalid + const KVM_MEMSLOT_INVALID = 1 << 16; + } +} + +impl Default for UserMemRegionFlag { + fn default() -> Self { + Self::empty() + } +} + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +pub enum KvmMemoryChangeMode { + Create, + Delete, + Move, + FlagsOnly, +} + +impl Vm { + #[inline(never)] + pub fn set_memory_region(&mut self, mem: KvmUserspaceMemoryRegion) -> Result<(), SystemError> { + if mem.slot >= u16::MAX as u32 { + return Err(SystemError::EINVAL); + } + + let as_id = mem.slot >> 16; + let id = mem.slot as u16; + + // 检查内存对齐以及32位检测(虽然现在没什么用<) + if (mem.memory_size as usize & MMArch::PAGE_SIZE != 0) + || mem.memory_size != mem.memory_size as usize as u64 + { + return Err(SystemError::EINVAL); + } + + if !mem.guest_phys_addr.check_aligned(MMArch::PAGE_SIZE) { + return Err(SystemError::EINVAL); + } + + if !mem.userspace_addr.check_aligned(MMArch::PAGE_SIZE) { + // 这里应该还需要判断从userspace_addr->userspace_addr+memory_size这段区间都是合法的 + return Err(SystemError::EINVAL); + } + + if as_id >= KVM_ADDRESS_SPACE_NUM as u32 || id >= KVM_MEM_SLOTS_NUM { + return Err(SystemError::EINVAL); + } + + if (mem.memory_size >> MMArch::PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES as u64 { + return Err(SystemError::EINVAL); + } + + let slots = self.memslot_set(as_id as usize).clone(); + + let slots_guard = slots.lock(); + let old = slots_guard.get_slot(id); + if mem.memory_size == 0 { + if let Some(old) = &old { + let old_npages = old.read().npages; + if old_npages == 0 { + return Err(SystemError::EINVAL); + } + + if self.nr_memslot_pages < old_npages { + return Err(SystemError::EIO); + } + drop(slots_guard); + return self.set_memslot(Some(&old), None, KvmMemoryChangeMode::Delete); + } else { + return Err(SystemError::EINVAL); + } + } + + let base_gfn = (mem.guest_phys_addr.data() >> MMArch::PAGE_SHIFT) as u64; + let npages = mem.memory_size >> MMArch::PAGE_SHIFT; + + let change; + if let Some(old) = &old { + let old_guard = old.read(); + if old_guard.npages == 0 { + change = KvmMemoryChangeMode::Create; + // 避免溢出 + if self.nr_memslot_pages + (npages as usize) < self.nr_memslot_pages { + return Err(SystemError::EINVAL); + } + } else { + if mem.userspace_addr != old_guard.userspace_addr + || npages != old_guard.npages as u64 + || (mem.flags ^ old_guard.flags).contains(UserMemRegionFlag::READONLY) + { + return Err(SystemError::EINVAL); + } + + if base_gfn != old_guard.base_gfn { + change = KvmMemoryChangeMode::Move; + } else if mem.flags != old_guard.flags { + change = KvmMemoryChangeMode::FlagsOnly; + } else { + return Ok(()); + } + } + } else { + change = KvmMemoryChangeMode::Create; + // 避免溢出 + if self.nr_memslot_pages + (npages as usize) < self.nr_memslot_pages { + return Err(SystemError::EINVAL); + } + }; + + if change == KvmMemoryChangeMode::Create || change == KvmMemoryChangeMode::Move { + if slots_guard.gfn_tree.contains_key(&base_gfn) { + return Err(SystemError::EEXIST); + } + } + + let new = LockedKvmMemSlot::new(); + let mut new_guard = new.write(); + + new_guard.as_id = as_id as u16; + new_guard.id = id; + new_guard.base_gfn = base_gfn; + new_guard.npages = npages as usize; + new_guard.flags = mem.flags; + new_guard.userspace_addr = mem.userspace_addr; + + drop(new_guard); + drop(slots_guard); + return self.set_memslot(old.as_ref(), Some(&new), change); + } + + #[inline] + /// 获取活动内存插槽 + fn memslot_set(&self, id: usize) -> &Arc { + // 避免越界 + let id = id % KVM_ADDRESS_SPACE_NUM; + &self.memslots[id] + } + + #[inline(never)] + fn set_memslot( + &mut self, + old: Option<&Arc>, + new: Option<&Arc>, + change: KvmMemoryChangeMode, + ) -> Result<(), SystemError> { + let invalid_slot = LockedKvmMemSlot::new(); + if change == KvmMemoryChangeMode::Delete || change == KvmMemoryChangeMode::Move { + self.invalidate_memslot(old.unwrap(), &invalid_slot) + } + + match self.prepare_memory_region(old, new, change) { + Ok(_) => {} + Err(e) => { + if change == KvmMemoryChangeMode::Delete || change == KvmMemoryChangeMode::Move { + self.active_memslot(Some(&invalid_slot), old) + } + return Err(e); + } + } + + match change { + KvmMemoryChangeMode::Create => self.create_memslot(new), + KvmMemoryChangeMode::Delete => self.delete_memslot(old, &invalid_slot), + KvmMemoryChangeMode::Move => self.move_memslot(old, new, &invalid_slot), + KvmMemoryChangeMode::FlagsOnly => self.update_flags_memslot(old, new), + } + + // TODO:kvm_commit_memory_region(kvm, old, new, change); + Ok(()) + } + + fn create_memslot(&mut self, new: Option<&Arc>) { + self.replace_memslot(None, new); + self.active_memslot(None, new); + } + + fn delete_memslot( + &mut self, + old: Option<&Arc>, + invalid_slot: &Arc, + ) { + self.replace_memslot(old, None); + self.active_memslot(Some(invalid_slot), None); + } + + fn move_memslot( + &mut self, + old: Option<&Arc>, + new: Option<&Arc>, + invalid_slot: &Arc, + ) { + self.replace_memslot(old, new); + self.active_memslot(Some(invalid_slot), new); + } + + fn update_flags_memslot( + &mut self, + old: Option<&Arc>, + new: Option<&Arc>, + ) { + self.replace_memslot(old, new); + self.active_memslot(old, new); + } + + fn prepare_memory_region( + &self, + old: Option<&Arc>, + new: Option<&Arc>, + change: KvmMemoryChangeMode, + ) -> Result<(), SystemError> { + if change != KvmMemoryChangeMode::Delete { + let new = new.unwrap(); + let mut new_guard = new.write(); + if !new_guard.flags.contains(UserMemRegionFlag::LOG_DIRTY_PAGES) { + new_guard.dirty_bitmap = None; + } else if old.is_some() { + let old_guard = old.unwrap().read(); + if old_guard.dirty_bitmap.is_some() { + new_guard.dirty_bitmap = old_guard.dirty_bitmap.clone(); + } else { + new_guard.dirty_bitmap = Some(AllocBitmap::new(new_guard.npages * 2)); + } + } + } + + return self.arch_prepare_memory_region(old, new, change); + } + + fn invalidate_memslot( + &mut self, + old: &Arc, + invalid_slot: &Arc, + ) { + invalid_slot.copy_from(old); + + let mut old_guard = old.write(); + let mut invalid_slot_guard = invalid_slot.write(); + invalid_slot_guard + .flags + .insert(UserMemRegionFlag::KVM_MEMSLOT_INVALID); + + self.swap_active_memslots(old_guard.as_id as usize); + + old_guard.arch = invalid_slot_guard.arch; + } + + #[inline(never)] + fn active_memslot( + &mut self, + old: Option<&Arc>, + new: Option<&Arc>, + ) { + let as_id = if let Some(slot) = old.or(new) { + slot.read().as_id + } else { + 0 + }; + + self.swap_active_memslots(as_id as usize); + + self.replace_memslot(old, new); + } + + #[inline(never)] + fn replace_memslot( + &self, + old: Option<&Arc>, + new: Option<&Arc>, + ) { + let as_id = if let Some(slot) = old.or(new) { + slot.read().as_id + } else { + 0 + }; + + let slot_set = self.get_inactive_memslot_set(as_id as usize); + + let mut slots_guard = slot_set.lock(); + let idx = slots_guard.node_idx; + + if let Some(old) = old { + slots_guard.hva_tree.remove(&old.read().hva_node_key[idx]); + + if let Some(last) = &slots_guard.last_use { + if Arc::ptr_eq(last, old) { + slots_guard.last_use = new.map(|x| x.clone()); + } + } + + if new.is_none() { + slots_guard.gfn_tree.remove(&old.read().base_gfn); + return; + } + } + + let new = new.unwrap(); + let mut new_guard = new.write(); + new_guard.hva_node_key[idx].start = new_guard.userspace_addr; + new_guard.hva_node_key[idx].last = + new_guard.userspace_addr + VirtAddr::new((new_guard.npages << MMArch::PAGE_SHIFT) - 1); + + slots_guard + .hva_tree + .insert(new_guard.hva_node_key[idx], new.clone()); + + if let Some(old) = old { + slots_guard.gfn_tree.remove(&old.read().base_gfn); + } + + slots_guard.gfn_tree.insert(new_guard.base_gfn, new.clone()); + } + + fn get_inactive_memslot_set(&self, as_id: usize) -> Arc { + let active = self.memslot_set(as_id); + + let inactive_idx = active.lock().node_idx ^ 1; + return self.memslots_set[as_id][inactive_idx].clone(); + } + + fn swap_active_memslots(&mut self, as_id: usize) { + self.memslots[as_id] = self.get_inactive_memslot_set(as_id); + } +} diff --git a/kernel/src/virt/vm/kvm_host/mod.rs b/kernel/src/virt/vm/kvm_host/mod.rs index 92e915644..2c636c42a 100644 --- a/kernel/src/virt/vm/kvm_host/mod.rs +++ b/kernel/src/virt/vm/kvm_host/mod.rs @@ -1,28 +1,232 @@ -use core::sync::atomic::AtomicUsize; +use core::{ + fmt::Debug, + sync::atomic::{AtomicUsize, Ordering}, +}; -use alloc::string::String; +use alloc::{ + alloc::Global, + boxed::Box, + sync::{Arc, Weak}, + vec::Vec, +}; +use hashbrown::HashMap; +use system_error::SystemError; +use x86::bits64::registers::rsp; -use crate::mm::ucontext::AddressSpace; +use crate::{ + arch::{ + vm::{kvm_host::vcpu::VirCpuRequest, x86_kvm_manager}, + CurrentKvmManager, KvmArch, VirtCpuArch, + }, + filesystem::vfs::file::{File, FileMode}, + libs::spinlock::{SpinLock, SpinLockGuard}, + mm::ucontext::AddressSpace, + process::{KernelStack, ProcessManager}, + smp::cpu::ProcessorId, + virt::vm::{ + kvm_dev::KvmVcpuDev, + kvm_host::vcpu::{LockedVirtCpu, VirtCpu}, + }, +}; +use self::{ + mem::{GfnToHvaCache, KvmMemSlotSet, LockedVmMemSlotSet, PfnCacheUsage}, + vcpu::GuestDebug, +}; + +pub mod mem; pub mod vcpu; const KVM_ADDRESS_SPACE_NUM: usize = 1; +pub const KVM_USERSAPCE_IRQ_SOURCE_ID: usize = 0; +pub const KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID: usize = 1; + +#[derive(Debug)] +pub struct LockedVm { + inner: SpinLock, +} + +static KVM_USAGE_COUNT: AtomicUsize = AtomicUsize::new(0); + +impl LockedVm { + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } + + pub fn create(vm_type: usize) -> Result, SystemError> { + let mut memslots_set = vec![]; + let mut memslots = vec![]; + for i in 0..KVM_ADDRESS_SPACE_NUM { + let mut tmp = vec![]; + for j in 0..2 { + let mut slots = KvmMemSlotSet::default(); + slots.last_use = None; + slots.node_idx = j; + slots.generation = i as u64; + tmp.push(LockedVmMemSlotSet::new(slots)); + } + memslots_set.push(tmp); + memslots.push(memslots_set[i][0].clone()); + } + + let kvm = Vm { + mm: ProcessManager::current_pcb() + .basic() + .user_vm() + .unwrap() + .write() + .try_clone()?, + max_vcpus: CurrentKvmManager::KVM_MAX_VCPUS, + memslots_set, + memslots, + arch: KvmArch::init(vm_type)?, + created_vcpus: 0, + lock_vm_ref: Weak::new(), + nr_memslot_pages: 0, + online_vcpus: 0, + dirty_ring_size: 0, + dirty_ring_with_bitmap: false, + vcpus: HashMap::new(), + }; + + let ret = Arc::new(Self { + inner: SpinLock::new(kvm), + }); + + Self::hardware_enable_all()?; -pub struct KvmMemSlots { - /// 最后一次使用到的内存插槽 - last_use: AtomicUsize, - /// 存储虚拟地址(hva)和内存插槽之间的映射关系 - // Rbt - /// 用于存储全局页帧号(gfn)和内存插槽之间的映射关系 - // Rbt - /// 将内存插槽的ID映射到对应的内存插槽。 - // HashMap - /// 节点索引 - node_idx: usize, + ret.lock().lock_vm_ref = Arc::downgrade(&ret); + return Ok(ret); + } + + fn hardware_enable_all() -> Result<(), SystemError> { + KVM_USAGE_COUNT.fetch_add(1, Ordering::SeqCst); + + // 如果是第一个启动的,则需要对所有cpu都初始化硬件 + if KVM_USAGE_COUNT.load(Ordering::SeqCst) == 1 { + // FIXME!!!! + // 这里是要对每个cpu都进行初始化,目前这里只对当前cpu调用了初始化流程 + x86_kvm_manager().arch_hardware_enable()?; + } + + Ok(()) + } } +#[derive(Debug)] pub struct Vm { - mm: AddressSpace, + lock_vm_ref: Weak, + mm: Arc, max_vcpus: usize, - name: String, + created_vcpus: usize, + online_vcpus: usize, + /// vcpu集合 + vcpus: HashMap>, + // name: String, + /// 对应活动和非活动内存槽,实际为:[[Arc; 2]; KVM_ADDRESS_SPACE_NUM],这里暂时写Vec + memslots_set: Vec>>, + /// 当前活动内存槽,实际为:[Arc; KVM_ADDRESS_SPACE_NUM],这里暂时写Vec + memslots: Vec>, + /// 内存槽对应的页数 + nr_memslot_pages: usize, + + pub arch: KvmArch, + + pub dirty_ring_size: u32, + dirty_ring_with_bitmap: bool, +} + +#[inline] +pub fn check_stack_usage() { + let rsp = rsp() as usize; + let free = rsp & (KernelStack::ALIGN - 1); + let usage = KernelStack::SIZE - free; + kdebug!("current rsp {rsp:x} stack use {usage} free {free}"); +} + +impl Vm { + #[inline(never)] + pub fn create_vcpu(&mut self, id: usize) -> Result { + check_stack_usage(); + if id >= self.max_vcpus { + return Err(SystemError::EINVAL); + } + + if self.created_vcpus >= self.max_vcpus { + return Err(SystemError::EINVAL); + } + + self.created_vcpus += 1; + + let vcpu = self._create_vcpu(id); + if self.dirty_ring_size != 0 { + todo!() + } + + vcpu.lock().vcpu_id = self.online_vcpus; + + self.vcpus.insert(self.online_vcpus, vcpu.clone()); + + self.online_vcpus += 1; + + let vcpu_inode = KvmVcpuDev::new(vcpu); + + let file = File::new(vcpu_inode, FileMode::from_bits_truncate(0x777))?; + + let fd = ProcessManager::current_pcb() + .fd_table() + .write() + .alloc_fd(file, None)?; + + Ok(fd as usize) + } + + /// ### 创建一个vcpu,并且初始化部分数据 + #[inline(never)] + pub fn _create_vcpu(&self, id: usize) -> Arc { + check_stack_usage(); + + let mut vcpu = self.new_vcpu(id); + + vcpu.init_arch(self); + + Arc::new(LockedVirtCpu::new(vcpu)) + } + + #[inline(never)] + pub fn new_vcpu(&self, id: usize) -> VirtCpu { + return VirtCpu { + cpu: ProcessorId::INVALID, + kvm: Some(self.lock_vm_ref.clone()), + vcpu_id: id, + pid: None, + preempted: false, + ready: false, + last_used_slot: None, + stats_id: format!("kvm-{}/vcpu-{}", ProcessManager::current_pid().data(), id), + pv_time: GfnToHvaCache::init(self.lock_vm_ref.clone(), PfnCacheUsage::HOST_USES_PFN), + arch: VirtCpuArch::new(), + private: None, + request: VirCpuRequest::empty(), + guest_debug: GuestDebug::empty(), + run: unsafe { Some(Box::new_zeroed().assume_init()) }, + vcpu_idx: 0, + }; + } +} + +/// ## 多处理器状态(有些状态在某些架构并不合法) +#[derive(Debug, Clone, Copy)] +pub enum MutilProcessorState { + Runnable, + Uninitialized, + InitReceived, + Halted, + SipiReceived, + Stopped, + CheckStop, + Operating, + Load, + ApResetHold, + Suspended, } diff --git a/kernel/src/virt/vm/kvm_host/vcpu.rs b/kernel/src/virt/vm/kvm_host/vcpu.rs index fc54f11a4..d51f4dbbd 100644 --- a/kernel/src/virt/vm/kvm_host/vcpu.rs +++ b/kernel/src/virt/vm/kvm_host/vcpu.rs @@ -1,51 +1,94 @@ -use alloc::{string::String, sync::Arc}; +use core::mem::MaybeUninit; + +use alloc::{ + alloc::Global, + boxed::Box, + string::String, + sync::{Arc, Weak}, +}; use crate::{ + arch::{ + vm::{kvm_host::vcpu::VirCpuRequest, vmx::VmxVCpuPriv}, + VirtCpuArch, + }, + libs::{ + lazy_init::Lazy, + spinlock::{SpinLock, SpinLockGuard}, + }, process::{Pid, ProcessManager}, smp::cpu::ProcessorId, + virt::vm::{kvm_host::check_stack_usage, user_api::UapiKvmRun}, }; -use super::{KvmMemSlots, Vm}; +use super::{ + mem::{GfnToHvaCache, KvmMemSlot, PfnCacheUsage}, + LockedVm, Vm, +}; -pub struct VirtCpu { - cpu: ProcessorId, - kvm: Arc, - vcpu_id: usize, - pid: Option, - preempted: bool, - ready: bool, - last_used_slot: Option>, - stats_id: String, +#[derive(Debug)] +pub struct LockedVirtCpu { + inner: SpinLock, } -impl VirtCpu { - /// ### 创建一个vcpu,并且初始化部分数据 - pub fn create(vm: Arc, id: usize) -> Self { +impl LockedVirtCpu { + pub fn new(vcpu: VirtCpu) -> Self { Self { - cpu: ProcessorId::INVALID, - kvm: vm, - vcpu_id: id, - pid: None, - preempted: false, - ready: false, - last_used_slot: None, - stats_id: format!("kvm-{}/vcpu-{}", ProcessManager::current_pid().data(), id), + inner: SpinLock::new(vcpu), } } + + pub fn lock(&self) -> SpinLockGuard { + self.inner.lock() + } +} + +#[derive(Debug)] +pub struct VirtCpu { + pub cpu: ProcessorId, + pub kvm: Option>, + /// 从用户层获取 + pub vcpu_id: usize, + /// id alloctor获取 + pub vcpu_idx: usize, + pub pid: Option, + pub preempted: bool, + pub ready: bool, + pub last_used_slot: Option>, + pub stats_id: String, + pub pv_time: GfnToHvaCache, + pub arch: VirtCpuArch, + + pub guest_debug: GuestDebug, + + #[cfg(target_arch = "x86_64")] + pub private: Option, + + /// 记录请求 + pub request: VirCpuRequest, + pub run: Option>, +} + +impl VirtCpu { + #[inline] + pub fn kvm(&self) -> Arc { + self.kvm.as_ref().unwrap().upgrade().unwrap() + } + + #[cfg(target_arch = "x86_64")] + pub fn vmx(&self) -> &VmxVCpuPriv { + self.private.as_ref().unwrap() + } + + #[cfg(target_arch = "x86_64")] + pub fn vmx_mut(&mut self) -> &mut VmxVCpuPriv { + self.private.as_mut().unwrap() + } } -/// ## 多处理器状态(有些状态在某些架构并不合法) -#[derive(Debug, Clone, Copy)] -pub enum MutilProcessorState { - Runnable, - Uninitialized, - InitReceived, - Halted, - SipiReceived, - Stopped, - CheckStop, - Operating, - Load, - ApResetHold, - Suspended, +bitflags! { + pub struct GuestDebug: usize { + const ENABLE = 0x00000001; + const SINGLESTEP = 0x00000002; + } } diff --git a/kernel/src/virt/vm/mod.rs b/kernel/src/virt/vm/mod.rs index cf82f0060..048b943e3 100644 --- a/kernel/src/virt/vm/mod.rs +++ b/kernel/src/virt/vm/mod.rs @@ -1 +1,3 @@ -pub mod kvm_host; \ No newline at end of file +pub mod kvm_dev; +pub mod kvm_host; +pub mod user_api; diff --git a/kernel/src/virt/vm/user_api.rs b/kernel/src/virt/vm/user_api.rs new file mode 100644 index 000000000..57a5424f2 --- /dev/null +++ b/kernel/src/virt/vm/user_api.rs @@ -0,0 +1,430 @@ +use core::fmt::Debug; + +/// +/// 该文件定义了暴露给用户空间的结构体 +/// +use system_error::SystemError; + +use crate::mm::{PhysAddr, VirtAddr}; + +use super::kvm_host::mem::UserMemRegionFlag; + +/// 通过这个结构可以将虚拟机的物理地址对应到用户进程的虚拟地址 +/// 用来表示虚拟机的一段物理内存 +#[repr(C)] +#[derive(Default)] +pub struct PosixKvmUserspaceMemoryRegion { + /// 在哪个slot上注册内存区间 + pub slot: u32, + /// flags有两个取值,KVM_MEM_LOG_DIRTY_PAGES和KVM_MEM_READONLY,用来指示kvm针对这段内存应该做的事情。 + /// KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。 + pub flags: u32, + /// 虚机内存区间起始物理地址 + pub guest_phys_addr: u64, + /// 虚机内存区间大小 + pub memory_size: u64, + /// 虚机内存区间对应的主机虚拟地址 + pub userspace_addr: u64, +} + +/// PosixKvmUserspaceMemoryRegion对应内核表示 +pub struct KvmUserspaceMemoryRegion { + /// 在哪个slot上注册内存区间 + pub slot: u32, + /// 用来指示kvm针对这段内存应该做的事情。 + /// KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。 + pub flags: UserMemRegionFlag, + /// 虚机内存区间起始物理地址 + pub guest_phys_addr: PhysAddr, + /// 虚机内存区间大小 + pub memory_size: u64, + /// 虚机内存区间对应的主机虚拟地址 + pub userspace_addr: VirtAddr, +} + +impl KvmUserspaceMemoryRegion { + pub fn from_posix(posix: &PosixKvmUserspaceMemoryRegion) -> Result { + let flags = UserMemRegionFlag::from_bits(posix.flags).ok_or(SystemError::EINVAL)?; + Ok(Self { + slot: posix.slot, + flags, + guest_phys_addr: PhysAddr::new(posix.guest_phys_addr as usize), + memory_size: posix.memory_size, + userspace_addr: VirtAddr::new(posix.userspace_addr as usize), + }) + } +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct UapiKvmRun { + pub request_interrupt_window: u8, + pub immediate_exit: u8, + pub padding1: [u8; 6usize], + pub exit_reason: u32, + pub ready_for_interrupt_injection: u8, + pub if_flag: u8, + pub flags: u16, + pub cr8: u64, + pub apic_base: u64, + pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1, + pub kvm_valid_regs: u64, + pub kvm_dirty_regs: u64, + pub s: uapi_kvm_run__bindgen_ty_2, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_run__bindgen_ty_2 { + pub regs: UapiKvmSyncRegs, + pub padding: [u8; 2048usize], +} + +impl Debug for uapi_kvm_run__bindgen_ty_2 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("uapi_kvm_run__bindgen_ty_2").finish() + } +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmSyncRegs { + pub device_irq_level: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy1 { + pub hardware_exit_reason: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy2 { + pub hardware_entry_failure_reason: u64, + pub cpu: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy3 { + pub exception: u32, + pub error_code: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy4 { + pub direction: u8, + pub size: u8, + pub port: u16, + pub count: u32, + pub data_offset: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmDebugExitArch { + pub hsr: u32, + pub hsr_high: u32, + pub far: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy5 { + pub arch: UapiKvmDebugExitArch, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy6 { + pub phys_addr: u64, + pub data: [u8; 8usize], + pub len: u32, + pub is_write: u8, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy7 { + pub nr: u64, + pub args: [u64; 6usize], + pub ret: u64, + pub longmode: u32, + pub pad: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy8 { + pub rip: u64, + pub is_write: u32, + pub pad: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy9 { + pub icptcode: u8, + pub ipa: u16, + pub ipb: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy10 { + pub trans_exc_code: u64, + pub pgm_code: u32, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy11 { + pub dcrn: u32, + pub data: u32, + pub is_write: u8, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy12 { + pub suberror: u32, + pub ndata: u32, + pub data: [u64; 16usize], +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct UapiKvmRunBindgenTy1BindgenTy13 { + pub suberror: u32, + pub ndata: u32, + pub flags: u64, + pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1__bindgen_ty_13__bindgen_ty_1, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_run__bindgen_ty_1__bindgen_ty_13__bindgen_ty_1 { + pub __bindgen_anon_1: UapiKvmRunBindgenTy1BindgenTy13BindgenTy1BindgenTy1, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy13BindgenTy1BindgenTy1 { + pub insn_size: u8, + pub insn_bytes: [u8; 15usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy14 { + pub gprs: [u64; 32usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy15 { + pub nr: u64, + pub ret: u64, + pub args: [u64; 9usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy16 { + pub subchannel_id: u16, + pub subchannel_nr: u16, + pub io_int_parm: u32, + pub io_int_word: u32, + pub ipb: u32, + pub dequeued: u8, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy17 { + pub epr: u32, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct UapiKvmRunBindgenTy1BindgenTy18 { + pub type_: u32, + pub ndata: u32, + pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1__bindgen_ty_18__bindgen_ty_1, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_run__bindgen_ty_1__bindgen_ty_18__bindgen_ty_1 { + pub flags: u64, + pub data: [u64; 16usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy19 { + pub addr: u64, + pub ar: u8, + pub reserved: u8, + pub fc: u8, + pub sel1: u8, + pub sel2: u16, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy20 { + pub vector: u8, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy21 { + pub esr_iss: u64, + pub fault_ipa: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy22 { + pub error: u8, + pub pad: [u8; 7usize], + pub reason: u32, + pub index: u32, + pub data: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy23 { + pub extension_id: usize, + pub function_id: usize, + pub args: [usize; 6usize], + pub ret: [usize; 2usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy24 { + pub csr_num: usize, + pub new_value: usize, + pub write_mask: usize, + pub ret_value: usize, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmRunBindgenTy1BindgenTy25 { + pub flags: u32, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_run__bindgen_ty_1 { + pub hw: UapiKvmRunBindgenTy1BindgenTy1, + pub fail_entry: UapiKvmRunBindgenTy1BindgenTy2, + pub ex: UapiKvmRunBindgenTy1BindgenTy3, + pub io: UapiKvmRunBindgenTy1BindgenTy4, + pub debug: UapiKvmRunBindgenTy1BindgenTy5, + pub mmio: UapiKvmRunBindgenTy1BindgenTy6, + pub hypercall: UapiKvmRunBindgenTy1BindgenTy7, + pub tpr_access: UapiKvmRunBindgenTy1BindgenTy8, + pub s390_sieic: UapiKvmRunBindgenTy1BindgenTy9, + pub s390_reset_flags: u64, + pub s390_ucontrol: UapiKvmRunBindgenTy1BindgenTy10, + pub dcr: UapiKvmRunBindgenTy1BindgenTy11, + pub internal: UapiKvmRunBindgenTy1BindgenTy12, + pub emulation_failure: UapiKvmRunBindgenTy1BindgenTy13, + pub osi: UapiKvmRunBindgenTy1BindgenTy14, + pub papr_hcall: UapiKvmRunBindgenTy1BindgenTy15, + pub s390_tsch: UapiKvmRunBindgenTy1BindgenTy16, + pub epr: UapiKvmRunBindgenTy1BindgenTy17, + pub system_event: UapiKvmRunBindgenTy1BindgenTy18, + pub s390_stsi: UapiKvmRunBindgenTy1BindgenTy19, + pub eoi: UapiKvmRunBindgenTy1BindgenTy20, + pub hyperv: UapiKvmHypervExit, + pub arm_nisv: UapiKvmRunBindgenTy1BindgenTy21, + pub msr: UapiKvmRunBindgenTy1BindgenTy22, + pub xen: UapiKvmXenExit, + pub riscv_sbi: UapiKvmRunBindgenTy1BindgenTy23, + pub riscv_csr: UapiKvmRunBindgenTy1BindgenTy24, + pub notify: UapiKvmRunBindgenTy1BindgenTy25, + pub padding: [u8; 256usize], +} + +impl Debug for uapi_kvm_run__bindgen_ty_1 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("uapi_kvm_run__bindgen_ty_1").finish() + } +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct UapiKvmHypervExit { + pub type_: u32, + pub pad1: u32, + pub u: uapi_kvm_hyperv_exit__bindgen_ty_1, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_hyperv_exit__bindgen_ty_1 { + pub synic: UapiKvmHypervExitBindgenTy1BindgenTy1, + pub hcall: UapiKvmHypervExitBindgenTy1BindgenTy2, + pub syndbg: UapiKvmHypervExitBindgenTy1BindgenTy3, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmHypervExitBindgenTy1BindgenTy1 { + pub msr: u32, + pub pad2: u32, + pub control: u64, + pub evt_page: u64, + pub msg_page: u64, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmHypervExitBindgenTy1BindgenTy2 { + pub input: u64, + pub result: u64, + pub params: [u64; 2usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmHypervExitBindgenTy1BindgenTy3 { + pub msr: u32, + pub pad2: u32, + pub control: u64, + pub status: u64, + pub send_page: u64, + pub recv_page: u64, + pub pending_page: u64, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct UapiKvmXenExit { + pub type_: u32, + pub u: uapi_kvm_xen_exit__bindgen_ty_1, +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union uapi_kvm_xen_exit__bindgen_ty_1 { + pub hcall: UapiKvmXenExitBindgenTy1BindgenTy1, +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmXenExitBindgenTy1BindgenTy1 { + pub longmode: u32, + pub cpl: u32, + pub input: u64, + pub result: u64, + pub params: [u64; 6usize], +} diff --git a/user/apps/test_kvm/main.c b/user/apps/test_kvm/main.c index fd60ccb6f..739953568 100644 --- a/user/apps/test_kvm/main.c +++ b/user/apps/test_kvm/main.c @@ -19,11 +19,14 @@ #include #include #include +#include +#include -#define KVM_CREATE_VCPU 0x00 -#define KVM_SET_USER_MEMORY_REGION 0x01 +#define KVM_CREATE_VM 0xAE01 +#define KVM_CREATE_VCPU 0xAE41 +#define KVM_SET_USER_MEMORY_REGION 0xAE46 -#define KVM_RUN 0x00 +#define KVM_RUN 0xAE80 #define KVM_GET_REGS 0x01 #define KVM_SET_REGS 0x02 @@ -64,7 +67,7 @@ int main() printf("Test kvm running...\n"); printf("Open /dev/kvm\n"); int kvm_fd = open("/dev/kvm", O_RDWR|O_CLOEXEC); - int vmfd = ioctl(kvm_fd, 0x01, 0); + int vmfd = ioctl(kvm_fd, KVM_CREATE_VM, 0); printf("vmfd=%d\n", vmfd); /* @@ -84,16 +87,17 @@ int main() 0xf4, /* hlt */ }; - size_t mem_size = 0x4000; // size of user memory you want to assign + size_t mem_size = 0x1000; // size of user memory you want to assign printf("code=%p\n", code); - // void *mem = mmap(0, mem_size, 0x7, -1, 0); - // memcpy(mem, code, sizeof(code)); + void *mem = mmap((void*)65536, mem_size, 0x7, 0x20, 0,0); + memcpy(mem, code, sizeof(code)); + printf("map mem=%p\n", mem); struct kvm_userspace_memory_region region = { .slot = 0, .flags = 0, .guest_phys_addr = 0, .memory_size = mem_size, - .userspace_addr = (size_t)code + .userspace_addr = (size_t)mem }; ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ®ion); From acd7f16deb583b9673eeccf417486a38089ba66c Mon Sep 17 00:00:00 2001 From: GnoCiYeH Date: Tue, 21 May 2024 01:11:17 +0800 Subject: [PATCH 03/10] =?UTF-8?q?kvm=5Frun=E5=AE=8C=E6=88=90=E4=B8=80?= =?UTF-8?q?=E5=8D=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kernel_build/src/cfiles/arch/x86_64.rs | 1 + kernel/crates/bitmap/src/alloc_bitmap.rs | 4 + kernel/crates/bitmap/src/bitmap_core.rs | 30 +- kernel/crates/bitmap/src/lib.rs | 1 + kernel/src/arch/x86_64/vm/asm.rs | 90 +- kernel/src/arch/x86_64/vm/kvm_host/lapic.rs | 37 +- kernel/src/arch/x86_64/vm/kvm_host/mod.rs | 94 +- kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs | 794 +++++- kernel/src/arch/x86_64/vm/mmu.rs | 43 + kernel/src/arch/x86_64/vm/mod.rs | 79 +- kernel/src/arch/x86_64/vm/uapi.rs | 58 + kernel/src/arch/x86_64/vm/vmx/capabilities.rs | 86 +- kernel/src/arch/x86_64/vm/vmx/mod.rs | 2220 ++++++++++++++++- kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs | 135 +- kernel/src/arch/x86_64/vm/vmx/vmenter.S | 179 ++ kernel/src/virt/vm/kvm_dev.rs | 88 +- kernel/src/virt/vm/kvm_host/mem.rs | 2 +- kernel/src/virt/vm/kvm_host/mod.rs | 38 +- kernel/src/virt/vm/kvm_host/vcpu.rs | 3 +- kernel/src/virt/vm/user_api.rs | 36 + 20 files changed, 3776 insertions(+), 242 deletions(-) create mode 100644 kernel/src/arch/x86_64/vm/uapi.rs create mode 100644 kernel/src/arch/x86_64/vm/vmx/vmenter.S diff --git a/build-scripts/kernel_build/src/cfiles/arch/x86_64.rs b/build-scripts/kernel_build/src/cfiles/arch/x86_64.rs index e473a8a9c..5b7bbec58 100644 --- a/build-scripts/kernel_build/src/cfiles/arch/x86_64.rs +++ b/build-scripts/kernel_build/src/cfiles/arch/x86_64.rs @@ -45,6 +45,7 @@ impl CFilesArch for X86_64CFilesArch { files.push(PathBuf::from("src/arch/x86_64/asm/head.S")); files.push(PathBuf::from("src/arch/x86_64/asm/entry.S")); files.push(PathBuf::from("src/arch/x86_64/asm/apu_boot.S")); + files.push(PathBuf::from("src/arch/x86_64/vm/vmx/vmenter.S")); } fn setup_global_flags(&self, c: &mut Build) { diff --git a/kernel/crates/bitmap/src/alloc_bitmap.rs b/kernel/crates/bitmap/src/alloc_bitmap.rs index 36ee33e3d..1d2d4d311 100644 --- a/kernel/crates/bitmap/src/alloc_bitmap.rs +++ b/kernel/crates/bitmap/src/alloc_bitmap.rs @@ -18,6 +18,10 @@ impl AllocBitmap { core: BitMapCore::new(), } } + + pub fn data(&self) -> &[usize] { + &self.data + } } impl BitMapOps for AllocBitmap { diff --git a/kernel/crates/bitmap/src/bitmap_core.rs b/kernel/crates/bitmap/src/bitmap_core.rs index 20babb274..384e32776 100644 --- a/kernel/crates/bitmap/src/bitmap_core.rs +++ b/kernel/crates/bitmap/src/bitmap_core.rs @@ -3,7 +3,7 @@ use core::{intrinsics::unlikely, marker::PhantomData}; use crate::traits::BitOps; #[derive(Debug, Clone)] -pub(crate) struct BitMapCore { +pub struct BitMapCore { phantom: PhantomData, } @@ -15,7 +15,7 @@ impl BitMapCore { } /// 获取位图中的某一位 - pub(crate) fn get(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn get(&self, n: usize, data: &[T], index: usize) -> Option { if unlikely(index >= n) { return None; } @@ -30,7 +30,7 @@ impl BitMapCore { } /// 设置位图中的某一位 - pub(crate) fn set(&self, n: usize, data: &mut [T], index: usize, value: bool) -> Option { + pub fn set(&self, n: usize, data: &mut [T], index: usize, value: bool) -> Option { if unlikely(index >= n) { return None; } @@ -43,7 +43,7 @@ impl BitMapCore { Some(bit) } - pub(crate) fn set_all(&self, n: usize, data: &mut [T], value: bool) { + pub fn set_all(&self, n: usize, data: &mut [T], value: bool) { let val = if value { T::max() } else { T::zero() }; for element in data.iter_mut() { *element = val; @@ -58,7 +58,7 @@ impl BitMapCore { } /// 获取位图中第一个为1的位 - pub(crate) fn first_index(&self, data: &[T]) -> Option { + pub fn first_index(&self, data: &[T]) -> Option { for (i, element) in data.iter().enumerate() { let bit = ::first_index(element); if let Some(b) = bit { @@ -70,7 +70,7 @@ impl BitMapCore { } /// 获取位图中第一个为0的位 - pub(crate) fn first_false_index(&self, n: usize, data: &[T]) -> Option { + pub fn first_false_index(&self, n: usize, data: &[T]) -> Option { for (i, element) in data.iter().enumerate() { if let Some(bit) = ::first_false_index(element) { return self.make_index(n, i * T::bit_size() + bit); @@ -81,7 +81,7 @@ impl BitMapCore { } /// 获取位图中最后一个为1的位 - pub(crate) fn last_index(&self, n: usize, data: &[T]) -> Option { + pub fn last_index(&self, n: usize, data: &[T]) -> Option { for (i, element) in data.iter().enumerate().rev() { if let Some(bit) = ::last_index(element) { return self.make_index(n, i * T::bit_size() + bit); @@ -97,7 +97,7 @@ impl BitMapCore { /// /// - `data`:位图数据 /// - `n`:位图有效位数 - pub(crate) fn last_false_index(&self, n: usize, data: &[T]) -> Option { + pub fn last_false_index(&self, n: usize, data: &[T]) -> Option { let mut iter = data.iter().rev(); let mut last_element = *iter.next()?; @@ -123,7 +123,7 @@ impl BitMapCore { } /// 获取位图中下一个为1的位 - pub(crate) fn next_index(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn next_index(&self, n: usize, data: &[T], index: usize) -> Option { if unlikely(index >= n) { return None; } @@ -146,7 +146,7 @@ impl BitMapCore { } /// 获取位图中下一个为0的位 - pub(crate) fn next_false_index(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn next_false_index(&self, n: usize, data: &[T], index: usize) -> Option { if unlikely(index >= n) { return None; } @@ -169,7 +169,7 @@ impl BitMapCore { } /// 获取位图中上一个为1的位 - pub(crate) fn prev_index(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn prev_index(&self, n: usize, data: &[T], index: usize) -> Option { if unlikely(index >= n) { return None; } @@ -190,7 +190,7 @@ impl BitMapCore { None } - pub(crate) fn prev_false_index(&self, n: usize, data: &[T], index: usize) -> Option { + pub fn prev_false_index(&self, n: usize, data: &[T], index: usize) -> Option { let element_index = index / T::bit_size(); let bit_index = index % T::bit_size(); @@ -208,7 +208,7 @@ impl BitMapCore { None } - pub(crate) fn invert(&self, n: usize, data: &mut [T]) { + pub fn invert(&self, n: usize, data: &mut [T]) { for element in data.iter_mut() { ::invert(element); } @@ -222,7 +222,7 @@ impl BitMapCore { } } - pub(crate) fn is_full(&self, n: usize, data: &[T]) -> bool { + pub fn is_full(&self, n: usize, data: &[T]) -> bool { let mut iter = data.iter().peekable(); while let Some(element) = iter.next() { if iter.peek().is_none() { @@ -245,7 +245,7 @@ impl BitMapCore { return false; } - pub(crate) fn is_empty(&self, data: &[T]) -> bool { + pub fn is_empty(&self, data: &[T]) -> bool { for element in data.iter() { if element != &T::zero() { return false; diff --git a/kernel/crates/bitmap/src/lib.rs b/kernel/crates/bitmap/src/lib.rs index 7af67331e..90efa492e 100644 --- a/kernel/crates/bitmap/src/lib.rs +++ b/kernel/crates/bitmap/src/lib.rs @@ -12,4 +12,5 @@ mod bitmap_core; mod static_bitmap; pub mod traits; pub use alloc_bitmap::AllocBitmap; +pub use bitmap_core::BitMapCore; pub use static_bitmap::StaticBitmap; diff --git a/kernel/src/arch/x86_64/vm/asm.rs b/kernel/src/arch/x86_64/vm/asm.rs index 92867c616..47271f740 100644 --- a/kernel/src/arch/x86_64/vm/asm.rs +++ b/kernel/src/arch/x86_64/vm/asm.rs @@ -1,6 +1,7 @@ use core::arch::asm; use alloc::slice; +use bitfield_struct::bitfield; use raw_cpuid::CpuId; use system_error::SystemError; use x86::{ @@ -11,10 +12,11 @@ use x86::{ IA32_VMX_CR4_FIXED0, IA32_VMX_CR4_FIXED1, }, }; +use x86_64::registers::xcontrol::XCr0; use crate::{ arch::mm::barrier, - kdebug, + kdebug, kwarn, mm::{phys_2_virt, PhysAddr}, }; @@ -31,6 +33,15 @@ impl KvmX86Asm { return 0; } + pub fn write_pkru(val: u32) { + let cpuid = CpuId::new(); + if let Some(feat) = cpuid.get_extended_feature_info() { + if feat.has_ospke() { + todo!(); + } + } + } + fn rdpkru() -> u32 { let ecx: u32 = 0; let pkru: u32; @@ -102,6 +113,11 @@ impl VmxAsm { } } + /// vmread the current VMCS. + pub fn vmx_vmread(vmcs_field: u32) -> u64 { + unsafe { x86::bits64::vmx::vmread(vmcs_field).expect("vmx_read fail: ") } + } + pub fn kvm_cpu_vmxon(phys_addr: PhysAddr) -> Result<(), SystemError> { unsafe { let mut cr4 = cr4(); @@ -121,6 +137,52 @@ impl VmxAsm { } } + const VMX_VPID_EXTENT_INDIVIDUAL_ADDR: u64 = 0; + const VMX_VPID_EXTENT_SINGLE_CONTEXT: u64 = 1; + const VMX_VPID_EXTENT_ALL_CONTEXT: u64 = 2; + const VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: u64 = 3; + + pub fn sync_vcpu_single(vpid: u16) { + if vpid == 0 { + return; + } + + Self::invvpid(Self::VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0) + } + + pub fn sync_vcpu_global() { + Self::invvpid(Self::VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); + } + + #[inline(always)] + fn invvpid(ext: u64, vpid: u16, gva: u64) { + // 定义包含指令操作数的结构体 + #[bitfield(u128)] + struct Operand { + #[bits(16)] + vpid: u64, + #[bits(48)] + rsvd: u64, + gva: u64, + } + + // 构造操作数 + let mut operand = Operand::new(); + operand.set_vpid(vpid as u64); + operand.set_gva(gva); + + // 定义嵌入汇编块 + + kwarn!("TODO: asm invvpid"); + // unsafe { + // asm!( + // "invvpid {0} {1}", + // inlateout(reg) ext => _, + // inlateout(reg) &operand => _, + // ); + // } + } + /// Set the mandatory bits in CR4 and clear bits that are mandatory zero /// (Intel Manual: 24.8 Restrictions on VMX Operation) fn vmx_set_cr4_bits() { @@ -274,15 +336,16 @@ bitflags! { } } -#[derive(Debug, Default, Copy, Clone)] +#[derive(Debug, Default, Clone)] pub struct MsrData { pub host_initiated: bool, pub index: u32, pub data: u64, } +#[repr(C, align(16))] #[derive(Debug, Default, Copy, Clone)] -pub struct KvmMsrEntry { +pub struct VmxMsrEntry { pub index: u32, pub reserved: u32, pub data: u64, @@ -434,3 +497,24 @@ pub mod kvm_msr { pub const VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x00036dff; pub const VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x000011ff; } + +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum VcpuSegment { + ES, + CS, + SS, + DS, + FS, + GS, + TR, + LDTR, +} + +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum SegmentCacheField { + SEL = 0, + BASE = 1, + LIMIT = 2, + AR = 3, + NR = 4, +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs index dc41c2371..8a995e5c4 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs @@ -1,28 +1,47 @@ -use crate::{arch::kvm_arch_ops, virt::vm::kvm_host::vcpu::VirtCpu}; +use alloc::boxed::Box; + +use crate::{ + arch::{kvm_arch_ops, MMArch}, + kdebug, + mm::MemoryManagementArch, + virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, +}; const APIC_DEFAULT_PHYS_BASE: u64 = 0xfee00000; const MSR_IA32_APICBASE: u64 = 0x0000001b; -const MSR_IA32_APICBASE_BSP: u64 = (1 << 8); -const MSR_IA32_APICBASE_ENABLE: u64 = (1 << 11); -const MSR_IA32_APICBASE_BASE: u64 = (0xfffff << 12); +const MSR_IA32_APICBASE_BSP: u64 = 1 << 8; +const MSR_IA32_APICBASE_ENABLE: u64 = 1 << 11; +const MSR_IA32_APICBASE_BASE: u64 = 0xfffff << 12; + +#[derive(Debug)] +pub struct KvmLapic { + pub apicv_active: bool, + pub regs: Box<[u8]>, +} impl VirtCpu { - pub fn lapic_reset(&mut self, init_event: bool) { - let apic = self.arch.apic; - + pub fn lapic_reset(&mut self, vm: &Vm, init_event: bool) { kvm_arch_ops().apicv_pre_state_restore(self); if !init_event { let mut msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; - if self.kvm().lock().arch.bsp_vcpu_id == self.vcpu_id { + if vm.arch.bsp_vcpu_id == self.vcpu_id { msr_val |= MSR_IA32_APICBASE_BSP; } + + self.lapic_set_base(msr_val); } + + if self.arch.apic.is_none() { + return; + } + + todo!() } fn lapic_set_base(&mut self, value: u64) { let old_val = self.arch.apic_base; - let apic = self.arch.apic; + let apic = self.arch.apic.as_ref(); self.arch.apic_base = value; diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs index 5b0150e90..0f0ccd7e2 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -7,40 +7,59 @@ use system_error::SystemError; use x86::{ bits64::rflags::RFlags, controlregs::{Cr0, Cr4}, + dtables::DescriptorTablePointer, }; use x86_64::registers::control::EferFlags; use crate::{ smp::cpu::ProcessorId, - virt::vm::kvm_host::{ - vcpu::VirtCpu, Vm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, KVM_USERSAPCE_IRQ_SOURCE_ID, + virt::vm::{ + kvm_host::{ + vcpu::VirtCpu, Vm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, KVM_USERSAPCE_IRQ_SOURCE_ID, + }, + user_api::UapiKvmSegment, }, }; use crate::arch::VirtCpuArch; use super::{ - asm::{KvmMsrEntry, MsrData}, - vmx::vmx_info, + asm::{MsrData, VcpuSegment, VmxMsrEntry}, + uapi::UapiKvmDtable, + vmx::{vmx_info, VmxVCpuPriv}, x86_kvm_manager, x86_kvm_ops, }; pub mod lapic; pub mod vcpu; +pub const TSS_IOPB_BASE_OFFSET: usize = 0x66; +pub const TSS_BASE_SIZE: usize = 0x68; +pub const TSS_IOPB_SIZE: usize = (65536 / 8); +pub const TSS_REDIRECTION_SIZE: usize = (256 / 8); +pub const RMODE_TSS_SIZE: usize = (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1); + #[derive(Debug, Default)] pub struct X86KvmArch { /// 中断芯片模式 - irqchip_mode: KvmIrqChipMode, + pub irqchip_mode: KvmIrqChipMode, /// 负责引导(bootstrap)kvm的vcpu_id bsp_vcpu_id: usize, pub pause_in_guest: bool, pub cstate_in_guest: bool, + pub mwait_in_guest: bool, + pub hlt_in_guest: bool, + pub bus_lock_detection_enabled: bool, irq_sources_bitmap: u64, default_tsc_khz: u64, guest_can_read_msr_platform_info: bool, apicv_inhibit_reasons: usize, + pub max_vcpu_ids: usize, + + pub notify_vmexit_flags: NotifyVmExitFlags, + pub notify_window: u32, + msr_fliter: Option>, } @@ -129,33 +148,61 @@ pub trait KvmFunc: Send + Sync + Debug { fn vm_init(&self) -> X86KvmArch; + fn vcpu_precreate(&self, vm: &mut Vm) -> Result<(), SystemError>; + fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm); fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: ProcessorId); - fn cache_reg(&self, vcpu: &VirtCpuArch, reg: KvmReg); + fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg); fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu); - fn set_msr(&self, vcpu: &mut VirtCpuArch, msr: MsrData); + fn set_msr(&self, vcpu: &mut VirtCpu, msr: MsrData) -> Result<(), SystemError>; fn set_rflags(&self, vcpu: &mut VirtCpu, rflags: RFlags); - fn get_rflags(&self, vcpu: &VirtCpu) -> RFlags; + fn get_rflags(&self, vcpu: &mut VirtCpu) -> RFlags; - fn set_cr0(&self, vcpu: &mut VirtCpu, cr0: Cr0); + fn set_cr0(&self, vm: &Vm, vcpu: &mut VirtCpu, cr0: Cr0); + + fn is_vaild_cr0(&self, vcpu: &VirtCpu, cr0: Cr0) -> bool; fn set_cr4(&self, vcpu: &mut VirtCpu, cr4: Cr4); + fn post_set_cr3(&self, vcpu: &VirtCpu, cr3: u64); + + fn is_vaild_cr4(&self, vcpu: &VirtCpu, cr4: Cr4) -> bool; + fn set_efer(&self, vcpu: &mut VirtCpu, efer: EferFlags); + fn set_segment(&self, vcpu: &mut VirtCpu, var: &mut UapiKvmSegment, seg: VcpuSegment); + + fn get_segment( + &self, + vcpu: &mut VirtCpu, + var: UapiKvmSegment, + seg: VcpuSegment, + ) -> UapiKvmSegment; + + /// 这个函数不会用到VCPU,这里拿到只是为了确保上一层拿到锁 + fn get_idt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer); + + fn set_idt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer); + + fn get_gdt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer); + + fn set_gdt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer); + fn update_exception_bitmap(&self, vcpu: &mut VirtCpu); - fn vcpu_reset(&self, vcpu: &mut VirtCpu, init_event: bool); + fn vcpu_reset(&self, vcpu: &mut VirtCpu, vm: &Vm, init_event: bool); fn has_emulated_msr(&self, msr: u32) -> bool; - fn get_msr_feature(&self, msr: &mut KvmMsrEntry) -> bool; + fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool; + + fn vcpu_run(&self, vcpu: &mut VirtCpu); } /// ## 中断抑制的原因位 @@ -231,6 +278,17 @@ bitflags! { const KVM_MSR_FILTER_READ = 1 << 0; const KVM_MSR_FILTER_WRITE = 1 << 1; } + + pub struct NotifyVmExitFlags: u8 { + const KVM_X86_NOTIFY_VMEXIT_ENABLED = 1 << 0; + const KVM_X86_NOTIFY_VMEXIT_USER = 1 << 1; + } +} + +impl Default for NotifyVmExitFlags { + fn default() -> Self { + NotifyVmExitFlags::empty() + } } #[derive(Debug, Clone, Copy)] @@ -296,3 +354,17 @@ pub struct KvmCommonRegs { rip: u64, rflags: u64, } + +impl Vm { + pub fn vcpu_precreate(&mut self, id: usize) -> Result<(), SystemError> { + if self.arch.max_vcpu_ids == 0 { + self.arch.max_vcpu_ids = 1024 * 4; + } + + if id >= self.arch.max_vcpu_ids { + return Err(SystemError::EINVAL); + } + + return x86_kvm_ops().vcpu_precreate(self); + } +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs index 9d7be8976..bb96f6ec8 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -1,61 +1,81 @@ -use core::intrinsics::unlikely; +use core::{arch::x86_64::_xsetbv, intrinsics::unlikely}; use alloc::{boxed::Box, sync::Arc, vec::Vec}; -use bitmap::{traits::BitMapOps, AllocBitmap, StaticBitmap}; +use bitmap::{traits::BitMapOps, AllocBitmap, BitMapCore, StaticBitmap}; use raw_cpuid::CpuId; use system_error::SystemError; use x86::{ bits64::rflags::RFlags, - controlregs::{Cr0, Cr4}, + controlregs::{Cr0, Cr4, Xcr0}, + dtables::DescriptorTablePointer, msr::{ - IA32_APIC_BASE, IA32_CSTAR, IA32_FS_BASE, IA32_GS_BASE, IA32_KERNEL_GSBASE, IA32_LSTAR, - IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, IA32_TSC_AUX, + self, wrmsr, IA32_APIC_BASE, IA32_CSTAR, IA32_FS_BASE, IA32_GS_BASE, IA32_KERNEL_GSBASE, + IA32_LSTAR, IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, IA32_TSC_AUX, }, + vmx::vmcs::control::SecondaryControls, }; -use x86_64::registers::control::EferFlags; +use x86_64::registers::{control::EferFlags, xcontrol::XCr0Flags}; use crate::{ arch::{ kvm_arch_ops, vm::{ - asm::{KvmX86Asm, MiscEnable, MsrData}, + asm::{ + hyperv, kvm_msr, KvmX86Asm, MiscEnable, MsrData, SegmentCacheField, VcpuSegment, + }, cpuid::KvmCpuidEntry2, kvm_host::KvmReg, mmu::{KvmMmu, LockedKvmMmu}, - vmx::vmcs::LoadedVmcs, + uapi::{UapiKvmSegmentRegs, KVM_SYNC_X86_VALID_FIELDS}, + vmx::{ + vmcs::{ControlsType, LoadedVmcs}, + vmx_info, VmxVCpuPriv, + }, x86_kvm_manager, x86_kvm_manager_mut, x86_kvm_ops, }, }, kdebug, kerror, mm::{PhysAddr, VirtAddr}, smp::{core::smp_get_processor_id, cpu::ProcessorId}, - virt::vm::kvm_host::{ - mem::GfnToHvaCache, - vcpu::{GuestDebug, VirtCpu}, - LockedVm, MutilProcessorState, Vm, + virt::vm::{ + kvm_host::{ + mem::GfnToHvaCache, + vcpu::{GuestDebug, VirtCpu}, + LockedVm, MutilProcessorState, Vm, + }, + user_api::{UapiKvmRun, UapiKvmSegment}, }, }; -use super::{HFlags, KvmCommonRegs, KvmIrqChipMode}; +use super::{lapic::KvmLapic, HFlags, KvmCommonRegs, KvmIrqChipMode}; #[derive(Debug)] pub struct X86VcpuArch { /// 最近一次尝试进入虚拟机的主机cpu last_vmentry_cpu: ProcessorId, - /// 可用寄存器数量 + /// 可用寄存器位图 regs_avail: AllocBitmap, - /// 脏寄存器数量 + /// 脏寄存器位图 regs_dirty: AllocBitmap, /// 多处理器状态 mp_state: MutilProcessorState, pub apic_base: u64, /// apic - pub apic: Option<()>, + pub apic: Option, /// 主机pkru寄存器 host_pkru: u32, + pkru: u32, /// hflag hflags: HFlags, + pub microcode_version: u64, + + arch_capabilities: u64, + + perf_capabilities: u64, + + ia32_xss: u64, + pub guest_state_protected: bool, pub cpuid_entries: Vec, @@ -64,6 +84,9 @@ pub struct X86VcpuArch { pub exception_vmexit: KvmQueuedException, pub apf: KvmAsyncPageFault, + pub emulate_regs_need_sync_from_vcpu: bool, + pub emulate_regs_need_sync_to_vcpu: bool, + pub smbase: u64, pub interrupt: KvmQueuedInterrupt, @@ -78,18 +101,22 @@ pub struct X86VcpuArch { pub max_phyaddr: usize, + pub pat: u64, + pub regs: [u64; KvmReg::NrVcpuRegs as usize], pub cr0: Cr0, pub cr0_guest_owned_bits: Cr0, - pub cr2: usize, - pub cr3: usize, + pub cr2: u64, + pub cr3: u64, pub cr4: Cr4, pub cr4_guest_owned_bits: Cr4, - pub cr4_guest_rsvd_bits: usize, - pub cr8: usize, + pub cr4_guest_rsvd_bits: Cr4, + pub cr8: u64, pub efer: EferFlags, + pub xcr0: Xcr0, + pub dr6: usize, pub dr7: usize, @@ -105,6 +132,8 @@ pub struct X86VcpuArch { pub nmi_pending: u32, pub nmi_injected: bool, + pub xfd_no_write_intercept: bool, + pub db: [usize; Self::KVM_NR_DB_REGS], } @@ -118,9 +147,19 @@ impl X86VcpuArch { ret.regs_avail = AllocBitmap::new(32); ret.regs_dirty = AllocBitmap::new(32); ret.mp_state = MutilProcessorState::Runnable; + + ret.apic = None; *ret } + pub fn clear_dirty(&mut self) { + self.regs_dirty.set_all(false); + } + + pub fn vcpu_apicv_active(&self) -> bool { + self.lapic_in_kernel() && self.lapic().apicv_active + } + pub fn lapic_in_kernel(&self) -> bool { if x86_kvm_manager().has_noapic_vcpu { return self.apic.is_some(); @@ -128,6 +167,21 @@ impl X86VcpuArch { true } + pub fn is_bsp(&self) -> bool { + return self.apic_base & IA32_APIC_BASE as u64 != 0; + } + + #[inline] + pub fn lapic(&self) -> &KvmLapic { + self.apic.as_ref().unwrap() + } + + pub fn queue_interrupt(&mut self, vec: u8, soft: bool) { + self.interrupt.injected = true; + self.interrupt.soft = soft; + self.interrupt.nr = vec; + } + pub fn read_cr0_bits(&mut self, mask: Cr0) -> Cr0 { let tmask = mask & (Cr0::CR0_TASK_SWITCHED | Cr0::CR0_WRITE_PROTECT); if tmask.contains(self.cr0_guest_owned_bits) @@ -165,6 +219,14 @@ impl X86VcpuArch { return self.cr4 & mask; } + pub fn get_cr8(&self) -> u64 { + if self.lapic_in_kernel() { + todo!() + } else { + return self.cr8; + } + } + #[inline] pub fn is_smm(&self) -> bool { self.hflags.contains(HFlags::HF_SMM_MASK) @@ -175,6 +237,16 @@ impl X86VcpuArch { self.hflags.contains(HFlags::HF_GUEST_MASK) } + #[inline] + pub fn is_long_mode(&self) -> bool { + self.efer.contains(EferFlags::LONG_MODE_ACTIVE) + } + + #[inline] + pub fn is_portected_mode(&mut self) -> bool { + !self.read_cr0_bits(Cr0::CR0_PROTECTED_MODE).is_empty() + } + #[inline] fn clear_interrupt_queue(&mut self) { self.interrupt.injected = false; @@ -187,40 +259,6 @@ impl X86VcpuArch { self.exception_vmexit.pending = false; } - pub fn set_msr(&mut self, index: u32, data: u64, host_initiated: bool) { - match index { - IA32_FS_BASE | IA32_GS_BASE | IA32_KERNEL_GSBASE | IA32_CSTAR | IA32_LSTAR => { - if VirtAddr::new(data as usize).is_canonical() { - return; - } - } - - IA32_SYSENTER_EIP | IA32_SYSENTER_ESP => { - // 需要将Data转为合法地址,但是现在先这样写 - assert!(VirtAddr::new(data as usize).is_canonical()); - } - IA32_TSC_AUX => { - if x86_kvm_manager() - .find_user_return_msr_idx(IA32_TSC_AUX) - .is_none() - { - return; - } - - todo!() - } - _ => {} - } - - let msr_data = MsrData { - host_initiated, - index, - data, - }; - - return kvm_arch_ops().set_msr(self, msr_data); - } - pub fn update_cpuid_runtime(&mut self, entries: &Vec) { let cpuid = CpuId::new(); let feat = cpuid.get_feature_info().unwrap(); @@ -233,29 +271,44 @@ impl X86VcpuArch { } #[inline] - fn mark_register_dirty(&mut self, reg: KvmReg) { + pub fn mark_register_dirty(&mut self, reg: KvmReg) { self.regs_avail.set(reg as usize, true); self.regs_dirty.set(reg as usize, true); } #[inline] - fn write_reg(&mut self, reg: KvmReg, data: u64) { + pub fn mark_register_available(&mut self, reg: KvmReg) { + self.regs_avail.set(reg as usize, true); + } + + #[inline] + pub fn is_register_dirty(&self, reg: KvmReg) -> bool { + self.regs_dirty.get(reg as usize).unwrap() + } + + #[inline] + pub fn is_register_available(&self, reg: KvmReg) -> bool { + self.regs_avail.get(reg as usize).unwrap() + } + + #[inline] + pub fn write_reg(&mut self, reg: KvmReg, data: u64) { self.regs[reg as usize] = data; } #[inline] - fn write_reg_raw(&mut self, reg: KvmReg, data: u64) { + pub fn write_reg_raw(&mut self, reg: KvmReg, data: u64) { self.regs[reg as usize] = data; self.mark_register_dirty(reg); } #[inline] - fn read_reg(&self, reg: KvmReg) -> u64 { + pub fn read_reg(&self, reg: KvmReg) -> u64 { return self.regs[reg as usize]; } #[inline] - fn read_reg_raw(&self, reg: KvmReg) -> u64 { + pub fn read_reg_raw(&mut self, reg: KvmReg) -> u64 { if self.regs_avail.get(reg as usize) == Some(true) { kvm_arch_ops().cache_reg(self, reg); } @@ -270,10 +323,236 @@ impl X86VcpuArch { } return self.read_reg_raw(KvmReg::VcpuRegsRip); } + + pub fn set_msr_common(&mut self, msr_info: &MsrData) { + let msr = msr_info.index; + let data = msr_info.data; + + match msr { + // MSR_AMD64_NB_CFG + 0xc001001f => { + return; + } + // MSR_VM_HSAVE_PA + 0xc0010117 => { + return; + } + // MSR_AMD64_PATCH_LOADER + 0xc0010020 => { + return; + } + // MSR_AMD64_BU_CFG2 + 0xc001102a => { + return; + } + // MSR_AMD64_DC_CFG + 0xc0011022 => { + return; + } + // MSR_AMD64_TW_CFG + 0xc0011023 => { + return; + } + // MSR_F15H_EX_CFG + 0xc001102c => { + return; + } + msr::IA32_BIOS_UPDT_TRIG => { + return; + } + msr::IA32_BIOS_SIGN_ID => { + // MSR_IA32_UCODE_REV + if msr_info.host_initiated { + self.microcode_version = data; + } + return; + } + // MSR_IA32_ARCH_CAPABILITIES + 0x0000010a => { + if !msr_info.host_initiated { + return; + } + + self.arch_capabilities = data; + } + msr::MSR_PERF_CAPABILITIES => { + if !msr_info.host_initiated { + return; + } + + if data & (!x86_kvm_manager().kvm_caps.supported_perf_cap) != 0 { + return; + } + + if self.perf_capabilities == data { + return; + } + + self.perf_capabilities = data; + // todo: kvm_pmu_refresh + return; + } + // MSR_IA32_FLUSH_CMD + 0x0000010b => { + todo!() + } + msr::IA32_EFER => { + todo!() + } + // MSR_K7_HWCR + 0xc0010015 => { + todo!() + } + // MSR_FAM10H_MMIO_CONF_BASE + 0xc0010058 => { + todo!() + } + msr::IA32_PAT => { + todo!() + } + // MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000 | MSR_MTRRdefType + 0x200..=0x26f | 0x2ff => { + todo!() + } + msr::APIC_BASE => { + todo!() + } + // APIC_BASE_MSR ... APIC_BASE_MSR + 0xff + 0x800..=0x8ff => { + todo!() + } + msr::IA32_TSC_DEADLINE => { + todo!() + } + msr::IA32_TSC_ADJUST => { + todo!() + } + msr::IA32_MISC_ENABLE => { + todo!() + } + msr::IA32_SMBASE => { + todo!() + } + msr::TSC => { + todo!() + } + // MSR_IA32_XSS + msr::MSR_C5_PMON_BOX_CTRL => { + if !msr_info.host_initiated { + return; + } + if data & (!x86_kvm_manager().kvm_caps.supported_xss) != 0 { + return; + } + + self.ia32_xss = data; + // TODO:kvm_update_cpuid_runtime + return; + } + msr::MSR_SMI_COUNT => { + todo!() + } + kvm_msr::MSR_KVM_WALL_CLOCK_NEW => { + todo!() + } + kvm_msr::MSR_KVM_WALL_CLOCK => { + todo!() + } + kvm_msr::MSR_KVM_SYSTEM_TIME => { + todo!() + } + kvm_msr::MSR_KVM_ASYNC_PF_EN => { + todo!() + } + kvm_msr::MSR_KVM_ASYNC_PF_INT => { + todo!() + } + kvm_msr::MSR_KVM_ASYNC_PF_ACK => { + todo!() + } + kvm_msr::MSR_KVM_STEAL_TIME => { + todo!() + } + kvm_msr::MSR_KVM_PV_EOI_EN => { + todo!() + } + kvm_msr::MSR_KVM_POLL_CONTROL => { + todo!() + } + msr::MCG_CTL + | msr::MCG_STATUS + | msr::MC0_CTL..=msr::MSR_MC26_MISC + | msr::IA32_MC0_CTL2..=msr::IA32_MC21_CTL2 => { + todo!() + } + // MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3 + // MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3 + // MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3 + // MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1 + 0xc0010004..=0xc0010007 + | 0xc1..=0xc2 + | 0xc0010000..=0xc0010003 + | 0x00000186..=0x00000187 => { + todo!() + } + + // MSR_K7_CLK_CTL + 0xc001001b => { + return; + } + + hyperv::HV_X64_MSR_GUEST_OS_ID..=hyperv::HV_REGISTER_SINT15 + | hyperv::HV_X64_MSR_SYNDBG_CONTROL..=hyperv::HV_X64_MSR_SYNDBG_PENDING_BUFFER + | hyperv::HV_X64_MSR_SYNDBG_OPTIONS + | hyperv::HV_REGISTER_CRASH_P0..=hyperv::HV_REGISTER_CRASH_P4 + | hyperv::HV_REGISTER_CRASH_CTL + | hyperv::HV_REGISTER_STIMER0_CONFIG..=hyperv::HV_REGISTER_STIMER3_COUNT + | hyperv::HV_X64_MSR_REENLIGHTENMENT_CONTROL + | hyperv::HV_X64_MSR_TSC_EMULATION_CONTROL + | hyperv::HV_X64_MSR_TSC_EMULATION_STATUS + | hyperv::HV_X64_MSR_TSC_INVARIANT_CONTROL => { + todo!() + } + + msr::MSR_BBL_CR_CTL3 => { + todo!() + } + + // MSR_AMD64_OSVW_ID_LENGTH + 0xc0010140 => { + todo!() + } + // MSR_AMD64_OSVW_STATUS + 0xc0010141 => { + todo!() + } + + msr::MSR_PLATFORM_INFO => { + todo!() + } + // MSR_MISC_FEATURES_ENABLES + 0x00000140 => { + todo!() + } + // MSR_IA32_XFD + 0x000001c4 => { + todo!() + } + // MSR_IA32_XFD_ERR + 0x000001c5 => { + todo!() + } + _ => { + todo!() + } + } + } } impl VirtCpu { - pub fn init_arch(&mut self, vm: &Vm) { + pub fn init_arch(&mut self, vm: &mut Vm, id: usize) -> Result<(), SystemError> { + vm.vcpu_precreate(id)?; + self.arch.last_vmentry_cpu = ProcessorId::INVALID; self.arch.regs_avail.set_all(true); self.arch.regs_dirty.set_all(true); @@ -295,12 +574,31 @@ impl VirtCpu { x86_kvm_ops().vcpu_create(self, vm); self.load(); - self.vcpu_reset(false); + self.vcpu_reset(vm, false)?; self.arch.kvm_init_mmu(); + + Ok(()) + } + + pub fn kvm_run(&self) -> &Box { + self.run.as_ref().unwrap() } pub fn run(&mut self) -> Result { self.load(); + + if unlikely(self.arch.mp_state == MutilProcessorState::Uninitialized) { + todo!() + } + + let kvm_run = self.kvm_run(); + + if kvm_run.kvm_valid_regs & !KVM_SYNC_X86_VALID_FIELDS != 0 + || kvm_run.kvm_dirty_regs & !KVM_SYNC_X86_VALID_FIELDS != 0 + { + return Err(SystemError::EINVAL); + } + todo!() } @@ -331,15 +629,53 @@ impl VirtCpu { self.request.insert(req); } - pub fn vcpu_reset(&mut self, init_event: bool) { + pub fn set_msr( + &mut self, + index: u32, + data: u64, + host_initiated: bool, + ) -> Result<(), SystemError> { + match index { + IA32_FS_BASE | IA32_GS_BASE | IA32_KERNEL_GSBASE | IA32_CSTAR | IA32_LSTAR => { + if VirtAddr::new(data as usize).is_canonical() { + return Ok(()); + } + } + + IA32_SYSENTER_EIP | IA32_SYSENTER_ESP => { + // 需要将Data转为合法地址,但是现在先这样写 + assert!(VirtAddr::new(data as usize).is_canonical()); + } + IA32_TSC_AUX => { + if x86_kvm_manager() + .find_user_return_msr_idx(IA32_TSC_AUX) + .is_none() + { + return Ok(()); + } + + todo!() + } + _ => {} + } + + let msr_data = MsrData { + host_initiated, + index, + data, + }; + + return kvm_arch_ops().set_msr(self, msr_data); + } + + pub fn vcpu_reset(&mut self, vm: &Vm, init_event: bool) -> Result<(), SystemError> { let old_cr0 = self.arch.read_cr0_bits(Cr0::all()); if self.arch.is_guest_mode() { todo!() } - // :TODO - // self.lapic_reset(init_event); + self.lapic_reset(vm, init_event); self.arch.hflags = HFlags::empty(); @@ -395,7 +731,7 @@ impl VirtCpu { // TODO: __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); // 0xda0: MSR_IA32_XSS - self.arch.set_msr(0xda0, 0, true); + self.set_msr(0xda0, 0, true)?; } for reg in &mut self.arch.regs { @@ -412,7 +748,7 @@ impl VirtCpu { }; self.arch.write_reg(KvmReg::VcpuRegsRdx, val as u64); - kvm_arch_ops().vcpu_reset(self, init_event); + kvm_arch_ops().vcpu_reset(self, vm, init_event); self.set_rflags(RFlags::FLAGS_A1); self.arch.write_reg_raw(KvmReg::VcpuRegsRip, 0xfff0); @@ -427,7 +763,7 @@ impl VirtCpu { new_cr0.insert(Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE); } - kvm_arch_ops().set_cr0(self, new_cr0); + kvm_arch_ops().set_cr0(vm, self, new_cr0); kvm_arch_ops().set_cr4(self, Cr4::empty()); kvm_arch_ops().set_efer(self, EferFlags::empty()); kvm_arch_ops().update_exception_bitmap(self); @@ -440,6 +776,8 @@ impl VirtCpu { if init_event { self.request(VirCpuRequest::KVM_REQ_TLB_FLUSH_GUEST); } + + Ok(()) } fn set_rflags(&mut self, rflags: RFlags) { @@ -457,7 +795,7 @@ impl VirtCpu { kvm_arch_ops().set_rflags(self, rflags); } - fn get_rflags(&self) -> RFlags { + fn get_rflags(&mut self) -> RFlags { let mut rflags = kvm_arch_ops().get_rflags(self); if self.guest_debug.contains(GuestDebug::SINGLESTEP) { rflags.insert(RFlags::FLAGS_TF); @@ -474,7 +812,7 @@ impl VirtCpu { return self._get_regs(); } - fn _get_regs(&self) -> KvmCommonRegs { + fn _get_regs(&mut self) -> KvmCommonRegs { KvmCommonRegs { rax: self.arch.read_reg(KvmReg::VcpuRegsRax), rbx: self.arch.read_reg(KvmReg::VcpuRegsRbx), @@ -496,6 +834,323 @@ impl VirtCpu { rflags: self.get_rflags().bits(), } } + + pub fn get_segment_regs(&mut self) -> UapiKvmSegmentRegs { + self.load(); + return self._get_segment_regs(); + } + + fn _get_segment_regs(&mut self) -> UapiKvmSegmentRegs { + let mut sregs = self._get_segment_regs_common(); + + if self.arch.guest_state_protected { + return sregs; + } + + if self.arch.interrupt.injected && !self.arch.interrupt.soft { + BitMapCore::new().set( + sregs.interrupt_bitmap.len() * core::mem::size_of::(), + &mut sregs.interrupt_bitmap, + self.arch.interrupt.nr as usize, + true, + ); + } + + return sregs; + } + + fn read_cr3(&mut self) -> u64 { + if !self.arch.is_register_available(KvmReg::VcpuExregCr3) { + x86_kvm_ops().cache_reg(&mut self.arch, KvmReg::VcpuExregCr3); + } + + return self.arch.cr3; + } + + fn kvm_get_segment(&mut self, segment: &mut UapiKvmSegment, seg: VcpuSegment) { + *segment = x86_kvm_ops().get_segment(self, *segment, seg); + } + + fn _get_segment_regs_common(&mut self) -> UapiKvmSegmentRegs { + let mut sregs = UapiKvmSegmentRegs::default(); + + if !self.arch.guest_state_protected { + let mut dt = DescriptorTablePointer::default(); + + self.kvm_get_segment(&mut sregs.cs, VcpuSegment::CS); + self.kvm_get_segment(&mut sregs.ds, VcpuSegment::DS); + self.kvm_get_segment(&mut sregs.es, VcpuSegment::ES); + self.kvm_get_segment(&mut sregs.fs, VcpuSegment::FS); + self.kvm_get_segment(&mut sregs.gs, VcpuSegment::GS); + self.kvm_get_segment(&mut sregs.ss, VcpuSegment::SS); + + self.kvm_get_segment(&mut sregs.tr, VcpuSegment::TR); + self.kvm_get_segment(&mut sregs.ldt, VcpuSegment::LDTR); + + x86_kvm_ops().get_idt(self, &mut dt); + sregs.idt.limit = dt.limit; + sregs.idt.base = dt.base as usize as u64; + + x86_kvm_ops().get_gdt(self, &mut dt); + sregs.gdt.limit = dt.limit; + sregs.gdt.base = dt.base as usize as u64; + + sregs.cr2 = self.arch.cr2; + sregs.cr3 = self.read_cr3(); + } + + sregs.cr0 = self.arch.read_cr0_bits(Cr0::all()).bits() as u64; + sregs.cr4 = self.arch.read_cr4_bits(Cr4::all()).bits() as u64; + sregs.cr8 = self.arch.get_cr8(); + sregs.efer = self.arch.efer.bits(); + sregs.apic_base = self.arch.apic_base; + + return sregs; + } + + pub fn set_segment_regs(&mut self, sregs: &mut UapiKvmSegmentRegs) -> Result<(), SystemError> { + self.load(); + kdebug!("set_segment_regs sregs{sregs:?}"); + self._set_segmenet_regs(&self.kvm().lock(), sregs)?; + Ok(()) + } + + fn _set_segmenet_regs( + &mut self, + vm: &Vm, + sregs: &mut UapiKvmSegmentRegs, + ) -> Result<(), SystemError> { + let mut mmu_reset_needed = false; + self._set_segmenet_regs_common(vm, sregs, &mut mmu_reset_needed, true)?; + + if mmu_reset_needed { + todo!() + } + + // KVM_NR_INTERRUPTS + let max_bits = 256; + + let pending_vec = BitMapCore::new().first_index(&sregs.interrupt_bitmap); + if let Some(pending) = pending_vec { + if pending < max_bits { + self.arch.queue_interrupt(pending as u8, false); + + self.request(VirCpuRequest::KVM_REQ_EVENT); + } + } + + Ok(()) + } + + /// 设置段寄存器 + fn _set_segmenet_regs_common( + &mut self, + vm: &Vm, + sregs: &mut UapiKvmSegmentRegs, + mmu_reset_needed: &mut bool, + update_pdptrs: bool, + ) -> Result<(), SystemError> { + let mut apic_base_msr = MsrData::default(); + + if !self.is_valid_segment_regs(sregs) { + return Err(SystemError::EINVAL); + } + + apic_base_msr.data = sregs.apic_base; + apic_base_msr.host_initiated = true; + + // TODO: kvm_set_apic_base + + if self.arch.guest_state_protected { + return Ok(()); + } + + let mut dt: DescriptorTablePointer = DescriptorTablePointer::default(); + + dt.limit = sregs.idt.limit; + dt.base = sregs.idt.base as usize as *const u8; + x86_kvm_ops().set_idt(self, &dt); + + dt.limit = sregs.gdt.limit; + dt.base = sregs.gdt.base as usize as *const u8; + x86_kvm_ops().set_gdt(self, &dt); + + self.arch.cr2 = sregs.cr2; + *mmu_reset_needed |= self.read_cr3() != sregs.cr3; + + self.arch.cr3 = sregs.cr3; + + self.arch.mark_register_dirty(KvmReg::VcpuExregCr3); + + x86_kvm_ops().post_set_cr3(&self, sregs.cr3); + + self.kvm_set_cr8(sregs.cr8); + + let efer = EferFlags::from_bits_truncate(sregs.efer); + *mmu_reset_needed |= self.arch.efer != efer; + x86_kvm_ops().set_efer(self, efer); + + let cr0 = Cr0::from_bits_truncate(sregs.cr0 as usize); + *mmu_reset_needed |= self.arch.cr0 != cr0; + x86_kvm_ops().set_cr0(vm, self, cr0); + self.arch.cr0 = cr0; + + let cr4 = Cr4::from_bits_truncate(sregs.cr4 as usize); + *mmu_reset_needed |= self.arch.read_cr4_bits(Cr4::all()) != cr4; + x86_kvm_ops().set_cr4(self, cr4); + + if update_pdptrs { + todo!() + } + + x86_kvm_ops().set_segment(self, &mut sregs.cs, VcpuSegment::CS); + x86_kvm_ops().set_segment(self, &mut sregs.ds, VcpuSegment::DS); + x86_kvm_ops().set_segment(self, &mut sregs.es, VcpuSegment::ES); + x86_kvm_ops().set_segment(self, &mut sregs.fs, VcpuSegment::FS); + x86_kvm_ops().set_segment(self, &mut sregs.gs, VcpuSegment::GS); + x86_kvm_ops().set_segment(self, &mut sregs.ss, VcpuSegment::SS); + + x86_kvm_ops().set_segment(self, &mut sregs.tr, VcpuSegment::TR); + x86_kvm_ops().set_segment(self, &mut sregs.ldt, VcpuSegment::LDTR); + + // TODO: update_cr8_intercept + + if self.arch.is_bsp() + && self.arch.read_reg_raw(KvmReg::VcpuRegsRip) == 0xfff0 + && sregs.cs.selector == 0xf000 + && sregs.cs.base == 0xffff0000 + && !self.arch.is_portected_mode() + { + self.arch.mp_state = MutilProcessorState::Runnable; + } + + Ok(()) + } + + pub fn kvm_set_cr8(&mut self, cr8: u64) { + // 先这样写 + self.arch.cr8 = cr8; + } + + fn is_valid_segment_regs(&self, sregs: &UapiKvmSegmentRegs) -> bool { + let efer = EferFlags::from_bits_truncate(sregs.efer); + let cr4 = Cr4::from_bits_truncate(sregs.cr4 as usize); + let cr0 = Cr0::from_bits_truncate(sregs.cr0 as usize); + + if efer.contains(EferFlags::LONG_MODE_ENABLE) && cr0.contains(Cr0::CR0_ENABLE_PAGING) { + if !cr4.contains(Cr4::CR4_ENABLE_PAE) || !efer.contains(EferFlags::LONG_MODE_ACTIVE) { + return false; + } + + // TODO: legal gpa? + } else { + if efer.contains(EferFlags::LONG_MODE_ACTIVE) || sregs.cs.l != 0 { + return false; + } + } + + return self.kvm_is_vaild_cr0(cr0) && self.kvm_is_vaild_cr4(cr4); + } + + fn kvm_is_vaild_cr0(&self, cr0: Cr0) -> bool { + if cr0.contains(Cr0::CR0_NOT_WRITE_THROUGH) && !cr0.contains(Cr0::CR0_CACHE_DISABLE) { + return false; + } + + if cr0.contains(Cr0::CR0_ENABLE_PAGING) && !cr0.contains(Cr0::CR0_PROTECTED_MODE) { + return false; + } + + return x86_kvm_ops().is_vaild_cr0(self, cr0); + } + + fn __kvm_is_valid_cr4(&self, cr4: Cr4) -> bool { + if cr4.contains(self.arch.cr4_guest_rsvd_bits) { + return false; + } + + return true; + } + + fn kvm_is_vaild_cr4(&self, cr4: Cr4) -> bool { + return self.__kvm_is_valid_cr4(cr4) && x86_kvm_ops().is_vaild_cr4(self, cr4); + } + + pub fn is_unrestricted_guest(&self) -> bool { + let guard = self.vmx().loaded_vmcs(); + return vmx_info().enable_unrestricted_guest + && (!self.arch.is_guest_mode() + || SecondaryControls::from_bits_truncate( + guard.controls_get(ControlsType::SecondaryExec) as u32, + ) + .contains(SecondaryControls::UNRESTRICTED_GUEST)); + } + + pub fn set_regs(&mut self, regs: &KvmCommonRegs) -> Result<(), SystemError> { + self.load(); + self._set_regs(regs); + Ok(()) + } + + fn _set_regs(&mut self, regs: &KvmCommonRegs) { + self.arch.emulate_regs_need_sync_from_vcpu = true; + self.arch.emulate_regs_need_sync_to_vcpu = false; + + self.arch.write_reg(KvmReg::VcpuRegsRax, regs.rax); + self.arch.write_reg(KvmReg::VcpuRegsRbx, regs.rbx); + self.arch.write_reg(KvmReg::VcpuRegsRcx, regs.rcx); + self.arch.write_reg(KvmReg::VcpuRegsRdx, regs.rdx); + self.arch.write_reg(KvmReg::VcpuRegsRsi, regs.rsi); + self.arch.write_reg(KvmReg::VcpuRegsRdi, regs.rdi); + self.arch.write_reg(KvmReg::VcpuRegsRsp, regs.rsp); + self.arch.write_reg(KvmReg::VcpuRegsRbp, regs.rbp); + + self.arch.write_reg(KvmReg::VcpuRegsR8, regs.r8); + self.arch.write_reg(KvmReg::VcpuRegsR9, regs.r9); + self.arch.write_reg(KvmReg::VcpuRegsR10, regs.r10); + self.arch.write_reg(KvmReg::VcpuRegsR11, regs.r11); + self.arch.write_reg(KvmReg::VcpuRegsR12, regs.r12); + self.arch.write_reg(KvmReg::VcpuRegsR13, regs.r13); + self.arch.write_reg(KvmReg::VcpuRegsR14, regs.r14); + self.arch.write_reg(KvmReg::VcpuRegsR15, regs.r15); + + self.arch.write_reg_raw(KvmReg::VcpuRegsRip, regs.rip); + + self.set_rflags(RFlags::from_bits_truncate(regs.rflags) | RFlags::FLAGS_A1); + + self.arch.exception.pending = false; + self.arch.exception_vmexit.pending = false; + + self.request(VirCpuRequest::KVM_REQ_EVENT); + } + + pub fn load_guest_xsave_state(&mut self) { + if self.arch.guest_state_protected { + return; + } + + if !self.arch.read_cr4_bits(Cr4::CR4_ENABLE_OS_XSAVE).is_empty() { + if self.arch.xcr0 != x86_kvm_manager().host_xcr0 { + unsafe { _xsetbv(0, self.arch.xcr0.bits()) }; + } + + if self.arch.ia32_xss != x86_kvm_manager().host_xss { + // XSS + unsafe { wrmsr(0xda0, self.arch.ia32_xss) }; + } + } + + if CpuId::new().get_extended_feature_info().unwrap().has_pku() + && self.arch.pkru != self.arch.host_pkru + && (self.arch.xcr0.contains(Xcr0::XCR0_PKRU_STATE) + || !self + .arch + .read_cr4_bits(Cr4::CR4_ENABLE_PROTECTION_KEY) + .is_empty()) + { + KvmX86Asm::write_pkru(self.arch.pkru); + } + } } bitflags! { @@ -505,6 +1160,7 @@ bitflags! { const KVM_REQUEST_NO_ACTION = 1 << 2; const KVM_REQ_EVENT = 1 << 6; const KVM_REQ_STEAL_UPDATE = 1 << 8; + const KVM_REQ_APIC_PAGE_RELOAD = 1 << 17 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; const KVM_REQ_TLB_FLUSH_GUEST = 1 << 27 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; const KVM_REQ_TLB_FLUSH = 1 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; } diff --git a/kernel/src/arch/x86_64/vm/mmu.rs b/kernel/src/arch/x86_64/vm/mmu.rs index 852f7cfe8..a0edddf4b 100644 --- a/kernel/src/arch/x86_64/vm/mmu.rs +++ b/kernel/src/arch/x86_64/vm/mmu.rs @@ -1,5 +1,6 @@ use alloc::{sync::Arc, vec::Vec}; use bitfield_struct::bitfield; +use raw_cpuid::CpuId; use x86::controlregs::{Cr0, Cr4}; use x86_64::registers::control::EferFlags; @@ -17,10 +18,24 @@ const PT32_ROOT_LEVEL: usize = 2; const PT32E_ROOT_LEVEL: usize = 3; static mut TDP_ENABLED: bool = false; +static mut TDP_MMU_ENABLED: bool = true; +static mut TDP_MMU_ALLOWED: bool = unsafe { TDP_MMU_ENABLED }; + static mut TDP_ROOT_LEVEL: usize = 0; static mut MAX_TDP_LEVEL: usize = 0; static mut SHADOW_ACCESSED_MASK: usize = 0; +static mut MAX_HUGE_PAGE_LEVEL: PageLevel = PageLevel::None; + +pub enum PageLevel { + None, + Level4k, + Level2M, + Level1G, + Level512G, + LevelNum, +} + #[derive(Debug)] pub struct LockedKvmMmu { inner: SpinLock, @@ -74,6 +89,34 @@ impl KvmMmu { pub fn ad_enabled() -> bool { unsafe { SHADOW_ACCESSED_MASK != 0 } } + + /// 初始化mmu的配置,因为其是无锁的,所以该函数只能在初始化vmx时调用 + pub fn kvm_configure_mmu( + enable_tdp: bool, + tdp_forced_root_level: usize, + tdp_max_root_level: usize, + tdp_huge_page_level: PageLevel, + ) { + unsafe { + TDP_ENABLED = enable_tdp; + TDP_ROOT_LEVEL = tdp_forced_root_level; + MAX_TDP_LEVEL = tdp_max_root_level; + + TDP_MMU_ENABLED = TDP_MMU_ALLOWED && TDP_ENABLED; + + if TDP_ENABLED { + MAX_HUGE_PAGE_LEVEL = tdp_huge_page_level; + } else if CpuId::new() + .get_extended_processor_and_feature_identifiers() + .unwrap() + .has_1gib_pages() + { + MAX_HUGE_PAGE_LEVEL = PageLevel::Level1G; + } else { + MAX_HUGE_PAGE_LEVEL = PageLevel::Level2M; + } + } + } } #[derive(Debug, Default)] diff --git a/kernel/src/arch/x86_64/vm/mod.rs b/kernel/src/arch/x86_64/vm/mod.rs index 87ad42fcd..c7996265e 100644 --- a/kernel/src/arch/x86_64/vm/mod.rs +++ b/kernel/src/arch/x86_64/vm/mod.rs @@ -1,8 +1,10 @@ +use core::arch::x86_64::{_xgetbv, _XCR_XFEATURE_ENABLED_MASK}; + use alloc::vec::Vec; use raw_cpuid::CpuId; use system_error::SystemError; use x86::{ - controlregs::Xcr0, + controlregs::{xcr0, Cr0, Cr4, Xcr0}, msr::{ rdmsr, IA32_BIOS_SIGN_ID, IA32_CSTAR, IA32_EFER, IA32_FEATURE_CONTROL, IA32_FMASK, IA32_KERNEL_GSBASE, IA32_LSTAR, IA32_MCG_CTL, IA32_MCG_STATUS, IA32_MISC_ENABLE, IA32_PAT, @@ -20,6 +22,10 @@ use x86::{ MSR_PLATFORM_INFO, MSR_POWER_CTL, MSR_SMI_COUNT, }, }; +use x86_64::registers::{ + control::{Efer, EferFlags}, + xcontrol::{XCr0, XCr0Flags}, +}; use crate::{ arch::vm::vmx::{VmxL1dFlushState, L1TF_VMX_MITIGATION}, @@ -29,7 +35,7 @@ use crate::{ }; use self::{ - asm::{hyperv::*, kvm_msr::*, ArchCapabilities, KvmMsrEntry}, + asm::{hyperv::*, kvm_msr::*, ArchCapabilities, VmxMsrEntry}, kvm_host::{KvmFunc, KvmInitFunc}, }; @@ -40,6 +46,7 @@ mod cpuid; pub mod kvm_host; pub mod mem; mod mmu; +pub mod uapi; pub mod vmx; static mut KVM_X86_MANAGER: Option = None; @@ -59,7 +66,7 @@ pub fn x86_kvm_manager_mut() -> &'static mut KvmArchManager { pub fn init_kvm_arch() { static ONCE: Once = Once::new(); ONCE.call_once(|| unsafe { - KVM_X86_MANAGER = Some(KvmArchManager::default()); + KVM_X86_MANAGER = Some(KvmArchManager::init()); let mut user_return_msrs = Vec::new(); user_return_msrs.resize(PerCpu::MAX_CPU_NUM as usize, KvmUserReturnMsrs::default()); @@ -68,11 +75,11 @@ pub fn init_kvm_arch() { } /// fixme:这些成员是否需要加锁呢?? -#[derive(Debug, Default)] +#[derive(Debug)] pub struct KvmArchManager { funcs: Option<&'static dyn KvmFunc>, - host_xcr0: u64, - host_efer: u64, + host_xcr0: Xcr0, + host_efer: EferFlags, host_xss: u64, host_arch_capabilities: u64, kvm_uret_msrs_list: Vec, @@ -85,9 +92,54 @@ pub struct KvmArchManager { has_noapic_vcpu: bool, enable_pmu: bool, + + // 只读 + possible_cr0_guest: Cr0, + possible_cr4_guest: Cr4, + cr4_tlbflush_bits: Cr4, + cr4_pdptr_bits: Cr4, } impl KvmArchManager { + pub fn init() -> Self { + Self { + possible_cr0_guest: Cr0::CR0_TASK_SWITCHED | Cr0::CR0_WRITE_PROTECT, + possible_cr4_guest: Cr4::CR4_VIRTUAL_INTERRUPTS + | Cr4::CR4_DEBUGGING_EXTENSIONS + | Cr4::CR4_ENABLE_PPMC + | Cr4::CR4_ENABLE_SSE + | Cr4::CR4_UNMASKED_SSE + | Cr4::CR4_ENABLE_GLOBAL_PAGES + | Cr4::CR4_TIME_STAMP_DISABLE + | Cr4::CR4_ENABLE_FSGSBASE, + + cr4_tlbflush_bits: Cr4::CR4_ENABLE_GLOBAL_PAGES + | Cr4::CR4_ENABLE_PCID + | Cr4::CR4_ENABLE_PAE + | Cr4::CR4_ENABLE_SMEP, + + cr4_pdptr_bits: Cr4::CR4_ENABLE_GLOBAL_PAGES + | Cr4::CR4_ENABLE_PSE + | Cr4::CR4_ENABLE_PAE + | Cr4::CR4_ENABLE_SMEP, + + host_xcr0: Xcr0::empty(), + + funcs: Default::default(), + host_efer: EferFlags::empty(), + host_xss: Default::default(), + host_arch_capabilities: Default::default(), + kvm_uret_msrs_list: Default::default(), + kvm_caps: Default::default(), + max_tsc_khz: Default::default(), + msrs_to_save: Default::default(), + emulated_msrs: Default::default(), + msr_based_features: Default::default(), + has_noapic_vcpu: Default::default(), + enable_pmu: Default::default(), + } + } + #[inline] pub fn set_runtime_func(&mut self, funcs: &'static dyn KvmFunc) { self.funcs = Some(funcs); @@ -108,6 +160,11 @@ impl KvmArchManager { None } + pub fn mpx_supported(&self) -> bool { + self.kvm_caps.supported_xcr0 & (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE) + == (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDREG_STATE) + } + pub const KVM_MAX_VCPUS: usize = 1024; pub const KVM_MAX_NR_USER_RETURN_MSRS: usize = 7; @@ -290,11 +347,11 @@ impl KvmArchManager { // TODO:mmu vendor init if cpu_feature.has_xsave() { - // fixme:这里会UD,后续再修 - // self.host_xcr0 = unsafe { _xgetbv(_XCR_XFEATURE_ENABLED_MASK) }; + self.host_xcr0 = unsafe { xcr0() }; + self.kvm_caps.supported_xcr0 = self.host_xcr0; } // 保存efer - self.host_efer = unsafe { rdmsr(IA32_EFER) }; + self.host_efer = Efer::read(); // 保存xss if cpu_extend.has_xsaves_xrstors() { @@ -431,7 +488,7 @@ impl KvmArchManager { } fn kvm_prove_feature_msr(&mut self, index: u32) { - let mut msr = KvmMsrEntry { + let mut msr = VmxMsrEntry { index, reserved: Default::default(), data: Default::default(), @@ -444,7 +501,7 @@ impl KvmArchManager { self.msr_based_features.push(index); } - fn get_msr_feature(&self, msr: &mut KvmMsrEntry) -> bool { + fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool { match msr.index { 0x10a => { // MSR_IA32_ARCH_CAPABILITIES, diff --git a/kernel/src/arch/x86_64/vm/uapi.rs b/kernel/src/arch/x86_64/vm/uapi.rs new file mode 100644 index 000000000..e22b02b76 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/uapi.rs @@ -0,0 +1,58 @@ +use crate::virt::vm::user_api::UapiKvmSegment; + +pub const DE_VECTOR: usize = 0; +pub const DB_VECTOR: usize = 1; +pub const BP_VECTOR: usize = 3; +pub const OF_VECTOR: usize = 4; +pub const BR_VECTOR: usize = 5; +pub const UD_VECTOR: usize = 6; +pub const NM_VECTOR: usize = 7; +pub const DF_VECTOR: usize = 8; +pub const TS_VECTOR: usize = 10; +pub const NP_VECTOR: usize = 11; +pub const SS_VECTOR: usize = 12; +pub const GP_VECTOR: usize = 13; +pub const PF_VECTOR: usize = 14; +pub const MF_VECTOR: usize = 16; +pub const AC_VECTOR: usize = 17; +pub const MC_VECTOR: usize = 18; +pub const XM_VECTOR: usize = 19; +pub const VE_VECTOR: usize = 20; + +pub const KVM_SYNC_X86_REGS: u64 = 1 << 0; +pub const KVM_SYNC_X86_SREGS: u64 = 1 << 1; +pub const KVM_SYNC_X86_EVENTS: u64 = 1 << 2; + +pub const KVM_SYNC_X86_VALID_FIELDS: u64 = + KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS | KVM_SYNC_X86_EVENTS; + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmSegmentRegs { + pub cs: UapiKvmSegment, + pub ds: UapiKvmSegment, + pub es: UapiKvmSegment, + pub fs: UapiKvmSegment, + pub gs: UapiKvmSegment, + pub ss: UapiKvmSegment, + pub tr: UapiKvmSegment, + pub ldt: UapiKvmSegment, + pub gdt: UapiKvmDtable, + pub idt: UapiKvmDtable, + pub cr0: u64, + pub cr2: u64, + pub cr3: u64, + pub cr4: u64, + pub cr8: u64, + pub efer: u64, + pub apic_base: u64, + pub interrupt_bitmap: [u64; 4usize], +} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmDtable { + pub base: u64, + pub limit: u16, + pub padding: [u16; 3usize], +} diff --git a/kernel/src/arch/x86_64/vm/vmx/capabilities.rs b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs index 2cc81520d..71c947f24 100644 --- a/kernel/src/arch/x86_64/vm/vmx/capabilities.rs +++ b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs @@ -1,3 +1,4 @@ +use raw_cpuid::CpuId; use x86::{ msr::{ IA32_VMX_BASIC, IA32_VMX_CR0_FIXED0, IA32_VMX_CR0_FIXED1, IA32_VMX_CR4_FIXED0, @@ -11,9 +12,12 @@ use x86::{ }, }; -use crate::arch::vm::{ - CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR, PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, - VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR, VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR, +use crate::{ + arch::vm::{ + mmu::PageLevel, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR, PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, + VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR, VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR, + }, + virt::vm::kvm_host::vcpu::VirtCpu, }; use super::{vmcs::feat::VmxFeat, Vmx}; @@ -181,7 +185,7 @@ pub struct VmxCapability { pub vpid: VpidFlag, } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum ProcessorTraceMode { System, HostGuest, @@ -368,6 +372,30 @@ impl Vmx { return self.vmx_cap.ept.contains(EptFlag::EPT_PAGE_WALK_4); } + /// 是否支持5级页表 + #[inline] + pub fn has_ept_5levels(&self) -> bool { + return self.vmx_cap.ept.contains(EptFlag::EPT_PAGE_WALK_5); + } + + pub fn get_max_ept_level(&self) -> usize { + if self.has_ept_5levels() { + return 5; + } + return 4; + } + + pub fn ept_cap_to_lpage_level(&self) -> PageLevel { + if self.vmx_cap.ept.contains(EptFlag::EPT_1GB_PAGE) { + return PageLevel::Level1G; + } + if self.vmx_cap.ept.contains(EptFlag::EPT_2MB_PAGE) { + return PageLevel::Level2M; + } + + return PageLevel::Level4k; + } + /// 判断mt(Memory type)是否为write back #[inline] pub fn has_ept_mt_wb(&self) -> bool { @@ -497,6 +525,50 @@ impl Vmx { .contains(PrimaryControls::USE_MSR_BITMAPS); } + #[inline] + pub fn has_sceondary_exec_ctrls(&self) -> bool { + self.vmcs_config + .cpu_based_exec_ctrl + .contains(PrimaryControls::SECONDARY_CONTROLS) + } + + #[inline] + pub fn has_rdtscp(&self) -> bool { + self.vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_RDTSCP) + } + + #[inline] + pub fn has_vmfunc(&self) -> bool { + self.vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_VM_FUNCTIONS) + } + + #[inline] + pub fn has_xsaves(&self) -> bool { + self.vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::ENABLE_XSAVES_XRSTORS) + } + + #[inline] + pub fn vmx_umip_emulated(&self) -> bool { + let feat = CpuId::new().get_extended_feature_info().unwrap().has_umip(); + + return !feat + && (self + .vmcs_config + .cpu_based_2nd_exec_ctrl + .contains(SecondaryControls::DTABLE_EXITING)); + } + + #[inline] + pub fn has_tertiary_exec_ctrls(&self) -> bool { + false + } + #[inline] pub fn has_bus_lock_detection(&self) -> bool { false @@ -506,4 +578,10 @@ impl Vmx { pub fn has_notify_vmexit(&self) -> bool { false } + + /// 是否需要拦截页面故障 + #[inline] + pub fn vmx_need_pf_intercept(&self, _vcpu: &VirtCpu) -> bool { + true + } } diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs index b6c4b4349..fe6b1917e 100644 --- a/kernel/src/arch/x86_64/vm/vmx/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -1,8 +1,12 @@ -use core::{ - mem::MaybeUninit, - sync::atomic::{AtomicBool, Ordering}, -}; +use core::intrinsics::unlikely; +use core::sync::atomic::{AtomicBool, Ordering}; +use crate::arch::vm::mmu::KvmMmu; +use crate::arch::vm::uapi::{ + AC_VECTOR, BP_VECTOR, DB_VECTOR, GP_VECTOR, MC_VECTOR, NM_VECTOR, PF_VECTOR, UD_VECTOR, +}; +use crate::libs::spinlock::SpinLockGuard; +use crate::virt::vm::kvm_host::vcpu::GuestDebug; use crate::{ arch::{ vm::{ @@ -10,45 +14,50 @@ use crate::{ kvm_host::{vcpu::VirCpuRequest, X86KvmArch}, vmx::vmcs::vmx_area, }, - CurrentIrqArch, VirtCpuArch, + CurrentIrqArch, MMArch, VirtCpuArch, }, exception::InterruptArch, kdebug, - libs::{once::Once, spinlock::SpinLock}, + libs::spinlock::SpinLock, mm::{ percpu::{PerCpu, PerCpuVar}, - virt_2_phys, PhysAddr, + virt_2_phys, MemoryManagementArch, PhysAddr, }, smp::{core::smp_get_processor_id, cpu::ProcessorId}, - virt::vm::{kvm_dev::kvm_init, kvm_host::vcpu::VirtCpu}, + virt::vm::{kvm_dev::kvm_init, kvm_host::vcpu::VirtCpu, user_api::UapiKvmSegment}, }; use alloc::{alloc::Global, boxed::Box, collections::LinkedList, sync::Arc, vec::Vec}; +use bitfield_struct::bitfield; use bitmap::{traits::BitMapOps, AllocBitmap, StaticBitmap}; use raw_cpuid::CpuId; use system_error::SystemError; +use x86::controlregs::{cr2, cr2_write}; +use x86::irq::PageFaultError; +use x86::msr::wrmsr; use x86::{ - controlregs::Xcr0, + bits64::rflags::RFlags, + controlregs::{cr0, cr3, cr4, Cr0, Cr4, Xcr0}, msr::{ - rdmsr, IA32_CSTAR, IA32_EFER, IA32_FMASK, IA32_FS_BASE, IA32_GS_BASE, IA32_KERNEL_GSBASE, - IA32_LSTAR, IA32_SMBASE, IA32_STAR, IA32_SYSENTER_CS, IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, - IA32_TIME_STAMP_COUNTER, IA32_TSC_AUX, IA32_VMX_BASIC, IA32_VMX_CR0_FIXED0, - IA32_VMX_CR0_FIXED1, IA32_VMX_CR4_FIXED0, IA32_VMX_CR4_FIXED1, IA32_VMX_ENTRY_CTLS, - IA32_VMX_EPT_VPID_CAP, IA32_VMX_EXIT_CTLS, IA32_VMX_MISC, IA32_VMX_PINBASED_CTLS, - IA32_VMX_PROCBASED_CTLS, IA32_VMX_PROCBASED_CTLS2, IA32_VMX_TRUE_ENTRY_CTLS, - IA32_VMX_TRUE_EXIT_CTLS, IA32_VMX_TRUE_PINBASED_CTLS, IA32_VMX_TRUE_PROCBASED_CTLS, - IA32_VMX_VMCS_ENUM, IA32_VMX_VMFUNC, MSR_CORE_C1_RESIDENCY, MSR_CORE_C3_RESIDENCY, - MSR_CORE_C6_RESIDENCY, MSR_CORE_C7_RESIDENCY, MSR_IA32_ADDR0_START, MSR_IA32_ADDR3_END, - MSR_IA32_CR3_MATCH, MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK_PTRS, - MSR_IA32_RTIT_STATUS, MSR_IA32_TSX_CTRL, MSR_LASTBRANCH_TOS, MSR_LBR_SELECT, + self, rdmsr, IA32_CSTAR, IA32_EFER, IA32_FMASK, IA32_FS_BASE, IA32_GS_BASE, + IA32_KERNEL_GSBASE, IA32_LSTAR, IA32_SMBASE, IA32_STAR, IA32_SYSENTER_CS, + IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, IA32_TIME_STAMP_COUNTER, IA32_TSC_AUX, + IA32_VMX_BASIC, IA32_VMX_EPT_VPID_CAP, IA32_VMX_MISC, IA32_VMX_VMFUNC, + MSR_CORE_C1_RESIDENCY, MSR_CORE_C3_RESIDENCY, MSR_CORE_C6_RESIDENCY, MSR_CORE_C7_RESIDENCY, + MSR_IA32_ADDR0_START, MSR_IA32_ADDR3_END, MSR_IA32_CR3_MATCH, MSR_IA32_RTIT_OUTPUT_BASE, + MSR_IA32_RTIT_OUTPUT_MASK_PTRS, MSR_IA32_RTIT_STATUS, MSR_IA32_TSX_CTRL, + MSR_LASTBRANCH_TOS, MSR_LBR_SELECT, }, + segmentation::{self, cs}, vmx::vmcs::{ control::{ - EntryControls, ExitControls, PrimaryControls, SecondaryControls, PINBASED_EXEC_CONTROLS, + self, EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls, }, - host, + guest, host, }, }; -use x86_64::instructions::tables::sidt; +use x86_64::registers::control::Cr3; +use x86_64::structures::idt::PageFaultErrorCode; +use x86_64::{instructions::tables::sidt, registers::control::EferFlags}; use crate::{ arch::{ @@ -56,30 +65,39 @@ use crate::{ KvmArch, }, kerror, kwarn, - libs::{lazy_init::Lazy, rwlock::RwLock}, + libs::rwlock::RwLock, virt::vm::kvm_host::Vm, }; +use self::vmcs::LoadedVmcs; use self::{ - capabilities::{NestedVmxMsrs, ProcessorTraceMode, VmcsConfig, VmxCapability}, + capabilities::{ProcessorTraceMode, VmcsConfig, VmxCapability}, vmcs::{ - current_loaded_vmcs_list_mut, current_vmcs, current_vmcs_mut, LockedLoadedVmcs, - VMControlStructure, VmxMsrBitmapAccess, VmxMsrBitmapAction, PERCPU_LOADED_VMCS_LIST, - PERCPU_VMCS, VMXAREA, + current_loaded_vmcs_list_mut, current_vmcs, current_vmcs_mut, ControlsType, + LockedLoadedVmcs, VMControlStructure, VmxMsrBitmapAccess, VmxMsrBitmapAction, + PERCPU_LOADED_VMCS_LIST, PERCPU_VMCS, VMXAREA, }, }; +use super::asm::SegmentCacheField; +use super::kvm_host::RMODE_TSS_SIZE; +use super::x86_kvm_ops; use super::{ - asm::VmxAsm, + asm::{VcpuSegment, VmxAsm, VmxMsrEntry}, init_kvm_arch, - kvm_host::{KvmFunc, KvmInitFunc, MsrFilterType}, - x86_kvm_manager, KvmArchManager, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR, - PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR, - VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR, + kvm_host::{ + vcpu, KvmFunc, KvmInitFunc, KvmIrqChipMode, KvmReg, MsrFilterType, NotifyVmExitFlags, + }, + x86_kvm_manager, KvmArchManager, }; pub mod capabilities; pub mod vmcs; + +extern "C" { + fn vmx_vmexit(); +} + pub struct VmxKvmInitFunc; impl VmxKvmInitFunc { @@ -112,6 +130,9 @@ impl KvmInitFunc for VmxKvmInitFunc { .map_err(|_| SystemError::ENOMEM)? .assume_init() }; + + vmx_init.init(); + vmx_init.host_idt_base = idt.base.as_u64(); Vmx::set_up_user_return_msrs(); @@ -207,7 +228,7 @@ impl KvmInitFunc for VmxKvmInitFunc { kvm_cap.has_bus_lock_exit = vmx_init.has_bus_lock_detection(); kvm_cap.has_notify_vmexit = vmx_init.has_notify_vmexit(); - vmx_init.vpid_bitmap.lock().set_all(false); + // vmx_init.vpid_bitmap.lock().set_all(false); if vmx_init.enable_ept { // TODO: mmu_set_ept_masks @@ -216,7 +237,12 @@ impl KvmInitFunc for VmxKvmInitFunc { kwarn!("vmx_setup_me_spte_mask TODO!"); - kwarn!("kvm_configure_mmu TODO!"); + KvmMmu::kvm_configure_mmu( + vmx_init.enable_ept, + 0, + vmx_init.get_max_ept_level(), + vmx_init.ept_cap_to_lpage_level(), + ); if !vmx_init.enable_ept || !vmx_init.enable_ept_ad || !vmx_init.has_pml() { vmx_init.enable_pml = false; @@ -366,6 +392,21 @@ impl VmxKvmFunc { todo!() } } + + pub fn seg_setup(&self, seg: VcpuSegment) { + let seg_field = &KVM_VMX_SEGMENT_FIELDS[seg as usize]; + + VmxAsm::vmx_vmwrite(seg_field.selector, 0); + VmxAsm::vmx_vmwrite(seg_field.base, 0); + VmxAsm::vmx_vmwrite(seg_field.limit, 0xffff); + + let mut ar = 0x93; + if seg == VcpuSegment::CS { + ar |= 0x08; + } + + VmxAsm::vmx_vmwrite(seg_field.ar_bytes, ar); + } } impl KvmFunc for VmxKvmFunc { @@ -407,8 +448,45 @@ impl KvmFunc for VmxKvmFunc { // TODO: vmx_vcpu_pi_load } - fn cache_reg(&self, vcpu: &VirtCpuArch, reg: super::kvm_host::KvmReg) { - todo!() + fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg) { + vcpu.mark_register_available(reg); + + match reg { + KvmReg::VcpuRegsRsp => { + vcpu.regs[reg as usize] = VmxAsm::vmx_vmread(guest::RSP); + } + KvmReg::VcpuRegsRip => { + vcpu.regs[reg as usize] = VmxAsm::vmx_vmread(guest::RIP); + } + // VCPU_EXREG_PDPTR + KvmReg::NrVcpuRegs => { + if vmx_info().enable_ept { + todo!() + } + } + KvmReg::VcpuExregCr0 => { + let guest_owned = vcpu.cr0_guest_owned_bits; + + vcpu.cr0.remove(guest_owned); + vcpu.cr0.insert( + Cr0::from_bits_truncate(VmxAsm::vmx_vmread(guest::CR0) as usize) & guest_owned, + ); + } + KvmReg::VcpuExregCr3 => { + todo!() + } + KvmReg::VcpuExregCr4 => { + let guest_owned = vcpu.cr4_guest_owned_bits; + + vcpu.cr4.remove(guest_owned); + vcpu.cr4.insert( + Cr4::from_bits_truncate(VmxAsm::vmx_vmread(guest::CR4) as usize) & guest_owned, + ); + } + _ => { + todo!() + } + } } fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu) { @@ -417,32 +495,387 @@ impl KvmFunc for VmxKvmFunc { // todo!() } - fn set_msr(&self, vcpu: &mut VirtCpuArch, msr: super::asm::MsrData) { - todo!() + fn set_msr(&self, vcpu: &mut VirtCpu, msr: super::asm::MsrData) -> Result<(), SystemError> { + let vmx = vcpu.vmx_mut(); + let msr_index = msr.index; + let data = msr.data; + + match msr_index { + msr::IA32_EFER => { + todo!("IA32_EFER") + } + + msr::IA32_FS_BASE => { + todo!("IA32_FS_BASE") + } + + msr::IA32_GS_BASE => { + todo!("IA32_GS_BASE") + } + + msr::IA32_KERNEL_GSBASE => { + todo!("IA32_KERNEL_GSBASE") + } + + 0x000001c4 => { + todo!("MSR_IA32_XFD") + } + + msr::IA32_SYSENTER_CS => { + todo!("IA32_SYSENTER_CS") + } + + msr::IA32_SYSENTER_EIP => { + todo!("IA32_SYSENTER_EIP") + } + + msr::IA32_SYSENTER_ESP => { + todo!("IA32_SYSENTER_ESP") + } + + msr::IA32_DEBUGCTL => { + todo!("IA32_DEBUGCTL") + } + + msr::MSR_C1_PMON_EVNT_SEL0 => { + todo!("MSR_IA32_BNDCFGS") + } + + 0xe1 => { + todo!("MSR_IA32_UMWAIT_CONTROL ") + } + + 0x48 => { + todo!("MSR_IA32_SPEC_CTRL") + } + + msr::MSR_IA32_TSX_CTRL => { + todo!("MSR_IA32_TSX_CTRL") + } + + msr::IA32_PAT => { + todo!("IA32_PAT") + } + + 0x4d0 => { + todo!("MSR_IA32_MCG_EXT_CTL") + } + + msr::IA32_FEATURE_CONTROL => { + todo!("IA32_FEATURE_CONTROL") + } + + 0x8c..=0x8f => { + todo!("MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3 {msr_index}") + } + + msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC => { + todo!("msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC") + } + + msr::MSR_IA32_RTIT_CTL => { + todo!("MSR_IA32_RTIT_CTL") + } + + msr::MSR_IA32_RTIT_STATUS => { + todo!("MSR_IA32_RTIT_STATUS") + } + + msr::MSR_IA32_RTIT_OUTPUT_BASE => { + todo!("MSR_IA32_RTIT_OUTPUT_BASE") + } + + 0x572 => { + todo!("MSR_IA32_RTIT_CR3_MATCH") + } + + msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS => { + todo!("MSR_IA32_RTIT_OUTPUT_MASK_PTRS") + } + + msr::MSR_IA32_ADDR0_START..=msr::MSR_IA32_ADDR3_END => { + todo!("msr::MSR_IA32_ADDR0_START..=msr::MSR_IA32_ADDR3_END") + } + + msr::MSR_PERF_CAPABILITIES => { + todo!("MSR_PERF_CAPABILITIES") + } + + _ => { + let uret_msr = vmx.find_uret_msr(msr_index); + + if let Some(msr) = uret_msr { + let mut tmp_msr = VmxUretMsr::from(*msr); + vmx.set_guest_uret_msr(&mut tmp_msr, data)?; + vmx.set_uret_msr(msr_index, data); + } else { + vcpu.arch.set_msr_common(&msr); + }; + } + } + + if msr_index == 0x10a { + // MSR_IA32_ARCH_CAPABILITIES + todo!() + } + + Ok(()) } - fn vcpu_reset(&self, vcpu: &mut VirtCpu, init_event: bool) { - todo!() + fn vcpu_reset(&self, vcpu: &mut VirtCpu, vm: &Vm, init_event: bool) { + if !init_event { + vmx_info_mut().vmx_reset_vcpu(vcpu, vm) + } + vcpu.kvm_set_cr8(0); + + let vmx = vcpu.vmx_mut(); + vmx.rmode.vm86_active = false; + vmx.spec_ctrl = 0; + vmx.msr_ia32_umwait_control = 0; + vmx.hv_deadline_tsc = u64::MAX; + + vmx.segment_cache_clear(); + + vcpu.arch.mark_register_available(KvmReg::VcpuExregSegments); + + self.seg_setup(VcpuSegment::CS); + VmxAsm::vmx_vmwrite(guest::CS_SELECTOR, 0xf000); + VmxAsm::vmx_vmwrite(guest::CS_BASE, 0xffff0000); + + self.seg_setup(VcpuSegment::DS); + self.seg_setup(VcpuSegment::ES); + self.seg_setup(VcpuSegment::FS); + self.seg_setup(VcpuSegment::GS); + self.seg_setup(VcpuSegment::SS); + + VmxAsm::vmx_vmwrite(guest::TR_SELECTOR, 0); + VmxAsm::vmx_vmwrite(guest::TR_BASE, 0); + VmxAsm::vmx_vmwrite(guest::TR_LIMIT, 0xffff); + VmxAsm::vmx_vmwrite(guest::TR_ACCESS_RIGHTS, 0x008b); + + VmxAsm::vmx_vmwrite(guest::LDTR_SELECTOR, 0); + VmxAsm::vmx_vmwrite(guest::LDTR_BASE, 0); + VmxAsm::vmx_vmwrite(guest::LDTR_LIMIT, 0xffff); + VmxAsm::vmx_vmwrite(guest::LDTR_ACCESS_RIGHTS, 0x00082); + + VmxAsm::vmx_vmwrite(guest::GDTR_BASE, 0); + VmxAsm::vmx_vmwrite(guest::GDTR_LIMIT, 0xffff); + + VmxAsm::vmx_vmwrite(guest::IDTR_BASE, 0); + VmxAsm::vmx_vmwrite(guest::IDTR_LIMIT, 0xffff); + + VmxAsm::vmx_vmwrite(guest::ACTIVITY_STATE, 0); + VmxAsm::vmx_vmwrite(guest::INTERRUPTIBILITY_STATE, 0); + VmxAsm::vmx_vmwrite(guest::PENDING_DBG_EXCEPTIONS, 0); + + if x86_kvm_manager().mpx_supported() { + VmxAsm::vmx_vmwrite(guest::IA32_BNDCFGS_FULL, 0); + } + + VmxAsm::vmx_vmwrite(control::VMENTRY_INTERRUPTION_INFO_FIELD, 0); + + vcpu.request(VirCpuRequest::KVM_REQ_APIC_PAGE_RELOAD); + + vmx_info().vpid_sync_context(vcpu.vmx().vpid); + + kwarn!("TODO: vmx_update_fb_clear_dis"); } - fn set_rflags(&self, vcpu: &mut VirtCpu, rflags: x86::bits64::rflags::RFlags) { - todo!() + fn set_rflags(&self, vcpu: &mut VirtCpu, mut rflags: x86::bits64::rflags::RFlags) { + if vcpu.is_unrestricted_guest() { + vcpu.arch.mark_register_available(KvmReg::VcpuExregRflags); + vcpu.vmx_mut().rflags = rflags; + VmxAsm::vmx_vmwrite(guest::RFLAGS, rflags.bits()); + return; + } + + let old_rflags = self.get_rflags(vcpu); + + let vmx = vcpu.vmx_mut(); + + vmx.rflags = rflags; + if vmx.rmode.vm86_active { + vmx.rmode.save_rflags = rflags; + rflags.insert(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM); + } + + VmxAsm::vmx_vmwrite(guest::RFLAGS, rflags.bits()); + + if (old_rflags ^ vmx.rflags).contains(RFlags::FLAGS_VM) { + drop(vmx); + let emulation_required = vmx_info().emulation_required(vcpu); + vcpu.vmx_mut().emulation_required = emulation_required; + } } - fn set_cr0(&self, vcpu: &mut VirtCpu, cr0: x86::controlregs::Cr0) { - todo!() + fn set_cr0(&self, vm: &Vm, vcpu: &mut VirtCpu, cr0: x86::controlregs::Cr0) { + let old_cr0_pg = vcpu.arch.read_cr0_bits(Cr0::CR0_ENABLE_PAGING); + let mut hw_cr0 = cr0 & (!(Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE)); + + if vmx_info().enable_unrestricted_guest { + hw_cr0.insert(Cr0::CR0_NUMERIC_ERROR); + } else { + hw_cr0 + .insert(Cr0::CR0_NUMERIC_ERROR | Cr0::CR0_ENABLE_PAGING | Cr0::CR0_PROTECTED_MODE); + + if !vmx_info().enable_ept { + hw_cr0.insert(Cr0::CR0_WRITE_PROTECT); + } + + if vcpu.vmx().rmode.vm86_active && cr0.contains(Cr0::CR0_PROTECTED_MODE) { + vmx_info().enter_pmode(vcpu); + } + + if !vcpu.vmx().rmode.vm86_active && !cr0.contains(Cr0::CR0_PROTECTED_MODE) { + vmx_info().enter_rmode(vcpu, vm); + } + } + + VmxAsm::vmx_vmwrite(control::CR0_READ_SHADOW, cr0.bits() as u64); + VmxAsm::vmx_vmwrite(guest::CR0, hw_cr0.bits() as u64); + + vcpu.arch.cr0 = cr0; + + vcpu.arch.mark_register_available(KvmReg::VcpuExregCr0); + + if vcpu.arch.efer.contains(EferFlags::LONG_MODE_ENABLE) { + if old_cr0_pg.is_empty() && cr0.contains(Cr0::CR0_ENABLE_PAGING) { + todo!("enter lmode todo"); + } else if !old_cr0_pg.is_empty() && !cr0.contains(Cr0::CR0_ENABLE_PAGING) { + todo!("exit lmode todo"); + } + } + + if vmx_info().enable_ept && !vmx_info().enable_unrestricted_guest { + todo!() + } + + vcpu.vmx_mut().emulation_required = vmx_info().emulation_required(vcpu); } - fn set_cr4(&self, vcpu: &mut VirtCpu, cr4: x86::controlregs::Cr4) { - todo!() + fn set_cr4(&self, vcpu: &mut VirtCpu, cr4_flags: x86::controlregs::Cr4) { + let old_cr4 = vcpu.arch.read_cr4_bits(Cr4::all()); + + let mut hw_cr4 = (unsafe { cr4() } & Cr4::CR4_ENABLE_MACHINE_CHECK) + | (cr4_flags & (!Cr4::CR4_ENABLE_MACHINE_CHECK)); + + if vmx_info().enable_unrestricted_guest { + hw_cr4.insert(Cr4::CR4_ENABLE_VMX); + } else if vcpu.vmx().rmode.vm86_active { + hw_cr4.insert(Cr4::CR4_ENABLE_PAE | Cr4::CR4_ENABLE_VMX | Cr4::CR4_ENABLE_VME); + } else { + hw_cr4.insert(Cr4::CR4_ENABLE_PAE | Cr4::CR4_ENABLE_VMX); + } + + if vmx_info().vmx_umip_emulated() { + if cr4_flags.contains(Cr4::CR4_ENABLE_UMIP) { + vcpu.vmx().loaded_vmcs().controls_set( + ControlsType::SecondaryExec, + SecondaryControls::DTABLE_EXITING.bits() as u64, + ); + hw_cr4.remove(Cr4::CR4_ENABLE_UMIP); + } else if !vcpu.arch.is_guest_mode() { + vcpu.vmx().loaded_vmcs().controls_clearbit( + ControlsType::SecondaryExec, + SecondaryControls::DTABLE_EXITING.bits() as u64, + ); + } + } + + vcpu.arch.cr4 = cr4_flags; + vcpu.arch.mark_register_available(KvmReg::VcpuExregCr4); + + if !vmx_info().enable_unrestricted_guest { + if vmx_info().enable_ept { + if vcpu.arch.read_cr0_bits(Cr0::CR0_ENABLE_PAGING).is_empty() { + hw_cr4.remove(Cr4::CR4_ENABLE_PAE); + hw_cr4.insert(Cr4::CR4_ENABLE_PSE); + } else if !cr4_flags.contains(Cr4::CR4_ENABLE_PAE) { + hw_cr4.remove(Cr4::CR4_ENABLE_PAE); + } + } + + if vcpu.arch.read_cr0_bits(Cr0::CR0_ENABLE_PAGING).is_empty() { + hw_cr4.remove( + Cr4::CR4_ENABLE_SMEP | Cr4::CR4_ENABLE_SMAP | Cr4::CR4_ENABLE_PROTECTION_KEY, + ); + } + } + + VmxAsm::vmx_vmwrite(control::CR4_READ_SHADOW, cr4_flags.bits() as u64); + VmxAsm::vmx_vmwrite(guest::CR4, hw_cr4.bits() as u64); + + if (cr4_flags ^ old_cr4).contains(Cr4::CR4_ENABLE_OS_XSAVE | Cr4::CR4_ENABLE_PROTECTION_KEY) + { + // TODO: update_cpuid_runtime + } } fn set_efer(&self, vcpu: &mut VirtCpu, efer: x86_64::registers::control::EferFlags) { - todo!() + if vcpu.vmx().find_uret_msr(msr::IA32_EFER).is_none() { + return; + } + + vcpu.arch.efer = efer; + if efer.contains(EferFlags::LONG_MODE_ACTIVE) { + vcpu.vmx().loaded_vmcs().controls_setbit( + ControlsType::VmEntry, + EntryControls::IA32E_MODE_GUEST.bits().into(), + ); + } else { + vcpu.vmx().loaded_vmcs().controls_clearbit( + ControlsType::VmEntry, + EntryControls::IA32E_MODE_GUEST.bits().into(), + ); + } + + vmx_info().setup_uret_msrs(vcpu); } fn update_exception_bitmap(&self, vcpu: &mut VirtCpu) { - todo!() + let mut eb = (1u32 << PF_VECTOR) + | (1 << UD_VECTOR) + | (1 << MC_VECTOR) + | (1 << DB_VECTOR) + | (1 << AC_VECTOR); + + if vmx_info().enable_vmware_backdoor { + eb |= 1 << GP_VECTOR; + } + + if vcpu.guest_debug & (GuestDebug::ENABLE | GuestDebug::USE_SW_BP) + == (GuestDebug::ENABLE | GuestDebug::USE_SW_BP) + { + eb |= 1 << BP_VECTOR; + } + + if vcpu.vmx().rmode.vm86_active { + eb = !0; + } + + if !vmx_info().vmx_need_pf_intercept(&vcpu) { + eb &= !(1 << PF_VECTOR); + } + + if vcpu.arch.is_guest_mode() { + todo!() + } else { + let mut mask = PageFaultErr::empty(); + let mut match_code = PageFaultErr::empty(); + if vmx_info().enable_ept && (eb & (1 << PF_VECTOR) != 0) { + mask = PageFaultErr::PFERR_PRESENT | PageFaultErr::PFERR_RSVD; + match_code = PageFaultErr::PFERR_PRESENT; + } + + VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MASK, mask.bits); + VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MATCH, match_code.bits); + } + + if vcpu.arch.xfd_no_write_intercept { + eb |= 1 << NM_VECTOR; + } + + VmxAsm::vmx_vmwrite(control::EXCEPTION_BITMAP, eb as u64); } fn has_emulated_msr(&self, msr: u32) -> bool { @@ -467,7 +900,7 @@ impl KvmFunc for VmxKvmFunc { } } - fn get_msr_feature(&self, msr: &mut super::asm::KvmMsrEntry) -> bool { + fn get_msr_feature(&self, msr: &mut super::asm::VmxMsrEntry) -> bool { match msr.index { IA32_VMX_BASIC..=IA32_VMX_VMFUNC => { if !vmx_info().nested { @@ -490,7 +923,148 @@ impl KvmFunc for VmxKvmFunc { } } - fn get_rflags(&self, vcpu: &VirtCpu) -> x86::bits64::rflags::RFlags { + fn get_rflags(&self, vcpu: &mut VirtCpu) -> x86::bits64::rflags::RFlags { + if !vcpu.arch.is_register_available(KvmReg::VcpuExregRflags) { + vcpu.arch.mark_register_available(KvmReg::VcpuExregRflags); + let mut rflags = RFlags::from_bits_truncate(VmxAsm::vmx_vmread(guest::RFLAGS)); + if vcpu.vmx_mut().rmode.vm86_active { + rflags.remove(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM); + let save_rflags = vcpu.vmx_mut().rmode.save_rflags; + rflags.insert(save_rflags & !(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM)); + } + + vcpu.vmx_mut().rflags = rflags; + } + + return vcpu.vmx_mut().rflags; + } + + fn vcpu_precreate(&self, vm: &mut Vm) -> Result<(), SystemError> { + if vm.arch.irqchip_mode != KvmIrqChipMode::None || !vmx_info().enable_ipiv { + return Ok(()); + } + + let kvm_vmx = vm.kvm_vmx_mut(); + + if kvm_vmx.pid_table.is_some() { + return Ok(()); + } + + kvm_vmx.pid_table = Some(unsafe { Box::new_zeroed().assume_init() }); + Ok(()) + } + + fn set_segment(&self, vcpu: &mut VirtCpu, var: &mut UapiKvmSegment, seg: VcpuSegment) { + vcpu.vmx_mut().emulation_required = vmx_info().emulation_required(vcpu); + *var = vmx_info()._vmx_set_segment(vcpu, *var, seg); + } + + fn get_segment( + &self, + vcpu: &mut VirtCpu, + var: UapiKvmSegment, + seg: VcpuSegment, + ) -> UapiKvmSegment { + return vmx_info().vmx_get_segment(vcpu, var, seg); + } + + fn get_idt(&self, _vcpu: &mut VirtCpu, dt: &mut x86::dtables::DescriptorTablePointer) { + dt.limit = VmxAsm::vmx_vmread(guest::IDTR_LIMIT) as u16; + dt.base = VmxAsm::vmx_vmread(guest::IDTR_BASE) as usize as *const _; + } + + fn set_idt(&self, _vcpu: &mut VirtCpu, dt: &x86::dtables::DescriptorTablePointer) { + VmxAsm::vmx_vmwrite(guest::IDTR_LIMIT, dt.limit as u64); + VmxAsm::vmx_vmwrite(guest::IDTR_BASE, dt.base as usize as u64); + } + + fn get_gdt(&self, _vcpu: &mut VirtCpu, dt: &mut x86::dtables::DescriptorTablePointer) { + dt.limit = VmxAsm::vmx_vmread(guest::GDTR_LIMIT) as u16; + dt.base = VmxAsm::vmx_vmread(guest::GDTR_BASE) as usize as *const _; + } + + fn set_gdt(&self, _vcpu: &mut VirtCpu, dt: &x86::dtables::DescriptorTablePointer) { + VmxAsm::vmx_vmwrite(guest::GDTR_LIMIT, dt.limit as u64); + VmxAsm::vmx_vmwrite(guest::GDTR_BASE, dt.base as usize as u64); + } + + fn is_vaild_cr0(&self, vcpu: &VirtCpu, cr0: Cr0) -> bool { + if vcpu.arch.is_guest_mode() { + todo!() + } + + // TODO: 判断vmx->nested->vmxon + + true + } + + fn is_vaild_cr4(&self, vcpu: &VirtCpu, cr4: Cr4) -> bool { + if cr4.contains(Cr4::CR4_ENABLE_VMX) && vcpu.arch.is_smm() { + return false; + } + + // TODO: 判断vmx->nested->vmxon + + return true; + } + + fn post_set_cr3(&self, vcpu: &VirtCpu, cr3: u64) { + // Nothing + } + + fn vcpu_run(&self, vcpu: &mut VirtCpu) { + if unlikely(vmx_info().enable_vnmi && vcpu.vmx().loaded_vmcs().soft_vnmi_blocked) { + todo!() + } + + if unlikely(vcpu.vmx().emulation_required) { + todo!() + } + + if vcpu.vmx().ple_window_dirty { + vcpu.vmx_mut().ple_window_dirty = false; + VmxAsm::vmx_vmwrite(control::PLE_WINDOW, vcpu.vmx().ple_window as u64); + } + + if vcpu.arch.is_register_dirty(KvmReg::VcpuRegsRsp) { + VmxAsm::vmx_vmwrite(guest::RSP, vcpu.arch.regs[KvmReg::VcpuRegsRsp as usize]); + } + if vcpu.arch.is_register_dirty(KvmReg::VcpuRegsRip) { + VmxAsm::vmx_vmwrite(guest::RIP, vcpu.arch.regs[KvmReg::VcpuRegsRip as usize]); + } + + vcpu.arch.clear_dirty(); + + let cr3 = Cr3::read().1; + if unlikely(cr3 != vcpu.vmx().loaded_vmcs().host_state.cr3) { + VmxAsm::vmx_vmwrite(host::CR3, cr3.bits()); + vcpu.vmx().loaded_vmcs().host_state.cr3 = cr3; + } + + let cr4 = unsafe { cr4() }; + if unlikely(cr4 != vcpu.vmx().loaded_vmcs().host_state.cr4) { + VmxAsm::vmx_vmwrite(host::CR4, cr4.bits() as u64); + vcpu.vmx().loaded_vmcs().host_state.cr4 = cr4; + } + + // TODO: set_debugreg + + if vcpu.guest_debug.contains(GuestDebug::SINGLESTEP) { + todo!() + } + + vcpu.load_guest_xsave_state(); + + // TODO: pt_guest_enter + + // TODO: atomic_switch_perf_msrs + + if vmx_info().enable_preemption_timer { + todo!() + } + + Vmx::vmx_vcpu_enter_exit(vcpu, vcpu.vmx().vmx_vcpu_run_flags()); + todo!() } } @@ -502,6 +1076,11 @@ pub fn vmx_info() -> &'static Vmx { unsafe { VMX.as_ref().unwrap() } } +#[inline] +pub fn vmx_info_mut() -> &'static mut Vmx { + unsafe { VMX.as_mut().unwrap() } +} + #[inline(never)] pub fn init_vmx(vmx: Box) { static INIT_ONCE: AtomicBool = AtomicBool::new(false); @@ -520,7 +1099,7 @@ pub struct Vmx { pub host_idt_base: u64, pub vmcs_config: VmcsConfig, pub vmx_cap: VmxCapability, - pub vpid_bitmap: SpinLock>, + pub vpid_bitmap: SpinLock, pub enable_vpid: bool, pub enable_ept: bool, pub enable_ept_ad: bool, @@ -534,6 +1113,8 @@ pub struct Vmx { pub enable_pml: bool, pub enable_preemption_timer: bool, + pub enable_vmware_backdoor: bool, + pub nested: bool, pub ple_gap: u32, @@ -545,35 +1126,39 @@ pub struct Vmx { pub pt_mode: ProcessorTraceMode, } -impl Default for Vmx { - fn default() -> Self { - Self { - host_idt_base: Default::default(), - vmcs_config: Default::default(), - vmx_cap: Default::default(), - vpid_bitmap: SpinLock::new(StaticBitmap::new()), - enable_vpid: true, - enable_ept: true, - enable_ept_ad: true, - enable_unrestricted_guest: true, - enable_flexpriority: true, - enable_vnmi: true, - enable_sgx: true, - ple_gap: 128, - ple_window: 4096, - ple_window_grow: 2, - ple_window_max: u32::MAX, - ple_window_shrink: 0, - enable_apicv: true, - enable_ipiv: true, - enable_pml: true, - enable_preemption_timer: true, - pt_mode: ProcessorTraceMode::System, - emulate_invalid_guest_state: true, - - // 目前先不管嵌套虚拟化,后续再实现 - nested: true, - } +impl Vmx { + fn init(&mut self) { + let mut bitmap = AllocBitmap::new(1 << 16); + + // 0为vpid的非法值 + bitmap.set(0, true); + + self.host_idt_base = Default::default(); + self.vmcs_config = Default::default(); + self.vmx_cap = Default::default(); + self.vpid_bitmap = SpinLock::new(bitmap); + self.enable_vpid = true; + self.enable_ept = true; + self.enable_ept_ad = true; + self.enable_unrestricted_guest = true; + self.enable_flexpriority = true; + self.enable_vnmi = true; + self.enable_sgx = true; + self.ple_gap = 128; + self.ple_window = 4096; + self.ple_window_grow = 2; + self.ple_window_max = u32::MAX; + self.ple_window_shrink = 0; + self.enable_apicv = true; + self.enable_ipiv = true; + self.enable_pml = true; + self.enable_preemption_timer = true; + self.pt_mode = ProcessorTraceMode::System; + self.emulate_invalid_guest_state = true; + + // 目前先不管嵌套虚拟化,后续再实现 + self.nested = false; + self.enable_vmware_backdoor = false; } } @@ -869,6 +1454,14 @@ impl Vmx { } } + pub fn vpid_sync_context(&self, vpid: u16) { + if self.has_invvpid_single() { + VmxAsm::sync_vcpu_single(vpid); + } else if vpid != 0 { + VmxAsm::sync_vcpu_global(); + } + } + pub fn possible_passthrough_msr_slot(msr: u32) -> Option { for (idx, val) in Self::VMX_POSSIBLE_PASSTHROUGH_MSRS.iter().enumerate() { if *val == msr { @@ -887,6 +1480,899 @@ impl Vmx { // TODO:先这样写 *L1TF_VMX_MITIGATION.write() = VmxL1dFlushState::FlushNotRequired; } + + fn vmx_reset_vcpu(&mut self, vcpu: &mut VirtCpu, vm: &Vm) { + self.init_vmcs(vcpu, vm); + + if self.nested { + todo!() + } + + // TODO: vcpu_setup_sgx_lepubkeyhash + + // TODO: nested + + vcpu.arch.microcode_version = 0x100000000; + + let vmx = vcpu.vmx_mut(); + vmx.msr_ia32_feature_control_valid_bits = 1 << 0; + + vmx.post_intr_desc.control.set_nv(0xf2); + vmx.post_intr_desc.control.set_sn(true); + } + + fn init_vmcs(&mut self, vcpu: &mut VirtCpu, vm: &Vm) { + let kvm_vmx = vm.kvm_vmx(); + if vmx_info().nested { + todo!() + } + + if vmx_info().has_msr_bitmap() { + VmxAsm::vmx_vmwrite( + control::MSR_BITMAPS_ADDR_FULL, + vcpu.vmx().vmcs01.lock().msr_bitmap.phys_addr() as u64, + ) + } + + VmxAsm::vmx_vmwrite(guest::LINK_PTR_FULL, u64::MAX); + + let mut loaded_vmcs = vcpu.vmx().loaded_vmcs.lock(); + + loaded_vmcs.controls_set( + ControlsType::Pin, + self.get_pin_based_exec_controls(vcpu).bits() as u64, + ); + + loaded_vmcs.controls_set( + ControlsType::Exec, + self.get_exec_controls(vcpu, &vm.arch).bits() as u64, + ); + + if self.has_sceondary_exec_ctrls() { + loaded_vmcs.controls_set( + ControlsType::SecondaryExec, + self.get_secondary_exec_controls(vcpu, vm).bits() as u64, + ) + } + + if self.has_tertiary_exec_ctrls() { + todo!() + } + + drop(loaded_vmcs); + + if self.enable_apicv && vcpu.arch.lapic_in_kernel() { + VmxAsm::vmx_vmwrite(control::EOI_EXIT0_FULL, 0); + VmxAsm::vmx_vmwrite(control::EOI_EXIT1_FULL, 0); + VmxAsm::vmx_vmwrite(control::EOI_EXIT2_FULL, 0); + VmxAsm::vmx_vmwrite(control::EOI_EXIT3_FULL, 0); + + VmxAsm::vmx_vmwrite(guest::INTERRUPT_STATUS, 0); + + VmxAsm::vmx_vmwrite(control::POSTED_INTERRUPT_NOTIFICATION_VECTOR, 0xf2); + VmxAsm::vmx_vmwrite( + control::POSTED_INTERRUPT_DESC_ADDR_FULL, + virt_2_phys(&vcpu.vmx().post_intr_desc as *const _ as usize) as u64, + ) + } + + if self.enable_apicv && vcpu.arch.lapic_in_kernel() { + // PID_POINTER_TABLE + VmxAsm::vmx_vmwrite( + 0x2042, + virt_2_phys(kvm_vmx.pid_table().as_ptr() as usize) as u64, + ); + // LAST_PID_POINTER_INDEX + VmxAsm::vmx_vmwrite(0x08, vm.arch.max_vcpu_ids as u64 - 1); + } + + if !vm.arch.pause_in_guest { + VmxAsm::vmx_vmwrite(control::PLE_GAP, self.ple_gap as u64); + vcpu.vmx_mut().ple_window = self.ple_window; + vcpu.vmx_mut().ple_window_dirty = true; + } + + if vm + .arch + .notify_vmexit_flags + .contains(NotifyVmExitFlags::KVM_X86_NOTIFY_VMEXIT_ENABLED) + { + // NOTIFY_WINDOW + VmxAsm::vmx_vmwrite(0x4024, vm.arch.notify_window as u64); + } + + VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MASK, 0); + VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MATCH, 0); + VmxAsm::vmx_vmwrite(control::CR3_TARGET_COUNT, 0); + + VmxAsm::vmx_vmwrite(host::FS_SELECTOR, 0); + VmxAsm::vmx_vmwrite(host::GS_SELECTOR, 0); + self.set_constant_host_state(vcpu); + + VmxAsm::vmx_vmwrite(host::FS_BASE, 0); + VmxAsm::vmx_vmwrite(host::GS_BASE, 0); + + if self.has_vmfunc() { + VmxAsm::vmx_vmwrite(control::VM_FUNCTION_CONTROLS_FULL, 0); + } + + VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_STORE_COUNT, 0); + VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, 0); + VmxAsm::vmx_vmwrite( + control::VMEXIT_MSR_LOAD_ADDR_FULL, + virt_2_phys(vcpu.vmx().msr_autoload.host.val.as_ptr() as *const _ as usize) as u64, + ); + VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, 0); + VmxAsm::vmx_vmwrite( + control::VMENTRY_MSR_LOAD_ADDR_FULL, + virt_2_phys(vcpu.vmx().msr_autoload.guest.val.as_ptr() as usize) as u64, + ); + + if self + .vmcs_config + .vmentry_ctrl + .contains(EntryControls::LOAD_IA32_PAT) + { + VmxAsm::vmx_vmwrite(guest::IA32_PAT_FULL, vcpu.arch.pat) + } + + let mut loaded_vmcs = vcpu.vmx().loaded_vmcs.lock(); + loaded_vmcs.controls_set( + ControlsType::VmExit, + self.get_vmexit_controls().bits() as u64, + ); + + loaded_vmcs.controls_set( + ControlsType::VmEntry, + self.get_vmentry_controls().bits() as u64, + ); + + drop(loaded_vmcs); + + vcpu.arch.cr0_guest_owned_bits = self.l1_guest_owned_cr0_bits(); + VmxAsm::vmx_vmwrite( + control::CR0_GUEST_HOST_MASK, + (!vcpu.arch.cr0_guest_owned_bits).bits() as u64, + ); + + self.set_cr4_guest_host_mask(&mut vcpu.arch); + + if vcpu.vmx().vpid != 0 { + VmxAsm::vmx_vmwrite(control::VPID, vcpu.vmx().vpid as u64); + } + + if self.has_xsaves() { + VmxAsm::vmx_vmwrite(control::XSS_EXITING_BITMAP_FULL, 0); + } + + if self.enable_pml { + VmxAsm::vmx_vmwrite( + control::PML_ADDR_FULL, + virt_2_phys(vcpu.vmx().pml_pg.as_ref().as_ptr() as usize) as u64, + ); + + VmxAsm::vmx_vmwrite(guest::PML_INDEX, VmxVCpuPriv::PML_ENTITY_NUM as u64 - 1); + } + + // TODO: vmx_write_encls_bitmap + + if self.pt_mode == ProcessorTraceMode::HostGuest { + todo!() + } + + VmxAsm::vmx_vmwrite(guest::IA32_SYSENTER_CS, 0); + VmxAsm::vmx_vmwrite(guest::IA32_SYSENTER_ESP, 0); + VmxAsm::vmx_vmwrite(guest::IA32_SYSENTER_EIP, 0); + VmxAsm::vmx_vmwrite(guest::IA32_DEBUGCTL_FULL, 0); + + if self.has_tpr_shadow() { + VmxAsm::vmx_vmwrite(control::VIRT_APIC_ADDR_FULL, 0); + if vcpu.arch.lapic_in_kernel() { + VmxAsm::vmx_vmwrite( + control::VIRT_APIC_ADDR_FULL, + virt_2_phys(vcpu.arch.lapic().regs.as_ptr() as usize) as u64, + ); + } + + VmxAsm::vmx_vmwrite(control::TPR_THRESHOLD, 0); + } + + self.setup_uret_msrs(vcpu); + } + + fn setup_uret_msrs(&self, vcpu: &mut VirtCpu) { + // 是否加载syscall相关msr + let load_syscall_msrs = + vcpu.arch.is_long_mode() && vcpu.arch.efer.contains(EferFlags::SYSTEM_CALL_EXTENSIONS); + + self.setup_uret_msr(vcpu, msr::IA32_STAR, load_syscall_msrs); + self.setup_uret_msr(vcpu, msr::IA32_LSTAR, load_syscall_msrs); + self.setup_uret_msr(vcpu, msr::IA32_FMASK, load_syscall_msrs); + + let load_efer = self.update_transition_efer(vcpu); + self.setup_uret_msr(vcpu, msr::IA32_EFER, load_efer); + + // TODO: MSR_TSC_AUX + + self.setup_uret_msr( + vcpu, + msr::MSR_IA32_TSX_CTRL, + CpuId::default() + .get_extended_feature_info() + .unwrap() + .has_rtm(), + ); + + vcpu.vmx_mut().guest_uret_msrs_loaded = false; + } + + fn setup_uret_msr(&self, vcpu: &mut VirtCpu, msr: u32, load_into_hardware: bool) { + let uret_msr = vcpu.vmx_mut().find_uret_msr_mut(msr); + + if let Some(msr) = uret_msr { + msr.load_into_hardware = load_into_hardware; + } + } + + fn update_transition_efer(&self, vcpu: &mut VirtCpu) -> bool { + let mut guest_efer = vcpu.arch.efer; + let mut ignore_efer = EferFlags::empty(); + if !self.enable_ept { + guest_efer.insert(EferFlags::NO_EXECUTE_ENABLE); + } + + ignore_efer.insert(EferFlags::SYSTEM_CALL_EXTENSIONS); + + ignore_efer.insert(EferFlags::LONG_MODE_ACTIVE | EferFlags::LONG_MODE_ENABLE); + + if guest_efer.contains(EferFlags::LONG_MODE_ACTIVE) { + ignore_efer.remove(EferFlags::SYSTEM_CALL_EXTENSIONS); + } + + if self.has_load_ia32_efer() + || (self.enable_ept + && (vcpu.arch.efer ^ x86_kvm_manager().host_efer) + .contains(EferFlags::NO_EXECUTE_ENABLE)) + { + if !guest_efer.contains(EferFlags::LONG_MODE_ACTIVE) { + guest_efer.remove(EferFlags::LONG_MODE_ENABLE); + } + + if guest_efer != x86_kvm_manager().host_efer { + vcpu.vmx_mut().add_atomic_switch_msr( + msr::IA32_EFER, + guest_efer.bits().into(), + x86_kvm_manager().host_efer.bits().into(), + false, + ); + } else { + vcpu.vmx_mut().clear_atomic_switch_msr(msr::IA32_EFER); + } + + return false; + } + + let idx = x86_kvm_manager().find_user_return_msr_idx(msr::IA32_EFER); + if let Some(i) = idx { + vcpu.vmx_mut().clear_atomic_switch_msr(msr::IA32_EFER); + + guest_efer.remove(ignore_efer); + guest_efer.insert(x86_kvm_manager().host_efer & ignore_efer); + + vcpu.vmx_mut().guest_uret_msrs[i].data = guest_efer.bits().into(); + vcpu.vmx_mut().guest_uret_msrs[i].mask = (!ignore_efer).bits().into(); + return true; + } else { + return false; + } + } + + fn set_cr4_guest_host_mask(&self, arch: &mut VirtCpuArch) { + arch.cr4_guest_owned_bits = + x86_kvm_manager().possible_cr4_guest & (!arch.cr4_guest_rsvd_bits); + + if !self.enable_ept { + arch.cr4_guest_owned_bits + .remove(x86_kvm_manager().cr4_tlbflush_bits); + arch.cr4_guest_owned_bits + .remove(x86_kvm_manager().cr4_pdptr_bits); + } + + if arch.is_guest_mode() { + // 嵌套todo + todo!() + } + + VmxAsm::vmx_vmwrite( + control::CR4_GUEST_HOST_MASK, + (!arch.cr4_guest_owned_bits).bits() as u64, + ); + } + + fn l1_guest_owned_cr0_bits(&self) -> Cr0 { + let mut cr0 = x86_kvm_manager().possible_cr0_guest; + + if !self.enable_ept { + cr0.remove(Cr0::CR0_WRITE_PROTECT) + } + + return cr0; + } + + /// 设置在guest生命周期中host不变的部分 + fn set_constant_host_state(&self, vcpu: &mut VirtCpu) { + let loaded_vmcs_host_state = &mut vcpu.vmx().loaded_vmcs.lock().host_state; + + VmxAsm::vmx_vmwrite(host::CR0, unsafe { cr0() }.bits() as u64); + + let cr3 = Cr3::read().1; + VmxAsm::vmx_vmwrite(host::CR3, cr3.bits()); + loaded_vmcs_host_state.cr3 = cr3; + + let cr4 = unsafe { cr4() }; + VmxAsm::vmx_vmwrite(host::CR4, cr4.bits() as u64); + loaded_vmcs_host_state.cr4 = cr4; + + VmxAsm::vmx_vmwrite( + host::CS_SELECTOR, + (segmentation::cs().bits() & (!0x07)).into(), + ); + + VmxAsm::vmx_vmwrite(host::DS_SELECTOR, 0); + VmxAsm::vmx_vmwrite(host::ES_SELECTOR, 0); + + VmxAsm::vmx_vmwrite( + host::SS_SELECTOR, + (segmentation::ds().bits() & (!0x07)).into(), + ); + VmxAsm::vmx_vmwrite( + host::TR_SELECTOR, + (unsafe { x86::task::tr().bits() } & (!0x07)).into(), + ); + + VmxAsm::vmx_vmwrite(host::IDTR_BASE, self.host_idt_base); + VmxAsm::vmx_vmwrite(host::RIP, vmx_vmexit as u64); + + let val = unsafe { rdmsr(msr::IA32_SYSENTER_CS) }; + + // low32 + VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_CS, (val << 32) >> 32); + + // VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_ESP, 0); + + let tmp = unsafe { rdmsr(msr::IA32_SYSENTER_EIP) }; + VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_EIP, (tmp << 32) >> 32); + + if self + .vmcs_config + .vmexit_ctrl + .contains(ExitControls::LOAD_IA32_PAT) + { + VmxAsm::vmx_vmwrite(host::IA32_PAT_FULL, unsafe { rdmsr(msr::IA32_PAT) }); + } + + if self.has_load_ia32_efer() { + VmxAsm::vmx_vmwrite( + host::IA32_EFER_FULL, + x86_kvm_manager().host_efer.bits() as u64, + ); + } + } + + fn get_pin_based_exec_controls(&self, vcpu: &VirtCpu) -> PinbasedControls { + let mut ctrls = self.vmcs_config.pin_based_exec_ctrl; + + if vcpu.arch.vcpu_apicv_active() { + ctrls.remove(PinbasedControls::POSTED_INTERRUPTS); + } + + if !self.enable_vnmi { + ctrls.remove(PinbasedControls::VIRTUAL_NMIS); + } + + if !self.enable_preemption_timer { + ctrls.remove(PinbasedControls::VMX_PREEMPTION_TIMER); + } + + return ctrls; + } + + fn get_exec_controls(&self, vcpu: &VirtCpu, vmarch: &KvmArch) -> PrimaryControls { + let mut ctrls = self.vmcs_config.cpu_based_exec_ctrl; + + ctrls.remove( + PrimaryControls::RDTSC_EXITING + | PrimaryControls::USE_IO_BITMAPS + | PrimaryControls::MONITOR_TRAP_FLAG + | PrimaryControls::PAUSE_EXITING, + ); + + ctrls.remove( + PrimaryControls::NMI_WINDOW_EXITING | PrimaryControls::INTERRUPT_WINDOW_EXITING, + ); + + ctrls.remove(PrimaryControls::MOV_DR_EXITING); + + if vcpu.arch.lapic_in_kernel() && self.has_tpr_shadow() { + ctrls.remove(PrimaryControls::USE_TPR_SHADOW); + } + + if ctrls.contains(PrimaryControls::USE_TPR_SHADOW) { + ctrls.remove(PrimaryControls::CR8_LOAD_EXITING | PrimaryControls::CR8_STORE_EXITING); + } else { + ctrls.insert(PrimaryControls::CR8_LOAD_EXITING | PrimaryControls::CR8_STORE_EXITING); + } + + if self.enable_ept { + ctrls.remove( + PrimaryControls::CR3_LOAD_EXITING + | PrimaryControls::CR3_STORE_EXITING + | PrimaryControls::INVLPG_EXITING, + ); + } + + if vmarch.mwait_in_guest { + ctrls.remove(PrimaryControls::MWAIT_EXITING | PrimaryControls::MONITOR_EXITING); + } + + if vmarch.hlt_in_guest { + ctrls.remove(PrimaryControls::HLT_EXITING); + } + + return ctrls; + } + + fn get_secondary_exec_controls(&mut self, vcpu: &VirtCpu, vm: &Vm) -> SecondaryControls { + let mut ctrls = self.vmcs_config.cpu_based_2nd_exec_ctrl; + + if self.pt_mode == ProcessorTraceMode::System { + ctrls.remove( + SecondaryControls::INTEL_PT_GUEST_PHYSICAL | SecondaryControls::CONCEAL_VMX_FROM_PT, + ); + } + + if !(self.enable_flexpriority && vcpu.arch.lapic_in_kernel()) { + ctrls.remove(SecondaryControls::VIRTUALIZE_APIC) + } + + if vcpu.vmx().vpid == 0 { + ctrls.remove(SecondaryControls::ENABLE_VPID); + } + + if !self.enable_ept { + ctrls.remove(SecondaryControls::ENABLE_EPT); + self.enable_unrestricted_guest = false; + } + + if !self.enable_unrestricted_guest { + ctrls.remove(SecondaryControls::UNRESTRICTED_GUEST); + } + + if vm.arch.pause_in_guest { + ctrls.remove(SecondaryControls::PAUSE_LOOP_EXITING); + } + if !vcpu.arch.vcpu_apicv_active() { + ctrls.remove( + SecondaryControls::VIRTUALIZE_APIC_REGISTER + | SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY, + ); + } + + ctrls.remove(SecondaryControls::VIRTUALIZE_X2APIC); + + ctrls.remove(SecondaryControls::ENABLE_VM_FUNCTIONS); + + ctrls.remove(SecondaryControls::DTABLE_EXITING); + + ctrls.remove(SecondaryControls::VMCS_SHADOWING); + + if !self.enable_pml || vm.nr_memslots_dirty_logging == 0 { + ctrls.remove(SecondaryControls::ENABLE_PML); + } + + // TODO: vmx_adjust_sec_exec_feature + + if self.has_rdtscp() { + kwarn!("adjust RDTSCP todo!"); + // todo!() + } + + return ctrls; + } + + fn get_vmexit_controls(&self) -> ExitControls { + let mut ctrls = self.vmcs_config.vmexit_ctrl; + + ctrls.remove( + ExitControls::SAVE_IA32_PAT + | ExitControls::SAVE_IA32_EFER + | ExitControls::SAVE_VMX_PREEMPTION_TIMER, + ); + + if self.pt_mode == ProcessorTraceMode::System { + ctrls.remove(ExitControls::CONCEAL_VMX_FROM_PT | ExitControls::CLEAR_IA32_RTIT_CTL); + } + + // todo: cpu_has_perf_global_ctrl_bug + + ctrls.remove(ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL | ExitControls::LOAD_IA32_EFER); + + ctrls + } + + fn get_vmentry_controls(&self) -> EntryControls { + let mut ctrls = self.vmcs_config.vmentry_ctrl; + + if self.pt_mode == ProcessorTraceMode::System { + ctrls.remove(EntryControls::CONCEAL_VMX_FROM_PT | EntryControls::LOAD_IA32_RTIT_CTL); + } + + ctrls.remove( + EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL + | EntryControls::LOAD_IA32_EFER + | EntryControls::IA32E_MODE_GUEST, + ); + + // todo: cpu_has_perf_global_ctrl_bug + + ctrls + } + + pub fn emulation_required(&self, vcpu: &mut VirtCpu) -> bool { + return self.emulate_invalid_guest_state && !self.guest_state_valid(vcpu); + } + + pub fn guest_state_valid(&self, vcpu: &mut VirtCpu) -> bool { + return vcpu.is_unrestricted_guest() || self.__guest_state_valid(vcpu); + } + + pub fn __guest_state_valid(&self, vcpu: &mut VirtCpu) -> bool { + if vcpu.arch.is_portected_mode() + || x86_kvm_ops().get_rflags(vcpu).contains(RFlags::FLAGS_VM) + { + if !self.rmode_segment_valid(vcpu, VcpuSegment::CS) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::SS) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::DS) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::ES) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::FS) { + return false; + } + if !self.rmode_segment_valid(vcpu, VcpuSegment::GS) { + return false; + } + } else { + todo!("protected mode guest state checks todo"); + } + + return true; + } + + pub fn vmx_get_segment( + &self, + vcpu: &mut VirtCpu, + mut var: UapiKvmSegment, + seg: VcpuSegment, + ) -> UapiKvmSegment { + if vcpu.vmx().rmode.vm86_active && seg != VcpuSegment::LDTR { + var = vcpu.vmx().rmode.segs[seg as usize]; + if seg == VcpuSegment::TR || var.selector == Vmx::vmx_read_guest_seg_selector(vcpu, seg) + { + return var; + } + + var.base = Vmx::vmx_read_guest_seg_base(vcpu, seg); + var.selector = Vmx::vmx_read_guest_seg_selector(vcpu, seg); + return var; + } + + var.base = Vmx::vmx_read_guest_seg_base(vcpu, seg); + var.limit = Vmx::vmx_read_guest_seg_limit(vcpu, seg); + var.selector = Vmx::vmx_read_guest_seg_selector(vcpu, seg); + + let ar = Vmx::vmx_read_guest_seg_ar(vcpu, seg); + + var.unusable = ((ar >> 16) & 1) as u8; + var.type_ = (ar & 15) as u8; + var.s = ((ar >> 4) & 1) as u8; + var.dpl = ((ar >> 5) & 3) as u8; + + var.present = !var.unusable; + var.avl = ((ar >> 12) & 1) as u8; + var.l = ((ar >> 13) & 1) as u8; + var.db = ((ar >> 14) & 1) as u8; + var.g = ((ar >> 15) & 1) as u8; + + return var; + } + + pub fn _vmx_set_segment( + &self, + vcpu: &mut VirtCpu, + mut var: UapiKvmSegment, + seg: VcpuSegment, + ) -> UapiKvmSegment { + let sf = &KVM_VMX_SEGMENT_FIELDS[seg as usize]; + + vcpu.vmx_mut().segment_cache_clear(); + + if vcpu.vmx().rmode.vm86_active && seg != VcpuSegment::LDTR { + vcpu.vmx_mut().rmode.segs[seg as usize] = var; + if seg == VcpuSegment::TR { + VmxAsm::vmx_vmwrite(sf.selector, var.selector as u64); + } else if var.s != 0 { + Vmx::fix_rmode_seg(seg, &vcpu.vmx().rmode.segs[seg as usize]); + } + return var; + } + + VmxAsm::vmx_vmwrite(sf.base, var.base); + VmxAsm::vmx_vmwrite(sf.limit, var.limit as u64); + VmxAsm::vmx_vmwrite(sf.selector, var.selector as u64); + + if vcpu.is_unrestricted_guest() && seg != VcpuSegment::LDTR { + var.type_ |= 0x1; + } + + VmxAsm::vmx_vmwrite(sf.ar_bytes, var.vmx_segment_access_rights() as u64); + return var; + } + + pub fn rmode_segment_valid(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) -> bool { + let mut var = UapiKvmSegment::default(); + var = self.vmx_get_segment(vcpu, var, seg); + + var.dpl = 0x3; + + if seg == VcpuSegment::CS { + var.type_ = 0x3; + } + + let ar = var.vmx_segment_access_rights(); + + if var.base != ((var.selector as u64) << 4) { + return false; + } + + if var.limit != 0xffff { + return false; + } + + if ar != 0xf3 { + return false; + } + + true + } + + pub fn fix_rmode_seg(seg: VcpuSegment, save: &UapiKvmSegment) { + let sf = &KVM_VMX_SEGMENT_FIELDS[seg as usize]; + + let mut var = *save; + var.dpl = 0x3; + if seg == VcpuSegment::CS { + var.type_ = 0x3; + } + + if !vmx_info().emulate_invalid_guest_state { + var.selector = (var.base >> 4) as u16; + var.base = var.base & 0xffff0; + var.limit = 0xffff; + var.g = 0; + var.db = 0; + var.present = 1; + var.s = 1; + var.l = 0; + var.unusable = 0; + var.type_ = 0x3; + var.avl = 0; + if save.base & 0xf != 0 { + kwarn!("segment base is not paragraph aligned when entering protected mode (seg={seg:?})"); + } + } + + VmxAsm::vmx_vmwrite(sf.selector, var.selector as u64); + VmxAsm::vmx_vmwrite(sf.base, var.base); + VmxAsm::vmx_vmwrite(sf.limit, var.limit as u64); + VmxAsm::vmx_vmwrite(sf.ar_bytes, var.vmx_segment_access_rights() as u64); + } + + pub fn fix_pmode_seg( + &self, + vcpu: &mut VirtCpu, + seg: VcpuSegment, + mut save: UapiKvmSegment, + ) -> UapiKvmSegment { + if self.emulate_invalid_guest_state { + if seg == VcpuSegment::CS || seg == VcpuSegment::SS { + save.selector &= !0x3; + } + + save.dpl = (save.selector & 0x3) as u8; + save.s = 1; + } + + self._vmx_set_segment(vcpu, save, seg); + + return save; + } + + pub fn enter_pmode(&self, vcpu: &mut VirtCpu) { + self.get_segment_with_rmode(vcpu, VcpuSegment::ES); + self.get_segment_with_rmode(vcpu, VcpuSegment::DS); + self.get_segment_with_rmode(vcpu, VcpuSegment::FS); + self.get_segment_with_rmode(vcpu, VcpuSegment::GS); + self.get_segment_with_rmode(vcpu, VcpuSegment::SS); + self.get_segment_with_rmode(vcpu, VcpuSegment::CS); + + vcpu.vmx_mut().rmode.vm86_active = false; + + self.set_segment_with_rmode(vcpu, VcpuSegment::TR); + + let mut flags = RFlags::from_bits_truncate(VmxAsm::vmx_vmread(guest::RFLAGS)); + + flags.remove(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM); + + flags.insert(vcpu.vmx().rmode.save_rflags & (RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM)); + + VmxAsm::vmx_vmwrite(guest::RFLAGS, flags.bits()); + + let cr4 = (Cr4::from_bits_truncate(VmxAsm::vmx_vmread(guest::CR4) as usize) + & (!Cr4::CR4_ENABLE_VME)) + | (Cr4::from_bits_truncate(VmxAsm::vmx_vmread(control::CR4_READ_SHADOW) as usize) + & Cr4::CR4_ENABLE_VME); + VmxAsm::vmx_vmwrite(guest::CR4, cr4.bits() as u64); + + VmxKvmFunc.update_exception_bitmap(vcpu); + + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::CS); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::SS); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::ES); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::DS); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::FS); + self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::GS); + } + + fn fix_pmode_seg_with_rmode(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) { + let segment = vcpu.vmx().rmode.segs[seg as usize]; + vcpu.vmx_mut().rmode.segs[seg as usize] = self.fix_pmode_seg(vcpu, seg, segment); + } + + fn get_segment_with_rmode(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) { + let segment = vcpu.vmx().rmode.segs[seg as usize]; + vcpu.vmx_mut().rmode.segs[seg as usize] = self.vmx_get_segment(vcpu, segment, seg); + } + + fn set_segment_with_rmode(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) { + let segment = vcpu.vmx().rmode.segs[seg as usize]; + vcpu.vmx_mut().rmode.segs[seg as usize] = self._vmx_set_segment(vcpu, segment, seg); + } + + pub fn enter_rmode(&self, vcpu: &mut VirtCpu, vm: &Vm) { + let kvm_vmx = vm.kvm_vmx(); + + self.get_segment_with_rmode(vcpu, VcpuSegment::TR); + self.get_segment_with_rmode(vcpu, VcpuSegment::ES); + self.get_segment_with_rmode(vcpu, VcpuSegment::DS); + self.get_segment_with_rmode(vcpu, VcpuSegment::FS); + self.get_segment_with_rmode(vcpu, VcpuSegment::GS); + self.get_segment_with_rmode(vcpu, VcpuSegment::SS); + self.get_segment_with_rmode(vcpu, VcpuSegment::CS); + + vcpu.vmx_mut().rmode.vm86_active = true; + + vcpu.vmx_mut().segment_cache_clear(); + + VmxAsm::vmx_vmwrite(guest::TR_BASE, kvm_vmx.tss_addr as u64); + VmxAsm::vmx_vmwrite(guest::TR_LIMIT, RMODE_TSS_SIZE as u64 - 1); + VmxAsm::vmx_vmwrite(guest::TR_ACCESS_RIGHTS, 0x008b); + + let mut flags = RFlags::from_bits_truncate(VmxAsm::vmx_vmread(guest::RFLAGS)); + vcpu.vmx_mut().rmode.save_rflags = flags; + + flags.insert(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM); + + VmxAsm::vmx_vmwrite(guest::RFLAGS, flags.bits()); + VmxAsm::vmx_vmwrite( + guest::CR4, + VmxAsm::vmx_vmread(guest::CR4) | Cr4::CR4_ENABLE_VME.bits() as u64, + ); + + VmxKvmFunc.update_exception_bitmap(vcpu); + + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::SS); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::CS); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::ES); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::DS); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::GS); + self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::FS); + } + + fn fix_rmode_seg_with_rmode(&self, vcpu: &VirtCpu, seg: VcpuSegment) { + Vmx::fix_rmode_seg(seg, &vcpu.vmx().rmode.segs[seg as usize]); + } + + pub fn vmx_read_guest_seg_ar(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u32 { + if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::AR) { + vcpu.vmx_mut().segment_cache.seg[seg as usize].ar = + VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].ar_bytes) as u32; + } + + return vcpu.vmx().segment_cache.seg[seg as usize].ar; + } + + pub fn vmx_read_guest_seg_selector(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u16 { + if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::SEL) { + vcpu.vmx_mut().segment_cache.seg[seg as usize].selector = + VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].selector) as u16; + } + + return vcpu.vmx().segment_cache.seg[seg as usize].selector; + } + + pub fn vmx_read_guest_seg_base(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u64 { + if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::BASE) { + vcpu.vmx_mut().segment_cache.seg[seg as usize].base = + VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].base); + } + + return vcpu.vmx().segment_cache.seg[seg as usize].base; + } + + pub fn vmx_read_guest_seg_limit(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u32 { + if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::LIMIT) { + vcpu.vmx_mut().segment_cache.seg[seg as usize].limit = + VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].limit) as u32; + } + + return vcpu.vmx().segment_cache.seg[seg as usize].limit; + } + + fn vmx_segment_cache_test_set( + vcpu: &mut VirtCpu, + seg: VcpuSegment, + field: SegmentCacheField, + ) -> bool { + let mask = 1u32 << (seg as usize * SegmentCacheField::NR as usize + field as usize); + + if !vcpu.arch.is_register_available(KvmReg::VcpuExregSegments) { + vcpu.arch.mark_register_available(KvmReg::VcpuExregSegments); + vcpu.vmx_mut().segment_cache_clear(); + } + + let ret = vcpu.vmx().segment_cache.bitmask & mask; + + vcpu.vmx_mut().segment_cache.bitmask |= mask; + + return ret != 0; + } + + pub fn vmx_vcpu_enter_exit(vcpu: &mut VirtCpu, flags: VmxRunFlag) { + // TODO: vmx_l1d_should_flush and mmio_stale_data_clear + + // TODO: vmx_disable_fb_clear + + if vcpu.arch.cr2 != unsafe { cr2() } as u64 { + unsafe { cr2_write(vcpu.arch.cr2) }; + } + + let fail = + unsafe { __vmx_vcpu_run(vcpu.vmx(), vcpu.arch.regs.as_ptr(), flags.bits as u32) }; + vcpu.vmx_mut().fail = fail as u8; + + todo!() + } +} + +extern "C" { + fn __vmx_vcpu_run(vmx: &VmxVCpuPriv, regs: *const u64, flags: u32) -> i32; } struct VmcsEntryExitPair { @@ -900,6 +2386,28 @@ impl VmcsEntryExitPair { } } +#[derive(Debug, Default)] +#[repr(C, align(64))] +pub struct PostedIntrDesc { + pir: [u32; 8], + control: PostedIntrDescControl, + // 保留位 + rsvd: [u32; 6], +} + +#[bitfield(u64)] +pub struct PostedIntrDescControl { + #[bits(1)] + on: bool, + #[bits(1)] + sn: bool, + #[bits(14)] + rsvd_1: u16, + nv: u8, + rsvd_2: u8, + ndst: u32, +} + #[derive(Debug, Default, Clone, Copy)] pub struct VmxUretMsr { load_into_hardware: bool, @@ -907,31 +2415,154 @@ pub struct VmxUretMsr { mask: u64, } +#[derive(Debug, Default)] +pub struct VmxMsrs { + nr: usize, + val: [VmxMsrEntry; Self::MAX_NR_LOADSTORE_MSRS], +} + +impl VmxMsrs { + pub const MAX_NR_LOADSTORE_MSRS: usize = 8; + + pub fn find_loadstore_msr_slot(&self, msr: u32) -> Option { + for i in 0..self.nr { + if self.val[i].index == msr { + return Some(i); + } + } + + None + } +} + +#[derive(Debug, Default)] +pub struct VmxMsrAutoLoad { + guest: VmxMsrs, + host: VmxMsrs, +} + +#[derive(Debug)] +pub struct VmxRMode { + pub vm86_active: bool, + pub save_rflags: RFlags, + pub segs: [UapiKvmSegment; 8], +} + +impl Default for VmxRMode { + fn default() -> Self { + Self { + vm86_active: false, + save_rflags: RFlags::empty(), + segs: [UapiKvmSegment::default(); 8], + } + } +} + +#[derive(Debug, Clone, Copy, Default)] +pub struct VmxSaveSegment { + selector: u16, + base: u64, + limit: u32, + ar: u32, +} + +#[derive(Debug, Default)] +pub struct VmxSegmentCache { + pub bitmask: u32, + pub seg: [VmxSaveSegment; 8], +} + #[derive(Debug)] pub struct VmxVCpuPriv { - vpid: Option, + vpid: u16, + + fail: u8, + vmcs01: Arc, loaded_vmcs: Arc, guest_uret_msrs: [VmxUretMsr; KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS], + guest_uret_msrs_loaded: bool, + + post_intr_desc: PostedIntrDesc, shadow_msr_intercept_read: AllocBitmap, shadow_msr_intercept_write: AllocBitmap, + + msr_ia32_feature_control: u64, + msr_ia32_feature_control_valid_bits: u64, + + emulation_required: bool, + + rflags: RFlags, + + ple_window: u32, + ple_window_dirty: bool, + + msr_autoload: VmxMsrAutoLoad, + + pml_pg: Box<[u8; MMArch::PAGE_SIZE]>, + + rmode: VmxRMode, + + spec_ctrl: u64, + msr_ia32_umwait_control: u32, + hv_deadline_tsc: u64, + + segment_cache: VmxSegmentCache, +} + +#[derive(Debug, Default)] +pub struct KvmVmx { + tss_addr: usize, + ept_identity_pagetable_done: bool, + ept_identity_map_addr: u64, + pid_table: Option>, +} + +impl KvmVmx { + pub fn pid_table(&self) -> &[u64; MMArch::PAGE_SIZE] { + self.pid_table.as_ref().unwrap().as_ref() + } } impl VmxVCpuPriv { + pub const PML_ENTITY_NUM: usize = 512; + + pub fn loaded_vmcs(&self) -> SpinLockGuard { + self.loaded_vmcs.lock() + } + /// 参考:https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#7452 pub fn init(vcpu: &mut VirtCpu, vm: &Vm) { let vmcs = LockedLoadedVmcs::new(); + + // TODO: 改堆分配 let mut vmx = Self { - vpid: None, + vpid: 0, + fail: 0, vmcs01: vmcs.clone(), loaded_vmcs: vmcs, guest_uret_msrs: [VmxUretMsr::default(); KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS], shadow_msr_intercept_read: AllocBitmap::new(16), shadow_msr_intercept_write: AllocBitmap::new(16), + post_intr_desc: PostedIntrDesc::default(), + ple_window: 0, + ple_window_dirty: false, + msr_autoload: VmxMsrAutoLoad::default(), + pml_pg: unsafe { Box::new_zeroed().assume_init() }, + guest_uret_msrs_loaded: false, + msr_ia32_feature_control: 0, + msr_ia32_feature_control_valid_bits: 0, + rmode: VmxRMode::default(), + spec_ctrl: 0, + msr_ia32_umwait_control: 0, + hv_deadline_tsc: u64::MAX, + segment_cache: VmxSegmentCache::default(), + emulation_required: false, + rflags: RFlags::empty(), }; - vmx.vpid = vmx_info().alloc_vpid(); + vmx.vpid = vmx_info().alloc_vpid().unwrap_or_default() as u16; for i in 0..x86_kvm_manager().kvm_uret_msrs_list.len() { vmx.guest_uret_msrs[i].mask = u64::MAX; @@ -991,6 +2622,12 @@ impl VmxVCpuPriv { } } + fn set_uret_msr(&mut self, msr: u32, data: u64) { + if let Some(msr) = self.find_uret_msr_mut(msr) { + msr.data = data; + } + } + pub fn find_uret_msr_mut(&mut self, msr: u32) -> Option<&mut VmxUretMsr> { let idx = x86_kvm_manager().find_user_return_msr_idx(msr); if let Some(index) = idx { @@ -1000,6 +2637,14 @@ impl VmxVCpuPriv { } } + fn set_guest_uret_msr(&mut self, msr: &VmxUretMsr, data: u64) -> Result<(), SystemError> { + if msr.load_into_hardware { + todo!() + } + + Ok(()) + } + /// ## 禁用对特定的 MSR 的拦截 fn disable_intercept_for_msr(&mut self, arch: &KvmArch, msr: u32, mut msr_type: MsrType) { if !vmx_info().has_msr_bitmap() { @@ -1044,6 +2689,175 @@ impl VmxVCpuPriv { msr_bitmap.ctl(msr, VmxMsrBitmapAction::Clear, VmxMsrBitmapAccess::Write); } } + + #[inline] + pub fn segment_cache_clear(&mut self) { + self.segment_cache.bitmask = 0; + } + + pub fn clear_atomic_switch_msr(&mut self, msr: u32) { + match msr { + msr::IA32_EFER => { + if vmx_info().has_load_ia32_efer() { + self.clear_stomic_switch_msr_special( + EntryControls::LOAD_IA32_EFER.bits().into(), + ExitControls::LOAD_IA32_EFER.bits().into(), + ); + return; + } + } + + msr::MSR_PERF_GLOBAL_CTRL => { + if vmx_info().has_load_perf_global_ctrl() { + self.clear_stomic_switch_msr_special( + EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(), + ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(), + ); + return; + } + } + _ => {} + } + + let m = &mut self.msr_autoload; + let i = m.guest.find_loadstore_msr_slot(msr); + + if let Some(i) = i { + m.guest.nr -= 1; + m.guest.val[i] = m.guest.val[m.guest.nr]; + VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, m.guest.nr as u64); + } + + let i = m.host.find_loadstore_msr_slot(msr); + if let Some(i) = i { + m.host.nr -= 1; + m.host.val[i] = m.host.val[m.host.nr]; + VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, m.host.nr as u64); + } + } + + fn clear_stomic_switch_msr_special(&self, entry: u64, exit: u64) { + let mut guard = self.loaded_vmcs.lock(); + guard.controls_clearbit(ControlsType::VmEntry, entry); + guard.controls_clearbit(ControlsType::VmExit, exit); + } + + pub fn add_atomic_switch_msr( + &mut self, + msr: u32, + guest_val: u64, + host_val: u64, + entry_only: bool, + ) { + match msr { + msr::IA32_EFER => { + if vmx_info().has_load_ia32_efer() { + self.add_atomic_switch_msr_special( + EntryControls::LOAD_IA32_EFER.bits() as u64, + ExitControls::LOAD_IA32_EFER.bits() as u64, + guest::IA32_EFER_FULL, + host::IA32_EFER_FULL, + guest_val, + host_val, + ); + return; + } + } + msr::MSR_PERF_GLOBAL_CTRL => { + if vmx_info().has_load_perf_global_ctrl() { + self.add_atomic_switch_msr_special( + EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(), + ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(), + guest::IA32_PERF_GLOBAL_CTRL_FULL, + host::IA32_PERF_GLOBAL_CTRL_FULL, + guest_val, + host_val, + ); + return; + } + } + msr::MSR_PEBS_ENABLE => { + unsafe { wrmsr(msr::MSR_PEBS_ENABLE, 0) }; + } + + _ => {} + } + + let m = &mut self.msr_autoload; + let mut i = m.guest.find_loadstore_msr_slot(msr); + let j = if !entry_only { + m.host.find_loadstore_msr_slot(msr) + } else { + Some(0) + }; + + if (i.is_none() && m.guest.nr == VmxMsrs::MAX_NR_LOADSTORE_MSRS) + || (j.is_none() && m.host.nr == VmxMsrs::MAX_NR_LOADSTORE_MSRS) + { + kwarn!("Not enough msr switch entries. Can't add msr {:x}", msr); + return; + } + + let i = if i.is_none() { + m.guest.nr += 1; + VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, m.guest.nr as u64); + m.guest.nr + } else { + i.unwrap() + }; + + m.guest.val[i].index = msr; + m.guest.val[i].data = guest_val; + + if entry_only { + return; + } + + let j = if j.is_none() { + m.host.nr += 1; + VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, m.host.nr as u64); + m.host.nr + } else { + j.unwrap() + }; + + m.host.val[i].index = msr; + m.host.val[i].data = host_val; + } + + fn add_atomic_switch_msr_special( + &self, + entry: u64, + exit: u64, + guest_val_vmcs: u32, + host_val_vmcs: u32, + guest_val: u64, + host_val: u64, + ) { + VmxAsm::vmx_vmwrite(guest_val_vmcs, guest_val); + if host_val_vmcs != host::IA32_EFER_FULL { + VmxAsm::vmx_vmwrite(host_val_vmcs, host_val); + } + + let mut guard = self.loaded_vmcs.lock(); + guard.controls_setbit(ControlsType::VmEntry, entry); + guard.controls_setbit(ControlsType::VmExit, exit); + } + + pub fn vmx_vcpu_run_flags(&self) -> VmxRunFlag { + let mut flags = VmxRunFlag::empty(); + + if self.loaded_vmcs().launched { + flags.insert(VmxRunFlag::VMRESUME); + } + + // MSR_IA32_SPEC_CTRL + if !self.loaded_vmcs().msr_write_intercepted(0x48) { + flags.insert(VmxRunFlag::SAVE_SPEC_CTRL); + } + + flags + } } bitflags! { @@ -1052,6 +2866,24 @@ bitflags! { const WRITE = 2; const RW = 3; } + + pub struct PageFaultErr: u64 { + const PFERR_PRESENT = 1 << 0; + const PFERR_WRITE = 1 << 1; + const PFERR_USER = 1 << 2; + const PFERR_RSVD = 1 << 3; + const PFERR_FETCH = 1 << 4; + const PFERR_PK = 1 << 5; + const PFERR_SGX = 1 << 15; + const PFERR_GUEST_FINAL = 1 << 32; + const PFERR_GUEST_PAGE = 1 << 33; + const PFERR_IMPLICIT_ACCESS = 1 << 48; + } + + pub struct VmxRunFlag: u8 { + const VMRESUME = 1 << 0; + const SAVE_SPEC_CTRL = 1 << 1; + } } #[derive(Debug, PartialEq)] @@ -1064,6 +2896,209 @@ pub enum VmxL1dFlushState { FlushNotRequired, } +pub struct VmxSegmentField { + selector: u32, + base: u32, + limit: u32, + ar_bytes: u32, +} + +pub const KVM_VMX_SEGMENT_FIELDS: &'static [VmxSegmentField] = &[ + // CS + VmxSegmentField { + selector: guest::CS_SELECTOR, + base: guest::CS_BASE, + limit: guest::CS_LIMIT, + ar_bytes: guest::CS_ACCESS_RIGHTS, + }, + // DS + VmxSegmentField { + selector: guest::DS_SELECTOR, + base: guest::DS_BASE, + limit: guest::DS_LIMIT, + ar_bytes: guest::DS_ACCESS_RIGHTS, + }, + // ES + VmxSegmentField { + selector: guest::ES_SELECTOR, + base: guest::ES_BASE, + limit: guest::ES_LIMIT, + ar_bytes: guest::ES_ACCESS_RIGHTS, + }, + // FS + VmxSegmentField { + selector: guest::FS_SELECTOR, + base: guest::FS_BASE, + limit: guest::FS_LIMIT, + ar_bytes: guest::FS_ACCESS_RIGHTS, + }, + // GS + VmxSegmentField { + selector: guest::GS_SELECTOR, + base: guest::GS_BASE, + limit: guest::GS_LIMIT, + ar_bytes: guest::GS_ACCESS_RIGHTS, + }, + // SS + VmxSegmentField { + selector: guest::SS_SELECTOR, + base: guest::SS_BASE, + limit: guest::SS_LIMIT, + ar_bytes: guest::SS_ACCESS_RIGHTS, + }, + // TR + VmxSegmentField { + selector: guest::TR_SELECTOR, + base: guest::TR_BASE, + limit: guest::TR_LIMIT, + ar_bytes: guest::TR_ACCESS_RIGHTS, + }, + // LDTR + VmxSegmentField { + selector: guest::LDTR_SELECTOR, + base: guest::LDTR_BASE, + limit: guest::LDTR_LIMIT, + ar_bytes: guest::LDTR_ACCESS_RIGHTS, + }, +]; + +#[derive(FromPrimitive)] +#[allow(non_camel_case_types)] +pub enum VmxExitReason { + EXCEPTION_OR_NMI = 0, + EXTERNAL_INTERRUPT = 1, + TRIPLE_FAULT = 2, + INIT_SIGNAL = 3, + SIPI = 4, + IO_SMI = 5, + OTHER_SMI = 6, + INTERRUPT_WINDOW = 7, + NMI_WINDOW = 8, + TASK_SWITCH = 9, + CPUID = 10, + GETSEC = 11, + HLT = 12, + INVD = 13, + INVLPG = 14, + RDPMC = 15, + RDTSC = 16, + RSM = 17, + VMCALL = 18, + VMCLEAR = 19, + VMLAUNCH = 20, + VMPTRLD = 21, + VMPTRST = 22, + VMREAD = 23, + VMRESUME = 24, + VMWRITE = 25, + VMXOFF = 26, + VMXON = 27, + CR_ACCESS = 28, + DR_ACCESS = 29, + IO_INSTRUCTION = 30, + RDMSR = 31, + WRMSR = 32, + VM_ENTRY_FAILURE_INVALID_GUEST_STATE = 33, + VM_ENTRY_FAILURE_MSR_LOADING = 34, + MWAIT = 36, + MONITOR_TRAP_FLAG = 37, + MONITOR = 39, + PAUSE = 40, + VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT = 41, + TPR_BELOW_THRESHOLD = 43, + APIC_ACCESS = 44, + VIRTUALIZED_EOI = 45, + ACCESS_GDTR_OR_IDTR = 46, + ACCESS_LDTR_OR_TR = 47, + EPT_VIOLATION = 48, + EPT_MISCONFIG = 49, + INVEPT = 50, + RDTSCP = 51, + VMX_PREEMPTION_TIMER_EXPIRED = 52, + INVVPID = 53, + WBINVD = 54, + XSETBV = 55, + APIC_WRITE = 56, + RDRAND = 57, + INVPCID = 58, + VMFUNC = 59, + ENCLS = 60, + RDSEED = 61, + PML_FULL = 62, + XSAVES = 63, + XRSTORS = 64, +} + +impl From for VmxExitReason { + fn from(num: i32) -> Self { + match num { + 0 => VmxExitReason::EXCEPTION_OR_NMI, + 1 => VmxExitReason::EXTERNAL_INTERRUPT, + 2 => VmxExitReason::TRIPLE_FAULT, + 3 => VmxExitReason::INIT_SIGNAL, + 4 => VmxExitReason::SIPI, + 5 => VmxExitReason::IO_SMI, + 6 => VmxExitReason::OTHER_SMI, + 7 => VmxExitReason::INTERRUPT_WINDOW, + 8 => VmxExitReason::NMI_WINDOW, + 9 => VmxExitReason::TASK_SWITCH, + 10 => VmxExitReason::CPUID, + 11 => VmxExitReason::GETSEC, + 12 => VmxExitReason::HLT, + 13 => VmxExitReason::INVD, + 14 => VmxExitReason::INVLPG, + 15 => VmxExitReason::RDPMC, + 16 => VmxExitReason::RDTSC, + 17 => VmxExitReason::RSM, + 18 => VmxExitReason::VMCALL, + 19 => VmxExitReason::VMCLEAR, + 20 => VmxExitReason::VMLAUNCH, + 21 => VmxExitReason::VMPTRLD, + 22 => VmxExitReason::VMPTRST, + 23 => VmxExitReason::VMREAD, + 24 => VmxExitReason::VMRESUME, + 25 => VmxExitReason::VMWRITE, + 26 => VmxExitReason::VMXOFF, + 27 => VmxExitReason::VMXON, + 28 => VmxExitReason::CR_ACCESS, + 29 => VmxExitReason::DR_ACCESS, + 30 => VmxExitReason::IO_INSTRUCTION, + 31 => VmxExitReason::RDMSR, + 32 => VmxExitReason::WRMSR, + 33 => VmxExitReason::VM_ENTRY_FAILURE_INVALID_GUEST_STATE, + 34 => VmxExitReason::VM_ENTRY_FAILURE_MSR_LOADING, + 36 => VmxExitReason::MWAIT, + 37 => VmxExitReason::MONITOR_TRAP_FLAG, + 39 => VmxExitReason::MONITOR, + 40 => VmxExitReason::PAUSE, + 41 => VmxExitReason::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT, + 43 => VmxExitReason::TPR_BELOW_THRESHOLD, + 44 => VmxExitReason::APIC_ACCESS, + 45 => VmxExitReason::VIRTUALIZED_EOI, + 46 => VmxExitReason::ACCESS_GDTR_OR_IDTR, + 47 => VmxExitReason::ACCESS_LDTR_OR_TR, + 48 => VmxExitReason::EPT_VIOLATION, + 49 => VmxExitReason::EPT_MISCONFIG, + 50 => VmxExitReason::INVEPT, + 51 => VmxExitReason::RDTSCP, + 52 => VmxExitReason::VMX_PREEMPTION_TIMER_EXPIRED, + 53 => VmxExitReason::INVVPID, + 54 => VmxExitReason::WBINVD, + 55 => VmxExitReason::XSETBV, + 56 => VmxExitReason::APIC_WRITE, + 57 => VmxExitReason::RDRAND, + 58 => VmxExitReason::INVPCID, + 59 => VmxExitReason::VMFUNC, + 60 => VmxExitReason::ENCLS, + 61 => VmxExitReason::RDSEED, + 62 => VmxExitReason::PML_FULL, + 63 => VmxExitReason::XSAVES, + 64 => VmxExitReason::XRSTORS, + _ => panic!("Invalid VmxExitReason number: {}", num), + } + } +} + pub static L1TF_VMX_MITIGATION: RwLock = RwLock::new(VmxL1dFlushState::FlushAuto); pub fn vmx_init() -> Result<(), SystemError> { @@ -1082,3 +3117,20 @@ pub fn vmx_init() -> Result<(), SystemError> { kvm_init()?; Ok(()) } + +#[no_mangle] +unsafe extern "C" fn vmx_update_host_rsp(vcpu_vmx: &VmxVCpuPriv, host_rsp: usize) { + let mut guard = vcpu_vmx.loaded_vmcs.lock(); + if unlikely(host_rsp != guard.host_state.rsp) { + guard.host_state.rsp = host_rsp; + VmxAsm::vmx_vmwrite(host::RSP, host_rsp as u64); + } + + return; +} + +#[no_mangle] +unsafe extern "C" fn vmx_spec_ctrl_restore_host(vcpu_vmx: &VmxVCpuPriv, flags: u32) { + // TODO + return; +} diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs index c3318f209..fac1a0504 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs @@ -1,6 +1,14 @@ use alloc::{boxed::Box, collections::LinkedList, sync::Arc, vec::Vec}; use bitmap::{traits::BitMapOps, AllocBitmap}; use system_error::SystemError; +use x86::{ + controlregs::Cr4, + vmx::vmcs::{ + control::{self, PrimaryControls}, + guest, + }, +}; +use x86_64::registers::control::{Cr3, Cr3Flags}; use crate::{ arch::{vm::asm::VmxAsm, MMArch}, @@ -106,10 +114,10 @@ impl LockedVMControlStructure { } } -#[derive(Debug, Default)] +#[derive(Debug)] pub struct VmcsHostState { - pub cr3: usize, - pub cr4: usize, + pub cr3: Cr3Flags, + pub cr4: Cr4, pub gs_base: usize, pub fs_base: usize, pub rsp: usize, @@ -120,6 +128,23 @@ pub struct VmcsHostState { pub rs_sel: u16, } +impl Default for VmcsHostState { + fn default() -> Self { + Self { + cr3: Cr3Flags::empty(), + cr4: Cr4::empty(), + gs_base: 0, + fs_base: 0, + rsp: 0, + fs_sel: 0, + gs_sel: 0, + ldt_sel: 0, + ds_sel: 0, + rs_sel: 0, + } + } +} + #[derive(Debug, Default)] pub struct VmcsControlsShadow { vm_entry: u32, @@ -127,7 +152,7 @@ pub struct VmcsControlsShadow { pin: u32, exec: u32, secondary_exec: u32, - tertiary_exec: u32, + tertiary_exec: u64, } #[derive(Debug)] @@ -142,7 +167,7 @@ pub struct LoadedVmcs { /// Hypervisor 定时器是否被软禁用 hv_timer_soft_disabled: bool, /// 支持 vnmi-less CPU 的字段,指示 VNMI 是否被软阻止 - soft_vnmi_blocked: bool, + pub soft_vnmi_blocked: bool, /// 记录 VM 进入时间 entry_time: u64, /// 记录 VNMI 被阻止的时间 @@ -150,16 +175,102 @@ pub struct LoadedVmcs { /// msr位图 pub msr_bitmap: VmxMsrBitmap, /// 保存 VMCS 主机状态的结构体 - host_state: VmcsHostState, + pub host_state: VmcsHostState, /// 保存 VMCS 控制字段的shadow状态的结构体。 controls_shadow: VmcsControlsShadow, } +impl LoadedVmcs { + pub fn controls_set(&mut self, ctl_type: ControlsType, value: u64) { + match ctl_type { + ControlsType::VmEntry => { + if self.controls_shadow.vm_entry != value as u32 { + VmxAsm::vmx_vmwrite(control::VMENTRY_CONTROLS, value); + self.controls_shadow.vm_entry = value as u32; + } + } + ControlsType::VmExit => { + if self.controls_shadow.vm_exit != value as u32 { + VmxAsm::vmx_vmwrite(control::VMEXIT_CONTROLS, value); + self.controls_shadow.vm_exit = value as u32; + } + } + ControlsType::Pin => { + if self.controls_shadow.pin != value as u32 { + VmxAsm::vmx_vmwrite(control::PINBASED_EXEC_CONTROLS, value); + self.controls_shadow.pin = value as u32; + } + } + ControlsType::Exec => { + if self.controls_shadow.exec != value as u32 { + VmxAsm::vmx_vmwrite(control::PRIMARY_PROCBASED_EXEC_CONTROLS, value); + self.controls_shadow.exec = value as u32; + } + } + ControlsType::SecondaryExec => { + if self.controls_shadow.secondary_exec != value as u32 { + VmxAsm::vmx_vmwrite(control::SECONDARY_PROCBASED_EXEC_CONTROLS, value); + self.controls_shadow.secondary_exec = value as u32; + } + } + ControlsType::TertiaryExec => { + if self.controls_shadow.tertiary_exec != value { + VmxAsm::vmx_vmwrite(0x2034, value); + self.controls_shadow.tertiary_exec = value; + } + } + } + } + + pub fn controls_get(&self, ctl_type: ControlsType) -> u64 { + match ctl_type { + ControlsType::VmEntry => self.controls_shadow.vm_entry as u64, + ControlsType::VmExit => self.controls_shadow.vm_exit as u64, + ControlsType::Pin => self.controls_shadow.pin as u64, + ControlsType::Exec => self.controls_shadow.exec as u64, + ControlsType::SecondaryExec => self.controls_shadow.secondary_exec as u64, + ControlsType::TertiaryExec => self.controls_shadow.tertiary_exec, + } + } + + pub fn controls_setbit(&mut self, ctl_type: ControlsType, value: u64) { + let val = self.controls_get(ctl_type) | value; + self.controls_set(ctl_type, val) + } + + pub fn controls_clearbit(&mut self, ctl_type: ControlsType, value: u64) { + let val = self.controls_get(ctl_type) & (!value); + self.controls_set(ctl_type, val) + } + + pub fn msr_write_intercepted(&mut self, msr: u32) -> bool { + if PrimaryControls::from_bits_truncate(self.controls_get(ControlsType::Exec) as u32) + .contains(PrimaryControls::USE_MSR_BITMAPS) + { + return true; + } + + return self + .msr_bitmap + .ctl(msr, VmxMsrBitmapAction::Test, VmxMsrBitmapAccess::Write); + } +} + #[derive(Debug)] pub struct LockedLoadedVmcs { inner: SpinLock, } +#[derive(Debug, Clone, Copy)] +pub enum ControlsType { + VmEntry, + VmExit, + Pin, + Exec, + SecondaryExec, + TertiaryExec, +} + impl LockedLoadedVmcs { pub fn new() -> Arc { let bitmap = if vmx_info().has_msr_bitmap() { @@ -198,6 +309,7 @@ impl LockedLoadedVmcs { #[derive(Debug)] pub struct VmxMsrBitmap { data: AllocBitmap, + phys_addr: usize, } pub enum VmxMsrBitmapAction { @@ -224,7 +336,16 @@ impl VmxMsrBitmap { pub fn new(init_val: bool, size: usize) -> Self { let mut data = AllocBitmap::new(size); data.set_all(init_val); - Self { data } + + let addr = data.data() as *const [usize] as *const usize as usize; + Self { + data, + phys_addr: virt_2_phys(addr), + } + } + + pub fn phys_addr(&self) -> usize { + self.phys_addr } pub fn ctl( diff --git a/kernel/src/arch/x86_64/vm/vmx/vmenter.S b/kernel/src/arch/x86_64/vm/vmx/vmenter.S new file mode 100644 index 000000000..b1119e76b --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/vmenter.S @@ -0,0 +1,179 @@ +#include "common/asm.h" + +#define __VCPU_REGS_RAX 0 +#define __VCPU_REGS_RCX 1 +#define __VCPU_REGS_RDX 2 +#define __VCPU_REGS_RBX 3 +#define __VCPU_REGS_RSP 4 +#define __VCPU_REGS_RBP 5 +#define __VCPU_REGS_RSI 6 +#define __VCPU_REGS_RDI 7 + +#define __VCPU_REGS_R8 8 +#define __VCPU_REGS_R9 9 +#define __VCPU_REGS_R10 10 +#define __VCPU_REGS_R11 11 +#define __VCPU_REGS_R12 12 +#define __VCPU_REGS_R13 13 +#define __VCPU_REGS_R14 14 +#define __VCPU_REGS_R15 15 + +#define VCPU_RAX __VCPU_REGS_RAX * 8 +#define VCPU_RCX __VCPU_REGS_RCX * 8 +#define VCPU_RDX __VCPU_REGS_RDX * 8 +#define VCPU_RBX __VCPU_REGS_RBX * 8 +#define VCPU_RBP __VCPU_REGS_RBP * 8 +#define VCPU_RSI __VCPU_REGS_RSI * 8 +#define VCPU_RDI __VCPU_REGS_RDI * 8 + +#define VCPU_R8 __VCPU_REGS_R8 * 8 +#define VCPU_R9 __VCPU_REGS_R9 * 8 +#define VCPU_R10 __VCPU_REGS_R10 * 8 +#define VCPU_R11 __VCPU_REGS_R11 * 8 +#define VCPU_R12 __VCPU_REGS_R12 * 8 +#define VCPU_R13 __VCPU_REGS_R13 * 8 +#define VCPU_R14 __VCPU_REGS_R14 * 8 +#define VCPU_R15 __VCPU_REGS_R15 * 8 + +#define VMX_RUN_VMRESUME_SHIFT 0 +#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 + +#define VMX_RUN_VMRESUME 1 << VMX_RUN_VMRESUME_SHIFT +#define VMX_RUN_SAVE_SPEC_CTRL 1 << VMX_RUN_SAVE_SPEC_CTRL_SHIFT + +// 将VCPU运行在guest模式 +ENTRY(__vmx_vcpu_run) + pushq %rbp + movq %rsp, %rbp + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + + push %rbx + + // 参数一 + push %rdi + // 参数三 + push %rdx + // 参数二 + push %rsi + + mov %edx, %ebx + + lea (%rsp), %rsi + + call vmx_update_host_rsp + + // TODO: spec_ctrl + +.Lspec_ctrl_done: + mov %rsp, %rax + + bt $VMX_RUN_VMRESUME_SHIFT, %ebx + + mov VCPU_RCX(%rax), %rcx + mov VCPU_RDX(%rax), %rdx + mov VCPU_RBX(%rax), %rbx + mov VCPU_RBP(%rax), %rbp + mov VCPU_RSI(%rax), %rsi + mov VCPU_RDI(%rax), %rdi + + mov VCPU_R8(%rax), %R8 + mov VCPU_R9(%rax), %r9 + mov VCPU_R10(%rax), %r10 + mov VCPU_R11(%rax), %r11 + mov VCPU_R12(%rax), %r12 + mov VCPU_R13(%rax), %r13 + mov VCPU_R14(%rax), %r14 + mov VCPU_R15(%rax), %r15 + + mov VCPU_RAX(%rax), %rax + + // TODO: clear cpu buffer + + jnc .Lvmlaunch + +.Lvmresume: + vmresume + jmp .Lvmfail + +.Lvmlaunch: + vmlaunch + jmp .Lvmfail + +// 从guest模式退出 +ENTRY(vmx_vmexit) + // TODO: unwind hint restore + + // 临时保存guest RAX + push %rax + + // 拿到regs头指针,存入rax + mov 8(%rsp), %rax + + // 保存所有guest寄存器 + pop VCPU_RAX(%rax) + mov %rcx, VCPU_RCX(%rax) + mov %rdx, VCPU_RDX(%rax) + mov %rbx, VCPU_RBX(%rax) + mov %rbp, VCPU_RBP(%rax) + mov %rsi, VCPU_RSI(%rax) + mov %rdi, VCPU_RDI(%rax) + + mov %r8, VCPU_R8(%rax) + mov %r9, VCPU_R9(%rax) + mov %r10, VCPU_R10(%rax) + mov %r11, VCPU_R11(%rax) + mov %r12, VCPU_R12(%rax) + mov %r13, VCPU_R13(%rax) + mov %r14, VCPU_R14(%rax) + mov %r15, VCPU_R15(%rax) + + xor %ebx, %ebx + +.Lclear_regs: + pop %rax + + xor %eax, %eax + xor %ecx, %ecx + xor %edx, %edx + xor %ebp, %ebp + xor %esi, %esi + xor %edi, %edi + + xor %r8d, %r8d + xor %r9d, %r9d + xor %r10d, %r10d + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + + // todo: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmenter.S#270 + + pop %rsi + pop %rdi + + call vmx_spec_ctrl_restore_host + + mov %rbx, %rax + + pop %rbx + + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + pop %rbp + ret + +.Lvmfail: + mov $1, %rbx + jmp .Lclear_regs + + + diff --git a/kernel/src/virt/vm/kvm_dev.rs b/kernel/src/virt/vm/kvm_dev.rs index d42c319f6..9435fd8f1 100644 --- a/kernel/src/virt/vm/kvm_dev.rs +++ b/kernel/src/virt/vm/kvm_dev.rs @@ -8,7 +8,7 @@ use system_error::SystemError; use crate::{ arch::{ - vm::{kvm_host::KvmCommonRegs, x86_kvm_manager}, + vm::{kvm_host::KvmCommonRegs, uapi::UapiKvmSegmentRegs, x86_kvm_manager}, MMArch, }, driver::base::device::device_number::DeviceNumber, @@ -26,10 +26,7 @@ use crate::{ process::ProcessManager, syscall::user_access::{UserBufferReader, UserBufferWriter}, time::PosixTimeSpec, - virt::vm::{ - kvm_host::check_stack_usage, - user_api::{KvmUserspaceMemoryRegion, PosixKvmUserspaceMemoryRegion}, - }, + virt::vm::user_api::{KvmUserspaceMemoryRegion, PosixKvmUserspaceMemoryRegion}, }; use super::kvm_host::{ @@ -237,12 +234,11 @@ impl IndexNode for KvmInstance { arg: usize, _private_data: &crate::filesystem::vfs::FilePrivateData, ) -> Result { - kdebug!("ioctl"); - check_stack_usage(); + kdebug!("kvm instance ioctl cmd {cmd:x}"); match cmd { Self::KVM_CREATE_VCPU => { let ret = self.kvm.lock().create_vcpu(arg); - kwarn!("!!!###$$"); + kdebug!("[KVM] create vcpu fd {ret:?}"); return ret; } @@ -318,6 +314,8 @@ impl IndexNode for KvmInstance { #[derive(Debug)] pub struct KvmVcpuDev { vcpu: Arc, + /// INode 元数据 + metadata: Metadata, } impl KvmVcpuDev { @@ -328,7 +326,25 @@ impl KvmVcpuDev { const KVM_SET_SREGS: u32 = 0xAE84; pub fn new(vcpu: Arc) -> Arc { - Arc::new(Self { vcpu }) + Arc::new(Self { + vcpu, + metadata: Metadata { + dev_id: 1, + inode_id: generate_inode_id(), + size: 0, + blk_size: 0, + blocks: 0, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + file_type: FileType::KvmDevice, // 文件夹,block设备,char设备 + mode: ModeType::S_IALLUGO, + nlinks: 1, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), // 这里用来作为device number + }, + }) } } @@ -354,8 +370,12 @@ impl IndexNode for KvmVcpuDev { arg: usize, _private_data: &crate::filesystem::vfs::FilePrivateData, ) -> Result { + kdebug!("vcpu ioctl cmd {cmd:x}"); match cmd { Self::KVM_RUN => { + if arg != 0 { + return Err(SystemError::EINVAL); + } let mut vcpu = self.vcpu.lock(); let oldpid = vcpu.pid; if unlikely(oldpid != Some(ProcessManager::current_pid())) { @@ -365,9 +385,7 @@ impl IndexNode for KvmVcpuDev { return vcpu.run(); } Self::KVM_GET_REGS => { - kdebug!("KVM_GET_REGS"); let kvm_regs = self.vcpu.lock().get_regs(); - kdebug!("get regs {kvm_regs:?}"); let mut user_writer = UserBufferWriter::new( arg as *const KvmCommonRegs as *mut KvmCommonRegs, core::mem::size_of::(), @@ -377,6 +395,50 @@ impl IndexNode for KvmVcpuDev { user_writer.copy_one_to_user(&kvm_regs, 0)?; return Ok(0); } + + Self::KVM_SET_REGS => { + let user_reader = UserBufferReader::new( + arg as *const KvmCommonRegs, + core::mem::size_of::(), + true, + )?; + + let regs = user_reader.read_one_from_user::(0)?; + + self.vcpu.lock().set_regs(regs)?; + + return Ok(0); + } + + Self::KVM_GET_SREGS => { + let sregs = self.vcpu.lock().get_segment_regs(); + + let mut writer = UserBufferWriter::new( + arg as *const UapiKvmSegmentRegs as *mut UapiKvmSegmentRegs, + core::mem::size_of::(), + true, + )?; + + writer.copy_one_to_user(&sregs, 0)?; + + return Ok(0); + } + + Self::KVM_SET_SREGS => { + let user_reader = UserBufferReader::new( + arg as *const UapiKvmSegmentRegs, + core::mem::size_of::(), + true, + )?; + + let mut sreg = UapiKvmSegmentRegs::default(); + user_reader.copy_one_from_user(&mut sreg, 0)?; + + self.vcpu.lock().set_segment_regs(&mut sreg)?; + + return Ok(0); + } + _ => { // arch ioctl kwarn!("[KVM-VCPU] unknown ioctl cmd {cmd:x}"); @@ -386,6 +448,10 @@ impl IndexNode for KvmVcpuDev { Ok(0) } + fn metadata(&self) -> Result { + Ok(self.metadata.clone()) + } + fn read_at( &self, offset: usize, diff --git a/kernel/src/virt/vm/kvm_host/mem.rs b/kernel/src/virt/vm/kvm_host/mem.rs index bf5180073..fddca79be 100644 --- a/kernel/src/virt/vm/kvm_host/mem.rs +++ b/kernel/src/virt/vm/kvm_host/mem.rs @@ -18,7 +18,7 @@ use crate::{ virt::vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion}, }; -use super::{check_stack_usage, LockedVm, Vm}; +use super::{LockedVm, Vm}; pub const KVM_USER_MEM_SLOTS: u16 = u16::MAX; pub const KVM_INTERNAL_MEM_SLOTS: u16 = 3; diff --git a/kernel/src/virt/vm/kvm_host/mod.rs b/kernel/src/virt/vm/kvm_host/mod.rs index 2c636c42a..bdba71ebb 100644 --- a/kernel/src/virt/vm/kvm_host/mod.rs +++ b/kernel/src/virt/vm/kvm_host/mod.rs @@ -15,7 +15,7 @@ use x86::bits64::registers::rsp; use crate::{ arch::{ - vm::{kvm_host::vcpu::VirCpuRequest, x86_kvm_manager}, + vm::{kvm_host::vcpu::VirCpuRequest, vmx::KvmVmx, x86_kvm_manager}, CurrentKvmManager, KvmArch, VirtCpuArch, }, filesystem::vfs::file::{File, FileMode}, @@ -87,6 +87,9 @@ impl LockedVm { dirty_ring_size: 0, dirty_ring_with_bitmap: false, vcpus: HashMap::new(), + #[cfg(target_arch = "x86_64")] + kvm_vmx: KvmVmx::default(), + nr_memslots_dirty_logging: 0, }; let ret = Arc::new(Self { @@ -133,21 +136,16 @@ pub struct Vm { pub arch: KvmArch, pub dirty_ring_size: u32, + pub nr_memslots_dirty_logging: u32, dirty_ring_with_bitmap: bool, -} -#[inline] -pub fn check_stack_usage() { - let rsp = rsp() as usize; - let free = rsp & (KernelStack::ALIGN - 1); - let usage = KernelStack::SIZE - free; - kdebug!("current rsp {rsp:x} stack use {usage} free {free}"); + #[cfg(target_arch = "x86_64")] + pub kvm_vmx: KvmVmx, } impl Vm { #[inline(never)] pub fn create_vcpu(&mut self, id: usize) -> Result { - check_stack_usage(); if id >= self.max_vcpus { return Err(SystemError::EINVAL); } @@ -158,7 +156,7 @@ impl Vm { self.created_vcpus += 1; - let vcpu = self._create_vcpu(id); + let vcpu = self._create_vcpu(id)?; if self.dirty_ring_size != 0 { todo!() } @@ -183,14 +181,12 @@ impl Vm { /// ### 创建一个vcpu,并且初始化部分数据 #[inline(never)] - pub fn _create_vcpu(&self, id: usize) -> Arc { - check_stack_usage(); - + pub fn _create_vcpu(&mut self, id: usize) -> Result, SystemError> { let mut vcpu = self.new_vcpu(id); - vcpu.init_arch(self); + vcpu.init_arch(self, id)?; - Arc::new(LockedVirtCpu::new(vcpu)) + Ok(Arc::new(LockedVirtCpu::new(vcpu))) } #[inline(never)] @@ -213,10 +209,20 @@ impl Vm { vcpu_idx: 0, }; } + + #[cfg(target_arch = "x86_64")] + pub fn kvm_vmx_mut(&mut self) -> &mut KvmVmx { + &mut self.kvm_vmx + } + + #[cfg(target_arch = "x86_64")] + pub fn kvm_vmx(&self) -> &KvmVmx { + &self.kvm_vmx + } } /// ## 多处理器状态(有些状态在某些架构并不合法) -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum MutilProcessorState { Runnable, Uninitialized, diff --git a/kernel/src/virt/vm/kvm_host/vcpu.rs b/kernel/src/virt/vm/kvm_host/vcpu.rs index d51f4dbbd..16a1282c1 100644 --- a/kernel/src/virt/vm/kvm_host/vcpu.rs +++ b/kernel/src/virt/vm/kvm_host/vcpu.rs @@ -18,7 +18,7 @@ use crate::{ }, process::{Pid, ProcessManager}, smp::cpu::ProcessorId, - virt::vm::{kvm_host::check_stack_usage, user_api::UapiKvmRun}, + virt::vm::user_api::UapiKvmRun, }; use super::{ @@ -90,5 +90,6 @@ bitflags! { pub struct GuestDebug: usize { const ENABLE = 0x00000001; const SINGLESTEP = 0x00000002; + const USE_SW_BP = 0x00010000; } } diff --git a/kernel/src/virt/vm/user_api.rs b/kernel/src/virt/vm/user_api.rs index 57a5424f2..90a17e3e9 100644 --- a/kernel/src/virt/vm/user_api.rs +++ b/kernel/src/virt/vm/user_api.rs @@ -9,6 +9,42 @@ use crate::mm::{PhysAddr, VirtAddr}; use super::kvm_host::mem::UserMemRegionFlag; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct UapiKvmSegment { + pub base: u64, + pub limit: u32, + pub selector: u16, + pub type_: u8, + pub present: u8, + pub dpl: u8, + pub db: u8, + pub s: u8, + pub l: u8, + pub g: u8, + pub avl: u8, + pub unusable: u8, + pub padding: u8, +} + +impl UapiKvmSegment { + pub fn vmx_segment_access_rights(&self) -> u32 { + let mut ar = self.type_ as u32 & 15; + ar |= (self.s as u32 & 1) << 4; + ar |= (self.dpl as u32 & 3) << 5; + ar |= (self.present as u32 & 1) << 7; + ar |= (self.avl as u32 & 1) << 12; + ar |= (self.l as u32 & 1) << 13; + ar |= (self.db as u32 & 1) << 14; + ar |= (self.g as u32 & 1) << 15; + + let b = self.unusable != 0 || self.present == 0; + ar |= (b as u32) << 16; + + return ar; + } +} + /// 通过这个结构可以将虚拟机的物理地址对应到用户进程的虚拟地址 /// 用来表示虚拟机的一段物理内存 #[repr(C)] From 11cea4783c67a2ca4ba16a10f6d813913c71580b Mon Sep 17 00:00:00 2001 From: GnoCiYeH Date: Sun, 16 Jun 2024 00:48:23 +0800 Subject: [PATCH 04/10] =?UTF-8?q?=E8=83=BD=E5=A4=9F=E6=88=90=E5=8A=9Fvmlau?= =?UTF-8?q?nch=EF=BC=8C=E4=BD=86=E6=98=AF=E5=9C=A8vmexit=E6=97=B6=E5=80=99?= =?UTF-8?q?=E8=BF=98=E6=9C=89=E4=BA=9B=E9=97=AE=E9=A2=98=E6=9C=AA=E6=8E=92?= =?UTF-8?q?=E6=9F=A5=E5=87=BA=E6=9D=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/src/arch/x86_64/vm/asm.rs | 126 +- kernel/src/arch/x86_64/vm/cpuid.rs | 1 + kernel/src/arch/x86_64/vm/exit.rs | 1 + kernel/src/arch/x86_64/vm/kvm_host/lapic.rs | 6 +- kernel/src/arch/x86_64/vm/kvm_host/mod.rs | 30 +- kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs | 512 +++++++- kernel/src/arch/x86_64/vm/mem.rs | 17 +- kernel/src/arch/x86_64/vm/mmu.rs | 114 +- kernel/src/arch/x86_64/vm/mod.rs | 192 ++- kernel/src/arch/x86_64/vm/uapi.rs | 44 + kernel/src/arch/x86_64/vm/vmx/asm.rs | 19 + kernel/src/arch/x86_64/vm/vmx/capabilities.rs | 50 +- kernel/src/arch/x86_64/vm/vmx/exit.rs | 204 ++++ kernel/src/arch/x86_64/vm/vmx/mod.rs | 1033 +++++++++++++---- kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs | 50 +- kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs | 72 +- kernel/src/arch/x86_64/vm/vmx/vmenter.S | 4 +- kernel/src/virt/vm/kvm_dev.rs | 39 +- kernel/src/virt/vm/kvm_host/mem.rs | 45 +- kernel/src/virt/vm/kvm_host/mod.rs | 13 +- kernel/src/virt/vm/kvm_host/vcpu.rs | 29 +- kernel/src/virt/vm/user_api.rs | 4 +- 22 files changed, 2069 insertions(+), 536 deletions(-) create mode 100644 kernel/src/arch/x86_64/vm/exit.rs create mode 100644 kernel/src/arch/x86_64/vm/vmx/asm.rs create mode 100644 kernel/src/arch/x86_64/vm/vmx/exit.rs diff --git a/kernel/src/arch/x86_64/vm/asm.rs b/kernel/src/arch/x86_64/vm/asm.rs index 47271f740..c3cb826ee 100644 --- a/kernel/src/arch/x86_64/vm/asm.rs +++ b/kernel/src/arch/x86_64/vm/asm.rs @@ -1,7 +1,6 @@ use core::arch::asm; use alloc::slice; -use bitfield_struct::bitfield; use raw_cpuid::CpuId; use system_error::SystemError; use x86::{ @@ -11,15 +10,17 @@ use x86::{ rdmsr, wrmsr, IA32_FEATURE_CONTROL, IA32_VMX_CR0_FIXED0, IA32_VMX_CR0_FIXED1, IA32_VMX_CR4_FIXED0, IA32_VMX_CR4_FIXED1, }, + vmx::vmcs::ro, }; -use x86_64::registers::xcontrol::XCr0; use crate::{ arch::mm::barrier, - kdebug, kwarn, + kdebug, kerror, mm::{phys_2_virt, PhysAddr}, }; +use super::vmx::vmx_info; + pub struct KvmX86Asm; impl KvmX86Asm { @@ -33,7 +34,7 @@ impl KvmX86Asm { return 0; } - pub fn write_pkru(val: u32) { + pub fn write_pkru(_val: u32) { let cpuid = CpuId::new(); if let Some(feat) = cpuid.get_extended_feature_info() { if feat.has_ospke() { @@ -45,13 +46,13 @@ impl KvmX86Asm { fn rdpkru() -> u32 { let ecx: u32 = 0; let pkru: u32; - let edx: u32; + let _edx: u32; unsafe { asm!( "rdpkru", out("eax") pkru, - out("edx") edx, + out("edx") _edx, in("ecx") ecx, ); } @@ -137,11 +138,29 @@ impl VmxAsm { } } + #[allow(dead_code)] const VMX_VPID_EXTENT_INDIVIDUAL_ADDR: u64 = 0; const VMX_VPID_EXTENT_SINGLE_CONTEXT: u64 = 1; + #[allow(dead_code)] const VMX_VPID_EXTENT_ALL_CONTEXT: u64 = 2; const VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: u64 = 3; + const VMX_EPT_EXTENT_CONTEXT: u64 = 1; + const VMX_EPT_EXTENT_GLOBAL: u64 = 2; + const VMX_EPT_EXTENT_SHIFT: u64 = 24; + + pub fn ept_sync_global() { + Self::invept(Self::VMX_EPT_EXTENT_GLOBAL, 0, 0); + } + + pub fn ept_sync_context(eptp: u64) { + if vmx_info().has_vmx_invept_context() { + Self::invept(Self::VMX_EPT_EXTENT_CONTEXT, eptp, 0); + } else { + Self::ept_sync_global(); + } + } + pub fn sync_vcpu_single(vpid: u16) { if vpid == 0 { return; @@ -154,33 +173,45 @@ impl VmxAsm { Self::invvpid(Self::VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); } + #[inline(always)] + fn invept(ext: u64, eptp: u64, gpa: u64) { + #[repr(C)] + struct InveptDescriptor { + eptp: u64, + gpa: u64, + } + + let descriptor = InveptDescriptor { eptp, gpa }; + + unsafe { + asm!( + "invept {0}, [{1}]", + in(reg) ext, + in(reg) &descriptor, + options(nostack) + ); + } + } + #[inline(always)] fn invvpid(ext: u64, vpid: u16, gva: u64) { - // 定义包含指令操作数的结构体 - #[bitfield(u128)] - struct Operand { - #[bits(16)] - vpid: u64, - #[bits(48)] + #[repr(C)] + struct InvvpidDescriptor { + vpid: u16, rsvd: u64, gva: u64, } - // 构造操作数 - let mut operand = Operand::new(); - operand.set_vpid(vpid as u64); - operand.set_gva(gva); - - // 定义嵌入汇编块 - - kwarn!("TODO: asm invvpid"); - // unsafe { - // asm!( - // "invvpid {0} {1}", - // inlateout(reg) ext => _, - // inlateout(reg) &operand => _, - // ); - // } + let descriptor = InvvpidDescriptor { vpid, rsvd: 0, gva }; + + unsafe { + asm!( + "invvpid {0}, [{1}]", + in(reg) ext, + in(reg) &descriptor, + options(nostack) + ); + } } /// Set the mandatory bits in CR4 and clear bits that are mandatory zero @@ -234,7 +265,46 @@ impl VmxAsm { } } +#[no_mangle] +unsafe extern "C" fn vmx_vmlaunch() { + if let Err(e) = x86::bits64::vmx::vmlaunch() { + kerror!( + "vmx_launch fail: {:?}, err code {}", + e, + VmxAsm::vmx_vmread(ro::VM_INSTRUCTION_ERROR) + ); + } +} + bitflags! { + pub struct IntrInfo: u32 { + const INTR_INFO_VECTOR_MASK = 0xff; + const INTR_INFO_INTR_TYPE_MASK = 0x700; + const INTR_INFO_DELIVER_CODE_MASK = 0x800; + const INTR_INFO_UNBLOCK_NMI = 0x1000; + const INTR_INFO_VALID_MASK = 0x80000000; + const INTR_INFO_RESVD_BITS_MASK = 0x7ffff000; + } + + pub struct IntrType: u32 { + /// external interrupt + const INTR_TYPE_EXT_INTR = (0 << 8); + /// reserved + const INTR_TYPE_RESERVED = (1 << 8); + /// NMI + const INTR_TYPE_NMI_INTR = (2 << 8); + /// processor exception + const INTR_TYPE_HARD_EXCEPTION = (3 << 8); + /// software interrupt + const INTR_TYPE_SOFT_INTR = (4 << 8); + /// ICE breakpoint - undocumented + const INTR_TYPE_PRIV_SW_EXCEPTION = (5 << 8); + /// software exception + const INTR_TYPE_SOFT_EXCEPTION = (6 << 8); + /// other even + const INTR_TYPE_OTHER_EVENT = (7 << 8); + } + pub struct MiscEnable: u64 { const MSR_IA32_MISC_ENABLE_FAST_STRING = 1 << 0; const MSR_IA32_MISC_ENABLE_TCC = 1 << 1; @@ -351,6 +421,7 @@ pub struct VmxMsrEntry { pub data: u64, } +#[allow(dead_code)] pub mod hyperv { /* Hyper-V specific model specific registers (MSRs) */ @@ -477,6 +548,7 @@ pub mod hyperv { pub const HV_X64_MSR_SYNDBG_OPTIONS: u32 = 0x400000FF; } +#[allow(dead_code)] pub mod kvm_msr { pub const MSR_KVM_WALL_CLOCK: u32 = 0x11; pub const MSR_KVM_SYSTEM_TIME: u32 = 0x12; diff --git a/kernel/src/arch/x86_64/vm/cpuid.rs b/kernel/src/arch/x86_64/vm/cpuid.rs index 28e91d6a9..4fd447a65 100644 --- a/kernel/src/arch/x86_64/vm/cpuid.rs +++ b/kernel/src/arch/x86_64/vm/cpuid.rs @@ -1,6 +1,7 @@ use alloc::vec::Vec; #[derive(Debug, Default, Clone, Copy)] +#[allow(dead_code)] pub struct KvmCpuidEntry2 { pub function: u32, pub index: u32, diff --git a/kernel/src/arch/x86_64/vm/exit.rs b/kernel/src/arch/x86_64/vm/exit.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/exit.rs @@ -0,0 +1 @@ + diff --git a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs index 8a995e5c4..c71d1cd9f 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs @@ -1,16 +1,16 @@ use alloc::boxed::Box; use crate::{ - arch::{kvm_arch_ops, MMArch}, - kdebug, - mm::MemoryManagementArch, + arch::kvm_arch_ops, virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, }; const APIC_DEFAULT_PHYS_BASE: u64 = 0xfee00000; +#[allow(dead_code)] const MSR_IA32_APICBASE: u64 = 0x0000001b; const MSR_IA32_APICBASE_BSP: u64 = 1 << 8; const MSR_IA32_APICBASE_ENABLE: u64 = 1 << 11; +#[allow(dead_code)] const MSR_IA32_APICBASE_BASE: u64 = 0xfffff << 12; #[derive(Debug)] diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs index 0f0ccd7e2..e61cb5047 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -25,19 +25,18 @@ use crate::arch::VirtCpuArch; use super::{ asm::{MsrData, VcpuSegment, VmxMsrEntry}, - uapi::UapiKvmDtable, - vmx::{vmx_info, VmxVCpuPriv}, + vmx::{exit::ExitFastpathCompletion, vmx_info}, x86_kvm_manager, x86_kvm_ops, }; pub mod lapic; pub mod vcpu; - +#[allow(dead_code)] pub const TSS_IOPB_BASE_OFFSET: usize = 0x66; pub const TSS_BASE_SIZE: usize = 0x68; -pub const TSS_IOPB_SIZE: usize = (65536 / 8); -pub const TSS_REDIRECTION_SIZE: usize = (256 / 8); -pub const RMODE_TSS_SIZE: usize = (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1); +pub const TSS_IOPB_SIZE: usize = 65536 / 8; +pub const TSS_REDIRECTION_SIZE: usize = 256 / 8; +pub const RMODE_TSS_SIZE: usize = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1; #[derive(Debug, Default)] pub struct X86KvmArch { @@ -121,6 +120,7 @@ impl X86KvmArch { } #[derive(Debug, Clone, Copy, PartialEq)] +#[allow(dead_code)] pub enum KvmIrqChipMode { None, Kernel, @@ -154,6 +154,8 @@ pub trait KvmFunc: Send + Sync + Debug { fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: ProcessorId); + fn load_mmu_pgd(&self, vcpu: &mut VirtCpu, vm: &Vm, root_hpa: u64, root_level: u32); + fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg); fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu); @@ -202,13 +204,26 @@ pub trait KvmFunc: Send + Sync + Debug { fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool; - fn vcpu_run(&self, vcpu: &mut VirtCpu); + fn prepare_switch_to_guest(&self, vcpu: &mut VirtCpu); + + fn flush_tlb_all(&self, vcpu: &mut VirtCpu); + + fn vcpu_run(&self, vcpu: &mut VirtCpu) -> ExitFastpathCompletion; + + fn handle_exit_irqoff(&self, vcpu: &mut VirtCpu); + + fn handle_exit( + &self, + vcpu: &mut VirtCpu, + fastpath: ExitFastpathCompletion, + ) -> Result<(), SystemError>; } /// ## 中断抑制的原因位 #[derive(Debug)] pub struct KvmApicvInhibit; +#[allow(dead_code)] impl KvmApicvInhibit { // Intel与AMD共用 @@ -266,6 +281,7 @@ pub struct KernelMsrRange { } #[repr(C)] +#[allow(dead_code)] pub struct PosixMsrFilterRange { pub flags: u32, pub nmsrs: u32, diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs index bb96f6ec8..e640fc3b4 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -1,47 +1,44 @@ +use core::intrinsics::likely; use core::{arch::x86_64::_xsetbv, intrinsics::unlikely}; use alloc::{boxed::Box, sync::Arc, vec::Vec}; -use bitmap::{traits::BitMapOps, AllocBitmap, BitMapCore, StaticBitmap}; +use bitmap::{traits::BitMapOps, AllocBitmap, BitMapCore}; use raw_cpuid::CpuId; use system_error::SystemError; use x86::{ bits64::rflags::RFlags, controlregs::{Cr0, Cr4, Xcr0}, dtables::DescriptorTablePointer, - msr::{ - self, wrmsr, IA32_APIC_BASE, IA32_CSTAR, IA32_FS_BASE, IA32_GS_BASE, IA32_KERNEL_GSBASE, - IA32_LSTAR, IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, IA32_TSC_AUX, - }, + msr::{self, wrmsr}, vmx::vmcs::control::SecondaryControls, }; -use x86_64::registers::{control::EferFlags, xcontrol::XCr0Flags}; +use x86_64::registers::control::EferFlags; +use crate::arch::vm::vmx::exit::ExitFastpathCompletion; +use crate::kwarn; +use crate::virt::vm::kvm_host::mem::KvmMmuMemoryCache; +use crate::virt::vm::kvm_host::vcpu::VcpuMode; use crate::{ arch::{ kvm_arch_ops, + mm::barrier, vm::{ - asm::{ - hyperv, kvm_msr, KvmX86Asm, MiscEnable, MsrData, SegmentCacheField, VcpuSegment, - }, + asm::{hyperv, kvm_msr, KvmX86Asm, MiscEnable, MsrData, VcpuSegment}, cpuid::KvmCpuidEntry2, kvm_host::KvmReg, - mmu::{KvmMmu, LockedKvmMmu}, + mmu::LockedKvmMmu, uapi::{UapiKvmSegmentRegs, KVM_SYNC_X86_VALID_FIELDS}, - vmx::{ - vmcs::{ControlsType, LoadedVmcs}, - vmx_info, VmxVCpuPriv, - }, + vmx::{vmcs::ControlsType, vmx_info}, x86_kvm_manager, x86_kvm_manager_mut, x86_kvm_ops, }, }, - kdebug, kerror, - mm::{PhysAddr, VirtAddr}, + mm::VirtAddr, smp::{core::smp_get_processor_id, cpu::ProcessorId}, virt::vm::{ kvm_host::{ mem::GfnToHvaCache, vcpu::{GuestDebug, VirtCpu}, - LockedVm, MutilProcessorState, Vm, + MutilProcessorState, Vm, }, user_api::{UapiKvmRun, UapiKvmSegment}, }, @@ -52,11 +49,11 @@ use super::{lapic::KvmLapic, HFlags, KvmCommonRegs, KvmIrqChipMode}; #[derive(Debug)] pub struct X86VcpuArch { /// 最近一次尝试进入虚拟机的主机cpu - last_vmentry_cpu: ProcessorId, + pub last_vmentry_cpu: ProcessorId, /// 可用寄存器位图 - regs_avail: AllocBitmap, + pub regs_avail: AllocBitmap, /// 脏寄存器位图 - regs_dirty: AllocBitmap, + pub regs_dirty: AllocBitmap, /// 多处理器状态 mp_state: MutilProcessorState, pub apic_base: u64, @@ -99,6 +96,11 @@ pub struct X86VcpuArch { pub walk_mmu: Option>, pub nested_mmu: Option>, + pub mmu_pte_list_desc_cache: KvmMmuMemoryCache, + pub mmu_shadow_page_cache: KvmMmuMemoryCache, + pub mmu_shadowed_info_cache: KvmMmuMemoryCache, + pub mmu_page_header_cache: KvmMmuMemoryCache, + pub max_phyaddr: usize, pub pat: u64, @@ -134,6 +136,10 @@ pub struct X86VcpuArch { pub xfd_no_write_intercept: bool, + pub l1tf_flush_l1d: bool, + + pub at_instruction_boundary: bool, + pub db: [usize; Self::KVM_NR_DB_REGS], } @@ -168,7 +174,7 @@ impl X86VcpuArch { } pub fn is_bsp(&self) -> bool { - return self.apic_base & IA32_APIC_BASE as u64 != 0; + return self.apic_base & msr::IA32_APIC_BASE as u64 != 0; } #[inline] @@ -259,17 +265,25 @@ impl X86VcpuArch { self.exception_vmexit.pending = false; } + #[allow(dead_code)] pub fn update_cpuid_runtime(&mut self, entries: &Vec) { let cpuid = CpuId::new(); let feat = cpuid.get_feature_info().unwrap(); let base = KvmCpuidEntry2::find(entries, 1, None); - if let Some(base) = base { + if let Some(_base) = base { if feat.has_xsave() {} } todo!() } + #[inline] + pub fn test_and_mark_available(&mut self, reg: KvmReg) -> bool { + let old = self.regs_avail.get(reg as usize).unwrap_or_default(); + self.regs_avail.set(reg as usize, true); + return old; + } + #[inline] pub fn mark_register_dirty(&mut self, reg: KvmReg) { self.regs_avail.set(reg as usize, true); @@ -580,10 +594,16 @@ impl VirtCpu { Ok(()) } + #[inline] pub fn kvm_run(&self) -> &Box { self.run.as_ref().unwrap() } + #[inline] + pub fn kvm_run_mut(&mut self) -> &mut Box { + self.run.as_mut().unwrap() + } + pub fn run(&mut self) -> Result { self.load(); @@ -591,15 +611,288 @@ impl VirtCpu { todo!() } - let kvm_run = self.kvm_run(); - - if kvm_run.kvm_valid_regs & !KVM_SYNC_X86_VALID_FIELDS != 0 - || kvm_run.kvm_dirty_regs & !KVM_SYNC_X86_VALID_FIELDS != 0 + if self.kvm_run().kvm_valid_regs & !KVM_SYNC_X86_VALID_FIELDS != 0 + || self.kvm_run().kvm_dirty_regs & !KVM_SYNC_X86_VALID_FIELDS != 0 { return Err(SystemError::EINVAL); } - todo!() + if self.kvm_run().kvm_dirty_regs != 0 { + todo!() + } + + if !self.arch.lapic_in_kernel() { + self.kvm_set_cr8(self.kvm_run().cr8); + } + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#11174 - 11196 + + if self.kvm_run().immediate_exit != 0 { + return Err(SystemError::EINTR); + } + + // vmx_vcpu_pre_run + + self.vcpu_run(&self.kvm().lock())?; + + Ok(0) + } + + fn vcpu_run(&mut self, vm: &Vm) -> Result<(), SystemError> { + self.arch.l1tf_flush_l1d = true; + + loop { + self.arch.at_instruction_boundary = false; + if self.can_running() { + self.enter_guest(vm)?; + } else { + todo!() + }; + } + } + + fn enter_guest(&mut self, vm: &Vm) -> Result<(), SystemError> { + let req_immediate_exit = false; + + kwarn!("request {:?}", self.request); + if !self.request.is_empty() { + if self.check_request(VirtCpuRequest::KVM_REQ_VM_DEAD) { + return Err(SystemError::EIO); + } + + // TODO: kvm_dirty_ring_check_request + + if self.check_request(VirtCpuRequest::KVM_REQ_MMU_FREE_OBSOLETE_ROOTS) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_MIGRATE_TIMER) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_MASTERCLOCK_UPDATE) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_GLOBAL_CLOCK_UPDATE) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_CLOCK_UPDATE) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_MMU_SYNC) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_LOAD_MMU_PGD) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH) { + self.flush_tlb_all(); + } + + self.service_local_tlb_flush_requests(); + + // TODO: KVM_REQ_HV_TLB_FLUSH) && kvm_hv_vcpu_flush_tlb(vcpu) + + if self.check_request(VirtCpuRequest::KVM_REQ_REPORT_TPR_ACCESS) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_TRIPLE_FAULT) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_STEAL_UPDATE) { + // todo!() + kwarn!("VirtCpuRequest::KVM_REQ_STEAL_UPDATE TODO!"); + } + + if self.check_request(VirtCpuRequest::KVM_REQ_SMI) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_NMI) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_PMU) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_PMI) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_IOAPIC_EOI_EXIT) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_SCAN_IOAPIC) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_LOAD_EOI_EXITMAP) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_APIC_PAGE_RELOAD) { + // todo!() + kwarn!("VirtCpuRequest::KVM_REQ_APIC_PAGE_RELOAD TODO!"); + } + + if self.check_request(VirtCpuRequest::KVM_REQ_HV_CRASH) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_HV_RESET) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_HV_EXIT) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_HV_STIMER) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_APICV_UPDATE) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_APF_READY) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_MSR_FILTER_CHANGED) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_UPDATE_CPU_DIRTY_LOGGING) { + todo!() + } + } + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10661 + if self.check_request(VirtCpuRequest::KVM_REQ_EVENT) { + // TODO + } + + self.kvm_mmu_reload(vm)?; + + x86_kvm_ops().prepare_switch_to_guest(self); + // kwarn!( + // "mode {:?} req {:?} mode_cond {} !is_empty {} cond {}", + // self.mode, + // self.request, + // self.mode == VcpuMode::ExitingGuestMode, + // !self.request.is_empty(), + // (self.mode == VcpuMode::ExitingGuestMode) || (!self.request.is_empty()) + // ); + kwarn!( + "req bit {} empty bit {}", + self.request.bits, + VirtCpuRequest::empty().bits + ); + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10730 + if self.mode == VcpuMode::ExitingGuestMode || !self.request.is_empty() { + self.mode = VcpuMode::OutsideGuestMode; + return Err(SystemError::EINVAL); + } + + if req_immediate_exit { + self.request(VirtCpuRequest::KVM_REQ_EVENT); + todo!(); + } + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10749 - 10766 + + let exit_fastpath; + loop { + exit_fastpath = x86_kvm_ops().vcpu_run(self); + if likely(exit_fastpath != ExitFastpathCompletion::ExitHandled) { + break; + } + + todo!(); + } + + // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10799 - 10814 + + self.arch.last_vmentry_cpu = self.cpu; + + // TODO: last_guest_tsc + + self.mode = VcpuMode::OutsideGuestMode; + + barrier::mfence(); + + // TODO: xfd + + x86_kvm_ops().handle_exit_irqoff(self); + + // todo: xfd + + // TODO: 一些中断或者tsc操作 + + return x86_kvm_ops().handle_exit(self, exit_fastpath); + } + + fn flush_tlb_all(&mut self) { + x86_kvm_ops().flush_tlb_all(self); + self.clear_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH_CURRENT); + } + + fn service_local_tlb_flush_requests(&mut self) { + if self.check_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH_CURRENT) { + todo!() + } + + if self.check_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH_GUEST) { + todo!() + } + } + + pub fn request(&mut self, req: VirtCpuRequest) { + // self.request.set( + // (req.bits() & VirtCpuRequest::KVM_REQUEST_MASK.bits()) as usize, + // true, + // ); + self.request.insert(req); + } + + fn check_request(&mut self, req: VirtCpuRequest) -> bool { + if self.test_request(req) { + self.clear_request(req); + + barrier::mfence(); + return true; + } + + return false; + } + + fn test_request(&self, req: VirtCpuRequest) -> bool { + // self.request + // .get((req.bits & VirtCpuRequest::KVM_REQUEST_MASK.bits) as usize) + // .unwrap_or_default() + self.request.contains(req) + } + + fn clear_request(&mut self, req: VirtCpuRequest) { + // self.request.set( + // (req.bits & VirtCpuRequest::KVM_REQUEST_MASK.bits) as usize, + // false, + // ); + self.request.remove(req); + } + + pub fn can_running(&self) -> bool { + return self.arch.mp_state == MutilProcessorState::Runnable && !self.arch.apf.halted; } #[inline] @@ -622,11 +915,7 @@ impl VirtCpu { self.cpu = cpu; } - self.request(VirCpuRequest::KVM_REQ_STEAL_UPDATE) - } - - pub fn request(&mut self, req: VirCpuRequest) { - self.request.insert(req); + self.request(VirtCpuRequest::KVM_REQ_STEAL_UPDATE) } pub fn set_msr( @@ -636,19 +925,23 @@ impl VirtCpu { host_initiated: bool, ) -> Result<(), SystemError> { match index { - IA32_FS_BASE | IA32_GS_BASE | IA32_KERNEL_GSBASE | IA32_CSTAR | IA32_LSTAR => { + msr::IA32_FS_BASE + | msr::IA32_GS_BASE + | msr::IA32_KERNEL_GSBASE + | msr::IA32_CSTAR + | msr::IA32_LSTAR => { if VirtAddr::new(data as usize).is_canonical() { return Ok(()); } } - IA32_SYSENTER_EIP | IA32_SYSENTER_ESP => { + msr::IA32_SYSENTER_EIP | msr::IA32_SYSENTER_ESP => { // 需要将Data转为合法地址,但是现在先这样写 assert!(VirtAddr::new(data as usize).is_canonical()); } - IA32_TSC_AUX => { + msr::IA32_TSC_AUX => { if x86_kvm_manager() - .find_user_return_msr_idx(IA32_TSC_AUX) + .find_user_return_msr_idx(msr::IA32_TSC_AUX) .is_none() { return Ok(()); @@ -703,7 +996,7 @@ impl VirtCpu { self.arch.cr2 = 0; - self.request(VirCpuRequest::KVM_REQ_EVENT); + self.request(VirtCpuRequest::KVM_REQ_EVENT); self.arch.apf.msr_en_val = 0; self.arch.apf.msr_int_val = 0; @@ -769,12 +1062,12 @@ impl VirtCpu { kvm_arch_ops().update_exception_bitmap(self); if old_cr0.contains(Cr0::CR0_ENABLE_PAGING) { - self.request(VirCpuRequest::KVM_REQ_TLB_FLUSH_GUEST); + self.request(VirtCpuRequest::MAKE_KVM_REQ_TLB_FLUSH_GUEST); self.arch.reset_mmu_context(); } if init_event { - self.request(VirCpuRequest::KVM_REQ_TLB_FLUSH_GUEST); + self.request(VirtCpuRequest::MAKE_KVM_REQ_TLB_FLUSH_GUEST); } Ok(()) @@ -782,7 +1075,7 @@ impl VirtCpu { fn set_rflags(&mut self, rflags: RFlags) { self._set_rflags(rflags); - self.request(VirCpuRequest::KVM_REQ_EVENT); + self.request(VirtCpuRequest::KVM_REQ_EVENT); } fn _set_rflags(&mut self, mut rflags: RFlags) { @@ -910,7 +1203,6 @@ impl VirtCpu { pub fn set_segment_regs(&mut self, sregs: &mut UapiKvmSegmentRegs) -> Result<(), SystemError> { self.load(); - kdebug!("set_segment_regs sregs{sregs:?}"); self._set_segmenet_regs(&self.kvm().lock(), sregs)?; Ok(()) } @@ -935,7 +1227,7 @@ impl VirtCpu { if pending < max_bits { self.arch.queue_interrupt(pending as u8, false); - self.request(VirCpuRequest::KVM_REQ_EVENT); + self.request(VirtCpuRequest::KVM_REQ_EVENT); } } @@ -1121,7 +1413,7 @@ impl VirtCpu { self.arch.exception.pending = false; self.arch.exception_vmexit.pending = false; - self.request(VirCpuRequest::KVM_REQ_EVENT); + self.request(VirtCpuRequest::KVM_REQ_EVENT); } pub fn load_guest_xsave_state(&mut self) { @@ -1154,18 +1446,132 @@ impl VirtCpu { } bitflags! { - pub struct VirCpuRequest: u32 { - const KVM_REQUEST_NO_WAKEUP = 1 << 0; - const KVM_REQUEST_WAIT = 1 << 1; - const KVM_REQUEST_NO_ACTION = 1 << 2; - const KVM_REQ_EVENT = 1 << 6; - const KVM_REQ_STEAL_UPDATE = 1 << 8; - const KVM_REQ_APIC_PAGE_RELOAD = 1 << 17 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; - const KVM_REQ_TLB_FLUSH_GUEST = 1 << 27 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; - const KVM_REQ_TLB_FLUSH = 1 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; + // pub struct VirtCpuRequest: u64 { + // const KVM_REQUEST_MASK = 0xFF; + + // const KVM_REQ_TLB_FLUSH = 0 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; + // const KVM_REQ_VM_DEAD = 1 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; + + // const KVM_REQUEST_NO_WAKEUP = 1 << 8; + // const KVM_REQUEST_WAIT = 1 << 9; + // const KVM_REQUEST_NO_ACTION = 1 << 10; + + // const KVM_REQ_MIGRATE_TIMER = kvm_arch_req(0); + // const KVM_REQ_REPORT_TPR_ACCESS = kvm_arch_req(1); + // const KVM_REQ_TRIPLE_FAULT = kvm_arch_req(2); + // const KVM_REQ_MMU_SYNC = kvm_arch_req(3); + // const KVM_REQ_CLOCK_UPDATE = kvm_arch_req(4); + // const KVM_REQ_LOAD_MMU_PGD = kvm_arch_req(5); + // const KVM_REQ_EVENT = kvm_arch_req(6); + // const KVM_REQ_APF_HALT = kvm_arch_req(7); + // const KVM_REQ_STEAL_UPDATE = kvm_arch_req(8); + // const KVM_REQ_NMI = kvm_arch_req(9); + // const KVM_REQ_PMU = kvm_arch_req(10); + // const KVM_REQ_PMI = kvm_arch_req(11); + // const KVM_REQ_SMI = kvm_arch_req(12); + + // const KVM_REQ_MASTERCLOCK_UPDATE = kvm_arch_req(13); + // const KVM_REQ_MCLOCK_INPROGRESS = kvm_arch_req_flags(14, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_SCAN_IOAPIC = kvm_arch_req_flags(15, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_GLOBAL_CLOCK_UPDATE = kvm_arch_req(16); + // const KVM_REQ_APIC_PAGE_RELOAD = kvm_arch_req_flags(17, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_HV_CRASH = kvm_arch_req(18); + // const KVM_REQ_IOAPIC_EOI_EXIT = kvm_arch_req(19); + // const KVM_REQ_HV_RESET = kvm_arch_req(20); + // const KVM_REQ_HV_EXIT = kvm_arch_req(21); + // const KVM_REQ_HV_STIMER = kvm_arch_req(22); + // const KVM_REQ_LOAD_EOI_EXITMAP = kvm_arch_req(23); + // const KVM_REQ_GET_NESTED_STATE_PAGES = kvm_arch_req(24); + // const KVM_REQ_APICV_UPDATE = kvm_arch_req_flags(25, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_TLB_FLUSH_CURRENT = kvm_arch_req(26); + + // const KVM_REQ_TLB_FLUSH_GUEST = kvm_arch_req_flags(27, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_APF_READY = kvm_arch_req(28); + // const KVM_REQ_MSR_FILTER_CHANGED = kvm_arch_req(29); + // const KVM_REQ_UPDATE_CPU_DIRTY_LOGGING = kvm_arch_req_flags(30, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_MMU_FREE_OBSOLETE_ROOTS = kvm_arch_req_flags(31, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // const KVM_REQ_HV_TLB_FLUSH = kvm_arch_req_flags(32, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + // } + + pub struct VirtCpuRequest: u64 { + // const KVM_REQUEST_MASK = 0xFF; + + const KVM_REQ_TLB_FLUSH = Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits; + const KVM_REQ_VM_DEAD = 1; + + const KVM_REQUEST_NO_WAKEUP = 1 << 8; + const KVM_REQUEST_WAIT = 1 << 9; + const KVM_REQUEST_NO_ACTION = 1 << 10; + + const KVM_REQ_MIGRATE_TIMER = kvm_arch_req(0); + const KVM_REQ_REPORT_TPR_ACCESS = kvm_arch_req(1); + const KVM_REQ_TRIPLE_FAULT = kvm_arch_req(2); + const KVM_REQ_MMU_SYNC = kvm_arch_req(3); + const KVM_REQ_CLOCK_UPDATE = kvm_arch_req(4); + const KVM_REQ_LOAD_MMU_PGD = kvm_arch_req(5); + const KVM_REQ_EVENT = kvm_arch_req(6); + const KVM_REQ_APF_HALT = kvm_arch_req(7); + const KVM_REQ_STEAL_UPDATE = kvm_arch_req(8); + const KVM_REQ_NMI = kvm_arch_req(9); + const KVM_REQ_PMU = kvm_arch_req(10); + const KVM_REQ_PMI = kvm_arch_req(11); + const KVM_REQ_SMI = kvm_arch_req(12); + + const KVM_REQ_MASTERCLOCK_UPDATE = kvm_arch_req(13); + + const KVM_REQ_MCLOCK_INPROGRESS = kvm_arch_req(14); + const MAKE_KVM_REQ_MCLOCK_INPROGRESS = kvm_arch_req_flags(14, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_SCAN_IOAPIC = kvm_arch_req(15); + const MAKE_KVM_REQ_SCAN_IOAPIC = kvm_arch_req_flags(15, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + + const KVM_REQ_GLOBAL_CLOCK_UPDATE = kvm_arch_req(16); + + const KVM_REQ_APIC_PAGE_RELOAD = kvm_arch_req(17); + const MAKE_KVM_REQ_APIC_PAGE_RELOAD = kvm_arch_req_flags(17, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_HV_CRASH = kvm_arch_req(18); + const KVM_REQ_IOAPIC_EOI_EXIT = kvm_arch_req(19); + const KVM_REQ_HV_RESET = kvm_arch_req(20); + const KVM_REQ_HV_EXIT = kvm_arch_req(21); + const KVM_REQ_HV_STIMER = kvm_arch_req(22); + const KVM_REQ_LOAD_EOI_EXITMAP = kvm_arch_req(23); + const KVM_REQ_GET_NESTED_STATE_PAGES = kvm_arch_req(24); + + const KVM_REQ_APICV_UPDATE = kvm_arch_req(25); + const MAKE_KVM_REQ_APICV_UPDATE = kvm_arch_req_flags(25, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_TLB_FLUSH_CURRENT = kvm_arch_req(26); + + const KVM_REQ_TLB_FLUSH_GUEST = kvm_arch_req(27); + const MAKE_KVM_REQ_TLB_FLUSH_GUEST = kvm_arch_req_flags(27, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_APF_READY = kvm_arch_req(28); + const KVM_REQ_MSR_FILTER_CHANGED = kvm_arch_req(29); + + const KVM_REQ_UPDATE_CPU_DIRTY_LOGGING = kvm_arch_req(30); + const MAKE_KVM_REQ_UPDATE_CPU_DIRTY_LOGGING = kvm_arch_req_flags(30, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_MMU_FREE_OBSOLETE_ROOTS = kvm_arch_req(31); + const MAKE_KVM_REQ_MMU_FREE_OBSOLETE_ROOTS = kvm_arch_req_flags(31, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); + + const KVM_REQ_HV_TLB_FLUSH = kvm_arch_req(32); + const MAKE_KVM_REQ_HV_TLB_FLUSH = kvm_arch_req_flags(32, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits); } } +// const KVM_REQUEST_ARCH_BASE: u64 = 8; +const KVM_REQUEST_ARCH_BASE: u64 = 11; + +const fn kvm_arch_req(nr: u64) -> u64 { + return kvm_arch_req_flags(nr, 0); +} + +const fn kvm_arch_req_flags(nr: u64, flags: u64) -> u64 { + 1 << (nr + KVM_REQUEST_ARCH_BASE) | flags +} + #[derive(Debug, Default)] pub struct KvmQueuedInterrupt { pub injected: bool, @@ -1174,6 +1580,7 @@ pub struct KvmQueuedInterrupt { } #[derive(Debug, Default)] +#[allow(dead_code)] pub struct KvmQueuedException { pending: bool, injected: bool, @@ -1185,6 +1592,7 @@ pub struct KvmQueuedException { } #[derive(Debug)] +#[allow(dead_code)] pub struct KvmAsyncPageFault { /// 是否处于停止状态 halted: bool, diff --git a/kernel/src/arch/x86_64/vm/mem.rs b/kernel/src/arch/x86_64/vm/mem.rs index b3ac5f0f4..fa9488ac0 100644 --- a/kernel/src/arch/x86_64/vm/mem.rs +++ b/kernel/src/arch/x86_64/vm/mem.rs @@ -1,21 +1,26 @@ use alloc::sync::Arc; use system_error::SystemError; -use crate::virt::vm::kvm_host::{ - mem::{KvmMemoryChangeMode, LockedKvmMemSlot}, - Vm, +use crate::{ + kwarn, + virt::vm::kvm_host::{ + mem::{KvmMemoryChangeMode, LockedKvmMemSlot}, + Vm, + }, }; +#[allow(dead_code)] pub struct KvmArchMemorySlot {} impl Vm { pub fn arch_prepare_memory_region( &self, - old: Option<&Arc>, - new: Option<&Arc>, - change: KvmMemoryChangeMode, + _old: Option<&Arc>, + _new: Option<&Arc>, + _change: KvmMemoryChangeMode, ) -> Result<(), SystemError> { // todo + kwarn!("arch_prepare_memory_region TODO"); Ok(()) } } diff --git a/kernel/src/arch/x86_64/vm/mmu.rs b/kernel/src/arch/x86_64/vm/mmu.rs index a0edddf4b..504e9a107 100644 --- a/kernel/src/arch/x86_64/vm/mmu.rs +++ b/kernel/src/arch/x86_64/vm/mmu.rs @@ -1,16 +1,20 @@ +use crate::{arch::mm::X86_64MMArch, kdebug}; use alloc::{sync::Arc, vec::Vec}; use bitfield_struct::bitfield; +use core::intrinsics::likely; use raw_cpuid::CpuId; +use system_error::SystemError; use x86::controlregs::{Cr0, Cr4}; use x86_64::registers::control::EferFlags; use crate::{ - arch::{MMArch, VirtCpuArch}, + arch::{mm::LockedFrameAllocator, MMArch, VirtCpuArch}, libs::spinlock::{SpinLock, SpinLockGuard}, - mm::MemoryManagementArch, + mm::{page::PageMapper, MemoryManagementArch, PageTableKind}, + virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, }; -use super::vmx::vmx_info; +use super::{vmx::vmx_info, x86_kvm_ops}; const PT64_ROOT_5LEVEL: usize = 5; const PT64_ROOT_4LEVEL: usize = 4; @@ -27,6 +31,7 @@ static mut SHADOW_ACCESSED_MASK: usize = 0; static mut MAX_HUGE_PAGE_LEVEL: PageLevel = PageLevel::None; +#[allow(dead_code)] pub enum PageLevel { None, Level4k, @@ -54,10 +59,11 @@ impl LockedKvmMmu { } #[derive(Debug, Default)] +#[allow(dead_code)] pub struct KvmMmu { - root: KvmMmuRootInfo, - cpu_role: KvmCpuRole, - root_role: KvmMmuPageRole, + pub root: KvmMmuRootInfo, + pub cpu_role: KvmCpuRole, + pub root_role: KvmMmuPageRole, pkru_mask: u32, @@ -68,7 +74,7 @@ pub struct KvmMmu { impl KvmMmu { const KVM_MMU_NUM_PREV_ROOTS: usize = 3; - const INVALID_PAGE: u64 = u64::MAX; + pub const INVALID_PAGE: u64 = u64::MAX; #[inline] pub fn tdp_enabled() -> bool { @@ -121,8 +127,8 @@ impl KvmMmu { #[derive(Debug, Default)] pub struct KvmMmuRootInfo { - pgd: u64, - hpa: u64, + pub pgd: u64, + pub hpa: u64, } #[derive(Debug, Default, Clone, Copy)] @@ -381,6 +387,11 @@ impl VirtCpuArch { self.root_mmu.as_ref().unwrap() } + #[inline] + pub fn mmu(&self) -> SpinLockGuard { + self.mmu.as_ref().unwrap().lock() + } + fn calc_tdp_mmu_root_page_role(&self, cpu_role: KvmCpuRole) -> KvmMmuPageRole { let mut role = KvmMmuPageRole::default(); @@ -397,3 +408,88 @@ impl VirtCpuArch { role } } + +impl VirtCpu { + pub fn kvm_mmu_reload(&mut self, vm: &Vm) -> Result<(), SystemError> { + if likely(self.arch.mmu().root.hpa != KvmMmu::INVALID_PAGE) { + return Ok(()); + } + + return self.kvm_mmu_load(vm); + } + + pub fn kvm_mmu_load(&mut self, vm: &Vm) -> Result<(), SystemError> { + let direct = self.arch.mmu().root_role.direct(); + self.mmu_topup_memory_caches(!direct)?; + self.mmu_alloc_special_roots()?; + + if direct { + self.mmu_alloc_direct_roots()?; + } else { + self.mmu_alloc_shadow_roots()?; + } + + // TODO: kvm_mmu_sync_roots + + self.kvm_mmu_load_pgd(vm); + + Ok(()) + } + + pub fn kvm_mmu_load_pgd(&mut self, vm: &Vm) { + let root_hpa = self.arch.mmu().root.hpa; + + if root_hpa == KvmMmu::INVALID_PAGE { + return; + } + + let level = self.arch.mmu().root_role.level(); + x86_kvm_ops().load_mmu_pgd(self, vm, root_hpa, level); + } + + fn mmu_topup_memory_caches(&mut self, _maybe_indirect: bool) -> Result<(), SystemError> { + // TODO + Ok(()) + } + + fn mmu_alloc_special_roots(&mut self) -> Result<(), SystemError> { + // TODO + Ok(()) + } + + fn mmu_alloc_direct_roots(&mut self) -> Result<(), SystemError> { + // let shadow_root_level = self.arch.mmu().root_role.level(); + + // if KvmMmu::tdp_enabled() { + // todo!() + // } else if shadow_root_level >= PT64_ROOT_4LEVEL as u32 { + // todo!() + // } else if shadow_root_level == PT32E_ROOT_LEVEL as u32 { + // todo!() + // } else { + // kerror!("Bad TDP root level = {}", shadow_root_level); + // return Err(SystemError::EIO); + // } + + // self.arch.mmu().root.pgd = 0; + // Ok(()) + + // 申请并创建新的页表 + let mapper: crate::mm::page::PageMapper = unsafe { + PageMapper::create(PageTableKind::EPT, LockedFrameAllocator) + .ok_or(SystemError::ENOMEM)? + }; + + let ept_root_hpa = mapper.table().phys(); + + self.arch.mmu().root.hpa = ept_root_hpa.data() as u64; + + kdebug!("ept_root_hpa:{:x}!", ept_root_hpa.data() as u64); + + Ok(()) + } + + fn mmu_alloc_shadow_roots(&mut self) -> Result<(), SystemError> { + todo!(); + } +} diff --git a/kernel/src/arch/x86_64/vm/mod.rs b/kernel/src/arch/x86_64/vm/mod.rs index c7996265e..4f9af1cc6 100644 --- a/kernel/src/arch/x86_64/vm/mod.rs +++ b/kernel/src/arch/x86_64/vm/mod.rs @@ -1,35 +1,15 @@ -use core::arch::x86_64::{_xgetbv, _XCR_XFEATURE_ENABLED_MASK}; - use alloc::vec::Vec; use raw_cpuid::CpuId; use system_error::SystemError; use x86::{ - controlregs::{xcr0, Cr0, Cr4, Xcr0}, - msr::{ - rdmsr, IA32_BIOS_SIGN_ID, IA32_CSTAR, IA32_EFER, IA32_FEATURE_CONTROL, IA32_FMASK, - IA32_KERNEL_GSBASE, IA32_LSTAR, IA32_MCG_CTL, IA32_MCG_STATUS, IA32_MISC_ENABLE, IA32_PAT, - IA32_PERFEVTSEL0, IA32_PERFEVTSEL7, IA32_PERF_CAPABILITIES, IA32_PMC0, IA32_PMC7, - IA32_SMBASE, IA32_STAR, IA32_SYSENTER_CS, IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, - IA32_TIME_STAMP_COUNTER, IA32_TSC_ADJUST, IA32_TSC_AUX, IA32_TSC_DEADLINE, IA32_VMX_BASIC, - IA32_VMX_CR0_FIXED0, IA32_VMX_CR4_FIXED0, IA32_VMX_EPT_VPID_CAP, IA32_VMX_MISC, - IA32_VMX_PROCBASED_CTLS2, IA32_VMX_TRUE_ENTRY_CTLS, IA32_VMX_TRUE_EXIT_CTLS, - IA32_VMX_TRUE_PINBASED_CTLS, IA32_VMX_TRUE_PROCBASED_CTLS, IA32_VMX_VMCS_ENUM, - IA32_VMX_VMFUNC, MSR_C1_PMON_EVNT_SEL0, MSR_C5_PMON_BOX_CTRL, MSR_IA32_ADDR0_END, - MSR_IA32_ADDR0_START, MSR_IA32_ADDR1_END, MSR_IA32_ADDR1_START, MSR_IA32_ADDR2_END, - MSR_IA32_ADDR2_START, MSR_IA32_ADDR3_END, MSR_IA32_ADDR3_START, MSR_IA32_CR3_MATCH, - MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK_PTRS, - MSR_IA32_RTIT_STATUS, MSR_IA32_TSX_CTRL, MSR_PERF_FIXED_CTR0, MSR_PERF_FIXED_CTR2, - MSR_PLATFORM_INFO, MSR_POWER_CTL, MSR_SMI_COUNT, - }, -}; -use x86_64::registers::{ - control::{Efer, EferFlags}, - xcontrol::{XCr0, XCr0Flags}, + controlregs::{cr4, xcr0, Cr0, Cr4, Xcr0}, + msr::{self, rdmsr, wrmsr}, }; +use x86_64::registers::control::{Efer, EferFlags}; use crate::{ arch::vm::vmx::{VmxL1dFlushState, L1TF_VMX_MITIGATION}, - kdebug, kerror, + kerror, kwarn, libs::once::Once, mm::percpu::{PerCpu, PerCpuVar}, }; @@ -43,6 +23,7 @@ use super::driver::tsc::TSCManager; mod asm; mod cpuid; +pub(super) mod exit; pub mod kvm_host; pub mod mem; mod mmu; @@ -169,35 +150,35 @@ impl KvmArchManager { pub const KVM_MAX_NR_USER_RETURN_MSRS: usize = 7; const MSRS_TO_SAVE_BASE: &[u32] = &[ - IA32_SYSENTER_CS, - IA32_SYSENTER_ESP, - IA32_SYSENTER_EIP, - IA32_STAR, - IA32_CSTAR, - IA32_KERNEL_GSBASE, - IA32_FMASK, - IA32_LSTAR, - IA32_TIME_STAMP_COUNTER, - IA32_PAT, + msr::IA32_SYSENTER_CS, + msr::IA32_SYSENTER_ESP, + msr::IA32_SYSENTER_EIP, + msr::IA32_STAR, + msr::IA32_CSTAR, + msr::IA32_KERNEL_GSBASE, + msr::IA32_FMASK, + msr::IA32_LSTAR, + msr::IA32_TIME_STAMP_COUNTER, + msr::IA32_PAT, 0xc0010117, // MSR_VM_HSAVE_PA? - IA32_FEATURE_CONTROL, - MSR_C1_PMON_EVNT_SEL0, - IA32_TSC_AUX, + msr::IA32_FEATURE_CONTROL, + msr::MSR_C1_PMON_EVNT_SEL0, + msr::IA32_TSC_AUX, 0x48, // MSR_IA32_SPEC_CTRL - MSR_IA32_TSX_CTRL, - MSR_IA32_RTIT_CTL, - MSR_IA32_RTIT_STATUS, - MSR_IA32_CR3_MATCH, - MSR_IA32_RTIT_OUTPUT_BASE, - MSR_IA32_RTIT_OUTPUT_MASK_PTRS, - MSR_IA32_ADDR0_START, - MSR_IA32_ADDR0_END, - MSR_IA32_ADDR1_START, - MSR_IA32_ADDR1_END, - MSR_IA32_ADDR2_START, - MSR_IA32_ADDR2_END, - MSR_IA32_ADDR3_START, - MSR_IA32_ADDR3_END, + msr::MSR_IA32_TSX_CTRL, + msr::MSR_IA32_RTIT_CTL, + msr::MSR_IA32_RTIT_STATUS, + msr::MSR_IA32_CR3_MATCH, + msr::MSR_IA32_RTIT_OUTPUT_BASE, + msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS, + msr::MSR_IA32_ADDR0_START, + msr::MSR_IA32_ADDR0_END, + msr::MSR_IA32_ADDR1_START, + msr::MSR_IA32_ADDR1_END, + msr::MSR_IA32_ADDR2_START, + msr::MSR_IA32_ADDR2_END, + msr::MSR_IA32_ADDR3_START, + msr::MSR_IA32_ADDR3_END, 0xe1, // MSR_IA32_UMWAIT_CONTROL 0x1c4, // MSR_IA32_XFD 0x1c5, // MSR_IA32_XFD_ERR @@ -241,22 +222,22 @@ impl KvmArchManager { MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, - IA32_TSC_ADJUST, - IA32_TSC_DEADLINE, - IA32_PERF_CAPABILITIES, + msr::IA32_TSC_ADJUST, + msr::IA32_TSC_DEADLINE, + msr::IA32_PERF_CAPABILITIES, 0x10a, // MSR_IA32_ARCH_CAPABILITIES, - IA32_MISC_ENABLE, - IA32_MCG_STATUS, - IA32_MCG_CTL, + msr::IA32_MISC_ENABLE, + msr::IA32_MCG_STATUS, + msr::IA32_MCG_CTL, 0x4d0, // MSR_IA32_MCG_EXT_CTL, - IA32_SMBASE, - MSR_SMI_COUNT, - MSR_PLATFORM_INFO, + msr::IA32_SMBASE, + msr::MSR_SMI_COUNT, + msr::MSR_PLATFORM_INFO, 0x140, // MSR_MISC_FEATURES_ENABLES, 0xc001011f, // MSR_AMD64_VIRT_SPEC_CTRL, 0xc0000104, // MSR_AMD64_TSC_RATIO, - MSR_POWER_CTL, - IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV, + msr::MSR_POWER_CTL, + msr::IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV, /* * KVM always supports the "true" VMX control MSRs, even if the host * does not. The VMX MSRs as a whole are considered "emulated" as KVM @@ -264,27 +245,27 @@ impl KvmArchManager { * KVM would refuse to load in the first place if the core set of MSRs * aren't supported). */ - IA32_VMX_BASIC, - IA32_VMX_TRUE_PINBASED_CTLS, - IA32_VMX_TRUE_PROCBASED_CTLS, - IA32_VMX_TRUE_EXIT_CTLS, - IA32_VMX_TRUE_ENTRY_CTLS, - IA32_VMX_MISC, - IA32_VMX_CR0_FIXED0, - IA32_VMX_CR4_FIXED0, - IA32_VMX_VMCS_ENUM, - IA32_VMX_PROCBASED_CTLS2, - IA32_VMX_EPT_VPID_CAP, - IA32_VMX_VMFUNC, + msr::IA32_VMX_BASIC, + msr::IA32_VMX_TRUE_PINBASED_CTLS, + msr::IA32_VMX_TRUE_PROCBASED_CTLS, + msr::IA32_VMX_TRUE_EXIT_CTLS, + msr::IA32_VMX_TRUE_ENTRY_CTLS, + msr::IA32_VMX_MISC, + msr::IA32_VMX_CR0_FIXED0, + msr::IA32_VMX_CR4_FIXED0, + msr::IA32_VMX_VMCS_ENUM, + msr::IA32_VMX_PROCBASED_CTLS2, + msr::IA32_VMX_EPT_VPID_CAP, + msr::IA32_VMX_VMFUNC, 0xc0010015, // MSR_K7_HWCR, MSR_KVM_POLL_CONTROL, ]; const MSR_BASED_FEATURES_ALL_EXCEPT_VMX: &[u32] = &[ - 0xc0011029, // MSR_AMD64_DE_CFG - IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV - 0x10a, // MSR_IA32_ARCH_CAPABILITIES, - IA32_PERF_CAPABILITIES, + 0xc0011029, // MSR_AMD64_DE_CFG + msr::IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV + 0x10a, // MSR_IA32_ARCH_CAPABILITIES, + msr::IA32_PERF_CAPABILITIES, ]; pub fn arch_hardware_enable(&self) -> Result<(), SystemError> { @@ -338,7 +319,7 @@ impl KvmArchManager { // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9472 // 读取主机page attribute table(页属性表) - let host_pat = unsafe { rdmsr(IA32_PAT) }; + let host_pat = unsafe { rdmsr(msr::IA32_PAT) }; // PAT[0]是否为write back类型,即判断低三位是否为0b110(0x06) if host_pat & 0b111 != 0b110 { kerror!("[KVM] host PAT[0] is not WB"); @@ -346,16 +327,17 @@ impl KvmArchManager { } // TODO:mmu vendor init - if cpu_feature.has_xsave() { + if cpu_feature.has_xsave() && unsafe { cr4() }.contains(Cr4::CR4_ENABLE_OS_XSAVE) { self.host_xcr0 = unsafe { xcr0() }; self.kvm_caps.supported_xcr0 = self.host_xcr0; } + // 保存efer self.host_efer = Efer::read(); // 保存xss if cpu_extend.has_xsaves_xrstors() { - self.host_xss = unsafe { rdmsr(MSR_C5_PMON_BOX_CTRL) }; + self.host_xss = unsafe { rdmsr(msr::MSR_C5_PMON_BOX_CTRL) }; } // TODO: 初始化性能监视单元(PMU) @@ -389,6 +371,8 @@ impl KvmArchManager { kvm_caps.default_tsc_scaling_ratio = 1 << kvm_caps.tsc_scaling_ratio_frac_bits; self.kvm_init_msr_lists(); + + kwarn!("vendor init over"); Ok(()) } @@ -412,7 +396,7 @@ impl KvmArchManager { self.emulated_msrs.push(*msr); } - for msr in IA32_VMX_BASIC..=IA32_VMX_VMFUNC { + for msr in msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC { self.kvm_prove_feature_msr(msr) } @@ -427,13 +411,13 @@ impl KvmArchManager { let cpu_extend = cpuid.get_extended_feature_info().unwrap(); match msr { - MSR_C1_PMON_EVNT_SEL0 => { + msr::MSR_C1_PMON_EVNT_SEL0 => { if !cpu_extend.has_mpx() { return; } } - IA32_TSC_AUX => { + msr::IA32_TSC_AUX => { if !cpu_feat.has_tsc() { return; } @@ -444,39 +428,39 @@ impl KvmArchManager { return; } } - MSR_IA32_RTIT_CTL | MSR_IA32_RTIT_STATUS => { + msr::MSR_IA32_RTIT_CTL | msr::MSR_IA32_RTIT_STATUS => { if !cpu_extend.has_processor_trace() { return; } } - MSR_IA32_CR3_MATCH => { + msr::MSR_IA32_CR3_MATCH => { // TODO: 判断intel_pt_validate_hw_cap(PT_CAP_cr3_filtering) if !cpu_extend.has_processor_trace() { return; } } - MSR_IA32_RTIT_OUTPUT_BASE | MSR_IA32_RTIT_OUTPUT_MASK_PTRS => { + msr::MSR_IA32_RTIT_OUTPUT_BASE | msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS => { // TODO: 判断!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&!intel_pt_validate_hw_cap(PT_CAP_single_range_output) if !cpu_extend.has_processor_trace() { return; } } - MSR_IA32_ADDR0_START..MSR_IA32_ADDR3_END => { + msr::MSR_IA32_ADDR0_START..msr::MSR_IA32_ADDR3_END => { // TODO: 判断msr_index - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) if !cpu_extend.has_processor_trace() { return; } } - IA32_PMC0..IA32_PMC7 => { + msr::IA32_PMC0..msr::IA32_PMC7 => { // TODO: 判断msr是否符合配置 } - IA32_PERFEVTSEL0..IA32_PERFEVTSEL7 => { + msr::IA32_PERFEVTSEL0..msr::IA32_PERFEVTSEL7 => { // TODO: 判断msr是否符合配置 } - MSR_PERF_FIXED_CTR0..MSR_PERF_FIXED_CTR2 => { + msr::MSR_PERF_FIXED_CTR0..msr::MSR_PERF_FIXED_CTR2 => { // TODO: 判断msr是否符合配置 } - MSR_IA32_TSX_CTRL => { + msr::MSR_IA32_TSX_CTRL => { // TODO: !(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR) // 这个寄存器目前不支持,现在先return // return; @@ -507,10 +491,10 @@ impl KvmArchManager { // MSR_IA32_ARCH_CAPABILITIES, msr.data = self.get_arch_capabilities(); } - IA32_PERF_CAPABILITIES => { + msr::IA32_PERF_CAPABILITIES => { msr.data = self.kvm_caps.supported_perf_cap; } - IA32_BIOS_SIGN_ID => { + msr::IA32_BIOS_SIGN_ID => { // MSR_IA32_UCODE_REV msr.data = unsafe { rdmsr(msr.index) }; } @@ -523,9 +507,6 @@ impl KvmArchManager { } fn get_arch_capabilities(&self) -> u64 { - let cpuid = CpuId::new(); - let extend_feat = cpuid.get_extended_feature_info().unwrap(); - let mut data = ArchCapabilities::from_bits_truncate(self.host_arch_capabilities) & ArchCapabilities::KVM_SUPPORTED_ARCH_CAP; data.insert(ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO); @@ -561,6 +542,23 @@ impl KvmArchManager { // TODO:此处未完成 Ok(()) } + + pub fn kvm_set_user_return_msr(&self, slot: usize, mut value: u64, mask: u64) { + let msrs = user_return_msrs().get_mut(); + + value = (value & mask) | (msrs.values[slot].host & !mask); + if value == msrs.values[slot].curr { + return; + } + + unsafe { wrmsr(self.kvm_uret_msrs_list[slot], value) }; + + msrs.values[slot].curr = value; + + if !msrs.registered { + msrs.registered = true; + } + } } /// ### Kvm的功能特性 diff --git a/kernel/src/arch/x86_64/vm/uapi.rs b/kernel/src/arch/x86_64/vm/uapi.rs index e22b02b76..c7a8ccc24 100644 --- a/kernel/src/arch/x86_64/vm/uapi.rs +++ b/kernel/src/arch/x86_64/vm/uapi.rs @@ -1,3 +1,5 @@ +#![allow(dead_code)] + use crate::virt::vm::user_api::UapiKvmSegment; pub const DE_VECTOR: usize = 0; @@ -56,3 +58,45 @@ pub struct UapiKvmDtable { pub limit: u16, pub padding: [u16; 3usize], } + +#[allow(dead_code)] +pub mod kvm_exit { + pub const KVM_EXIT_UNKNOWN: u32 = 0; + pub const KVM_EXIT_EXCEPTION: u32 = 1; + pub const KVM_EXIT_IO: u32 = 2; + pub const KVM_EXIT_HYPERCALL: u32 = 3; + pub const KVM_EXIT_DEBUG: u32 = 4; + pub const KVM_EXIT_HLT: u32 = 5; + pub const KVM_EXIT_MMIO: u32 = 6; + pub const KVM_EXIT_IRQ_WINDOW_OPEN: u32 = 7; + pub const KVM_EXIT_SHUTDOWN: u32 = 8; + pub const KVM_EXIT_FAIL_ENTRY: u32 = 9; + pub const KVM_EXIT_INTR: u32 = 10; + pub const KVM_EXIT_SET_TPR: u32 = 11; + pub const KVM_EXIT_TPR_ACCESS: u32 = 12; + pub const KVM_EXIT_S390_SIEIC: u32 = 13; + pub const KVM_EXIT_S390_RESET: u32 = 14; + pub const KVM_EXIT_DCR: u32 = 15; + pub const KVM_EXIT_NMI: u32 = 16; + pub const KVM_EXIT_INTERNAL_ERROR: u32 = 17; + pub const KVM_EXIT_OSI: u32 = 18; + pub const KVM_EXIT_PAPR_HCALL: u32 = 19; + pub const KVM_EXIT_S390_UCONTROL: u32 = 20; + pub const KVM_EXIT_WATCHDOG: u32 = 21; + pub const KVM_EXIT_S390_TSCH: u32 = 22; + pub const KVM_EXIT_EPR: u32 = 23; + pub const KVM_EXIT_SYSTEM_EVENT: u32 = 24; + pub const KVM_EXIT_S390_STSI: u32 = 25; + pub const KVM_EXIT_IOAPIC_EOI: u32 = 26; + pub const KVM_EXIT_HYPERV: u32 = 27; + pub const KVM_EXIT_ARM_NISV: u32 = 28; + pub const KVM_EXIT_X86_RDMSR: u32 = 29; + pub const KVM_EXIT_X86_WRMSR: u32 = 30; + pub const KVM_EXIT_DIRTY_RING_FULL: u32 = 31; + pub const KVM_EXIT_AP_RESET_HOLD: u32 = 32; + pub const KVM_EXIT_X86_BUS_LOCK: u32 = 33; + pub const KVM_EXIT_XEN: u32 = 34; + pub const KVM_EXIT_RISCV_SBI: u32 = 35; + pub const KVM_EXIT_RISCV_CSR: u32 = 36; + pub const KVM_EXIT_NOTIFY: u32 = 37; +} diff --git a/kernel/src/arch/x86_64/vm/vmx/asm.rs b/kernel/src/arch/x86_64/vm/vmx/asm.rs new file mode 100644 index 000000000..92d857507 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/asm.rs @@ -0,0 +1,19 @@ +#![allow(dead_code)] + +pub const VMX_EPT_MT_EPTE_SHIFT: u64 = 3; +pub const VMX_EPTP_PWL_MASK: u64 = 0x38; +pub const VMX_EPTP_PWL_4: u64 = 0x18; +pub const VMX_EPTP_PWL_5: u64 = 0x20; +pub const VMX_EPTP_AD_ENABLE_BIT: u64 = 1 << 6; +pub const VMX_EPTP_MT_MASK: u64 = 0x7; +pub const VMX_EPTP_MT_WB: u64 = 0x6; +pub const VMX_EPTP_MT_UC: u64 = 0x0; +pub const VMX_EPT_READABLE_MASK: u64 = 0x1; +pub const VMX_EPT_WRITABLE_MASK: u64 = 0x2; +pub const VMX_EPT_EXECUTABLE_MASK: u64 = 0x4; +pub const VMX_EPT_IPAT_BIT: u64 = 1 << 6; +pub const VMX_EPT_ACCESS_BIT: u64 = 1 << 8; +pub const VMX_EPT_DIRTY_BIT: u64 = 1 << 9; +pub const VMX_EPT_RWX_MASK: u64 = + VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | VMX_EPT_EXECUTABLE_MASK; +pub const VMX_EPT_MT_MASK: u64 = 7 << VMX_EPT_MT_EPTE_SHIFT; diff --git a/kernel/src/arch/x86_64/vm/vmx/capabilities.rs b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs index 71c947f24..3e33ba7bb 100644 --- a/kernel/src/arch/x86_64/vm/vmx/capabilities.rs +++ b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs @@ -1,12 +1,6 @@ use raw_cpuid::CpuId; use x86::{ - msr::{ - IA32_VMX_BASIC, IA32_VMX_CR0_FIXED0, IA32_VMX_CR0_FIXED1, IA32_VMX_CR4_FIXED0, - IA32_VMX_CR4_FIXED1, IA32_VMX_ENTRY_CTLS, IA32_VMX_EPT_VPID_CAP, IA32_VMX_EXIT_CTLS, - IA32_VMX_MISC, IA32_VMX_PINBASED_CTLS, IA32_VMX_PROCBASED_CTLS, IA32_VMX_PROCBASED_CTLS2, - IA32_VMX_TRUE_ENTRY_CTLS, IA32_VMX_TRUE_EXIT_CTLS, IA32_VMX_TRUE_PINBASED_CTLS, - IA32_VMX_TRUE_PROCBASED_CTLS, IA32_VMX_VMCS_ENUM, IA32_VMX_VMFUNC, - }, + msr, vmx::vmcs::control::{ EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls, }, @@ -108,68 +102,68 @@ impl NestedVmxMsrs { pub fn get_vmx_msr(&self, msr_index: u32) -> Option { match msr_index { - IA32_VMX_BASIC => { + msr::IA32_VMX_BASIC => { return Some(self.basic); } - IA32_VMX_TRUE_PINBASED_CTLS | IA32_VMX_PINBASED_CTLS => { + msr::IA32_VMX_TRUE_PINBASED_CTLS | msr::IA32_VMX_PINBASED_CTLS => { let mut data = NestedVmxMsrs::control_msr(self.pinbased_ctls_low, self.pinbased_ctls_high); - if msr_index == IA32_VMX_PINBASED_CTLS { + if msr_index == msr::IA32_VMX_PINBASED_CTLS { data |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; } return Some(data); } - IA32_VMX_TRUE_PROCBASED_CTLS | IA32_VMX_PROCBASED_CTLS => { + msr::IA32_VMX_TRUE_PROCBASED_CTLS | msr::IA32_VMX_PROCBASED_CTLS => { let mut data = NestedVmxMsrs::control_msr(self.procbased_ctls_low, self.procbased_ctls_high); - if msr_index == IA32_VMX_PROCBASED_CTLS { + if msr_index == msr::IA32_VMX_PROCBASED_CTLS { data |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; } return Some(data); } - IA32_VMX_TRUE_EXIT_CTLS | IA32_VMX_EXIT_CTLS => { + msr::IA32_VMX_TRUE_EXIT_CTLS | msr::IA32_VMX_EXIT_CTLS => { let mut data = NestedVmxMsrs::control_msr(self.exit_ctls_low, self.exit_ctls_high); - if msr_index == IA32_VMX_EXIT_CTLS { + if msr_index == msr::IA32_VMX_EXIT_CTLS { data |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; } return Some(data); } - IA32_VMX_TRUE_ENTRY_CTLS | IA32_VMX_ENTRY_CTLS => { + msr::IA32_VMX_TRUE_ENTRY_CTLS | msr::IA32_VMX_ENTRY_CTLS => { let mut data = NestedVmxMsrs::control_msr(self.entry_ctls_low, self.entry_ctls_high); - if msr_index == IA32_VMX_ENTRY_CTLS { + if msr_index == msr::IA32_VMX_ENTRY_CTLS { data |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; } return Some(data); } - IA32_VMX_MISC => { + msr::IA32_VMX_MISC => { return Some(NestedVmxMsrs::control_msr(self.misc_low, self.misc_high)); } - IA32_VMX_CR0_FIXED0 => { + msr::IA32_VMX_CR0_FIXED0 => { return Some(self.cr0_fixed0); } - IA32_VMX_CR0_FIXED1 => { + msr::IA32_VMX_CR0_FIXED1 => { return Some(self.cr0_fixed1); } - IA32_VMX_CR4_FIXED0 => { + msr::IA32_VMX_CR4_FIXED0 => { return Some(self.cr4_fixed0); } - IA32_VMX_CR4_FIXED1 => { + msr::IA32_VMX_CR4_FIXED1 => { return Some(self.cr4_fixed1); } - IA32_VMX_VMCS_ENUM => { + msr::IA32_VMX_VMCS_ENUM => { return Some(self.vmcs_enum); } - IA32_VMX_PROCBASED_CTLS2 => { + msr::IA32_VMX_PROCBASED_CTLS2 => { return Some(NestedVmxMsrs::control_msr( self.secondary_ctls_low, self.secondary_ctls_high, )); } - IA32_VMX_EPT_VPID_CAP => { + msr::IA32_VMX_EPT_VPID_CAP => { return Some(self.ept_caps as u64 | ((self.vpid_caps as u64) << 32)); } - IA32_VMX_VMFUNC => { + msr::IA32_VMX_VMFUNC => { return Some(self.vmfunc_controls); } _ => { @@ -243,6 +237,7 @@ impl VmxCapability { impl Vmx { /// 检查处理器是否支持VMX基本控制结构的输入输出功能 #[inline] + #[allow(dead_code)] pub fn has_basic_inout(&self) -> bool { return ((self.vmcs_config.basic_cap as u64) << 32) & VmxFeat::VMX_BASIC_INOUT != 0; } @@ -402,6 +397,11 @@ impl Vmx { return self.vmx_cap.ept.contains(EptFlag::EPTP_WB); } + #[inline] + pub fn has_vmx_invept_context(&self) -> bool { + self.vmx_cap.ept.contains(EptFlag::EPT_EXTENT_CONTEXT) + } + /// EPT是否支持全局拓展 #[inline] pub fn has_invept_global(&self) -> bool { diff --git a/kernel/src/arch/x86_64/vm/vmx/exit.rs b/kernel/src/arch/x86_64/vm/vmx/exit.rs new file mode 100644 index 000000000..d32168ff7 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/exit.rs @@ -0,0 +1,204 @@ +use bitfield_struct::bitfield; +use system_error::SystemError; + +use crate::virt::vm::kvm_host::vcpu::VirtCpu; + +#[bitfield(u32)] +pub struct VmxExitReason { + pub basic: u16, + pub reserved16: bool, + pub reserved17: bool, + pub reserved18: bool, + pub reserved19: bool, + pub reserved20: bool, + pub reserved21: bool, + pub reserved22: bool, + pub reserved23: bool, + pub reserved24: bool, + pub reserved25: bool, + pub bus_lock_detected: bool, + pub enclave_mode: bool, + pub smi_pending_mtf: bool, + pub smi_from_vmx_root: bool, + pub reserved30: bool, + pub failed_vmentry: bool, +} + +#[derive(FromPrimitive, PartialEq)] +#[allow(non_camel_case_types)] +pub enum VmxExitReasonBasic { + EXCEPTION_OR_NMI = 0, + EXTERNAL_INTERRUPT = 1, + TRIPLE_FAULT = 2, + INIT_SIGNAL = 3, + SIPI = 4, + IO_SMI = 5, + OTHER_SMI = 6, + INTERRUPT_WINDOW = 7, + NMI_WINDOW = 8, + TASK_SWITCH = 9, + CPUID = 10, + GETSEC = 11, + HLT = 12, + INVD = 13, + INVLPG = 14, + RDPMC = 15, + RDTSC = 16, + RSM = 17, + VMCALL = 18, + VMCLEAR = 19, + VMLAUNCH = 20, + VMPTRLD = 21, + VMPTRST = 22, + VMREAD = 23, + VMRESUME = 24, + VMWRITE = 25, + VMXOFF = 26, + VMXON = 27, + CR_ACCESS = 28, + DR_ACCESS = 29, + IO_INSTRUCTION = 30, + RDMSR = 31, + WRMSR = 32, + VM_ENTRY_FAILURE_INVALID_GUEST_STATE = 33, + VM_ENTRY_FAILURE_MSR_LOADING = 34, + MWAIT = 36, + MONITOR_TRAP_FLAG = 37, + MONITOR = 39, + PAUSE = 40, + VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT = 41, + TPR_BELOW_THRESHOLD = 43, + APIC_ACCESS = 44, + VIRTUALIZED_EOI = 45, + ACCESS_GDTR_OR_IDTR = 46, + ACCESS_LDTR_OR_TR = 47, + EPT_VIOLATION = 48, + EPT_MISCONFIG = 49, + INVEPT = 50, + RDTSCP = 51, + VMX_PREEMPTION_TIMER_EXPIRED = 52, + INVVPID = 53, + WBINVD = 54, + XSETBV = 55, + APIC_WRITE = 56, + RDRAND = 57, + INVPCID = 58, + VMFUNC = 59, + ENCLS = 60, + RDSEED = 61, + PML_FULL = 62, + XSAVES = 63, + XRSTORS = 64, + + UMWAIT = 67, + TPAUSE = 68, + BUS_LOCK = 74, + NOTIFY = 75, + + UNKNOWN, +} + +impl From for VmxExitReasonBasic { + fn from(num: u16) -> Self { + match num { + 0 => VmxExitReasonBasic::EXCEPTION_OR_NMI, + 1 => VmxExitReasonBasic::EXTERNAL_INTERRUPT, + 2 => VmxExitReasonBasic::TRIPLE_FAULT, + 3 => VmxExitReasonBasic::INIT_SIGNAL, + 4 => VmxExitReasonBasic::SIPI, + 5 => VmxExitReasonBasic::IO_SMI, + 6 => VmxExitReasonBasic::OTHER_SMI, + 7 => VmxExitReasonBasic::INTERRUPT_WINDOW, + 8 => VmxExitReasonBasic::NMI_WINDOW, + 9 => VmxExitReasonBasic::TASK_SWITCH, + 10 => VmxExitReasonBasic::CPUID, + 11 => VmxExitReasonBasic::GETSEC, + 12 => VmxExitReasonBasic::HLT, + 13 => VmxExitReasonBasic::INVD, + 14 => VmxExitReasonBasic::INVLPG, + 15 => VmxExitReasonBasic::RDPMC, + 16 => VmxExitReasonBasic::RDTSC, + 17 => VmxExitReasonBasic::RSM, + 18 => VmxExitReasonBasic::VMCALL, + 19 => VmxExitReasonBasic::VMCLEAR, + 20 => VmxExitReasonBasic::VMLAUNCH, + 21 => VmxExitReasonBasic::VMPTRLD, + 22 => VmxExitReasonBasic::VMPTRST, + 23 => VmxExitReasonBasic::VMREAD, + 24 => VmxExitReasonBasic::VMRESUME, + 25 => VmxExitReasonBasic::VMWRITE, + 26 => VmxExitReasonBasic::VMXOFF, + 27 => VmxExitReasonBasic::VMXON, + 28 => VmxExitReasonBasic::CR_ACCESS, + 29 => VmxExitReasonBasic::DR_ACCESS, + 30 => VmxExitReasonBasic::IO_INSTRUCTION, + 31 => VmxExitReasonBasic::RDMSR, + 32 => VmxExitReasonBasic::WRMSR, + 33 => VmxExitReasonBasic::VM_ENTRY_FAILURE_INVALID_GUEST_STATE, + 34 => VmxExitReasonBasic::VM_ENTRY_FAILURE_MSR_LOADING, + 36 => VmxExitReasonBasic::MWAIT, + 37 => VmxExitReasonBasic::MONITOR_TRAP_FLAG, + 39 => VmxExitReasonBasic::MONITOR, + 40 => VmxExitReasonBasic::PAUSE, + 41 => VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT, + 43 => VmxExitReasonBasic::TPR_BELOW_THRESHOLD, + 44 => VmxExitReasonBasic::APIC_ACCESS, + 45 => VmxExitReasonBasic::VIRTUALIZED_EOI, + 46 => VmxExitReasonBasic::ACCESS_GDTR_OR_IDTR, + 47 => VmxExitReasonBasic::ACCESS_LDTR_OR_TR, + 48 => VmxExitReasonBasic::EPT_VIOLATION, + 49 => VmxExitReasonBasic::EPT_MISCONFIG, + 50 => VmxExitReasonBasic::INVEPT, + 51 => VmxExitReasonBasic::RDTSCP, + 52 => VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED, + 53 => VmxExitReasonBasic::INVVPID, + 54 => VmxExitReasonBasic::WBINVD, + 55 => VmxExitReasonBasic::XSETBV, + 56 => VmxExitReasonBasic::APIC_WRITE, + 57 => VmxExitReasonBasic::RDRAND, + 58 => VmxExitReasonBasic::INVPCID, + 59 => VmxExitReasonBasic::VMFUNC, + 60 => VmxExitReasonBasic::ENCLS, + 61 => VmxExitReasonBasic::RDSEED, + 62 => VmxExitReasonBasic::PML_FULL, + 63 => VmxExitReasonBasic::XSAVES, + 64 => VmxExitReasonBasic::XRSTORS, + + 67 => VmxExitReasonBasic::UMWAIT, + 68 => VmxExitReasonBasic::TPAUSE, + 74 => VmxExitReasonBasic::BUS_LOCK, + 75 => VmxExitReasonBasic::NOTIFY, + _ => VmxExitReasonBasic::UNKNOWN, + } + } +} + +#[derive(Debug, PartialEq)] +#[allow(dead_code)] +pub enum ExitFastpathCompletion { + None, + ReenterGuest, + ExitHandled, +} + +pub struct VmxExitHandler; + +impl VmxExitHandler { + pub fn handle( + vcpu: &mut VirtCpu, + basic: VmxExitReasonBasic, + ) -> Option> { + match basic { + VmxExitReasonBasic::IO_INSTRUCTION => { + return Some(Self::handle_io(vcpu)); + } + _ => { + return None; + } + } + } + + fn handle_io(vcpu: &mut VirtCpu) -> Result<(), SystemError> { + todo!(); + } +} diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs index fe6b1917e..f210ae233 100644 --- a/kernel/src/arch/x86_64/vm/vmx/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -1,17 +1,22 @@ +use core::intrinsics::likely; use core::intrinsics::unlikely; use core::sync::atomic::{AtomicBool, Ordering}; +use crate::arch::process::table::USER_DS; use crate::arch::vm::mmu::KvmMmu; +use crate::arch::vm::uapi::kvm_exit; use crate::arch::vm::uapi::{ AC_VECTOR, BP_VECTOR, DB_VECTOR, GP_VECTOR, MC_VECTOR, NM_VECTOR, PF_VECTOR, UD_VECTOR, }; +use crate::arch::vm::vmx::vmcs::VmcsIntrHelper; use crate::libs::spinlock::SpinLockGuard; +use crate::process::ProcessManager; use crate::virt::vm::kvm_host::vcpu::GuestDebug; use crate::{ arch::{ vm::{ asm::KvmX86Asm, - kvm_host::{vcpu::VirCpuRequest, X86KvmArch}, + kvm_host::{vcpu::VirtCpuRequest, X86KvmArch}, vmx::vmcs::vmx_area, }, CurrentIrqArch, MMArch, VirtCpuArch, @@ -27,27 +32,26 @@ use crate::{ virt::vm::{kvm_dev::kvm_init, kvm_host::vcpu::VirtCpu, user_api::UapiKvmSegment}, }; use alloc::{alloc::Global, boxed::Box, collections::LinkedList, sync::Arc, vec::Vec}; +use asm::VMX_EPTP_AD_ENABLE_BIT; +use asm::VMX_EPTP_MT_WB; +use asm::VMX_EPTP_PWL_4; +use asm::VMX_EPTP_PWL_5; use bitfield_struct::bitfield; -use bitmap::{traits::BitMapOps, AllocBitmap, StaticBitmap}; +use bitmap::{traits::BitMapOps, AllocBitmap}; use raw_cpuid::CpuId; use system_error::SystemError; use x86::controlregs::{cr2, cr2_write}; -use x86::irq::PageFaultError; +use x86::dtables::ldtr; use x86::msr::wrmsr; +use x86::segmentation::load_ds; +use x86::segmentation::load_es; +use x86::segmentation::{ds, es, fs, gs}; +use x86::vmx::vmcs::ro; use x86::{ bits64::rflags::RFlags, - controlregs::{cr0, cr3, cr4, Cr0, Cr4, Xcr0}, - msr::{ - self, rdmsr, IA32_CSTAR, IA32_EFER, IA32_FMASK, IA32_FS_BASE, IA32_GS_BASE, - IA32_KERNEL_GSBASE, IA32_LSTAR, IA32_SMBASE, IA32_STAR, IA32_SYSENTER_CS, - IA32_SYSENTER_EIP, IA32_SYSENTER_ESP, IA32_TIME_STAMP_COUNTER, IA32_TSC_AUX, - IA32_VMX_BASIC, IA32_VMX_EPT_VPID_CAP, IA32_VMX_MISC, IA32_VMX_VMFUNC, - MSR_CORE_C1_RESIDENCY, MSR_CORE_C3_RESIDENCY, MSR_CORE_C6_RESIDENCY, MSR_CORE_C7_RESIDENCY, - MSR_IA32_ADDR0_START, MSR_IA32_ADDR3_END, MSR_IA32_CR3_MATCH, MSR_IA32_RTIT_OUTPUT_BASE, - MSR_IA32_RTIT_OUTPUT_MASK_PTRS, MSR_IA32_RTIT_STATUS, MSR_IA32_TSX_CTRL, - MSR_LASTBRANCH_TOS, MSR_LBR_SELECT, - }, - segmentation::{self, cs}, + controlregs::{cr0, cr4, Cr0, Cr4, Xcr0}, + msr::{self, rdmsr}, + segmentation::{self}, vmx::vmcs::{ control::{ self, EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls, @@ -56,7 +60,6 @@ use x86::{ }, }; use x86_64::registers::control::Cr3; -use x86_64::structures::idt::PageFaultErrorCode; use x86_64::{instructions::tables::sidt, registers::control::EferFlags}; use crate::{ @@ -69,6 +72,9 @@ use crate::{ virt::vm::kvm_host::Vm, }; +use self::exit::ExitFastpathCompletion; +use self::exit::VmxExitReason; +use self::exit::VmxExitReasonBasic; use self::vmcs::LoadedVmcs; use self::{ capabilities::{ProcessorTraceMode, VmcsConfig, VmxCapability}, @@ -79,19 +85,20 @@ use self::{ }, }; +use super::asm::IntrInfo; use super::asm::SegmentCacheField; use super::kvm_host::RMODE_TSS_SIZE; use super::x86_kvm_ops; use super::{ asm::{VcpuSegment, VmxAsm, VmxMsrEntry}, init_kvm_arch, - kvm_host::{ - vcpu, KvmFunc, KvmInitFunc, KvmIrqChipMode, KvmReg, MsrFilterType, NotifyVmExitFlags, - }, + kvm_host::{KvmFunc, KvmInitFunc, KvmIrqChipMode, KvmReg, MsrFilterType, NotifyVmExitFlags}, x86_kvm_manager, KvmArchManager, }; +pub mod asm; pub mod capabilities; +pub mod exit; pub mod vmcs; extern "C" { @@ -278,6 +285,7 @@ impl KvmInitFunc for VmxKvmInitFunc { init_vmx(vmx_init); self.setup_per_cpu(); + kwarn!("hardware setup finish"); Ok(()) } @@ -344,8 +352,11 @@ impl VmxKvmFunc { vmx.loaded_vmcs.lock().cpu = cpu; let id = vmx.loaded_vmcs.lock().vmcs.lock().revision_id(); - kdebug!("revision_id {id}"); - vcpu.request(VirCpuRequest::KVM_REQ_TLB_FLUSH); + kdebug!( + "revision_id {id} req {:?}", + VirtCpuRequest::KVM_REQ_TLB_FLUSH + ); + vcpu.request(VirtCpuRequest::KVM_REQ_TLB_FLUSH); VmxAsm::vmx_vmwrite( host::TR_BASE, @@ -358,7 +369,9 @@ impl VmxKvmFunc { VmxAsm::vmx_vmwrite(host::GDTR_BASE, pseudo_descriptpr.base as usize as u64); - VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_ESP, unsafe { rdmsr(IA32_SYSENTER_ESP) }); + VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_ESP, unsafe { + rdmsr(msr::IA32_SYSENTER_ESP) + }); } } @@ -489,7 +502,7 @@ impl KvmFunc for VmxKvmFunc { } } - fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu) { + fn apicv_pre_state_restore(&self, _vcpu: &mut VirtCpu) { // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#6924 // TODO: pi // todo!() @@ -604,9 +617,8 @@ impl KvmFunc for VmxKvmFunc { _ => { let uret_msr = vmx.find_uret_msr(msr_index); - if let Some(msr) = uret_msr { - let mut tmp_msr = VmxUretMsr::from(*msr); - vmx.set_guest_uret_msr(&mut tmp_msr, data)?; + if let Some((idx, _msr)) = uret_msr { + vmx.set_guest_uret_msr(idx, data)?; vmx.set_uret_msr(msr_index, data); } else { vcpu.arch.set_msr_common(&msr); @@ -674,7 +686,7 @@ impl KvmFunc for VmxKvmFunc { VmxAsm::vmx_vmwrite(control::VMENTRY_INTERRUPTION_INFO_FIELD, 0); - vcpu.request(VirCpuRequest::KVM_REQ_APIC_PAGE_RELOAD); + vcpu.request(VirtCpuRequest::MAKE_KVM_REQ_APIC_PAGE_RELOAD); vmx_info().vpid_sync_context(vcpu.vmx().vpid); @@ -702,7 +714,6 @@ impl KvmFunc for VmxKvmFunc { VmxAsm::vmx_vmwrite(guest::RFLAGS, rflags.bits()); if (old_rflags ^ vmx.rflags).contains(RFlags::FLAGS_VM) { - drop(vmx); let emulation_required = vmx_info().emulation_required(vcpu); vcpu.vmx_mut().emulation_required = emulation_required; } @@ -880,12 +891,12 @@ impl KvmFunc for VmxKvmFunc { fn has_emulated_msr(&self, msr: u32) -> bool { match msr { - IA32_SMBASE => { + msr::IA32_SMBASE => { return vmx_info().enable_unrestricted_guest || vmx_info().emulate_invalid_guest_state; } - IA32_VMX_BASIC..=IA32_VMX_VMFUNC => { + msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC => { return vmx_info().nested; } @@ -902,7 +913,7 @@ impl KvmFunc for VmxKvmFunc { fn get_msr_feature(&self, msr: &mut super::asm::VmxMsrEntry) -> bool { match msr.index { - IA32_VMX_BASIC..=IA32_VMX_VMFUNC => { + msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC => { if !vmx_info().nested { return false; } @@ -988,7 +999,7 @@ impl KvmFunc for VmxKvmFunc { VmxAsm::vmx_vmwrite(guest::GDTR_BASE, dt.base as usize as u64); } - fn is_vaild_cr0(&self, vcpu: &VirtCpu, cr0: Cr0) -> bool { + fn is_vaild_cr0(&self, vcpu: &VirtCpu, _cr0: Cr0) -> bool { if vcpu.arch.is_guest_mode() { todo!() } @@ -1008,11 +1019,11 @@ impl KvmFunc for VmxKvmFunc { return true; } - fn post_set_cr3(&self, vcpu: &VirtCpu, cr3: u64) { - // Nothing + fn post_set_cr3(&self, _vcpu: &VirtCpu, _cr3: u64) { + // Do Nothing } - fn vcpu_run(&self, vcpu: &mut VirtCpu) { + fn vcpu_run(&self, vcpu: &mut VirtCpu) -> ExitFastpathCompletion { if unlikely(vmx_info().enable_vnmi && vcpu.vmx().loaded_vmcs().soft_vnmi_blocked) { todo!() } @@ -1060,12 +1071,168 @@ impl KvmFunc for VmxKvmFunc { // TODO: atomic_switch_perf_msrs if vmx_info().enable_preemption_timer { - todo!() + // todo!() + kwarn!("vmx_update_hv_timer TODO"); } Vmx::vmx_vcpu_enter_exit(vcpu, vcpu.vmx().vmx_vcpu_run_flags()); - todo!() + unsafe { + load_ds(USER_DS); + load_es(USER_DS); + }; + + // TODO: pt_guest_exit + + // TODO: kvm_load_host_xsave_state + + if vcpu.arch.is_guest_mode() { + todo!() + } + + if unlikely(vcpu.vmx().fail != 0) { + return ExitFastpathCompletion::None; + } + + if unlikely( + vcpu.vmx().exit_reason.basic() + == VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT as u16, + ) { + todo!() + } + + if unlikely(vcpu.vmx().exit_reason.failed_vmentry()) { + return ExitFastpathCompletion::None; + } + + vcpu.vmx().loaded_vmcs().launched = true; + + // TODO: 处理中断 + + if vcpu.arch.is_guest_mode() { + return ExitFastpathCompletion::None; + } + + return Vmx::vmx_exit_handlers_fastpath(vcpu); + } + + fn prepare_switch_to_guest(&self, vcpu: &mut VirtCpu) { + // let cpu = smp_get_processor_id(); + let vmx = vcpu.vmx_mut(); + vmx.req_immediate_exit = false; + + if !vmx.guest_uret_msrs_loaded { + vmx.guest_uret_msrs_loaded = true; + + for (idx, msr) in vmx.guest_uret_msrs.iter().enumerate() { + if msr.load_into_hardware { + x86_kvm_manager().kvm_set_user_return_msr(idx, msr.data, msr.mask); + } + } + } + + // TODO: nested + + if vmx.guest_state_loaded { + return; + } + + // fixme: 这里读的是当前cpu的gsbase,正确安全做法应该为将gsbase设置为percpu变量 + let gs_base = unsafe { rdmsr(msr::IA32_KERNEL_GSBASE) }; + + let current = ProcessManager::current_pcb(); + let mut pcb_arch = current.arch_info_irqsave(); + + let fs_sel = fs().bits(); + let gs_sel = gs().bits(); + + unsafe { + pcb_arch.save_fsbase(); + pcb_arch.save_gsbase(); + } + + let fs_base = pcb_arch.fsbase(); + vmx.msr_host_kernel_gs_base = pcb_arch.gsbase() as u64; + + unsafe { wrmsr(msr::IA32_KERNEL_GSBASE, vmx.msr_guest_kernel_gs_base) }; + + let mut loaded_vmcs = vmx.loaded_vmcs(); + let host_state = &mut loaded_vmcs.host_state; + host_state.ldt_sel = unsafe { ldtr() }.bits(); + + host_state.ds_sel = ds().bits(); + host_state.es_sel = es().bits(); + + host_state.set_host_fsgs(fs_sel, gs_sel, fs_base, gs_base as usize); + drop(loaded_vmcs); + + vmx.guest_state_loaded = true; + } + + fn flush_tlb_all(&self, vcpu: &mut VirtCpu) { + if vmx_info().enable_ept { + VmxAsm::ept_sync_global(); + } else { + if vmx_info().has_invvpid_global() { + VmxAsm::sync_vcpu_global(); + } else { + VmxAsm::sync_vcpu_single(vcpu.vmx().vpid); + // TODO: 嵌套:VmxAsm::sync_vcpu_single(vcpu.vmx().nested.vpid02); + } + } + } + + fn handle_exit_irqoff(&self, vcpu: &mut VirtCpu) { + if vcpu.vmx().emulation_required { + return; + } + + let basic = VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic()); + + if basic == VmxExitReasonBasic::EXTERNAL_INTERRUPT { + todo!() + } else if basic == VmxExitReasonBasic::EXCEPTION_OR_NMI { + todo!() + } + } + + fn handle_exit( + &self, + vcpu: &mut VirtCpu, + fastpath: ExitFastpathCompletion, + ) -> Result<(), SystemError> { + let r = vmx_info().vmx_handle_exit(vcpu, fastpath); + + if vcpu.vmx().exit_reason.bus_lock_detected() { + todo!() + } + + r + } + + fn load_mmu_pgd(&self, vcpu: &mut VirtCpu, _vm: &Vm, root_hpa: u64, root_level: u32) { + let guest_cr3; + let eptp; + + if vmx_info().enable_ept { + eptp = vmx_info().construct_eptp(vcpu, root_hpa, root_level); + + VmxAsm::vmx_vmwrite(control::EPTP_FULL, eptp); + + if !vmx_info().enable_unrestricted_guest + && !vcpu.arch.cr0.contains(Cr0::CR0_ENABLE_PAGING) + { + todo!() + } else if vcpu.arch.is_register_dirty(KvmReg::VcpuExregCr3) { + guest_cr3 = vcpu.arch.cr3; + } else { + return; + } + } else { + todo!(); + } + + VmxAsm::vmx_vmwrite(guest::CR3, guest_cr3); } } @@ -1168,7 +1335,9 @@ impl Vmx { * an error that should result in #GP in the guest, unless userspace * handles it. */ + #[allow(dead_code)] pub const KVM_MSR_RET_INVALID: u32 = 2; /* in-kernel MSR emulation #GP condition */ + #[allow(dead_code)] pub const KVM_MSR_RET_FILTERED: u32 = 3; /* #GP due to userspace MSR filter */ pub const MAX_POSSIBLE_PASSTHROUGH_MSRS: usize = 16; @@ -1177,19 +1346,19 @@ impl Vmx { 0x48, // MSR_IA32_SPEC_CTRL 0x49, // MSR_IA32_PRED_CMD 0x10b, // MSR_IA32_FLUSH_CMD - IA32_TIME_STAMP_COUNTER, - IA32_FS_BASE, - IA32_GS_BASE, - IA32_KERNEL_GSBASE, + msr::IA32_TIME_STAMP_COUNTER, + msr::IA32_FS_BASE, + msr::IA32_GS_BASE, + msr::IA32_KERNEL_GSBASE, 0x1c4, // MSR_IA32_XFD 0x1c5, // MSR_IA32_XFD_ERR - IA32_SYSENTER_CS, - IA32_SYSENTER_ESP, - IA32_SYSENTER_EIP, - MSR_CORE_C1_RESIDENCY, - MSR_CORE_C3_RESIDENCY, - MSR_CORE_C6_RESIDENCY, - MSR_CORE_C7_RESIDENCY, + msr::IA32_SYSENTER_CS, + msr::IA32_SYSENTER_ESP, + msr::IA32_SYSENTER_EIP, + msr::MSR_CORE_C1_RESIDENCY, + msr::MSR_CORE_C3_RESIDENCY, + msr::MSR_CORE_C6_RESIDENCY, + msr::MSR_CORE_C7_RESIDENCY, ]; /// ### 查看CPU是否支持虚拟化 @@ -1214,12 +1383,12 @@ impl Vmx { #[inline(never)] pub fn set_up_user_return_msrs() { const VMX_URET_MSRS_LIST: &'static [u32] = &[ - IA32_FMASK, - IA32_LSTAR, - IA32_CSTAR, - IA32_EFER, - IA32_TSC_AUX, - IA32_STAR, + msr::IA32_FMASK, + msr::IA32_LSTAR, + msr::IA32_CSTAR, + msr::IA32_EFER, + msr::IA32_TSC_AUX, + msr::IA32_STAR, // 这个寄存器会出错<,先注释掉 // MSR_IA32_TSX_CTRL, ]; @@ -1274,7 +1443,7 @@ impl Vmx { ) } - let cap = unsafe { rdmsr(IA32_VMX_EPT_VPID_CAP) }; + let cap = unsafe { rdmsr(msr::IA32_VMX_EPT_VPID_CAP) }; vmx_cap.set_val_from_msr_val(cap); // 不支持ept但是读取到了值 @@ -1337,7 +1506,7 @@ impl Vmx { return Err(SystemError::EIO); } - let basic = unsafe { rdmsr(IA32_VMX_BASIC) }; + let basic = unsafe { rdmsr(msr::IA32_VMX_BASIC) }; let vmx_msr_high = (basic >> 32) as u32; let vmx_msr_low = basic as u32; @@ -1351,7 +1520,7 @@ impl Vmx { return Err(SystemError::EIO); } - let misc_msr = unsafe { rdmsr(IA32_VMX_MISC) }; + let misc_msr = unsafe { rdmsr(msr::IA32_VMX_MISC) }; vmcs_config.size = vmx_msr_high & 0x1fff; vmcs_config.basic_cap = vmx_msr_high & !0x1fff; @@ -1417,15 +1586,15 @@ impl Vmx { // x2Apic msr寄存器 return true; } - MSR_IA32_RTIT_STATUS - | MSR_IA32_RTIT_OUTPUT_BASE - | MSR_IA32_RTIT_OUTPUT_MASK_PTRS - | MSR_IA32_CR3_MATCH - | MSR_LBR_SELECT - | MSR_LASTBRANCH_TOS => { + msr::MSR_IA32_RTIT_STATUS + | msr::MSR_IA32_RTIT_OUTPUT_BASE + | msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS + | msr::MSR_IA32_CR3_MATCH + | msr::MSR_LBR_SELECT + | msr::MSR_LASTBRANCH_TOS => { return true; } - MSR_IA32_ADDR0_START..MSR_IA32_ADDR3_END => { + msr::MSR_IA32_ADDR0_START..msr::MSR_IA32_ADDR3_END => { return true; } 0xdc0..0xddf => { @@ -1481,6 +1650,24 @@ impl Vmx { *L1TF_VMX_MITIGATION.write() = VmxL1dFlushState::FlushNotRequired; } + pub fn construct_eptp(&self, vcpu: &mut VirtCpu, root_hpa: u64, root_level: u32) -> u64 { + let mut eptp = VMX_EPTP_MT_WB; + + eptp |= if root_level == 5 { + VMX_EPTP_PWL_5 + } else { + VMX_EPTP_PWL_4 + }; + + if self.enable_ept_ad && !vcpu.arch.is_guest_mode() { + eptp |= VMX_EPTP_AD_ENABLE_BIT; + } + + eptp |= root_hpa; + + return eptp; + } + fn vmx_reset_vcpu(&mut self, vcpu: &mut VirtCpu, vm: &Vm) { self.init_vmcs(vcpu, vm); @@ -1508,6 +1695,10 @@ impl Vmx { } if vmx_info().has_msr_bitmap() { + kdebug!( + "msr_bitmap addr 0x{:x}", + vcpu.vmx().vmcs01.lock().msr_bitmap.phys_addr() as u64 + ); VmxAsm::vmx_vmwrite( control::MSR_BITMAPS_ADDR_FULL, vcpu.vmx().vmcs01.lock().msr_bitmap.phys_addr() as u64, @@ -1680,6 +1871,349 @@ impl Vmx { self.setup_uret_msrs(vcpu); } + /// 打印VMCS信息用于debug + pub fn dump_vmcs(&self, vcpu: &VirtCpu) { + let vmentry_ctl = + EntryControls::from_bits_truncate(self.vmread(control::VMENTRY_CONTROLS) as u32); + + let vmexit_ctl = + ExitControls::from_bits_truncate(self.vmread(control::VMEXIT_CONTROLS) as u32); + + let cpu_based_exec_ctl = PrimaryControls::from_bits_truncate( + self.vmread(control::PRIMARY_PROCBASED_EXEC_CONTROLS) as u32, + ); + + let pin_based_exec_ctl = PinbasedControls::from_bits_truncate( + self.vmread(control::PINBASED_EXEC_CONTROLS) as u32, + ); + + // let cr4 = Cr4::from_bits_truncate(self.vmread(guest::CR4) as usize); + + let secondary_exec_control = if self.has_sceondary_exec_ctrls() { + SecondaryControls::from_bits_truncate( + self.vmread(control::SECONDARY_PROCBASED_EXEC_CONTROLS) as u32, + ) + } else { + SecondaryControls::empty() + }; + + if self.has_tertiary_exec_ctrls() { + todo!() + } + + kerror!( + "VMCS addr: 0x{:x}, last attempted VM-entry on CPU {:?}", + vcpu.vmx().loaded_vmcs().vmcs.lock().as_ref() as *const _ as usize, + vcpu.arch.last_vmentry_cpu + ); + + kerror!("--- GUEST STATE ---"); + kerror!( + "CR0: actual = 0x{:x}, shadow = 0x{:x}, gh_mask = 0x{:x}", + self.vmread(guest::CR0), + self.vmread(control::CR0_READ_SHADOW), + self.vmread(control::CR0_GUEST_HOST_MASK) + ); + kerror!( + "CR4: actual = 0x{:x}, shadow = 0x{:x}, gh_mask = 0x{:x}", + self.vmread(guest::CR4), + self.vmread(control::CR4_READ_SHADOW), + self.vmread(control::CR4_GUEST_HOST_MASK) + ); + kerror!("CR3: actual = 0x{:x}", self.vmread(guest::CR3)); + + if self.has_ept() { + kerror!( + "PDPTR0 = 0x{:x}, PDPTR1 = 0x{:x}", + self.vmread(guest::PDPTE0_FULL), + self.vmread(guest::PDPTE1_FULL) + ); + kerror!( + "PDPTR2 = 0x{:x}, PDPTR3 = 0x{:x}", + self.vmread(guest::PDPTE2_FULL), + self.vmread(guest::PDPTE3_FULL) + ); + } + kerror!( + "RSP = 0x{:x}, RIP = 0x{:x}", + self.vmread(guest::RSP), + self.vmread(guest::RIP) + ); + kerror!( + "RFLAGS = 0x{:x}, DR7 = 0x{:x}", + self.vmread(guest::RFLAGS), + self.vmread(guest::DR7) + ); + kerror!( + "Sysenter RSP = 0x{:x}, CS:RIP = 0x{:x}:0x{:x}", + self.vmread(guest::IA32_SYSENTER_ESP), + self.vmread(guest::IA32_SYSENTER_CS), + self.vmread(guest::IA32_SYSENTER_EIP), + ); + + self.dump_sel("CS: ", guest::CS_SELECTOR); + self.dump_sel("DS: ", guest::DS_SELECTOR); + self.dump_sel("SS: ", guest::SS_SELECTOR); + self.dump_sel("FS: ", guest::FS_SELECTOR); + self.dump_sel("GS: ", guest::GS_SELECTOR); + + self.dump_dtsel("GDTR: ", guest::GDTR_LIMIT); + self.dump_sel("LDTR: ", guest::LDTR_SELECTOR); + self.dump_dtsel("IDTR: ", guest::IDTR_LIMIT); + self.dump_sel("TR: ", guest::TR_SELECTOR); + + let efer_slot = vcpu + .vmx() + .msr_autoload + .guest + .find_loadstore_msr_slot(msr::IA32_EFER); + + if vmentry_ctl.contains(EntryControls::LOAD_IA32_EFER) { + kerror!("EFER = 0x{:x}", self.vmread(guest::IA32_EFER_FULL)); + } else if let Some(slot) = efer_slot { + kerror!( + "EFER = 0x{:x} (autoload)", + vcpu.vmx().msr_autoload.guest.val[slot].data + ); + } else if vmentry_ctl.contains(EntryControls::IA32E_MODE_GUEST) { + kerror!( + "EFER = 0x{:x} (effective)", + vcpu.arch.efer | (EferFlags::LONG_MODE_ACTIVE | EferFlags::LONG_MODE_ENABLE) + ); + } else { + kerror!( + "EFER = 0x{:x} (effective)", + vcpu.arch.efer & !(EferFlags::LONG_MODE_ACTIVE | EferFlags::LONG_MODE_ENABLE) + ); + } + + if vmentry_ctl.contains(EntryControls::LOAD_IA32_PAT) { + kerror!("PAT = 0x{:x}", self.vmread(guest::IA32_PAT_FULL)); + } + + kerror!( + "DebugCtl = 0x{:x}, DebugExceptions = 0x{:x}", + self.vmread(guest::IA32_DEBUGCTL_FULL), + self.vmread(guest::PENDING_DBG_EXCEPTIONS) + ); + + if self.has_load_perf_global_ctrl() + && vmentry_ctl.contains(EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL) + { + kerror!( + "PerfGlobCtl = 0x{:x}", + self.vmread(guest::IA32_PERF_GLOBAL_CTRL_FULL) + ); + } + + if vmentry_ctl.contains(EntryControls::LOAD_IA32_BNDCFGS) { + kerror!("BndCfgS = 0x{:x}", self.vmread(guest::IA32_BNDCFGS_FULL)); + } + + kerror!( + "Interruptibility = 0x{:x}, ActivityState = 0x{:x}", + self.vmread(guest::INTERRUPT_STATUS), + self.vmread(guest::ACTIVITY_STATE) + ); + + if secondary_exec_control.contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY) { + kerror!( + "InterruptStatus = 0x{:x}", + self.vmread(guest::INTERRUPT_STATUS) + ); + } + + if self.vmread(control::VMENTRY_MSR_LOAD_COUNT) > 0 { + self.dump_msrs("guest autoload", &vcpu.vmx().msr_autoload.guest); + } + if self.vmread(control::VMEXIT_MSR_LOAD_COUNT) > 0 { + self.dump_msrs("guest autostore", &vcpu.vmx().msr_autostore); + } + + kerror!("\n--- HOST STATE ---"); + kerror!( + "RIP = 0x{:x}, RSP = 0x{:x}", + self.vmread(host::RIP), + self.vmread(host::RSP) + ); + kerror!( + "CS = 0x{:x}, SS = 0x{:x}, DS = 0x{:x}, ES = 0x{:x}, FS = 0x{:x}, GS = 0x{:x}, TR = 0x{:x}", + self.vmread(host::CS_SELECTOR), + self.vmread(host::SS_SELECTOR), + self.vmread(host::DS_SELECTOR), + self.vmread(host::ES_SELECTOR), + self.vmread(host::FS_SELECTOR), + self.vmread(host::GS_SELECTOR), + self.vmread(host::TR_SELECTOR) + ); + kerror!( + "FSBase = 0x{:x}, GSBase = 0x{:x}, TRBase = 0x{:x}", + self.vmread(host::FS_BASE), + self.vmread(host::GS_BASE), + self.vmread(host::TR_BASE), + ); + kerror!( + "GDTBase = 0x{:x}, IDTBase = 0x{:x}", + self.vmread(host::GDTR_BASE), + self.vmread(host::IDTR_BASE), + ); + kerror!( + "CR0 = 0x{:x}, CR3 = 0x{:x}, CR4 = 0x{:x}", + self.vmread(host::CR0), + self.vmread(host::CR3), + self.vmread(host::CR4), + ); + kerror!( + "Sysenter RSP = 0x{:x}, CS:RIP=0x{:x}:0x{:x}", + self.vmread(host::IA32_SYSENTER_ESP), + self.vmread(host::IA32_SYSENTER_CS), + self.vmread(host::IA32_SYSENTER_EIP), + ); + + if vmexit_ctl.contains(ExitControls::LOAD_IA32_EFER) { + kerror!("EFER = 0x{:x}", self.vmread(host::IA32_EFER_FULL)); + } + + if vmexit_ctl.contains(ExitControls::LOAD_IA32_PAT) { + kerror!("PAT = 0x{:x}", self.vmread(host::IA32_PAT_FULL)); + } + + if self.has_load_perf_global_ctrl() + && vmexit_ctl.contains(ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL) + { + kerror!( + "PerfGlobCtl = 0x{:x}", + self.vmread(host::IA32_PERF_GLOBAL_CTRL_FULL) + ); + } + + if self.vmread(control::VMEXIT_MSR_LOAD_COUNT) > 0 { + self.dump_msrs("host autoload", &vcpu.vmx().msr_autoload.host); + } + + kerror!("\n--- CONTROL STATE ---"); + kerror!( + "\nCPUBased = {:?},\nSecondaryExec = 0x{:x},\nTertiaryExec = 0(Unused)", + cpu_based_exec_ctl, + secondary_exec_control, + ); + kerror!( + "\nPinBased = {:?},\nEntryControls = {:?},\nExitControls = {:?}", + pin_based_exec_ctl, + vmentry_ctl, + vmexit_ctl, + ); + kerror!( + "ExceptionBitmap = 0x{:x}, PFECmask = 0x{:x}, PFECmatch = 0x{:x}", + self.vmread(control::EXCEPTION_BITMAP), + self.vmread(control::PAGE_FAULT_ERR_CODE_MASK), + self.vmread(control::PAGE_FAULT_ERR_CODE_MATCH), + ); + kerror!( + "VMEntry: intr_info = 0x{:x}, errcode = 0x{:x}, ilen = 0x{:x}", + self.vmread(control::VMENTRY_INTERRUPTION_INFO_FIELD), + self.vmread(control::VMENTRY_EXCEPTION_ERR_CODE), + self.vmread(control::VMENTRY_INSTRUCTION_LEN), + ); + kerror!( + "VMExit: intr_info = 0x{:x}, errcode = 0x{:x}, ilen = 0x{:x}", + self.vmread(ro::VMEXIT_INSTRUCTION_INFO), + self.vmread(ro::VMEXIT_INTERRUPTION_ERR_CODE), + self.vmread(ro::VMEXIT_INSTRUCTION_LEN), + ); + kerror!( + " reason = 0x{:x}, qualification = 0x{:x}", + self.vmread(ro::EXIT_REASON), + self.vmread(ro::EXIT_QUALIFICATION), + ); + kerror!( + "IDTVectoring: info = 0x{:x}, errcode = 0x{:x}", + self.vmread(ro::IDT_VECTORING_INFO), + self.vmread(ro::IDT_VECTORING_ERR_CODE), + ); + kerror!("TSC Offset = 0x{:x}", self.vmread(control::TSC_OFFSET_FULL)); + + if secondary_exec_control.contains(SecondaryControls::USE_TSC_SCALING) { + kerror!( + "TSC Multiplier = 0x{:x}", + self.vmread(control::TSC_MULTIPLIER_FULL) + ); + } + + if cpu_based_exec_ctl.contains(PrimaryControls::USE_TPR_SHADOW) { + if secondary_exec_control.contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY) { + let status = self.vmread(guest::INTERRUPT_STATUS); + kerror!("SVI|RVI = 0x{:x}|0x{:x}", status >> 8, status & 0xff); + } + + kerror!( + "TPR Threshold = 0x{:x}", + self.vmread(control::TPR_THRESHOLD) + ); + if secondary_exec_control.contains(SecondaryControls::VIRTUALIZE_APIC) { + kerror!( + "APIC-access addr = 0x{:x}", + self.vmread(control::APIC_ACCESS_ADDR_FULL) + ); + } + kerror!( + "virt-APIC addr = 0x{:x}", + self.vmread(control::VIRT_APIC_ADDR_FULL) + ); + } + + if pin_based_exec_ctl.contains(PinbasedControls::POSTED_INTERRUPTS) { + kerror!( + "PostedIntrVec = 0x{:x}", + self.vmread(control::POSTED_INTERRUPT_NOTIFICATION_VECTOR) + ); + } + + if secondary_exec_control.contains(SecondaryControls::ENABLE_EPT) { + kerror!("EPT pointer = 0x{:x}", self.vmread(control::EPTP_FULL)); + } + if secondary_exec_control.contains(SecondaryControls::PAUSE_LOOP_EXITING) { + kerror!( + "PLE Gap = 0x{:x}, Window = 0x{:x}", + self.vmread(control::PLE_GAP), + self.vmread(control::PLE_WINDOW) + ); + } + if secondary_exec_control.contains(SecondaryControls::ENABLE_VPID) { + kerror!("Virtual processor ID = 0x{:x}", self.vmread(control::VPID)); + } + } + + pub fn dump_sel(&self, name: &'static str, sel: u32) { + kerror!( + "{name} sel = 0x{:x}, attr = 0x{:x}, limit = 0x{:x}, base = 0x{:x}", + self.vmread(sel), + self.vmread(sel + guest::ES_ACCESS_RIGHTS - guest::ES_SELECTOR), + self.vmread(sel + guest::ES_LIMIT - guest::ES_SELECTOR), + self.vmread(sel + guest::ES_BASE - guest::ES_SELECTOR), + ); + } + + pub fn dump_dtsel(&self, name: &'static str, limit: u32) { + kerror!( + "{name} limit = 0x{:x}, base = 0x{:x}", + self.vmread(limit), + self.vmread(limit + guest::GDTR_BASE - guest::GDTR_LIMIT) + ); + } + + pub fn dump_msrs(&self, name: &'static str, msr: &VmxMsrs) { + kerror!("MSR {name}:"); + for (idx, msr) in msr.val.iter().enumerate() { + kerror!("{idx}: msr = 0x{:x}, value = 0x{:x}", msr.index, msr.data); + } + } + + #[inline] + pub fn vmread(&self, field: u32) -> u64 { + VmxAsm::vmx_vmread(field) + } + fn setup_uret_msrs(&self, vcpu: &mut VirtCpu) { // 是否加载syscall相关msr let load_syscall_msrs = @@ -1709,7 +2243,7 @@ impl Vmx { fn setup_uret_msr(&self, vcpu: &mut VirtCpu, msr: u32, load_into_hardware: bool) { let uret_msr = vcpu.vmx_mut().find_uret_msr_mut(msr); - if let Some(msr) = uret_msr { + if let Some((_idx, msr)) = uret_msr { msr.load_into_hardware = load_into_hardware; } } @@ -1862,7 +2396,7 @@ impl Vmx { fn get_pin_based_exec_controls(&self, vcpu: &VirtCpu) -> PinbasedControls { let mut ctrls = self.vmcs_config.pin_based_exec_ctrl; - if vcpu.arch.vcpu_apicv_active() { + if !vcpu.arch.vcpu_apicv_active() { ctrls.remove(PinbasedControls::POSTED_INTERRUPTS); } @@ -2365,13 +2899,161 @@ impl Vmx { let fail = unsafe { __vmx_vcpu_run(vcpu.vmx(), vcpu.arch.regs.as_ptr(), flags.bits as u32) }; + vcpu.vmx_mut().fail = fail as u8; + vcpu.arch.cr2 = unsafe { cr2() } as u64; + vcpu.arch.regs_avail.set_all(true); + + // 这些寄存器需要更新缓存 + for reg_idx in Vmx::VMX_REGS_LAZY_LOAD_SET { + vcpu.arch.regs_avail.set(*reg_idx, false); + } + + vcpu.vmx_mut().idt_vectoring_info = IntrInfo::empty(); + + // TODO: enable_fb_clear + + if unlikely(vcpu.vmx().fail != 0) { + vcpu.vmx_mut().exit_reason = VmxExitReason::from(0xdead); + return; + } + + vcpu.vmx_mut().exit_reason = + VmxExitReason::from(VmxAsm::vmx_vmread(ro::EXIT_REASON) as u32); + + if likely(!vcpu.vmx().exit_reason.failed_vmentry()) { + vcpu.vmx_mut().idt_vectoring_info = + IntrInfo::from_bits_truncate(VmxAsm::vmx_vmread(ro::IDT_VECTORING_INFO) as u32); + } + + if VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic()) + == VmxExitReasonBasic::EXCEPTION_OR_NMI + && VmcsIntrHelper::is_nmi(Vmx::vmx_get_intr_info(vcpu)) + { + todo!() + } + } + + fn vmx_get_intr_info(vcpu: &mut VirtCpu) -> IntrInfo { + if !vcpu + .arch + .test_and_mark_available(KvmReg::VcpuExregExitInfo2) + { + vcpu.vmx_mut().exit_intr_info = IntrInfo::from_bits_truncate(VmxAsm::vmx_vmread( + ro::VMEXIT_INTERRUPTION_INFO, + ) as u32); + } + + return vcpu.vmx_mut().exit_intr_info; + } + + pub fn vmx_exit_handlers_fastpath(vcpu: &mut VirtCpu) -> ExitFastpathCompletion { + match VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic()) { + VmxExitReasonBasic::WRMSR => { + todo!() + } + VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED => { + todo!() + } + _ => ExitFastpathCompletion::None, + } + } + + pub fn vmx_handle_exit( + &self, + vcpu: &mut VirtCpu, + exit_fastpath: ExitFastpathCompletion, + ) -> Result<(), SystemError> { + let exit_reason = vcpu.vmx().exit_reason; + + let unexpected_vmexit = |vcpu: &mut VirtCpu| -> Result<(), SystemError> { + kerror!("vmx: unexpected exit reason {:?}\n", exit_reason); + + self.dump_vmcs(vcpu); + + let cpu = vcpu.arch.last_vmentry_cpu.into() as u64; + let run = vcpu.kvm_run_mut(); + run.exit_reason = kvm_exit::KVM_EXIT_INTERNAL_ERROR; + + unsafe { + run.__bindgen_anon_1.internal.ndata = 2; + run.__bindgen_anon_1.internal.data[0] = Into::::into(exit_reason) as u64; + run.__bindgen_anon_1.internal.data[1] = cpu; + } + + return Ok(()); + }; + + let vectoring_info = vcpu.vmx().idt_vectoring_info; + + if self.enable_pml && !vcpu.arch.is_guest_mode() { + todo!() + } + + if vcpu.arch.is_guest_mode() { + if exit_reason.basic() == VmxExitReasonBasic::PML_FULL as u16 { + return unexpected_vmexit(vcpu); + } + + todo!() + } + + if vcpu.vmx().emulation_required { + todo!() + } + + if exit_reason.failed_vmentry() { + self.dump_vmcs(vcpu); + todo!() + } + + if unlikely(vcpu.vmx().fail != 0) { + self.dump_vmcs(vcpu); + todo!() + } + + let basic = VmxExitReasonBasic::from(exit_reason.basic()); + if vectoring_info.contains(IntrInfo::INTR_INFO_VALID_MASK) + && basic != VmxExitReasonBasic::EXCEPTION_OR_NMI + && basic != VmxExitReasonBasic::EPT_VIOLATION + && basic != VmxExitReasonBasic::PML_FULL + && basic != VmxExitReasonBasic::APIC_ACCESS + && basic != VmxExitReasonBasic::TASK_SWITCH + && basic != VmxExitReasonBasic::NOTIFY + { + todo!() + } + + if unlikely(!self.enable_pml && vcpu.vmx().loaded_vmcs().soft_vnmi_blocked) { + todo!() + } + + if exit_fastpath != ExitFastpathCompletion::None { + return Err(SystemError::EINVAL); + } + todo!() } + + /// 需要在缓存中更新的寄存器集。此处未列出的其他寄存器在 VM 退出后立即同步到缓存。 + pub const VMX_REGS_LAZY_LOAD_SET: &'static [usize] = &[ + KvmReg::VcpuRegsRip as usize, + KvmReg::VcpuRegsRsp as usize, + KvmReg::VcpuExregRflags as usize, + KvmReg::NrVcpuRegs as usize, + KvmReg::VcpuExregSegments as usize, + KvmReg::VcpuExregCr0 as usize, + KvmReg::VcpuExregCr3 as usize, + KvmReg::VcpuExregCr4 as usize, + KvmReg::VcpuExregExitInfo1 as usize, + KvmReg::VcpuExregExitInfo2 as usize, + ]; } extern "C" { + /// #[allow(improper_ctypes)]因为只需要在内部调用而无需与C交互 + #[allow(improper_ctypes)] fn __vmx_vcpu_run(vmx: &VmxVCpuPriv, regs: *const u64, flags: u32) -> i32; } @@ -2473,11 +3155,18 @@ pub struct VmxSegmentCache { } #[derive(Debug)] +#[allow(dead_code)] pub struct VmxVCpuPriv { vpid: u16, fail: u8, + exit_reason: VmxExitReason, + + exit_intr_info: IntrInfo, + + idt_vectoring_info: IntrInfo, + vmcs01: Arc, loaded_vmcs: Arc, guest_uret_msrs: [VmxUretMsr; KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS], @@ -2491,6 +3180,9 @@ pub struct VmxVCpuPriv { msr_ia32_feature_control: u64, msr_ia32_feature_control_valid_bits: u64, + msr_host_kernel_gs_base: u64, + msr_guest_kernel_gs_base: u64, + emulation_required: bool, rflags: RFlags, @@ -2499,6 +3191,7 @@ pub struct VmxVCpuPriv { ple_window_dirty: bool, msr_autoload: VmxMsrAutoLoad, + msr_autostore: VmxMsrs, pml_pg: Box<[u8; MMArch::PAGE_SIZE]>, @@ -2509,9 +3202,13 @@ pub struct VmxVCpuPriv { hv_deadline_tsc: u64, segment_cache: VmxSegmentCache, + + req_immediate_exit: bool, + guest_state_loaded: bool, } #[derive(Debug, Default)] +#[allow(dead_code)] pub struct KvmVmx { tss_addr: usize, ept_identity_pagetable_done: bool, @@ -2560,6 +3257,14 @@ impl VmxVCpuPriv { segment_cache: VmxSegmentCache::default(), emulation_required: false, rflags: RFlags::empty(), + req_immediate_exit: false, + guest_state_loaded: false, + msr_host_kernel_gs_base: 0, + msr_guest_kernel_gs_base: 0, + idt_vectoring_info: IntrInfo::empty(), + exit_reason: VmxExitReason::new(), + exit_intr_info: IntrInfo::empty(), + msr_autostore: VmxMsrs::default(), }; vmx.vpid = vmx_info().alloc_vpid().unwrap_or_default() as u16; @@ -2569,8 +3274,8 @@ impl VmxVCpuPriv { } if CpuId::new().get_extended_feature_info().unwrap().has_rtm() { - let tsx_ctrl = vmx.find_uret_msr_mut(MSR_IA32_TSX_CTRL); - if let Some(tsx_ctrl) = tsx_ctrl { + let tsx_ctrl = vmx.find_uret_msr_mut(msr::MSR_IA32_TSX_CTRL); + if let Some((_idx, tsx_ctrl)) = tsx_ctrl { // Disable TSX enumeration tsx_ctrl.mask = !(1 << 1); } @@ -2581,20 +3286,20 @@ impl VmxVCpuPriv { let arch = &vm.arch; - vmx.disable_intercept_for_msr(arch, IA32_TIME_STAMP_COUNTER, MsrType::READ); - vmx.disable_intercept_for_msr(arch, IA32_FS_BASE, MsrType::RW); - vmx.disable_intercept_for_msr(arch, IA32_GS_BASE, MsrType::RW); - vmx.disable_intercept_for_msr(arch, IA32_KERNEL_GSBASE, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_TIME_STAMP_COUNTER, MsrType::READ); + vmx.disable_intercept_for_msr(arch, msr::IA32_FS_BASE, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_GS_BASE, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_KERNEL_GSBASE, MsrType::RW); - vmx.disable_intercept_for_msr(arch, IA32_SYSENTER_CS, MsrType::RW); - vmx.disable_intercept_for_msr(arch, IA32_SYSENTER_ESP, MsrType::RW); - vmx.disable_intercept_for_msr(arch, IA32_SYSENTER_EIP, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_SYSENTER_CS, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_SYSENTER_ESP, MsrType::RW); + vmx.disable_intercept_for_msr(arch, msr::IA32_SYSENTER_EIP, MsrType::RW); if arch.pause_in_guest { - vmx.disable_intercept_for_msr(arch, MSR_CORE_C1_RESIDENCY, MsrType::READ); - vmx.disable_intercept_for_msr(arch, MSR_CORE_C3_RESIDENCY, MsrType::READ); - vmx.disable_intercept_for_msr(arch, MSR_CORE_C6_RESIDENCY, MsrType::READ); - vmx.disable_intercept_for_msr(arch, MSR_CORE_C7_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C1_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C3_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C6_RESIDENCY, MsrType::READ); + vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C7_RESIDENCY, MsrType::READ); } if vmx_info().enable_flexpriority && vcpu.arch.lapic_in_kernel() { @@ -2613,35 +3318,38 @@ impl VmxVCpuPriv { vcpu.private = Some(vmx); } - pub fn find_uret_msr(&self, msr: u32) -> Option<&VmxUretMsr> { + pub fn find_uret_msr(&self, msr: u32) -> Option<(usize, &VmxUretMsr)> { let idx = x86_kvm_manager().find_user_return_msr_idx(msr); if let Some(index) = idx { - return Some(&self.guest_uret_msrs[index]); + return Some((index, &self.guest_uret_msrs[index])); } else { return None; } } fn set_uret_msr(&mut self, msr: u32, data: u64) { - if let Some(msr) = self.find_uret_msr_mut(msr) { + if let Some((_idx, msr)) = self.find_uret_msr_mut(msr) { msr.data = data; } } - pub fn find_uret_msr_mut(&mut self, msr: u32) -> Option<&mut VmxUretMsr> { + pub fn find_uret_msr_mut(&mut self, msr: u32) -> Option<(usize, &mut VmxUretMsr)> { let idx = x86_kvm_manager().find_user_return_msr_idx(msr); if let Some(index) = idx { - return Some(&mut self.guest_uret_msrs[index]); + return Some((index, &mut self.guest_uret_msrs[index])); } else { return None; } } - fn set_guest_uret_msr(&mut self, msr: &VmxUretMsr, data: u64) -> Result<(), SystemError> { + fn set_guest_uret_msr(&mut self, slot: usize, data: u64) -> Result<(), SystemError> { + let msr = &mut self.guest_uret_msrs[slot]; if msr.load_into_hardware { - todo!() + x86_kvm_manager().kvm_set_user_return_msr(slot, data, msr.mask); } + msr.data = data; + Ok(()) } @@ -2784,7 +3492,7 @@ impl VmxVCpuPriv { } let m = &mut self.msr_autoload; - let mut i = m.guest.find_loadstore_msr_slot(msr); + let i = m.guest.find_loadstore_msr_slot(msr); let j = if !entry_only { m.host.find_loadstore_msr_slot(msr) } else { @@ -2794,7 +3502,7 @@ impl VmxVCpuPriv { if (i.is_none() && m.guest.nr == VmxMsrs::MAX_NR_LOADSTORE_MSRS) || (j.is_none() && m.host.nr == VmxMsrs::MAX_NR_LOADSTORE_MSRS) { - kwarn!("Not enough msr switch entries. Can't add msr {:x}", msr); + kwarn!("Not enough msr switch entries. Can't add msr 0x{:x}", msr); return; } @@ -2821,8 +3529,8 @@ impl VmxVCpuPriv { j.unwrap() }; - m.host.val[i].index = msr; - m.host.val[i].data = host_val; + m.host.val[j].index = msr; + m.host.val[j].data = host_val; } fn add_atomic_switch_msr_special( @@ -2887,6 +3595,7 @@ bitflags! { } #[derive(Debug, PartialEq)] +#[allow(dead_code)] pub enum VmxL1dFlushState { FlushAuto, FlushNever, @@ -2962,143 +3671,6 @@ pub const KVM_VMX_SEGMENT_FIELDS: &'static [VmxSegmentField] = &[ }, ]; -#[derive(FromPrimitive)] -#[allow(non_camel_case_types)] -pub enum VmxExitReason { - EXCEPTION_OR_NMI = 0, - EXTERNAL_INTERRUPT = 1, - TRIPLE_FAULT = 2, - INIT_SIGNAL = 3, - SIPI = 4, - IO_SMI = 5, - OTHER_SMI = 6, - INTERRUPT_WINDOW = 7, - NMI_WINDOW = 8, - TASK_SWITCH = 9, - CPUID = 10, - GETSEC = 11, - HLT = 12, - INVD = 13, - INVLPG = 14, - RDPMC = 15, - RDTSC = 16, - RSM = 17, - VMCALL = 18, - VMCLEAR = 19, - VMLAUNCH = 20, - VMPTRLD = 21, - VMPTRST = 22, - VMREAD = 23, - VMRESUME = 24, - VMWRITE = 25, - VMXOFF = 26, - VMXON = 27, - CR_ACCESS = 28, - DR_ACCESS = 29, - IO_INSTRUCTION = 30, - RDMSR = 31, - WRMSR = 32, - VM_ENTRY_FAILURE_INVALID_GUEST_STATE = 33, - VM_ENTRY_FAILURE_MSR_LOADING = 34, - MWAIT = 36, - MONITOR_TRAP_FLAG = 37, - MONITOR = 39, - PAUSE = 40, - VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT = 41, - TPR_BELOW_THRESHOLD = 43, - APIC_ACCESS = 44, - VIRTUALIZED_EOI = 45, - ACCESS_GDTR_OR_IDTR = 46, - ACCESS_LDTR_OR_TR = 47, - EPT_VIOLATION = 48, - EPT_MISCONFIG = 49, - INVEPT = 50, - RDTSCP = 51, - VMX_PREEMPTION_TIMER_EXPIRED = 52, - INVVPID = 53, - WBINVD = 54, - XSETBV = 55, - APIC_WRITE = 56, - RDRAND = 57, - INVPCID = 58, - VMFUNC = 59, - ENCLS = 60, - RDSEED = 61, - PML_FULL = 62, - XSAVES = 63, - XRSTORS = 64, -} - -impl From for VmxExitReason { - fn from(num: i32) -> Self { - match num { - 0 => VmxExitReason::EXCEPTION_OR_NMI, - 1 => VmxExitReason::EXTERNAL_INTERRUPT, - 2 => VmxExitReason::TRIPLE_FAULT, - 3 => VmxExitReason::INIT_SIGNAL, - 4 => VmxExitReason::SIPI, - 5 => VmxExitReason::IO_SMI, - 6 => VmxExitReason::OTHER_SMI, - 7 => VmxExitReason::INTERRUPT_WINDOW, - 8 => VmxExitReason::NMI_WINDOW, - 9 => VmxExitReason::TASK_SWITCH, - 10 => VmxExitReason::CPUID, - 11 => VmxExitReason::GETSEC, - 12 => VmxExitReason::HLT, - 13 => VmxExitReason::INVD, - 14 => VmxExitReason::INVLPG, - 15 => VmxExitReason::RDPMC, - 16 => VmxExitReason::RDTSC, - 17 => VmxExitReason::RSM, - 18 => VmxExitReason::VMCALL, - 19 => VmxExitReason::VMCLEAR, - 20 => VmxExitReason::VMLAUNCH, - 21 => VmxExitReason::VMPTRLD, - 22 => VmxExitReason::VMPTRST, - 23 => VmxExitReason::VMREAD, - 24 => VmxExitReason::VMRESUME, - 25 => VmxExitReason::VMWRITE, - 26 => VmxExitReason::VMXOFF, - 27 => VmxExitReason::VMXON, - 28 => VmxExitReason::CR_ACCESS, - 29 => VmxExitReason::DR_ACCESS, - 30 => VmxExitReason::IO_INSTRUCTION, - 31 => VmxExitReason::RDMSR, - 32 => VmxExitReason::WRMSR, - 33 => VmxExitReason::VM_ENTRY_FAILURE_INVALID_GUEST_STATE, - 34 => VmxExitReason::VM_ENTRY_FAILURE_MSR_LOADING, - 36 => VmxExitReason::MWAIT, - 37 => VmxExitReason::MONITOR_TRAP_FLAG, - 39 => VmxExitReason::MONITOR, - 40 => VmxExitReason::PAUSE, - 41 => VmxExitReason::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT, - 43 => VmxExitReason::TPR_BELOW_THRESHOLD, - 44 => VmxExitReason::APIC_ACCESS, - 45 => VmxExitReason::VIRTUALIZED_EOI, - 46 => VmxExitReason::ACCESS_GDTR_OR_IDTR, - 47 => VmxExitReason::ACCESS_LDTR_OR_TR, - 48 => VmxExitReason::EPT_VIOLATION, - 49 => VmxExitReason::EPT_MISCONFIG, - 50 => VmxExitReason::INVEPT, - 51 => VmxExitReason::RDTSCP, - 52 => VmxExitReason::VMX_PREEMPTION_TIMER_EXPIRED, - 53 => VmxExitReason::INVVPID, - 54 => VmxExitReason::WBINVD, - 55 => VmxExitReason::XSETBV, - 56 => VmxExitReason::APIC_WRITE, - 57 => VmxExitReason::RDRAND, - 58 => VmxExitReason::INVPCID, - 59 => VmxExitReason::VMFUNC, - 60 => VmxExitReason::ENCLS, - 61 => VmxExitReason::RDSEED, - 62 => VmxExitReason::PML_FULL, - 63 => VmxExitReason::XSAVES, - 64 => VmxExitReason::XRSTORS, - _ => panic!("Invalid VmxExitReason number: {}", num), - } - } -} - pub static L1TF_VMX_MITIGATION: RwLock = RwLock::new(VmxL1dFlushState::FlushAuto); pub fn vmx_init() -> Result<(), SystemError> { @@ -3120,17 +3692,16 @@ pub fn vmx_init() -> Result<(), SystemError> { #[no_mangle] unsafe extern "C" fn vmx_update_host_rsp(vcpu_vmx: &VmxVCpuPriv, host_rsp: usize) { + kwarn!("vmx_update_host_rsp"); let mut guard = vcpu_vmx.loaded_vmcs.lock(); if unlikely(host_rsp != guard.host_state.rsp) { guard.host_state.rsp = host_rsp; VmxAsm::vmx_vmwrite(host::RSP, host_rsp as u64); } - - return; } #[no_mangle] -unsafe extern "C" fn vmx_spec_ctrl_restore_host(vcpu_vmx: &VmxVCpuPriv, flags: u32) { +unsafe extern "C" fn vmx_spec_ctrl_restore_host(_vcpu_vmx: &VmxVCpuPriv, _flags: u32) { // TODO - return; + kwarn!("vmx_spec_ctrl_restore_host todo!"); } diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs index cd16f6c6a..3023deaf3 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs @@ -12,7 +12,7 @@ use x86::{ use crate::arch::vm::vmx::Vmx; pub struct VmxFeat; - +#[allow(dead_code)] impl VmxFeat { pub const KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL: u32 = PrimaryControls::HLT_EXITING.bits() | PrimaryControls::CR3_LOAD_EXITING.bits() @@ -111,48 +111,52 @@ impl VmxFeat { pub const VMX_BASIC_INOUT: u64 = 0x0040000000000000; pub fn adjust_primary_controls() -> Result { - Ok(PrimaryControls::from_bits_truncate( - Vmx::adjust_vmx_controls( + Ok(unsafe { + PrimaryControls::from_bits_unchecked(Vmx::adjust_vmx_controls( Self::KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, Self::KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, IA32_VMX_PROCBASED_CTLS, - )?, - )) + )?) + }) } pub fn adjust_secondary_controls() -> Result { - Ok(SecondaryControls::from_bits_truncate( - Vmx::adjust_vmx_controls( + Ok(unsafe { + SecondaryControls::from_bits_unchecked(Vmx::adjust_vmx_controls( Self::KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, Self::KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, IA32_VMX_PROCBASED_CTLS2, - )?, - )) + )?) + }) } pub fn adjust_exit_controls() -> Result { - Ok(ExitControls::from_bits_truncate(Vmx::adjust_vmx_controls( - Self::KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, - Self::KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, - IA32_VMX_EXIT_CTLS, - )?)) + Ok(unsafe { + ExitControls::from_bits_unchecked(Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, + Self::KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, + IA32_VMX_EXIT_CTLS, + )?) + }) } pub fn adjust_entry_controls() -> Result { - Ok(EntryControls::from_bits_truncate(Vmx::adjust_vmx_controls( - Self::KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, - Self::KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, - IA32_VMX_ENTRY_CTLS, - )?)) + Ok(unsafe { + EntryControls::from_bits_unchecked(Vmx::adjust_vmx_controls( + Self::KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, + Self::KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, + IA32_VMX_ENTRY_CTLS, + )?) + }) } pub fn adjust_pin_based_controls() -> Result { - Ok(PinbasedControls::from_bits_truncate( - Vmx::adjust_vmx_controls( + Ok(unsafe { + PinbasedControls::from_bits_unchecked(Vmx::adjust_vmx_controls( Self::KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, Self::KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, IA32_VMX_PINBASED_CTLS, - )?, - )) + )?) + }) } } diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs index fac1a0504..f4af66611 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs @@ -1,18 +1,21 @@ -use alloc::{boxed::Box, collections::LinkedList, sync::Arc, vec::Vec}; +use core::intrinsics::unlikely; + +use alloc::{boxed::Box, collections::LinkedList, sync::Arc}; use bitmap::{traits::BitMapOps, AllocBitmap}; -use system_error::SystemError; use x86::{ controlregs::Cr4, vmx::vmcs::{ control::{self, PrimaryControls}, - guest, + host, }, }; -use x86_64::registers::control::{Cr3, Cr3Flags}; +use x86_64::registers::control::Cr3Flags; use crate::{ - arch::{vm::asm::VmxAsm, MMArch}, - kdebug, + arch::{ + vm::asm::{IntrInfo, IntrType, VmxAsm}, + MMArch, + }, libs::spinlock::{SpinLock, SpinLockGuard}, mm::{percpu::PerCpuVar, virt_2_phys, MemoryManagementArch, PhysAddr}, smp::cpu::ProcessorId, @@ -38,6 +41,7 @@ pub fn current_loaded_vmcs_list_mut() -> &'static mut LinkedList &'static LinkedList> { unsafe { PERCPU_LOADED_VMCS_LIST.as_ref().unwrap().get() } } @@ -70,6 +74,7 @@ impl VMControlStructure { self.header & 0x7FFF_FFFF } + #[allow(dead_code)] pub fn is_shadow_vmcs(&self) -> bool { self.header & 0x8000_0000 == 1 } @@ -125,7 +130,41 @@ pub struct VmcsHostState { pub gs_sel: u16, pub ldt_sel: u16, pub ds_sel: u16, - pub rs_sel: u16, + pub es_sel: u16, +} + +impl VmcsHostState { + pub fn set_host_fsgs(&mut self, fs_sel: u16, gs_sel: u16, fs_base: usize, gs_base: usize) { + if unlikely(self.fs_sel != fs_sel) { + if (fs_sel & 7) == 0 { + VmxAsm::vmx_vmwrite(host::FS_SELECTOR, fs_sel as u64); + } else { + VmxAsm::vmx_vmwrite(host::FS_SELECTOR, 0); + } + + self.fs_sel = fs_sel; + } + + if unlikely(self.gs_sel != gs_sel) { + if (gs_sel & 7) == 0 { + VmxAsm::vmx_vmwrite(host::GS_SELECTOR, gs_sel as u64); + } else { + VmxAsm::vmx_vmwrite(host::GS_SELECTOR, 0); + } + + self.gs_sel = gs_sel; + } + + if unlikely(fs_base != self.fs_base) { + VmxAsm::vmx_vmwrite(host::FS_BASE, fs_base as u64); + self.fs_base = fs_base; + } + + if unlikely(self.gs_base != gs_base) { + VmxAsm::vmx_vmwrite(host::GS_BASE, gs_base as u64); + self.gs_base = gs_base; + } + } } impl Default for VmcsHostState { @@ -140,7 +179,7 @@ impl Default for VmcsHostState { gs_sel: 0, ldt_sel: 0, ds_sel: 0, - rs_sel: 0, + es_sel: 0, } } } @@ -156,6 +195,7 @@ pub struct VmcsControlsShadow { } #[derive(Debug)] +#[allow(dead_code)] pub struct LoadedVmcs { pub vmcs: Arc, pub shadow_vmcs: Option>, @@ -262,6 +302,7 @@ pub struct LockedLoadedVmcs { } #[derive(Debug, Clone, Copy)] +#[allow(dead_code)] pub enum ControlsType { VmEntry, VmExit, @@ -384,3 +425,18 @@ impl VmxMsrBitmap { } } } + +/// 中断相关辅助函数载体 +pub struct VmcsIntrHelper; + +impl VmcsIntrHelper { + pub fn is_nmi(intr_info: IntrInfo) -> bool { + return Self::is_intr_type(intr_info, IntrType::INTR_TYPE_NMI_INTR); + } + + pub fn is_intr_type(intr_info: IntrInfo, intr_type: IntrType) -> bool { + return (intr_info & (IntrInfo::INTR_INFO_VALID_MASK | IntrInfo::INTR_INFO_INTR_TYPE_MASK)) + .bits() + == IntrInfo::INTR_INFO_VALID_MASK.bits() | intr_type.bits() as u32; + } +} diff --git a/kernel/src/arch/x86_64/vm/vmx/vmenter.S b/kernel/src/arch/x86_64/vm/vmx/vmenter.S index b1119e76b..10f3fca2a 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmenter.S +++ b/kernel/src/arch/x86_64/vm/vmx/vmenter.S @@ -100,13 +100,12 @@ ENTRY(__vmx_vcpu_run) jmp .Lvmfail .Lvmlaunch: - vmlaunch + call vmx_vmlaunch jmp .Lvmfail // 从guest模式退出 ENTRY(vmx_vmexit) // TODO: unwind hint restore - // 临时保存guest RAX push %rax @@ -172,6 +171,7 @@ ENTRY(vmx_vmexit) ret .Lvmfail: + // 失败,设置返回值为1 mov $1, %rbx jmp .Lclear_regs diff --git a/kernel/src/virt/vm/kvm_dev.rs b/kernel/src/virt/vm/kvm_dev.rs index 9435fd8f1..9214faf5d 100644 --- a/kernel/src/virt/vm/kvm_dev.rs +++ b/kernel/src/virt/vm/kvm_dev.rs @@ -1,14 +1,11 @@ -use core::{ - intrinsics::unlikely, - sync::atomic::{AtomicUsize, Ordering}, -}; +use core::intrinsics::unlikely; use alloc::sync::{Arc, Weak}; use system_error::SystemError; use crate::{ arch::{ - vm::{kvm_host::KvmCommonRegs, uapi::UapiKvmSegmentRegs, x86_kvm_manager}, + vm::{kvm_host::KvmCommonRegs, uapi::UapiKvmSegmentRegs}, MMArch, }, driver::base::device::device_number::DeviceNumber, @@ -29,10 +26,7 @@ use crate::{ virt::vm::user_api::{KvmUserspaceMemoryRegion, PosixKvmUserspaceMemoryRegion}, }; -use super::kvm_host::{ - vcpu::{LockedVirtCpu, VirtCpu}, - LockedVm, Vm, -}; +use super::kvm_host::{vcpu::LockedVirtCpu, LockedVm}; #[derive(Debug)] pub struct KvmInode { @@ -269,9 +263,9 @@ impl IndexNode for KvmInstance { fn read_at( &self, - offset: usize, - len: usize, - buf: &mut [u8], + _offset: usize, + _len: usize, + _buf: &mut [u8], _data: crate::libs::spinlock::SpinLockGuard, ) -> Result { todo!() @@ -279,9 +273,9 @@ impl IndexNode for KvmInstance { fn write_at( &self, - offset: usize, - len: usize, - buf: &[u8], + _offset: usize, + _len: usize, + _buf: &[u8], _data: crate::libs::spinlock::SpinLockGuard, ) -> Result { todo!() @@ -370,10 +364,9 @@ impl IndexNode for KvmVcpuDev { arg: usize, _private_data: &crate::filesystem::vfs::FilePrivateData, ) -> Result { - kdebug!("vcpu ioctl cmd {cmd:x}"); match cmd { Self::KVM_RUN => { - if arg != 0 { + if arg != 0 { return Err(SystemError::EINVAL); } let mut vcpu = self.vcpu.lock(); @@ -454,9 +447,9 @@ impl IndexNode for KvmVcpuDev { fn read_at( &self, - offset: usize, - len: usize, - buf: &mut [u8], + _offset: usize, + _len: usize, + _buf: &mut [u8], _data: crate::libs::spinlock::SpinLockGuard, ) -> Result { todo!() @@ -464,9 +457,9 @@ impl IndexNode for KvmVcpuDev { fn write_at( &self, - offset: usize, - len: usize, - buf: &[u8], + _offset: usize, + _len: usize, + _buf: &[u8], _data: crate::libs::spinlock::SpinLockGuard, ) -> Result { todo!() diff --git a/kernel/src/virt/vm/kvm_host/mem.rs b/kernel/src/virt/vm/kvm_host/mem.rs index fddca79be..f284244e6 100644 --- a/kernel/src/virt/vm/kvm_host/mem.rs +++ b/kernel/src/virt/vm/kvm_host/mem.rs @@ -1,20 +1,20 @@ use alloc::{ boxed::Box, sync::{Arc, Weak}, + vec::Vec, }; use bitmap::AllocBitmap; use hashbrown::HashMap; use system_error::SystemError; -use x86::bits64::registers::rbp; use crate::{ - arch::{kvm_arch_ops, MMArch}, + arch::MMArch, libs::{ rbtree::RBTree, rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard}, spinlock::{SpinLock, SpinLockGuard}, }, - mm::{MemoryManagementArch, PhysAddr, VirtAddr}, + mm::{MemoryManagementArch, VirtAddr}, virt::vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion}, }; @@ -25,6 +25,44 @@ pub const KVM_INTERNAL_MEM_SLOTS: u16 = 3; pub const KVM_MEM_SLOTS_NUM: u16 = KVM_USER_MEM_SLOTS - KVM_INTERNAL_MEM_SLOTS; pub const KVM_MEM_MAX_NR_PAGES: usize = (1 << 31) - 1; +#[derive(Debug, Default)] +#[allow(dead_code)] +pub struct KvmMmuMemoryCache { + gfp_zero: u32, + gfp_custom: u32, + capacity: usize, + nobjs: usize, + objects: Option>>, +} +impl KvmMmuMemoryCache { + #[allow(dead_code)] + pub fn kvm_mmu_totup_memory_cache( + &mut self, + _capacity: usize, + _min: usize, + ) -> Result<(), SystemError> { + // let gfp = if self.gfp_custom != 0 { + // self.gfp_custom + // } else { + // todo!(); + // }; + + // if self.nobjs >= min { + // return Ok(()); + // } + + // if unlikely(self.objects.is_none()) { + // if self.capacity == 0 { + // return Err(SystemError::EIO); + // } + + // // self.objects = Some(Box::new) + // } + + Ok(()) + } +} + #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default)] pub struct AddrRange { pub start: VirtAddr, @@ -127,6 +165,7 @@ impl LockedVmMemSlotSet { } #[derive(Debug, Default)] +#[allow(dead_code)] pub struct GfnToHvaCache { generation: u64, /// 客户机对应物理地址(Guest Physical Address) diff --git a/kernel/src/virt/vm/kvm_host/mod.rs b/kernel/src/virt/vm/kvm_host/mod.rs index bdba71ebb..29aa746ef 100644 --- a/kernel/src/virt/vm/kvm_host/mod.rs +++ b/kernel/src/virt/vm/kvm_host/mod.rs @@ -4,24 +4,22 @@ use core::{ }; use alloc::{ - alloc::Global, boxed::Box, sync::{Arc, Weak}, vec::Vec, }; use hashbrown::HashMap; use system_error::SystemError; -use x86::bits64::registers::rsp; use crate::{ arch::{ - vm::{kvm_host::vcpu::VirCpuRequest, vmx::KvmVmx, x86_kvm_manager}, + vm::{kvm_host::vcpu::VirtCpuRequest, vmx::KvmVmx, x86_kvm_manager}, CurrentKvmManager, KvmArch, VirtCpuArch, }, filesystem::vfs::file::{File, FileMode}, libs::spinlock::{SpinLock, SpinLockGuard}, mm::ucontext::AddressSpace, - process::{KernelStack, ProcessManager}, + process::ProcessManager, smp::cpu::ProcessorId, virt::vm::{ kvm_dev::KvmVcpuDev, @@ -31,7 +29,7 @@ use crate::{ use self::{ mem::{GfnToHvaCache, KvmMemSlotSet, LockedVmMemSlotSet, PfnCacheUsage}, - vcpu::GuestDebug, + vcpu::{GuestDebug, VcpuMode}, }; pub mod mem; @@ -117,6 +115,7 @@ impl LockedVm { } #[derive(Debug)] +#[allow(dead_code)] pub struct Vm { lock_vm_ref: Weak, mm: Arc, @@ -203,10 +202,11 @@ impl Vm { pv_time: GfnToHvaCache::init(self.lock_vm_ref.clone(), PfnCacheUsage::HOST_USES_PFN), arch: VirtCpuArch::new(), private: None, - request: VirCpuRequest::empty(), + request: VirtCpuRequest::empty(), guest_debug: GuestDebug::empty(), run: unsafe { Some(Box::new_zeroed().assume_init()) }, vcpu_idx: 0, + mode: VcpuMode::OutsideGuestMode, }; } @@ -223,6 +223,7 @@ impl Vm { /// ## 多处理器状态(有些状态在某些架构并不合法) #[derive(Debug, Clone, Copy, PartialEq)] +#[allow(dead_code)] pub enum MutilProcessorState { Runnable, Uninitialized, diff --git a/kernel/src/virt/vm/kvm_host/vcpu.rs b/kernel/src/virt/vm/kvm_host/vcpu.rs index 16a1282c1..56d71c2c5 100644 --- a/kernel/src/virt/vm/kvm_host/vcpu.rs +++ b/kernel/src/virt/vm/kvm_host/vcpu.rs @@ -1,7 +1,4 @@ -use core::mem::MaybeUninit; - use alloc::{ - alloc::Global, boxed::Box, string::String, sync::{Arc, Weak}, @@ -9,21 +6,18 @@ use alloc::{ use crate::{ arch::{ - vm::{kvm_host::vcpu::VirCpuRequest, vmx::VmxVCpuPriv}, + vm::{kvm_host::vcpu::VirtCpuRequest, vmx::VmxVCpuPriv}, VirtCpuArch, }, - libs::{ - lazy_init::Lazy, - spinlock::{SpinLock, SpinLockGuard}, - }, - process::{Pid, ProcessManager}, + libs::spinlock::{SpinLock, SpinLockGuard}, + process::Pid, smp::cpu::ProcessorId, virt::vm::user_api::UapiKvmRun, }; use super::{ - mem::{GfnToHvaCache, KvmMemSlot, PfnCacheUsage}, - LockedVm, Vm, + mem::{GfnToHvaCache, KvmMemSlot}, + LockedVm, }; #[derive(Debug)] @@ -43,6 +37,15 @@ impl LockedVirtCpu { } } +#[derive(Debug, PartialEq)] +#[allow(dead_code)] +pub enum VcpuMode { + OutsideGuestMode, + InGuestMode, + ExitingGuestMode, + ReadingShadowPageTables, +} + #[derive(Debug)] pub struct VirtCpu { pub cpu: ProcessorId, @@ -59,13 +62,15 @@ pub struct VirtCpu { pub pv_time: GfnToHvaCache, pub arch: VirtCpuArch, + pub mode: VcpuMode, + pub guest_debug: GuestDebug, #[cfg(target_arch = "x86_64")] pub private: Option, /// 记录请求 - pub request: VirCpuRequest, + pub request: VirtCpuRequest, pub run: Option>, } diff --git a/kernel/src/virt/vm/user_api.rs b/kernel/src/virt/vm/user_api.rs index 90a17e3e9..e7d078c2b 100644 --- a/kernel/src/virt/vm/user_api.rs +++ b/kernel/src/virt/vm/user_api.rs @@ -1,8 +1,8 @@ -use core::fmt::Debug; - /// /// 该文件定义了暴露给用户空间的结构体 /// +use core::fmt::Debug; + use system_error::SystemError; use crate::mm::{PhysAddr, VirtAddr}; From 9e0e8636feb889cd74bdcadb87c74ff1b2b83911 Mon Sep 17 00:00:00 2001 From: BrahmaMantra <2033552517@qq.com> Date: Mon, 26 Aug 2024 14:45:03 +0800 Subject: [PATCH 05/10] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E4=BA=86vmlaunch?= =?UTF-8?q?=E5=AF=BC=E8=87=B4=E7=9A=84cpu=5Freset=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/src/arch/x86_64/vm/vmx/mod.rs | 12 +- kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs | 3 +- kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs | 6 +- user/apps/test_kvm/main.c | 569 ++++++++++++++++++--- 4 files changed, 497 insertions(+), 93 deletions(-) diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs index f210ae233..cd618ca51 100644 --- a/kernel/src/arch/x86_64/vm/vmx/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -1,6 +1,8 @@ use core::intrinsics::likely; use core::intrinsics::unlikely; use core::sync::atomic::{AtomicBool, Ordering}; +use x86_64::registers::control::Cr3Flags; +use x86_64::structures::paging::PhysFrame; use crate::arch::process::table::USER_DS; use crate::arch::vm::mmu::KvmMmu; @@ -1046,9 +1048,10 @@ impl KvmFunc for VmxKvmFunc { vcpu.arch.clear_dirty(); - let cr3 = Cr3::read().1; + let cr3: (PhysFrame,Cr3Flags) = Cr3::read(); if unlikely(cr3 != vcpu.vmx().loaded_vmcs().host_state.cr3) { - VmxAsm::vmx_vmwrite(host::CR3, cr3.bits()); + let cr3_combined: u64 = (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); + VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); vcpu.vmx().loaded_vmcs().host_state.cr3 = cr3; } @@ -2339,8 +2342,9 @@ impl Vmx { VmxAsm::vmx_vmwrite(host::CR0, unsafe { cr0() }.bits() as u64); - let cr3 = Cr3::read().1; - VmxAsm::vmx_vmwrite(host::CR3, cr3.bits()); + let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); + let cr3_combined: u64 = (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); + VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); loaded_vmcs_host_state.cr3 = cr3; let cr4 = unsafe { cr4() }; diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs index 3023deaf3..7533bf650 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs @@ -87,8 +87,7 @@ impl VmxFeat { pub const KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = PinbasedControls::VIRTUAL_NMIS .bits() - | PinbasedControls::POSTED_INTERRUPTS.bits() - | PinbasedControls::VMX_PREEMPTION_TIMER.bits(); + | PinbasedControls::POSTED_INTERRUPTS.bits(); pub const KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS: u32 = EntryControls::LOAD_DEBUG_CONTROLS.bits() | EntryControls::IA32E_MODE_GUEST.bits(); diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs index f4af66611..bff5b0d09 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs @@ -9,7 +9,7 @@ use x86::{ host, }, }; -use x86_64::registers::control::Cr3Flags; +use x86_64::{registers::control::Cr3Flags, structures::paging::PhysFrame}; use crate::{ arch::{ @@ -121,7 +121,7 @@ impl LockedVMControlStructure { #[derive(Debug)] pub struct VmcsHostState { - pub cr3: Cr3Flags, + pub cr3:(PhysFrame,Cr3Flags), pub cr4: Cr4, pub gs_base: usize, pub fs_base: usize, @@ -170,7 +170,7 @@ impl VmcsHostState { impl Default for VmcsHostState { fn default() -> Self { Self { - cr3: Cr3Flags::empty(), + cr3: (PhysFrame::containing_address(x86_64::PhysAddr::new(0)), Cr3Flags::empty()), cr4: Cr4::empty(), gs_base: 0, fs_base: 0, diff --git a/user/apps/test_kvm/main.c b/user/apps/test_kvm/main.c index 739953568..e1c8c6f67 100644 --- a/user/apps/test_kvm/main.c +++ b/user/apps/test_kvm/main.c @@ -1,19 +1,4 @@ -/** - * @file main.c - * @author xiaoyez (xiaoyez@zju.edu.cn) - * @brief 测试kvm的程序 - * @version 0.1 - * @date 2023-07-13 - * - * @copyright Copyright (c) 2023 - * - */ - -/** - * 测试kvm命令的方法: - * 1.在DragonOS的控制台输入 exec bin/test_kvm.elf - * - */ + #include #include #include @@ -21,14 +6,64 @@ #include #include #include +// #include -#define KVM_CREATE_VM 0xAE01 -#define KVM_CREATE_VCPU 0xAE41 -#define KVM_SET_USER_MEMORY_REGION 0xAE46 +#define KVM_S390_GET_SKEYS_NONE 1 +#define KVM_S390_SKEYS_MAX 1048576 -#define KVM_RUN 0xAE80 -#define KVM_GET_REGS 0x01 -#define KVM_SET_REGS 0x02 +#define KVM_EXIT_UNKNOWN 0 +#define KVM_EXIT_EXCEPTION 1 +#define KVM_EXIT_IO 2 +#define KVM_EXIT_HYPERCALL 3 +#define KVM_EXIT_DEBUG 4 +#define KVM_EXIT_HLT 5 +#define KVM_EXIT_MMIO 6 +#define KVM_EXIT_IRQ_WINDOW_OPEN 7 +#define KVM_EXIT_SHUTDOWN 8 +#define KVM_EXIT_FAIL_ENTRY 9 +#define KVM_EXIT_INTR 10 +#define KVM_EXIT_SET_TPR 11 +#define KVM_EXIT_TPR_ACCESS 12 +#define KVM_EXIT_S390_SIEIC 13 +#define KVM_EXIT_S390_RESET 14 +#define KVM_EXIT_DCR 15 /* deprecated */ +#define KVM_EXIT_NMI 16 +#define KVM_EXIT_INTERNAL_ERROR 17 +#define KVM_EXIT_OSI 18 +#define KVM_EXIT_PAPR_HCALL 19 +#define KVM_EXIT_S390_UCONTROL 20 +#define KVM_EXIT_WATCHDOG 21 +#define KVM_EXIT_S390_TSCH 22 +#define KVM_EXIT_EPR 23 +#define KVM_EXIT_SYSTEM_EVENT 24 +#define KVM_EXIT_S390_STSI 25 +#define KVM_EXIT_IOAPIC_EOI 26 +#define KVM_EXIT_HYPERV 27 +#define KVM_EXIT_ARM_NISV 28 +#define KVM_EXIT_X86_RDMSR 29 +#define KVM_EXIT_X86_WRMSR 30 +#define KVM_EXIT_DIRTY_RING_FULL 31 +#define KVM_EXIT_AP_RESET_HOLD 32 +#define KVM_EXIT_X86_BUS_LOCK 33 +#define KVM_EXIT_XEN 34 + +/* For KVM_EXIT_INTERNAL_ERROR */ +/* Emulate instruction failed. */ +#define KVM_INTERNAL_ERROR_EMULATION 1 +/* Encounter unexpected simultaneous exceptions. */ +#define KVM_INTERNAL_ERROR_SIMUL_EX 2 +/* Encounter unexpected vm-exit due to delivery event. */ +#define KVM_INTERNAL_ERROR_DELIVERY_EV 3 +/* Encounter unexpected vm-exit reason */ +#define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON 4 + +/* Flags that describe what fields in emulation_failure hold valid data. */ +#define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0) + +typedef uint32_t __u32; +typedef uint16_t __u16; +typedef uint8_t __u8; +typedef uint64_t __u64; struct kvm_userspace_memory_region { uint32_t slot; // 要在哪个slot上注册内存区间 @@ -49,71 +84,437 @@ struct kvm_regs { uint64_t rip, rflags; }; -int guest_code(){ - while (1) - { - // printf("guest code\n"); - __asm__ __volatile__ ( - "mov %rax, 0\n\t" - "mov %rcx, 0\n\t" - "cpuid\n\t" - ); - } - return 0; -} +struct kvm_segment { + uint64_t base; + uint32_t limit; + uint16_t selector; + uint8_t type; + uint8_t present, dpl, db, s, l, g, avl; + uint8_t unusable; + uint8_t padding; +}; + +struct kvm_dtable { + uint64_t base; + uint16_t limit; + uint16_t padding[3]; +}; + +struct kvm_sregs { + /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + uint64_t cr0, cr2, cr3, cr4, cr8; + uint64_t efer; + uint64_t apic_base; + uint64_t interrupt_bitmap[(256 + 63) / 64]; +}; + +struct kvm_hyperv_exit { +#define KVM_EXIT_HYPERV_SYNIC 1 +#define KVM_EXIT_HYPERV_HCALL 2 +#define KVM_EXIT_HYPERV_SYNDBG 3 + __u32 type; + __u32 pad1; + union { + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 evt_page; + __u64 msg_page; + } synic; + struct { + __u64 input; + __u64 result; + __u64 params[2]; + } hcall; + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 status; + __u64 send_page; + __u64 recv_page; + __u64 pending_page; + } syndbg; + } u; +}; + +struct kvm_xen_exit { +#define KVM_EXIT_XEN_HCALL 1 + __u32 type; + union { + struct { + __u32 longmode; + __u32 cpl; + __u64 input; + __u64 result; + __u64 params[6]; + } hcall; + } u; +}; + +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pending; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + struct { + __u8 smm; + __u8 pending; + __u8 smm_inside_nmi; + __u8 latched_init; + } smi; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; +}; + +/* kvm_sync_regs struct included by kvm_run struct */ +struct kvm_sync_regs { + /* Members of this structure are potentially malicious. + * Care must be taken by code reading, esp. interpreting, + * data fields from them inside KVM to prevent TOCTOU and + * double-fetch types of vulnerabilities. + */ + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_vcpu_events events; +}; -int main() -{ - printf("Test kvm running...\n"); - printf("Open /dev/kvm\n"); - int kvm_fd = open("/dev/kvm", O_RDWR|O_CLOEXEC); - int vmfd = ioctl(kvm_fd, KVM_CREATE_VM, 0); - printf("vmfd=%d\n", vmfd); - - /* - __asm__ __volatile__ ( - "mov %rax, 0\n\t" - "mov %rcx, 0\n\t" - "cpuid\n\t" - ); - */ - const uint8_t code[] = { - 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ - 0x00, 0xd8, /* add %bl, %al */ - 0x04, '0', /* add $'0', %al */ - 0xee, /* out %al, (%dx) */ - 0xb0, '\n', /* mov $'\n', %al */ - 0xee, /* out %al, (%dx) */ - 0xf4, /* hlt */ - }; - - size_t mem_size = 0x1000; // size of user memory you want to assign - printf("code=%p\n", code); - void *mem = mmap((void*)65536, mem_size, 0x7, 0x20, 0,0); - memcpy(mem, code, sizeof(code)); - printf("map mem=%p\n", mem); - struct kvm_userspace_memory_region region = { - .slot = 0, - .flags = 0, - .guest_phys_addr = 0, - .memory_size = mem_size, - .userspace_addr = (size_t)mem - }; - ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ®ion); - - int vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); - printf("vcpufd=%d\n", vcpufd); - int user_entry = 0x0; - - struct kvm_regs regs = {0}; - regs.rip = user_entry; - regs.rsp = 0x3000; // stack address - regs.rflags = 0x2; // in x86 the 0x2 bit should always be set - ioctl(vcpufd, KVM_SET_REGS, ®s); // set registers - - ioctl(vcpufd, KVM_RUN, 0); +struct kvm_debug_exit_arch { + __u32 exception; + __u32 pad; + __u64 pc; + __u64 dr6; + __u64 dr7; +}; + +/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ +struct kvm_run { + /* in */ + uint8_t request_interrupt_window; + uint8_t immediate_exit; + uint8_t padding1[6]; + + /* out */ + uint32_t exit_reason; + uint8_t ready_for_interrupt_injection; + uint8_t if_flag; + uint16_t flags; + + /* in (pre_kvm_run), out (post_kvm_run) */ + uint64_t cr8; + uint64_t apic_base; + +#ifdef __KVM_S390 + /* the processor status word for s390 */ + uint64_t psw_mask; /* psw upper half */ + uint64_t psw_addr; /* psw lower half */ +#endif + union { + /* KVM_EXIT_UNKNOWN */ + struct { + uint64_t hardware_exit_reason; + } hw; + /* KVM_EXIT_FAIL_ENTRY */ + struct { + uint64_t hardware_entry_failure_reason; + uint32_t cpu; + } fail_entry; + /* KVM_EXIT_EXCEPTION */ + struct { + uint32_t exception; + uint32_t error_code; + } ex; + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + uint8_t direction; + uint8_t size; /* bytes */ + uint16_t port; + uint32_t count; + uint64_t data_offset; /* relative to kvm_run start */ + } io; + /* KVM_EXIT_DEBUG */ + struct { + struct kvm_debug_exit_arch arch; + } debug; + /* KVM_EXIT_MMIO */ + struct { + uint64_t phys_addr; + uint8_t data[8]; + uint32_t len; + uint8_t is_write; + } mmio; + /* KVM_EXIT_HYPERCALL */ + struct { + uint64_t nr; + uint64_t args[6]; + uint64_t ret; + uint32_t longmode; + uint32_t pad; + } hypercall; + /* KVM_EXIT_TPR_ACCESS */ + struct { + uint64_t rip; + uint32_t is_write; + uint32_t pad; + } tpr_access; + /* KVM_EXIT_S390_SIEIC */ + struct { + uint8_t icptcode; + uint16_t ipa; + uint32_t ipb; + } s390_sieic; + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + uint64_t s390_reset_flags; + /* KVM_EXIT_S390_UCONTROL */ + struct { + uint64_t trans_exc_code; + uint32_t pgm_code; + } s390_ucontrol; + /* KVM_EXIT_DCR (deprecated) */ + struct { + uint32_t dcrn; + uint32_t data; + uint8_t is_write; + } dcr; + /* KVM_EXIT_INTERNAL_ERROR */ + struct { + uint32_t suberror; + /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */ + uint32_t ndata; + uint64_t data[16]; + } internal; + /* + * KVM_INTERNAL_ERROR_EMULATION + * + * "struct emulation_failure" is an overlay of "struct internal" + * that is used for the KVM_INTERNAL_ERROR_EMULATION sub-type of + * KVM_EXIT_INTERNAL_ERROR. Note, unlike other internal error + * sub-types, this struct is ABI! It also needs to be backwards + * compatible with "struct internal". Take special care that + * "ndata" is correct, that new fields are enumerated in "flags", + * and that each flag enumerates fields that are 64-bit aligned + * and sized (so that ndata+internal.data[] is valid/accurate). + */ + struct { + uint32_t suberror; + uint32_t ndata; + uint64_t flags; + uint8_t insn_size; + uint8_t insn_bytes[15]; + } emulation_failure; + /* KVM_EXIT_OSI */ + struct { + uint64_t gprs[32]; + } osi; + /* KVM_EXIT_PAPR_HCALL */ + struct { + uint64_t nr; + uint64_t ret; + uint64_t args[9]; + } papr_hcall; + /* KVM_EXIT_S390_TSCH */ + struct { + uint16_t subchannel_id; + uint16_t subchannel_nr; + uint32_t io_int_parm; + uint32_t io_int_word; + uint32_t ipb; + uint8_t dequeued; + } s390_tsch; + /* KVM_EXIT_EPR */ + struct { + uint32_t epr; + } epr; + /* KVM_EXIT_SYSTEM_EVENT */ + struct { +#define KVM_SYSTEM_EVENT_SHUTDOWN 1 +#define KVM_SYSTEM_EVENT_RESET 2 +#define KVM_SYSTEM_EVENT_CRASH 3 + uint32_t type; + uint64_t flags; + } system_event; + /* KVM_EXIT_S390_STSI */ + struct { + uint64_t addr; + uint8_t ar; + uint8_t reserved; + uint8_t fc; + uint8_t sel1; + uint16_t sel2; + } s390_stsi; + /* KVM_EXIT_IOAPIC_EOI */ + struct { + uint8_t vector; + } eoi; + /* KVM_EXIT_HYPERV */ + struct kvm_hyperv_exit hyperv; + /* KVM_EXIT_ARM_NISV */ + struct { + uint64_t esr_iss; + uint64_t fault_ipa; + } arm_nisv; + /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ + struct { + uint8_t error; /* user -> kernel */ + uint8_t pad[7]; +#define KVM_MSR_EXIT_REASON_INVAL (1 << 0) +#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) +#define KVM_MSR_EXIT_REASON_FILTER (1 << 2) + uint32_t reason; /* kernel -> user */ + uint32_t index; /* kernel -> user */ + uint64_t data; /* kernel <-> user */ + } msr; + /* KVM_EXIT_XEN */ + struct kvm_xen_exit xen; + /* Fix the size of the union. */ + char padding[256]; + }; + + /* 2048 is the size of the char array used to bound/pad the size + * of the union that holds sync regs. + */ + #define SYNC_REGS_SIZE_BYTES 2048 + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + uint64_t kvm_valid_regs; + uint64_t kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[SYNC_REGS_SIZE_BYTES]; + } s; +}; + +#define KVM_CREATE_VM 0xAE01 +#define KVM_CREATE_VCPU 0xAE41 +#define KVM_SET_USER_MEMORY_REGION 0xAE46 +#define KVM_GET_VCPU_MMAP_SIZE 0xAE04 +#define KVM_GET_REGS 0xAE81 +#define KVM_SET_REGS 0xAE82 +#define KVM_GET_SREGS 0xAE83 +#define KVM_SET_SREGS 0xAE84 + +#define KVM_RUN 0xAE80 + +int kvm(uint8_t code[], size_t code_len) { + // step 1, open /dev/kvm + int kvmfd = open("/dev/kvm", O_RDWR|O_CLOEXEC); + if(kvmfd == -1) { + printf("failed to open /dev/kvm\n"); return 0; + } + + // step 2, create VM + int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0); + printf("vmfd %d\n",vmfd); + // step 3, set up user memory region + size_t mem_size = 0x4000; // size of user memory you want to assign + void *mem = mmap(0, mem_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS, -1, 0); + + printf("map mem %p\n",mem); + int user_entry = 0x0; + memcpy((void*)((size_t)mem + user_entry), code, code_len); + struct kvm_userspace_memory_region region = { + .slot = 0, + .flags = 0, + .guest_phys_addr = 0, + .memory_size = mem_size, + .userspace_addr = (size_t)mem + }; + ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ®ion); + /* end of step 3 */ + + // step 4, create vCPU + int vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); + + // step 5, set up memory for vCPU + size_t vcpu_mmap_size = ioctl(kvmfd, KVM_GET_VCPU_MMAP_SIZE, NULL); + struct kvm_run* run = (struct kvm_run*) mmap(0, vcpu_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0); + + // step 6, set up vCPU's registers + /* standard registers include general-purpose registers and flags */ + struct kvm_regs regs; + ioctl(vcpufd, KVM_GET_REGS, ®s); + regs.rip = user_entry; + regs.rsp = 0x200000; // stack address + regs.rflags = 0x2; // in x86 the 0x2 bit should always be set + ioctl(vcpufd, KVM_SET_REGS, ®s); // set registers + + /* special registers include segment registers */ + struct kvm_sregs sregs; + ioctl(vcpufd, KVM_GET_SREGS, &sregs); + sregs.cs.base = sregs.cs.selector = 0; // let base of code segment equal to zero + ioctl(vcpufd, KVM_SET_SREGS, &sregs); + + // step 7, execute vm and handle exit reason + while (1) { + ioctl(vcpufd, KVM_RUN, NULL); + switch (run->exit_reason) { + case KVM_EXIT_HLT: + fputs("KVM_EXIT_HLT", stderr); + return 0; + case KVM_EXIT_IO: + /* TODO: check port and direction here */ + putchar(*(((char *)run) + run->io.data_offset)); + break; + case KVM_EXIT_FAIL_ENTRY: + printf("KVM_EXIT_FAIL_ENTRY: hardware_entry_failure_reason = 0x%lx", + run->fail_entry.hardware_entry_failure_reason); + return 0; + case KVM_EXIT_INTERNAL_ERROR: + printf("KVM_EXIT_INTERNAL_ERROR: suberror = 0x%x", + run->internal.suberror); + return 0; + case KVM_EXIT_SHUTDOWN: + printf("KVM_EXIT_SHUTDOWN"); + return 0; + default: + printf("Unhandled reason: %d", run->exit_reason); + return 0; + } + } } +int main() { + uint8_t code[] = "\xB0\x61\xBA\x17\x02\xEE\xB0\n\xEE\xF4"; + kvm(code, sizeof(code)); + return 0; +} + \ No newline at end of file From 53f72aa9fac96df904cd54b6db78fea163e54393 Mon Sep 17 00:00:00 2001 From: GnoCiYeH Date: Fri, 30 Aug 2024 17:42:55 +0800 Subject: [PATCH 06/10] =?UTF-8?q?=E6=95=B4=E7=90=86=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/src/arch/x86_64/vm/asm.rs | 2 +- kernel/src/arch/x86_64/vm/kvm_host/lapic.rs | 2 +- kernel/src/arch/x86_64/vm/kvm_host/mod.rs | 2 +- kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs | 39 ++++++-- kernel/src/arch/x86_64/vm/mmu.rs | 5 +- kernel/src/arch/x86_64/vm/mod.rs | 4 +- kernel/src/arch/x86_64/vm/vmx/mod.rs | 103 +++++++++++--------- kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs | 5 +- kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs | 26 +++-- kernel/src/virt/vm/kvm_host/mem.rs | 16 +-- 10 files changed, 125 insertions(+), 79 deletions(-) diff --git a/kernel/src/arch/x86_64/vm/asm.rs b/kernel/src/arch/x86_64/vm/asm.rs index c3cb826ee..2a299e17d 100644 --- a/kernel/src/arch/x86_64/vm/asm.rs +++ b/kernel/src/arch/x86_64/vm/asm.rs @@ -110,7 +110,7 @@ impl VmxAsm { pub fn vmx_vmwrite(vmcs_field: u32, value: u64) { unsafe { x86::bits64::vmx::vmwrite(vmcs_field, value) - .expect(&format!("vmcs_field: {:x} vmx_write fail", vmcs_field)) + .unwrap_or_else(|_| panic!("vmcs_field: {:x} vmx_write fail", vmcs_field)) } } diff --git a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs index c71d1cd9f..90a4bbda9 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs @@ -54,7 +54,7 @@ impl VirtCpu { } if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 { - if value & MSR_IA32_APICBASE_ENABLE != 0 {} + // if value & MSR_IA32_APICBASE_ENABLE != 0 {} } todo!() diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs index e61cb5047..6f8445048 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -93,7 +93,7 @@ impl X86KvmArch { pub fn msr_allowed(&self, msr: u32, ftype: MsrFilterType) -> bool { // x2APIC MSRs - if msr >= 0x800 && msr <= 0x8ff { + if (0x800..=0x8ff).contains(&msr) { return true; } diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs index e640fc3b4..8269a8776 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -134,6 +134,8 @@ pub struct X86VcpuArch { pub nmi_pending: u32, pub nmi_injected: bool, + pub handling_intr_from_guest: KvmIntrType, + pub xfd_no_write_intercept: bool, pub l1tf_flush_l1d: bool, @@ -561,6 +563,18 @@ impl X86VcpuArch { } } } + + pub fn kvm_before_interrupt(&mut self, intr: KvmIntrType) { + barrier::mfence(); + self.handling_intr_from_guest = intr; + barrier::mfence(); + } + + pub fn kvm_after_interrupt(&mut self) { + barrier::mfence(); + self.handling_intr_from_guest = KvmIntrType::None; + barrier::mfence(); + } } impl VirtCpu { @@ -595,7 +609,7 @@ impl VirtCpu { } #[inline] - pub fn kvm_run(&self) -> &Box { + pub fn kvm_run(&self) -> &UapiKvmRun { self.run.as_ref().unwrap() } @@ -1257,10 +1271,11 @@ impl VirtCpu { return Ok(()); } - let mut dt: DescriptorTablePointer = DescriptorTablePointer::default(); + let mut dt: DescriptorTablePointer = DescriptorTablePointer { + limit: sregs.idt.limit, + base: sregs.idt.base as usize as *const u8, + }; - dt.limit = sregs.idt.limit; - dt.base = sregs.idt.base as usize as *const u8; x86_kvm_ops().set_idt(self, &dt); dt.limit = sregs.gdt.limit; @@ -1274,7 +1289,7 @@ impl VirtCpu { self.arch.mark_register_dirty(KvmReg::VcpuExregCr3); - x86_kvm_ops().post_set_cr3(&self, sregs.cr3); + x86_kvm_ops().post_set_cr3(self, sregs.cr3); self.kvm_set_cr8(sregs.cr8); @@ -1335,12 +1350,9 @@ impl VirtCpu { } // TODO: legal gpa? - } else { - if efer.contains(EferFlags::LONG_MODE_ACTIVE) || sregs.cs.l != 0 { - return false; - } + } else if efer.contains(EferFlags::LONG_MODE_ACTIVE) || sregs.cs.l != 0 { + return false; } - return self.kvm_is_vaild_cr0(cr0) && self.kvm_is_vaild_cr4(cr4); } @@ -1621,3 +1633,10 @@ pub struct KvmAsyncPageFault { impl KvmAsyncPageFault { pub const ASYNC_PF_PER_VCPU: usize = 64; } + +#[derive(Debug)] +pub enum KvmIntrType { + None, + Irq, + Nmi, +} diff --git a/kernel/src/arch/x86_64/vm/mmu.rs b/kernel/src/arch/x86_64/vm/mmu.rs index 504e9a107..c250dade1 100644 --- a/kernel/src/arch/x86_64/vm/mmu.rs +++ b/kernel/src/arch/x86_64/vm/mmu.rs @@ -1,4 +1,4 @@ -use crate::{arch::mm::X86_64MMArch, kdebug}; +use crate::{arch::mm::X86_64MMArch, kdebug, kwarn}; use alloc::{sync::Arc, vec::Vec}; use bitfield_struct::bitfield; use core::intrinsics::likely; @@ -371,10 +371,13 @@ impl VirtCpuArch { if !context.cpu_role.base.is_cr0_pg() { // todo: context->gva_to_gpa = nonpaging_gva_to_gpa; + kwarn!("context->gva_to_gpa = nonpaging_gva_to_gpa todo!"); } else if context.cpu_role.base.is_cr4_pae() { // todo: context->gva_to_gpa = paging64_gva_to_gpa; + kwarn!("context->gva_to_gpa = paging64_gva_to_gpa todo!"); } else { // todo: context->gva_to_gpa = paging32_gva_to_gpa; + kwarn!("context->gva_to_gpa = paging32_gva_to_gpa todo!"); } // todo: diff --git a/kernel/src/arch/x86_64/vm/mod.rs b/kernel/src/arch/x86_64/vm/mod.rs index 4f9af1cc6..634b1561b 100644 --- a/kernel/src/arch/x86_64/vm/mod.rs +++ b/kernel/src/arch/x86_64/vm/mod.rs @@ -143,7 +143,7 @@ impl KvmArchManager { pub fn mpx_supported(&self) -> bool { self.kvm_caps.supported_xcr0 & (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE) - == (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDREG_STATE) + == (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE) } pub const KVM_MAX_VCPUS: usize = 1024; @@ -511,7 +511,7 @@ impl KvmArchManager { & ArchCapabilities::KVM_SUPPORTED_ARCH_CAP; data.insert(ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO); - if *L1TF_VMX_MITIGATION.read() != VmxL1dFlushState::FlushNever { + if *L1TF_VMX_MITIGATION.read() != VmxL1dFlushState::Never { data.insert(ArchCapabilities::ARCH_CAP_SKIP_VMENTRY_L1DFLUSH); } diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs index cd618ca51..eea64ea7a 100644 --- a/kernel/src/arch/x86_64/vm/vmx/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -1,6 +1,7 @@ use core::intrinsics::likely; use core::intrinsics::unlikely; use core::sync::atomic::{AtomicBool, Ordering}; +use x86::segmentation::GateDescriptorBuilder; use x86_64::registers::control::Cr3Flags; use x86_64::structures::paging::PhysFrame; @@ -89,6 +90,7 @@ use self::{ use super::asm::IntrInfo; use super::asm::SegmentCacheField; +use super::kvm_host::vcpu::KvmIntrType; use super::kvm_host::RMODE_TSS_SIZE; use super::x86_kvm_ops; use super::{ @@ -126,6 +128,7 @@ impl VmxKvmInitFunc { } impl KvmInitFunc for VmxKvmInitFunc { + #[allow(clippy::borrow_interior_mutable_const)] #[inline(never)] fn hardware_setup(&self) -> Result<(), SystemError> { let idt = sidt(); @@ -309,6 +312,7 @@ pub struct VmxKvmFuncConfig { } impl VmxKvmFunc { + #[allow(clippy::declare_interior_mutable_const)] pub const CONFIG: RwLock = RwLock::new(VmxKvmFuncConfig { have_set_apic_access_page_addr: true, have_update_cr8_intercept: true, @@ -398,7 +402,7 @@ impl VmxKvmFunc { } } - let _ = current_loaded_vmcs_list_mut().extract_if(|x| Arc::ptr_eq(&x, loaded_vmcs)); + let _ = current_loaded_vmcs_list_mut().extract_if(|x| Arc::ptr_eq(x, loaded_vmcs)); guard.cpu = ProcessorId::INVALID; guard.launched = false; @@ -866,7 +870,7 @@ impl KvmFunc for VmxKvmFunc { eb = !0; } - if !vmx_info().vmx_need_pf_intercept(&vcpu) { + if !vmx_info().vmx_need_pf_intercept(vcpu) { eb &= !(1 << PF_VECTOR); } @@ -1048,9 +1052,10 @@ impl KvmFunc for VmxKvmFunc { vcpu.arch.clear_dirty(); - let cr3: (PhysFrame,Cr3Flags) = Cr3::read(); + let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); if unlikely(cr3 != vcpu.vmx().loaded_vmcs().host_state.cr3) { - let cr3_combined: u64 = (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); + let cr3_combined: u64 = + (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); vcpu.vmx().loaded_vmcs().host_state.cr3 = cr3; } @@ -1175,13 +1180,11 @@ impl KvmFunc for VmxKvmFunc { fn flush_tlb_all(&self, vcpu: &mut VirtCpu) { if vmx_info().enable_ept { VmxAsm::ept_sync_global(); + } else if vmx_info().has_invvpid_global() { + VmxAsm::sync_vcpu_global(); } else { - if vmx_info().has_invvpid_global() { - VmxAsm::sync_vcpu_global(); - } else { - VmxAsm::sync_vcpu_single(vcpu.vmx().vpid); - // TODO: 嵌套:VmxAsm::sync_vcpu_single(vcpu.vmx().nested.vpid02); - } + VmxAsm::sync_vcpu_single(vcpu.vmx().vpid); + // TODO: 嵌套:VmxAsm::sync_vcpu_single(vcpu.vmx().nested.vpid02); } } @@ -1193,7 +1196,7 @@ impl KvmFunc for VmxKvmFunc { let basic = VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic()); if basic == VmxExitReasonBasic::EXTERNAL_INTERRUPT { - todo!() + Vmx::handle_external_interrupt_irqoff(vcpu); } else if basic == VmxExitReasonBasic::EXCEPTION_OR_NMI { todo!() } @@ -1330,9 +1333,7 @@ impl Vmx { self.nested = false; self.enable_vmware_backdoor = false; } -} -impl Vmx { /* * Internal error codes that are used to indicate that MSR emulation encountered * an error that should result in #GP in the guest, unless userspace @@ -1385,7 +1386,7 @@ impl Vmx { #[inline(never)] pub fn set_up_user_return_msrs() { - const VMX_URET_MSRS_LIST: &'static [u32] = &[ + const VMX_URET_MSRS_LIST: &[u32] = &[ msr::IA32_FMASK, msr::IA32_LSTAR, msr::IA32_CSTAR, @@ -1408,7 +1409,7 @@ impl Vmx { vmcs_config: &mut VmcsConfig, vmx_cap: &mut VmxCapability, ) -> Result<(), SystemError> { - const VMCS_ENTRY_EXIT_PAIRS: &'static [VmcsEntryExitPair] = &[ + const VMCS_ENTRY_EXIT_PAIRS: &[VmcsEntryExitPair] = &[ VmcsEntryExitPair::new( EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL, ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL, @@ -1650,7 +1651,7 @@ impl Vmx { fn setup_l1d_flush(&self) { // TODO:先这样写 - *L1TF_VMX_MITIGATION.write() = VmxL1dFlushState::FlushNotRequired; + *L1TF_VMX_MITIGATION.write() = VmxL1dFlushState::NotRequired; } pub fn construct_eptp(&self, vcpu: &mut VirtCpu, root_hpa: u64, root_level: u32) -> u64 { @@ -2278,8 +2279,8 @@ impl Vmx { if guest_efer != x86_kvm_manager().host_efer { vcpu.vmx_mut().add_atomic_switch_msr( msr::IA32_EFER, - guest_efer.bits().into(), - x86_kvm_manager().host_efer.bits().into(), + guest_efer.bits(), + x86_kvm_manager().host_efer.bits(), false, ); } else { @@ -2296,8 +2297,8 @@ impl Vmx { guest_efer.remove(ignore_efer); guest_efer.insert(x86_kvm_manager().host_efer & ignore_efer); - vcpu.vmx_mut().guest_uret_msrs[i].data = guest_efer.bits().into(); - vcpu.vmx_mut().guest_uret_msrs[i].mask = (!ignore_efer).bits().into(); + vcpu.vmx_mut().guest_uret_msrs[i].data = guest_efer.bits(); + vcpu.vmx_mut().guest_uret_msrs[i].mask = (!ignore_efer).bits(); return true; } else { return false; @@ -2343,7 +2344,8 @@ impl Vmx { VmxAsm::vmx_vmwrite(host::CR0, unsafe { cr0() }.bits() as u64); let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); - let cr3_combined: u64 = (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); + let cr3_combined: u64 = + (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); loaded_vmcs_host_state.cr3 = cr3; @@ -2369,7 +2371,7 @@ impl Vmx { ); VmxAsm::vmx_vmwrite(host::IDTR_BASE, self.host_idt_base); - VmxAsm::vmx_vmwrite(host::RIP, vmx_vmexit as u64); + VmxAsm::vmx_vmwrite(host::RIP, vmx_vmexit as usize as u64); let val = unsafe { rdmsr(msr::IA32_SYSENTER_CS) }; @@ -2701,7 +2703,7 @@ impl Vmx { if !vmx_info().emulate_invalid_guest_state { var.selector = (var.base >> 4) as u16; - var.base = var.base & 0xffff0; + var.base &= 0xffff0; var.limit = 0xffff; var.g = 0; var.db = 0; @@ -2933,7 +2935,7 @@ impl Vmx { if VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic()) == VmxExitReasonBasic::EXCEPTION_OR_NMI - && VmcsIntrHelper::is_nmi(Vmx::vmx_get_intr_info(vcpu)) + && VmcsIntrHelper::is_nmi(&Vmx::vmx_get_intr_info(vcpu)) { todo!() } @@ -3040,6 +3042,23 @@ impl Vmx { todo!() } + #[allow(unreachable_code)] + pub fn handle_external_interrupt_irqoff(vcpu: &mut VirtCpu) { + let intr_info = Vmx::vmx_get_intr_info(vcpu); + let vector = intr_info & IntrInfo::INTR_INFO_VECTOR_MASK; + // let desc = vmx_info().host_idt_base + vector.bits() as u64; + if !VmcsIntrHelper::is_external_intr(&intr_info) { + kerror!("unexpected VM-Exit interrupt info: {:?}", intr_info); + return; + } + + vcpu.arch.kvm_before_interrupt(KvmIntrType::Irq); + todo!(); + vcpu.arch.kvm_after_interrupt(); + + vcpu.arch.at_instruction_boundary = true; + } + /// 需要在缓存中更新的寄存器集。此处未列出的其他寄存器在 VM 退出后立即同步到缓存。 pub const VMX_REGS_LAZY_LOAD_SET: &'static [usize] = &[ KvmReg::VcpuRegsRip as usize, @@ -3111,13 +3130,7 @@ impl VmxMsrs { pub const MAX_NR_LOADSTORE_MSRS: usize = 8; pub fn find_loadstore_msr_slot(&self, msr: u32) -> Option { - for i in 0..self.nr { - if self.val[i].index == msr { - return Some(i); - } - } - - None + return (0..self.nr).find(|&i| self.val[i].index == msr); } } @@ -3510,12 +3523,12 @@ impl VmxVCpuPriv { return; } - let i = if i.is_none() { + let i = if let Some(i) = i { + i + } else { m.guest.nr += 1; VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, m.guest.nr as u64); m.guest.nr - } else { - i.unwrap() }; m.guest.val[i].index = msr; @@ -3525,12 +3538,12 @@ impl VmxVCpuPriv { return; } - let j = if j.is_none() { + let j = if let Some(j) = j { + j + } else { m.host.nr += 1; VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, m.host.nr as u64); m.host.nr - } else { - j.unwrap() }; m.host.val[j].index = msr; @@ -3601,12 +3614,12 @@ bitflags! { #[derive(Debug, PartialEq)] #[allow(dead_code)] pub enum VmxL1dFlushState { - FlushAuto, - FlushNever, - FlushCond, - FlushAlways, - FlushEptDisabled, - FlushNotRequired, + Auto, + Never, + Cond, + Always, + EptDisabled, + NotRequired, } pub struct VmxSegmentField { @@ -3616,7 +3629,7 @@ pub struct VmxSegmentField { ar_bytes: u32, } -pub const KVM_VMX_SEGMENT_FIELDS: &'static [VmxSegmentField] = &[ +pub const KVM_VMX_SEGMENT_FIELDS: &[VmxSegmentField] = &[ // CS VmxSegmentField { selector: guest::CS_SELECTOR, @@ -3675,7 +3688,7 @@ pub const KVM_VMX_SEGMENT_FIELDS: &'static [VmxSegmentField] = &[ }, ]; -pub static L1TF_VMX_MITIGATION: RwLock = RwLock::new(VmxL1dFlushState::FlushAuto); +pub static L1TF_VMX_MITIGATION: RwLock = RwLock::new(VmxL1dFlushState::Auto); pub fn vmx_init() -> Result<(), SystemError> { let cpuid = CpuId::new(); diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs index 7533bf650..77aa91a8d 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs @@ -85,9 +85,8 @@ impl VmxFeat { pub const KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = PinbasedControls::EXTERNAL_INTERRUPT_EXITING.bits() | PinbasedControls::NMI_EXITING.bits(); - pub const KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = PinbasedControls::VIRTUAL_NMIS - .bits() - | PinbasedControls::POSTED_INTERRUPTS.bits(); + pub const KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = + PinbasedControls::VIRTUAL_NMIS.bits() | PinbasedControls::POSTED_INTERRUPTS.bits(); pub const KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS: u32 = EntryControls::LOAD_DEBUG_CONTROLS.bits() | EntryControls::IA32E_MODE_GUEST.bits(); diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs index bff5b0d09..aec434aa5 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs @@ -121,7 +121,7 @@ impl LockedVMControlStructure { #[derive(Debug)] pub struct VmcsHostState { - pub cr3:(PhysFrame,Cr3Flags), + pub cr3: (PhysFrame, Cr3Flags), pub cr4: Cr4, pub gs_base: usize, pub fs_base: usize, @@ -170,7 +170,10 @@ impl VmcsHostState { impl Default for VmcsHostState { fn default() -> Self { Self { - cr3: (PhysFrame::containing_address(x86_64::PhysAddr::new(0)), Cr3Flags::empty()), + cr3: ( + PhysFrame::containing_address(x86_64::PhysAddr::new(0)), + Cr3Flags::empty(), + ), cr4: Cr4::empty(), gs_base: 0, fs_base: 0, @@ -397,8 +400,10 @@ impl VmxMsrBitmap { ) -> bool { if msr <= 0x1fff { return self.bit_op(msr as usize, access.base(), action); - } else if msr >= 0xc0000000 && msr <= 0xc0001fff { - return self.bit_op(msr as usize, access.base(), action); + } else if (0xc0000000..=0xc0001fff).contains(&msr) { + // 这里是有问题的,需要后续检查 + // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.h#450 + return self.bit_op(msr as usize & 0x1fff, access.base() + 0x400, action); } else { return true; } @@ -430,13 +435,18 @@ impl VmxMsrBitmap { pub struct VmcsIntrHelper; impl VmcsIntrHelper { - pub fn is_nmi(intr_info: IntrInfo) -> bool { + pub fn is_nmi(intr_info: &IntrInfo) -> bool { return Self::is_intr_type(intr_info, IntrType::INTR_TYPE_NMI_INTR); } - pub fn is_intr_type(intr_info: IntrInfo, intr_type: IntrType) -> bool { - return (intr_info & (IntrInfo::INTR_INFO_VALID_MASK | IntrInfo::INTR_INFO_INTR_TYPE_MASK)) + pub fn is_intr_type(intr_info: &IntrInfo, intr_type: IntrType) -> bool { + return (*intr_info + & (IntrInfo::INTR_INFO_VALID_MASK | IntrInfo::INTR_INFO_INTR_TYPE_MASK)) .bits() - == IntrInfo::INTR_INFO_VALID_MASK.bits() | intr_type.bits() as u32; + == IntrInfo::INTR_INFO_VALID_MASK.bits() | intr_type.bits(); + } + + pub fn is_external_intr(intr_info: &IntrInfo) -> bool { + return Self::is_intr_type(intr_info, IntrType::INTR_TYPE_EXT_INTR); } } diff --git a/kernel/src/virt/vm/kvm_host/mem.rs b/kernel/src/virt/vm/kvm_host/mem.rs index f284244e6..2a7746e7e 100644 --- a/kernel/src/virt/vm/kvm_host/mem.rs +++ b/kernel/src/virt/vm/kvm_host/mem.rs @@ -32,7 +32,7 @@ pub struct KvmMmuMemoryCache { gfp_custom: u32, capacity: usize, nobjs: usize, - objects: Option>>, + objects: Option>, } impl KvmMmuMemoryCache { #[allow(dead_code)] @@ -283,7 +283,7 @@ impl Vm { return Err(SystemError::EIO); } drop(slots_guard); - return self.set_memslot(Some(&old), None, KvmMemoryChangeMode::Delete); + return self.set_memslot(Some(old), None, KvmMemoryChangeMode::Delete); } else { return Err(SystemError::EINVAL); } @@ -325,10 +325,11 @@ impl Vm { } }; - if change == KvmMemoryChangeMode::Create || change == KvmMemoryChangeMode::Move { - if slots_guard.gfn_tree.contains_key(&base_gfn) { - return Err(SystemError::EEXIST); - } + if change == KvmMemoryChangeMode::Create + || change == KvmMemoryChangeMode::Move + || slots_guard.gfn_tree.contains_key(&base_gfn) + { + return Err(SystemError::EEXIST); } let new = LockedKvmMemSlot::new(); @@ -346,6 +347,7 @@ impl Vm { return self.set_memslot(old.as_ref(), Some(&new), change); } + #[allow(clippy::modulo_one)] #[inline] /// 获取活动内存插槽 fn memslot_set(&self, id: usize) -> &Arc { @@ -501,7 +503,7 @@ impl Vm { if let Some(last) = &slots_guard.last_use { if Arc::ptr_eq(last, old) { - slots_guard.last_use = new.map(|x| x.clone()); + slots_guard.last_use = new.cloned(); } } From a211e5a068b08dff576ff3484f33b71d42182635 Mon Sep 17 00:00:00 2001 From: Brahmamantra <2033552517@qq.com> Date: Tue, 17 Sep 2024 23:32:34 +0800 Subject: [PATCH 07/10] =?UTF-8?q?=E6=9A=82=E6=97=B6=E6=80=A7push=E5=88=B0h?= =?UTF-8?q?yc=E4=BB=93=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fixme.md | 36 ++ kernel/src/arch/x86_64/mod.rs | 1 + kernel/src/arch/x86_64/vm/kvm_host/mod.rs | 74 +++- kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs | 11 +- kernel/src/arch/x86_64/vm/{ => mmu}/mmu.rs | 101 ++++- kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs | 373 ++++++++++++++++++ kernel/src/arch/x86_64/vm/mmu/mod.rs | 4 + kernel/src/arch/x86_64/vm/mmu/pte.rs | 43 ++ kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs | 203 ++++++++++ kernel/src/arch/x86_64/vm/mod.rs | 3 +- kernel/src/arch/x86_64/vm/mtrr.rs | 37 ++ kernel/src/arch/x86_64/vm/vmx/capabilities.rs | 7 +- kernel/src/arch/x86_64/vm/vmx/ept/mod.rs | 31 ++ kernel/src/arch/x86_64/vm/vmx/exit.rs | 142 ++++++- kernel/src/arch/x86_64/vm/vmx/mod.rs | 38 +- kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs | 5 +- kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs | 7 +- kernel/src/mm/mod.rs | 2 +- kernel/src/virt/vm/kvm_host/mem.rs | 173 +++++++- kernel/src/virt/vm/kvm_host/mod.rs | 25 +- kernel/src/virt/vm/kvm_host/vcpu.rs | 21 +- 21 files changed, 1279 insertions(+), 58 deletions(-) create mode 100644 fixme.md rename kernel/src/arch/x86_64/vm/{ => mmu}/mmu.rs (83%) create mode 100644 kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs create mode 100644 kernel/src/arch/x86_64/vm/mmu/mod.rs create mode 100644 kernel/src/arch/x86_64/vm/mmu/pte.rs create mode 100644 kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs create mode 100644 kernel/src/arch/x86_64/vm/mtrr.rs create mode 100644 kernel/src/arch/x86_64/vm/vmx/ept/mod.rs diff --git a/fixme.md b/fixme.md new file mode 100644 index 000000000..0cd469b6a --- /dev/null +++ b/fixme.md @@ -0,0 +1,36 @@ +### fixme: +PageLevel的类型 +MTRR 是 x86 架构中的一组寄存器,用于控制不同内存区域的缓存属性。通过配置 MTRR,可以优化系统性能和兼容性。操作系统在启动时通常会配置 MTRR,以确保不同内存区域具有适当的缓存属性。 + +初次EPT_VIOLATION的时候,gpa=0,要建立从gpa到hpa的映射,也就是ept映射,处理完各个寄存器以及mmu等状态后 +- do_page_fault 初始化page_fault信息,能知道gfn + +- gfn_to_memslot 找到包含 gfn 的 memslot 的指针,放在page_fault.slot里面 + +- __gfn_to_hva_many 得到hva(照着之前的kvm写的)(要用到page_fault的slot) + +- hva_to_pfn 得到pfn,可以说相当于知道了hpa(照着之前的kvm写的),放在 page_fault.pfn里面 + +找到ept root物理地址 kernel/src/arch/x86_64/mm/mod.rs:184 + +### 疑问? +- 内核里面应该有相似的多级页表查询/映射的机制,是不是可以借鉴或者复用 kernel/src/mm/page.rs:712 kvm:kernel/src/arch/x86_64/kvm/vmx/ept.rs:91 + +- 我感觉得到ept root 物理地址(不知道存哪了,可能在真正要)后,按照索引在ept页表往下查,然后缺页就alloc块给它然后加入页表建立映射(gpa->hpa),直到找到目标层的level,[linux实现](https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/mmu/tdp_mmu.c?fi=kvm_tdp_mmu_map#952) + +- __va和virt_2_phys是一样的吗? + +- mm.h的作用 + + +### Debug +tdp_page_fault :at src/arch/x86_64/vm/mmu/mmu_internal.rs:233 +enter_guest : at src/arch/x86_64/vm/kvm_host/vcpu.rs:840 +handle_ept_violation :at src/arch/x86_64/vm/vmx/exit.rs:278 +try_handle_exit: at kernel/src/arch/x86_64/vm/vmx/exit.rs:250 +vmlaunch : at kernel/src/arch/x86_64/vm/vmx/vmenter.S:103 +page fault :kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs:105 + +kernel/src/mm/kernel_mapper.rs + + diff --git a/kernel/src/arch/x86_64/mod.rs b/kernel/src/arch/x86_64/mod.rs index 11c7ff27b..73cc1ea83 100644 --- a/kernel/src/arch/x86_64/mod.rs +++ b/kernel/src/arch/x86_64/mod.rs @@ -47,3 +47,4 @@ pub use crate::arch::vm::kvm_host::X86KvmArch as KvmArch; pub use crate::arch::vm::x86_kvm_ops as kvm_arch_ops; pub use crate::arch::vm::kvm_host::vcpu::X86VcpuArch as VirtCpuArch; +pub use crate::arch::vm::kvm_host::KvmVcpuStat as VirtCpuStat; diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs index e61cb5047..f16093eac 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -1,4 +1,4 @@ -use core::fmt::Debug; +use core::{fmt::Debug, sync::atomic::AtomicU32}; use alloc::{boxed::Box, vec::Vec}; use bit_field::BitField; @@ -38,6 +38,8 @@ pub const TSS_IOPB_SIZE: usize = 65536 / 8; pub const TSS_REDIRECTION_SIZE: usize = 256 / 8; pub const RMODE_TSS_SIZE: usize = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1; +pub const KVM_PFN_NOSLOT:u64 = 0x1 << 63; + #[derive(Debug, Default)] pub struct X86KvmArch { /// 中断芯片模式 @@ -60,6 +62,8 @@ pub struct X86KvmArch { pub notify_window: u32, msr_fliter: Option>, + + pub noncoherent_dma_count:AtomicU32, } impl X86KvmArch { @@ -216,7 +220,7 @@ pub trait KvmFunc: Send + Sync + Debug { &self, vcpu: &mut VirtCpu, fastpath: ExitFastpathCompletion, - ) -> Result<(), SystemError>; + ) -> Result; } /// ## 中断抑制的原因位 @@ -335,8 +339,8 @@ pub enum KvmReg { VcpuExregCr4, VcpuExregRflags, VcpuExregSegments, - VcpuExregExitInfo1, - VcpuExregExitInfo2, + VcpuExregExitInfo1, //EXITINFO1 provides the linear address of the memory operand. + VcpuExregExitInfo2, //EXITINFO2 provides the contents of the register operand. } bitflags! { @@ -384,3 +388,65 @@ impl Vm { return x86_kvm_ops().vcpu_precreate(self); } } +bitflags! { + pub struct EmulType: u32 { + const NO_DECODE = 1 << 0; + const TRAP_UD = 1 << 1; + const SKIP = 1 << 2; + const ALLOW_RETRY_PF = 1 << 3; + const TRAP_UD_FORCED = 1 << 4; + const VMWARE_GP = 1 << 5; + const PF = 1 << 6; + const COMPLETE_USER_EXIT = 1 << 7; + const WRITE_PF_TO_SP = 1 << 8; + } +} +#[derive(Default,Debug)] +///用于跟踪和记录VCPU的各种统计信息。 +pub struct KvmVcpuStat { + //pub generic: KvmVcpuStatGeneric, + pub pf_taken: u64, + pub pf_fixed: u64, + pub pf_emulate: u64, + pub pf_spurious: u64, + pub pf_fast: u64, + pub pf_mmio_spte_created: u64, + pub pf_guest: u64, + pub tlb_flush: u64, + pub invlpg: u64, + pub exits: u64, + pub io_exits: u64, + pub mmio_exits: u64, + pub signal_exits: u64, + pub irq_window_exits: u64, + pub nmi_window_exits: u64, + pub l1d_flush: u64, + pub halt_exits: u64, + pub request_irq_exits: u64, + pub irq_exits: u64, + pub host_state_reload: u64, + pub fpu_reload: u64, + pub insn_emulation: u64, + pub insn_emulation_fail: u64, + pub hypercalls: u64, + pub irq_injections: u64, + pub nmi_injections: u64, + pub req_event: u64, + pub nested_run: u64, + pub directed_yield_attempted: u64, + pub directed_yield_successful: u64, + pub preemption_reported: u64, + pub preemption_other: u64, + pub guest_mode: u64, + pub notify_window_exits: u64, +} +#[inline] +/// 将 GFN 转换为 GPA +pub fn gfn_to_gpa(gfn: u64) -> u64 { + gfn << 12 +} +#[inline] +/// 将 GPA 转换为 GFN +pub fn gpa_to_gfn(gfn: u64) -> u64 { + gfn >> 12 +} \ No newline at end of file diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs index e640fc3b4..66e2e40e3 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -26,7 +26,7 @@ use crate::{ asm::{hyperv, kvm_msr, KvmX86Asm, MiscEnable, MsrData, VcpuSegment}, cpuid::KvmCpuidEntry2, kvm_host::KvmReg, - mmu::LockedKvmMmu, + mmu::mmu::LockedKvmMmu, uapi::{UapiKvmSegmentRegs, KVM_SYNC_X86_VALID_FIELDS}, vmx::{vmcs::ControlsType, vmx_info}, x86_kvm_manager, x86_kvm_manager_mut, x86_kvm_ops, @@ -141,6 +141,9 @@ pub struct X86VcpuArch { pub at_instruction_boundary: bool, pub db: [usize; Self::KVM_NR_DB_REGS], + + /* set at EPT violation at this point */ + pub exit_qual: u64, } impl X86VcpuArch { @@ -155,6 +158,7 @@ impl X86VcpuArch { ret.mp_state = MutilProcessorState::Runnable; ret.apic = None; + //max_phyaddr=?? fztodo *ret } @@ -839,7 +843,10 @@ impl VirtCpu { // TODO: 一些中断或者tsc操作 - return x86_kvm_ops().handle_exit(self, exit_fastpath); + match x86_kvm_ops().handle_exit(self, exit_fastpath){ + Err(err)=>return Err(err), + Ok(_)=>{Ok(())} + } } fn flush_tlb_all(&mut self) { diff --git a/kernel/src/arch/x86_64/vm/mmu.rs b/kernel/src/arch/x86_64/vm/mmu/mmu.rs similarity index 83% rename from kernel/src/arch/x86_64/vm/mmu.rs rename to kernel/src/arch/x86_64/vm/mmu/mmu.rs index 504e9a107..5aadd7834 100644 --- a/kernel/src/arch/x86_64/vm/mmu.rs +++ b/kernel/src/arch/x86_64/vm/mmu/mmu.rs @@ -1,4 +1,12 @@ -use crate::{arch::mm::X86_64MMArch, kdebug}; +use crate::kerror; +use crate::virt::kvm::host_mem::PAGE_SHIFT; +use crate::{arch::mm::X86_64MMArch, kdebug, kwarn}; +use crate::{ + arch::{mm::LockedFrameAllocator, MMArch, VirtCpuArch}, + libs::spinlock::{SpinLock, SpinLockGuard}, + mm::{page::PageMapper, MemoryManagementArch, PageTableKind}, + virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, +}; use alloc::{sync::Arc, vec::Vec}; use bitfield_struct::bitfield; use core::intrinsics::likely; @@ -7,14 +15,11 @@ use system_error::SystemError; use x86::controlregs::{Cr0, Cr4}; use x86_64::registers::control::EferFlags; -use crate::{ - arch::{mm::LockedFrameAllocator, MMArch, VirtCpuArch}, - libs::spinlock::{SpinLock, SpinLockGuard}, - mm::{page::PageMapper, MemoryManagementArch, PageTableKind}, - virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, +use super::super::{ + vmx::vmx_info, + x86_kvm_ops, }; - -use super::{vmx::vmx_info, x86_kvm_ops}; +use super::mmu_internal::KvmPageFault; const PT64_ROOT_5LEVEL: usize = 5; const PT64_ROOT_4LEVEL: usize = 4; @@ -31,15 +36,50 @@ static mut SHADOW_ACCESSED_MASK: usize = 0; static mut MAX_HUGE_PAGE_LEVEL: PageLevel = PageLevel::None; +pub const PAGE_SIZE: u64 = 1 << PAGE_SHIFT; + +pub fn is_tdp_mmu_enabled()->bool{ + unsafe { TDP_MMU_ENABLED } +} + #[allow(dead_code)] +#[repr(u8)] pub enum PageLevel { None, - Level4k, + Level4K, Level2M, Level1G, Level512G, LevelNum, } +impl PageLevel { + fn kvm_hpage_gfn_shift(level: u8) -> u32 { + ((level - 1) * 9) as u32 + } + + fn kvm_hpage_shift(level: u8) -> u32 { + PAGE_SHIFT + Self::kvm_hpage_gfn_shift(level) + } + + fn kvm_hpage_size(level: u8) -> u64 { + 1 << Self::kvm_hpage_shift(level) + } + /// 计算每个大页包含的页数 + /// + /// # 参数 + /// - `level`: 页级别 + /// + /// # 返回值 + /// 返回每个大页包含的页数 + pub fn kvm_pages_per_hpage(level: u8) -> u64 { + Self::kvm_hpage_size(level) / PAGE_SIZE + } + +} +///计算给定 GFN(Guest Frame Number)在指定级别上的对齐值 +pub fn gfn_round_for_level(gfn: u64, level: u8) -> u64 { + gfn & !(PageLevel::kvm_pages_per_hpage(level) - 1) +} #[derive(Debug)] pub struct LockedKvmMmu { @@ -58,12 +98,16 @@ impl LockedKvmMmu { } } +pub type KvmMmuPageFaultHandler = + fn(vcpu: &mut VirtCpu, page_fault:&KvmPageFault) -> Result; + #[derive(Debug, Default)] #[allow(dead_code)] pub struct KvmMmu { pub root: KvmMmuRootInfo, pub cpu_role: KvmCpuRole, pub root_role: KvmMmuPageRole, + pub page_fault: Option, pkru_mask: u32, @@ -148,7 +192,7 @@ impl PartialEq for KvmCpuRole { pub struct KvmMmuPageRole { /// 表示页表级别,占用 4 位。对于普通的页表,取值是 2(二级页表)、3(三级页表)、4(四级页表)和 5(五级页表) #[bits(4)] - level: u32, + pub level: u32, /// 页表项是否为 4 字节,占用 1 位。在非 PAE 分页模式下,该值为 1 has_4_byte_gpte: bool, /// 表示页表项所在的象限,占用 2 位。该字段仅在 has_4_byte_gpte 为 1 时有效。 @@ -180,7 +224,7 @@ pub struct KvmMmuPageRole { unused: u32, /// 表示 SMM(System Management Mode)模式 #[bits(8)] - smm: u32, + pub smm: u32, } impl KvmMmuPageRole { @@ -191,6 +235,9 @@ impl KvmMmuPageRole { pub fn is_cr4_pae(&self) -> bool { !self.has_4_byte_gpte() } + pub fn get_direct(&self) -> bool { + self.direct() + } } #[bitfield(u32)] @@ -213,6 +260,38 @@ pub struct KvmMmuRoleRegs { pub efer: EferFlags, } +/// page falut的返回值, 用于表示页面错误的处理结果 +/// 应用在handle_mmio_page_fault()、mmu.page_fault()、fast_page_fault()和 +/// kvm_mmu_do_page_fault()等 +#[derive(Debug, Eq, PartialEq, FromPrimitive)] +#[repr(u32)] +pub enum PFRet { + Continue, // RET_PF_CONTINUE: 到目前为止一切正常,继续处理页面错误。 + Retry, // RET_PF_RETRY: 让 CPU 再次对该地址发生页面错误。 + Emulate, // RET_PF_EMULATE: MMIO 页面错误,直接模拟指令。 + Invalid, // RET_PF_INVALID: SPTE 无效,让实际的页面错误路径更新它。 + Fixed, // RET_PF_FIXED: 故障的条目已经被修复 + Spurious, // RET_PF_SPURIOUS: 故障的条目已经被修复,例如由另一个 vCPU 修复。 + Err = u32::MAX, // 错误 +} +impl From for u64 { + fn from(pf_ret: PFRet) -> Self { + pf_ret as u64 + } +} +impl From for PFRet { + fn from(value: u64) -> Self { + match value { + 0 => PFRet::Continue, + 1 => PFRet::Retry, + 2 => PFRet::Emulate, + 3 => PFRet::Invalid, + 4 => PFRet::Fixed, + 5 => PFRet::Spurious, + _ => PFRet::Err, // 默认返回 Invalid + } + } +} impl VirtCpuArch { pub fn kvm_init_mmu(&mut self) { let regs = self.role_regs(); diff --git a/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs new file mode 100644 index 000000000..4267394f2 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs @@ -0,0 +1,373 @@ +use core::{intrinsics::unlikely, ops::Index}; +use alloc::{ + boxed::Box, + sync::{Arc, Weak}, + vec::Vec, +}; + +use system_error::SystemError; + +use crate::{ + arch::vm::{ + kvm_host::{EmulType,KVM_PFN_NOSLOT} ,mmu::{mmu::{PFRet, PageLevel}, tdp_iter:: + { is_large_pte, is_shadow_present_pte, TdpIter}}, mtrr::kvm_mtrr_check_gfn_range_consistency, vmx::PageFaultErr + }, + kwarn, + mm::{virt_2_phys, PhysAddr}, + virt::{ + kvm::host_mem::PAGE_SHIFT, + vm::kvm_host::{mem::{LockedKvmMemSlot, LockedVmMemSlotSet, UserMemRegionFlag, __gfn_to_pfn_memslot}, search_memslots, vcpu::VirtCpu}, + }, +}; + +use super::mmu::{gfn_round_for_level, is_tdp_mmu_enabled, KvmMmuPageRole}; + +#[derive(Debug, Default)] +pub struct KvmMmuPage{ + pub tdp_mmu_page:bool,// 标记是否为 TDP(Two-Dimensional Paging)页表页 + pub gfn: u64,// 客户机帧号(Guest Frame Number) + + /* + * The following two entries are used to key the shadow page in the + * hash table.暫時沒看出來 + */ + pub role: KvmMmuPageRole, + pub spt: u64,// 指向页表条目(SPTE)的指针 + pub mmu_seq: u64, + pub map_writable: bool, + pub write_fault_to_shadow_pgtable: bool, +} + + +#[derive(Debug, Default)] +pub struct KvmPageFault { + // vcpu.do_page_fault 的参数 + addr: PhysAddr, // gpa_t 通常是一个 64 位地址 + error_code: u32, + prefetch: bool, + + // 从 error_code 派生 + exec: bool, + write: bool, + present: bool, + rsvd: bool, + user: bool, + + // 从 mmu 和全局状态派生 + is_tdp: bool, + nx_huge_page_workaround_enabled: bool, + + // 是否可以创建大于 4KB 的映射,或由于 NX 大页被禁止 + huge_page_disallowed: bool, + + // 此故障可以创建的最大页面大小 + max_level: u8, + + // 基于 max_level 和主机映射使用的页面大小可以创建的页面大小 + req_level: u8, + + // 基于 req_level 和 huge_page_disallowed 将创建的页面大小 + goal_level: u8, + + // 移位后的 addr,或如果 addr 是 gva 则是访客页表遍历的结果 + gfn: u64, // gfn_t 通常是一个 64 位地址 + + // 包含 gfn 的 memslot。可能为 None + slot: Option>, + + // kvm_faultin_pfn 的输出 + mmu_seq: u64, + pfn: u64, // kvm_pfn_t 通常是一个 64 位地址 + hva: u64, // hva_t 通常是一个 64 位地址 + map_writable: bool, + + // 表示访客正在尝试写入包含用于翻译写入本身的一个或多个 PTE 的 gfn + write_fault_to_shadow_pgtable: bool, +} + +impl VirtCpu { + #[inline(never)] + pub fn page_fault( + &mut self, + cr2_or_gpa: u64, + mut error_code: u64, + insn: Option, + insn_len: usize, + ) -> Result { + let mut emulation_type = EmulType::PF; + let direct = self.arch.mmu().root_role.get_direct(); + // IMPLICIT_ACCESS 是一个 KVM 定义的标志,用于在模拟触发隐式访问的指令时正确执行 SMAP 检查。 + // 防止内核态代码(超级用户模式)访问用户态内存。它是通过设置 CR4 寄存器中的 SMAP 位来启用的。 + // 如果硬件生成的错误代码与 KVM 定义的值冲突,则发出警告。 + // 清除该标志并继续,不终止虚拟机,因为 KVM 不可能依赖于 KVM 不知道的标志。 + if error_code & PageFaultErr::PFERR_IMPLICIT_ACCESS.bits() != 0 { + kwarn!("Implicit access error code detected"); + error_code &= !PageFaultErr::PFERR_IMPLICIT_ACCESS.bits(); + } + + //if self.arch.mmu().root.hpa != KvmMmu::INVALID_PAGE { + // return Ok(PFRet::Retry as u64); + //} + + let mut r = PFRet::Invalid; + if unlikely(error_code & PageFaultErr::PFERR_RSVD.bits() != 0) { + todo!(); + // r = self.handle_mmio_page_fault(cr2_or_gpa, direct)?; + // if r == PFRes::Emulate{ + // return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,insn_len) insn_len); + // } + } + + if r == PFRet::Invalid { + r = self + .do_page_fault(cr2_or_gpa, error_code as u32, false, emulation_type)? + .into(); + if r == PFRet::Invalid { + return Err(SystemError::EIO); + } + } + + if r == PFRet::Err { + //return SystemError::EFAULT; + todo!() + } + if r != PFRet::Emulate { + return Ok(1); + } + + // 在模拟指令之前,检查错误代码是否由于在翻译客户机页面时的只读(RO)违规。 + // 这可能发生在使用嵌套虚拟化和嵌套分页的情况下。如果是这样,我们只需取消页面保护并恢复客户机。 + let pferr_nested_guest_page = PageFaultErr::PFERR_GUEST_PAGE + | PageFaultErr::PFERR_WRITE + | PageFaultErr::PFERR_PRESENT; + if self.arch.mmu().root_role.get_direct() + && (error_code & pferr_nested_guest_page.bits()) == pferr_nested_guest_page.bits() + { + todo!() + } + + // self.arch.mmu.page_fault 返回 RET_PF_EMULATE,但我们仍然可以乐观地尝试取消页面保护, + // 并让处理器重新执行导致页面故障的指令。不允许重试 MMIO 模拟,因为这不仅毫无意义, + // 而且可能导致进入无限循环,因为处理器会不断在不存在的 MMIO 地址上发生故障。 + // 重试来自嵌套客户机的指令也是毫无意义且危险的,因为我们只显式地影子 L1 的页表, + // 即为 L1 取消保护并不会神奇地修复导致 L2 失败的问题。 + // if !self.mmio_info_in_cache(cr2_or_gpa, direct) && !self.arch.is_guest_mode() { + // emulation_type |= EmulType::ALLOW_RETRY_PF; + // } + + // self.emulate_instruction(cr2_or_gpa, emulation_type, insn, insn_len) + todo!("emulate_instruction") + } + fn do_page_fault( + &mut self, + cr2_or_gpa: u64, + error_code: u32, + prefetch: bool, + mut emultype: EmulType, + ) -> Result { + //初始化page fault + let mut page_fault = KvmPageFault { + addr: PhysAddr::new(cr2_or_gpa as usize), + error_code, + exec: error_code & PageFaultErr::PFERR_FETCH.bits() as u32 != 0, + write: error_code & PageFaultErr::PFERR_WRITE.bits() as u32 != 0, + present: error_code & PageFaultErr::PFERR_PRESENT.bits() as u32 != 0, + rsvd: error_code & PageFaultErr::PFERR_RSVD.bits() as u32 != 0, + user: error_code & PageFaultErr::PFERR_USER.bits() as u32 != 0, + prefetch, + is_tdp : true, + nx_huge_page_workaround_enabled: false, //todo + max_level: PageLevel::Level1G as u8, + req_level: PageLevel::Level4K as u8, + goal_level: PageLevel::Level4K as u8, + ..Default::default() + }; + //处理直接映射 + if self.arch.mmu().root_role.get_direct() { + page_fault.gfn = (page_fault.addr.data() >> PAGE_SHIFT) as u64; + page_fault.slot = self.gfn_to_memslot(page_fault.gfn);//kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);没完成 + } + //异步页面错误(Async #PF),也称为预取错误(prefetch faults), + //从客机(guest)的角度来看并不是错误,并且已经在原始错误发生时被计数。 + if !prefetch { + self.stat.pf_taken += 1; + } + + let r = if page_fault.is_tdp { + self.tdp_page_fault(&mut page_fault).unwrap() + } else { + let handle = self.arch.mmu().page_fault.unwrap(); + handle(self, &page_fault).unwrap() + }; + + if page_fault.write_fault_to_shadow_pgtable { + emultype |= EmulType::WRITE_PF_TO_SP; + } + //类似于上面的情况,预取错误并不是真正的虚假错误,并且异步页面错误路径不会进行仿真。 + //然而,确实要统计由异步页面错误处理程序修复的错误,否则它们将永远不会被统计。 + match PFRet::from(r) { + PFRet::Fixed => self.stat.pf_fixed += 1, + PFRet::Emulate => self.stat.pf_emulate += 1, + PFRet::Spurious => self.stat.pf_spurious += 1, + _ => {} + } + + Ok(r) + } + + fn gfn_to_memslot(&self, gfn: u64) -> Option> { + let slot_set: Arc = self.kvm_vcpu_memslots(); + //...todo + + search_memslots(slot_set, gfn) + + } + pub fn kvm_vcpu_memslots(&self) ->Arc { + let binding = self.kvm(); + let kvm = binding.lock(); + kvm.memslots.index(0).clone() + } + fn tdp_page_fault(&mut self, page_fault: &mut KvmPageFault) -> Result { + // 如果 shadow_memtype_mask 为真,并且虚拟机有非一致性 DMA + //if shadow_memtype_mask != 0 && self.kvm().lock().arch.noncoherent_dma_count > 0 { + while page_fault.max_level > PageLevel::Level4K as u8{ + let page_num = PageLevel::kvm_pages_per_hpage(page_fault.max_level); + + //低地址对齐 + let base = gfn_round_for_level(page_fault.gfn, page_fault.max_level); + + //检查给定 GFN 范围内的内存类型是否一致,暂未实现 + if kvm_mtrr_check_gfn_range_consistency(self, base, page_num) { + break; + } + + page_fault.max_level -= 1; + } + //} + + if is_tdp_mmu_enabled() { + return self.kvm_tdp_mmu_page_fault(page_fault); + } + + self.direct_page_fault(page_fault) + } + fn kvm_tdp_mmu_page_fault(&self,page_fault: &mut KvmPageFault)->Result{ + + //page_fault_handle_page_track(page_fault) + //fast_page_fault(page_fault); + //mmu_topup_memory_caches(false);//补充内存缓存 + let mut r= self.kvm_faultin_pfn(page_fault, 1|1<<1|1<<2).unwrap(); + if r != PFRet::Continue { + return Ok(r.into()); + } + + r = PFRet::Retry; + //实际的映射 + self.tdp_map(page_fault); + Ok(r.into()) + } + fn tdp_map(&self,page_fault: &mut KvmPageFault)->Result{ + //没有实现SPTE,huge page相关 + let mmu=self.arch.mmu(); + let kvm = self.kvm(); + let ret = PFRet::Retry; + + + let mut tdp_iter : TdpIter=TdpIter::default(); + + + tdp_iter.start(virt_2_phys(mmu.root.hpa as usize)/*__va */,mmu.root_role.level() as u8, + PageLevel::Level4K as u8,page_fault.gfn ); + for iter in tdp_iter{ + if !(iter.valid && iter.gfnpa的转换可能有点问题 + + + // 如果启用了 NX 巨大页解决方法,则进行调整 + if page_fault.nx_huge_page_workaround_enabled { + page_fault.nx_huge_page_workaround_enabled = false; + } + + if iter.level == page_fault.goal_level{ + //self.map_handle_target_level(page_fault,&mut iter); + } + + //如果在比目标更高的级别有一个映射大页的 SPTE, + //那么该 SPTE 必须被清除并替换为非叶子 SPTE。 + if is_shadow_present_pte(iter.old_spte) && !is_large_pte(iter.old_spte) { + continue; + } + + //SPTE是non-present或者指向一个需要split的大页 + + + } + todo!() + } + ///todo()!!! + fn map_handle_target_level(&self,page_fault:&mut KvmPageFault,iter: &mut TdpIter){ + todo!() + } + + fn direct_page_fault(&self,page_fault: &KvmPageFault)->Result{ + todo!() + } + fn kvm_faultin_pfn(&self,page_fault: &mut KvmPageFault,access: u32)->Result{ + page_fault.mmu_seq = self.kvm().lock().mmu_invalidate_seq; + self.__kvm_faultin_pfn(page_fault) + } + fn __kvm_faultin_pfn(&self,page_fault: &mut KvmPageFault)->Result{ + let slot = &page_fault.slot; + let mut is_async = false; + if slot.is_none() { + return Err(SystemError::KVM_HVA_ERR_BAD); + } + let slot = slot.as_ref().unwrap().read(); + + if slot.get_flags().bits()& UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits()!= 0 { + return Ok(PFRet::Retry); + } + if !slot.is_visible(){ + /* 不要将私有内存槽暴露给 L2。 */ + if self.arch.is_guest_mode() { + drop(slot); + page_fault.slot = None; + page_fault.pfn = KVM_PFN_NOSLOT; + page_fault.map_writable = false; + return Ok(PFRet::Continue); + } + /* + * 如果 APIC 访问页面存在但被禁用,则直接进行仿真, + * 而不缓存 MMIO 访问或创建 MMIO SPTE。 + * 这样,当 AVIC 重新启用时,不需要清除缓存。 + */ + // if slot.get_id() == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && !self.kvm_apicv_activated() + // { + // return PFRet::Emulate; + // } + } + + + // 尝试将 GFN 转换为 PFN + page_fault.pfn = __gfn_to_pfn_memslot(Some(&slot), page_fault.gfn, false, false, &mut is_async, + page_fault.write, &mut page_fault.map_writable, + &mut page_fault.hva)?; + if !is_async { + return Ok(PFRet::Continue); /* *pfn 已经有正确的页面 */ + } + + // if !page_fault.prefetch && self.kvm_can_do_async_pf() { + // self.trace_kvm_try_async_get_page(page_fault.addr, page_fault.gfn); + // if self.kvm_find_async_pf_gfn(page_fault.gfn) { + // self.trace_kvm_async_pf_repeated_fault(page_fault.addr, page_fault.gfn); + // self.kvm_make_request(KVM_REQ_APF_HALT); + // return Ok(PFRet::Retry); + // } else if self.kvm_arch_setup_async_pf(page_fault.addr, page_fault.gfn) { + // return Ok(PFRet::Retry); + // } + // } + Ok(PFRet::Continue) + } +} diff --git a/kernel/src/arch/x86_64/vm/mmu/mod.rs b/kernel/src/arch/x86_64/vm/mmu/mod.rs new file mode 100644 index 000000000..9c2f87955 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mmu/mod.rs @@ -0,0 +1,4 @@ +pub mod mmu; +pub mod mmu_internal; +pub mod tdp_iter; +pub mod pte; \ No newline at end of file diff --git a/kernel/src/arch/x86_64/vm/mmu/pte.rs b/kernel/src/arch/x86_64/vm/mmu/pte.rs new file mode 100644 index 000000000..052def047 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mmu/pte.rs @@ -0,0 +1,43 @@ +bitflags::bitflags! { + pub struct PteFlags: u64 { + const PRESENT = 1 << 0; + const READ_WRITE = 1 << 1; + const USER_SUPERVISOR = 1 << 2; + const PAGE_WRITE_THROUGH = 1 << 3; + const PAGE_CACHE_DISABLE = 1 << 4; + const ACCESSED = 1 << 5; + const DIRTY = 1 << 6; + const PAGE_SIZE = 1 << 7; + const GLOBAL = 1 << 8; + const EXECUTE_DISABLE = 1 << 63; + } +} + +pub struct Pte { + pub address: u64, // 物理地址 + pub flags: PteFlags, // 页表条目标志 +} + +impl Pte { + pub fn new(address: u64, flags: PteFlags) -> Self { + Self { address, flags } + } + + pub fn is_present(&self) -> bool { + self.flags.contains(PteFlags::PRESENT) + } + + pub fn is_read_write(&self) -> bool { + self.flags.contains(PteFlags::READ_WRITE) + } + + pub fn is_user_supervisor(&self) -> bool { + self.flags.contains(PteFlags::USER_SUPERVISOR) + } + + pub fn is_executable(&self) -> bool { + !self.flags.contains(PteFlags::EXECUTE_DISABLE) + } + + // 其他方法... +} \ No newline at end of file diff --git a/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs b/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs new file mode 100644 index 000000000..a0298a022 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs @@ -0,0 +1,203 @@ +use crate::{arch::vm::mmu::mmu::gfn_round_for_level, mm::{virt_2_phys, PhysAddr, VirtAddr}, time::sleep, virt::kvm::host_mem::PAGE_SHIFT}; + +use super::{mmu::{PageLevel, PAGE_SIZE}, mmu_internal::KvmMmuPage}; + + +pub const PT64_ROOT_MAX_LEVEL: usize = 5; //通常只用到4级,但是确实有5级的情况 +pub const PT_LEVEL_BITS: u8 = 9; // 每个页表级别的位数 +pub const PT64_ENT_PER_PAGE: u32 = 1<<9; +pub const PTE_LEN: usize = 64; + +//Bits 51:12 are from the EPT PDPTE +pub const PT64_BASE_ADDR_MASK: u64 = ((1u64 << 52) - 1) & !(PAGE_SIZE - 1); + +pub fn shadow_pt_index(addr: u64, level: u8) -> u64 { + (addr >> (PAGE_SHIFT as u8 + (level - 1) * PT_LEVEL_BITS)) & ((1 << PT_LEVEL_BITS) - 1) +} +pub fn is_last_spte(pte: u64, level: u8) -> bool { + level == PageLevel::Level4K as u8 || is_large_pte(pte) +} +pub fn is_shadow_present_pte(pte :u64) ->bool{ + pte & 1<<11 !=0//在intel手冊中:ept PTE:11 Ignored.不是很懂 +} +pub fn is_large_pte(pte :u64) ->bool{ + pte & 1<<7 !=0//在intel手冊中:ept PTE:7 Ignored. +} +///Bits 51:12 are from the EPT PDPTE +pub fn spte_to_pfn(pte: u64) -> u64 { + (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT +} + +#[derive(Default)] +pub struct TdpIter{ + inner: TdpIterInner, +} + +impl TdpIter{ + pub fn start(&self,root_pt:usize,root_level:u8,min_level:u8,next_last_level_gfn:u64)->Self{ + let mut inner = self.inner.clone(); + inner.start(root_pt,root_level,min_level,next_last_level_gfn); + TdpIter{ + inner + } + } +} +///迭代器将遍历分页结构,直到找到此 GFN 的映射。 +#[derive(Default,Clone)] +pub struct TdpIterInner { + next_last_level_gfn: u64, + /// 线程上次让出时的 next_last_level_gfn。 + /// 仅当 next_last_level_gfn != yielded_gfn 时让出,有助于确保前进。 + pub yielded_gfn: u64, + + ///指向遍历到当前 SPTE 的页表的指针 + pt_path: [u64; PT64_ROOT_MAX_LEVEL], + + ///指向当前 SPTE 的指针 是hva吗? + sptep: PhysAddr, + + /// 当前 SPTE 映射的最低 GFN hpa>>shift? + pub gfn: u64, + + ///给迭代器的根页级别 + pub root_level: u8, + + ///迭代器应遍历到的最低级别 + pub min_level: u8, + + ///迭代器在分页结构中的当前级别 + pub level: u8, + + ///sptep 处值的快照 + pub old_spte: u64, + + ///迭代器是否具有有效状态。如果迭代器走出分页结构的末端,则为 false。 + /// + pub valid: bool, +} +impl TdpIterInner{ + ///初始化ept iter + pub fn start(&mut self,root_pt :usize,root_level:u8, + min_level: u8,next_last_level_gfn:u64){ + // if root_pt.role.level() == 0 || root_pt.role.level() > PT64_ROOT_MAX_LEVEL as u32 { + // self.valid = false; + // return; + // } + + if root_level < 1 || root_level > PT64_ROOT_MAX_LEVEL as u8 { + self.valid = false; + return; + } + self.next_last_level_gfn = next_last_level_gfn; + self.root_level = root_level as u8; + self.min_level = min_level as u8; + self.pt_path[(self.root_level - 1) as usize] =root_pt as u64; + self.yielded_gfn=self.next_last_level_gfn; + self.level = self.root_level; + + self.gfn = gfn_round_for_level(self.next_last_level_gfn, self.level); + self.tdp_iter_refresh_sptep(); + self.valid = true; + } + + /* + * 重新计算当前GFN和level和SPTE指针,并重新读取SPTE。 + */ + fn tdp_iter_refresh_sptep(&mut self){ + self.sptep = PhysAddr::new((self.pt_path[self.level as usize - 1] + + shadow_pt_index(self.gfn <bool{ + if self.level == self.min_level { + return false; + } + //在下降之前重新读取SPTE,以避免遍历到不再从此条目链接的页表中。 + self.old_spte =read_sptep(self.sptep); + + match spte_to_child_pt(self.old_spte,self.level){ + Some(child_pt) =>{ + self.level -= 1; + self.pt_path[self.level as usize - 1] = child_pt.data() as u64; + self.gfn = gfn_round_for_level(self.gfn, self.level); + self.tdp_iter_refresh_sptep(); + true + } + None => false, + + } + } + fn try_step_up(&mut self)->bool{ + if self.level == self.root_level { + return false; + } + self.level += 1; + self.gfn = gfn_round_for_level(self.gfn, self.level); + self.tdp_iter_refresh_sptep(); + true + } + ///在当前页表的当前级别中,移动到下一个条目。下一个条目可以指向一个page backing guest memory , + ///或者另一个页表,或者它可能是不存在的。如果迭代器能够移动到页表中的下一个条目,则返回true, + ///如果迭代器已经在当前页表的末尾,则返回false。 + fn try_step_side(&mut self)-> bool{ + //检查迭代器是否已经在当前页表的末尾。 + if shadow_pt_index(self.gfn< Option { + let inner=&mut self.inner; + if !inner.valid { + return None; + } + inner._next(); + if inner.valid { + Some(inner.clone()) + } else { + None + } + } + +} +///给定一个 SPTE 及其级别,返回一个指针,该指针包含 SPTE 所引用的子页表的hva。 +///如果没有这样的条目,则返回 null。 +/// +fn spte_to_child_pt(spte:u64,level:u8) ->Option{ + //没有子页表 + if !is_shadow_present_pte(spte) || is_last_spte(spte,level){ + return None; + } + Some(VirtAddr::new(virt_2_phys//__va + ((spte_to_pfn(spte)<u64{ + unsafe{*(sptep.data() as *const u64)} +} \ No newline at end of file diff --git a/kernel/src/arch/x86_64/vm/mod.rs b/kernel/src/arch/x86_64/vm/mod.rs index 4f9af1cc6..c35f63ac2 100644 --- a/kernel/src/arch/x86_64/vm/mod.rs +++ b/kernel/src/arch/x86_64/vm/mod.rs @@ -26,9 +26,10 @@ mod cpuid; pub(super) mod exit; pub mod kvm_host; pub mod mem; -mod mmu; +pub mod mmu; pub mod uapi; pub mod vmx; +pub mod mtrr; static mut KVM_X86_MANAGER: Option = None; diff --git a/kernel/src/arch/x86_64/vm/mtrr.rs b/kernel/src/arch/x86_64/vm/mtrr.rs new file mode 100644 index 000000000..873689a1d --- /dev/null +++ b/kernel/src/arch/x86_64/vm/mtrr.rs @@ -0,0 +1,37 @@ +use crate::virt::vm::kvm_host::vcpu::VirtCpu; + +use super::kvm_host::gfn_to_gpa; + +pub fn kvm_mtrr_check_gfn_range_consistency(vcpu: &mut VirtCpu, gfn: u64, page_num: u64) -> bool { + // let mtrr_state = &vcpu.arch.mtrr_state; + // let mut iter = MtrrIter { + // mem_type: -1, + // mtrr_disabled: false, + // partial_map: false, + // }; + let start = gfn_to_gpa(gfn); + let end = gfn_to_gpa(gfn + page_num); + + // mtrr_for_each_mem_type(&mut iter, mtrr_state, start, end, |iter| { + // if iter.mem_type == -1 { + // iter.mem_type = iter.mem_type; + // } else if iter.mem_type != iter.mem_type { + // return false; + // } + // }); + + // if iter.mtrr_disabled { + // return true; + // } + + // if !iter.partial_map { + // return true; + // } + + // if iter.mem_type == -1 { + // return true; + // } + + // iter.mem_type == mtrr_default_type(mtrr_state) + true +} \ No newline at end of file diff --git a/kernel/src/arch/x86_64/vm/vmx/capabilities.rs b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs index 3e33ba7bb..573e6a02d 100644 --- a/kernel/src/arch/x86_64/vm/vmx/capabilities.rs +++ b/kernel/src/arch/x86_64/vm/vmx/capabilities.rs @@ -8,8 +8,9 @@ use x86::{ use crate::{ arch::vm::{ - mmu::PageLevel, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR, PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, - VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR, VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR, + mmu::mmu::PageLevel, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR, + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR, + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR, }, virt::vm::kvm_host::vcpu::VirtCpu, }; @@ -388,7 +389,7 @@ impl Vmx { return PageLevel::Level2M; } - return PageLevel::Level4k; + return PageLevel::Level4K; } /// 判断mt(Memory type)是否为write back diff --git a/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs b/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs new file mode 100644 index 000000000..ab5263908 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs @@ -0,0 +1,31 @@ +use crate::libs::rwlock::RwLock; + +// pub const VMX_EPT_MT_EPTE_SHIFT:u64 = 3; +pub const VMX_EPT_RWX_MASK: u64 = 0x7 << 3; + +// Exit Qualifications for EPT Violations +pub const EPT_VIOLATION_ACC_READ_BIT: u64 = 0; +pub const EPT_VIOLATION_ACC_WRITE_BIT: u64 = 1; +pub const EPT_VIOLATION_ACC_INSTR_BIT: u64 = 2; +pub const EPT_VIOLATION_RWX_SHIFT: u64 = 3; +pub const EPT_VIOLATION_GVA_IS_VALID_BIT: u64 = 7; +pub const EPT_VIOLATION_GVA_TRANSLATED_BIT: u64 = 8; + +bitflags! { + pub struct EptViolationExitQual :u64{ + const ACC_READ = 1 << EPT_VIOLATION_ACC_READ_BIT; + const ACC_WRITE = 1 << EPT_VIOLATION_ACC_WRITE_BIT; + const ACC_INSTR = 1 << EPT_VIOLATION_ACC_INSTR_BIT; + const RWX_MASK = VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT; + const GVA_IS_VALID = 1 << EPT_VIOLATION_GVA_IS_VALID_BIT; + const GVA_TRANSLATED = 1 << EPT_VIOLATION_GVA_TRANSLATED_BIT; + } +} +struct EptPageTable { + // EPT 页表数据结构 +} + +struct EptManager { + ept: RwLock, + // 其他字段 +} diff --git a/kernel/src/arch/x86_64/vm/vmx/exit.rs b/kernel/src/arch/x86_64/vm/vmx/exit.rs index d32168ff7..18dad7aaf 100644 --- a/kernel/src/arch/x86_64/vm/vmx/exit.rs +++ b/kernel/src/arch/x86_64/vm/vmx/exit.rs @@ -1,7 +1,14 @@ use bitfield_struct::bitfield; use system_error::SystemError; +use x86::vmx::vmcs::{guest, ro}; -use crate::virt::vm::kvm_host::vcpu::VirtCpu; +use crate::{ + arch::vm::asm::{IntrInfo, VmxAsm}, + virt::vm::kvm_host::vcpu::VirtCpu, +}; + +use super::{ept::EptViolationExitQual, vmx_info, PageFaultErr}; +extern crate num_traits; #[bitfield(u32)] pub struct VmxExitReason { @@ -24,6 +31,7 @@ pub struct VmxExitReason { pub failed_vmentry: bool, } +//#define VMX_EXIT_REASONS #[derive(FromPrimitive, PartialEq)] #[allow(non_camel_case_types)] pub enum VmxExitReasonBasic { @@ -69,7 +77,7 @@ pub enum VmxExitReasonBasic { VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT = 41, TPR_BELOW_THRESHOLD = 43, APIC_ACCESS = 44, - VIRTUALIZED_EOI = 45, + VIRTUALIZED_EOI = 45, // "EOI_INDUCED" ACCESS_GDTR_OR_IDTR = 46, ACCESS_LDTR_OR_TR = 47, EPT_VIOLATION = 48, @@ -180,25 +188,141 @@ pub enum ExitFastpathCompletion { ReenterGuest, ExitHandled, } +pub struct VmxExitHandlers {} +// //name 代表暂时不懂含义的(name linux=name DragonOS) +// ExceptionNmi = VmxExitReasonBasic::EXCEPTION_OR_NMI as isize, +// ExternalInterrupt = VmxExitReasonBasic::EXTERNAL_INTERRUPT as isize, +// TripleFault = VmxExitReasonBasic::TRIPLE_FAULT as isize, +// NmiWindow = VmxExitReasonBasic::NMI_WINDOW as isize, +// IoInstruction = VmxExitReasonBasic::IO_INSTRUCTION as isize, +// CrAccess = VmxExitReasonBasic::CR_ACCESS as isize, +// DrAccess = VmxExitReasonBasic::DR_ACCESS as isize, +// Cpuid = VmxExitReasonBasic::CPUID as isize, +// MsrRead = VmxExitReasonBasic::RDMSR as isize, +// MsrWrite = VmxExitReasonBasic::WRMSR as isize, +// InterruptWindow = VmxExitReasonBasic::INTERRUPT_WINDOW as isize, +// Hlt = VmxExitReasonBasic::HLT as isize, +// Invd = VmxExitReasonBasic::INVD as isize, +// Invlpg = VmxExitReasonBasic::INVLPG as isize, +// Rdpmc = VmxExitReasonBasic::RDPMC as isize, +// Vmcall = VmxExitReasonBasic::VMCALL as isize, +// Vmclear = VmxExitReasonBasic::VMCLEAR as isize, +// Vmlaunch = VmxExitReasonBasic::VMLAUNCH as isize, +// Vmptrld = VmxExitReasonBasic::VMPTRLD as isize, +// Vmptrst = VmxExitReasonBasic::VMPTRST as isize, +// Vmread = VmxExitReasonBasic::VMREAD as isize, +// Vmresume = VmxExitReasonBasic::VMRESUME as isize, +// Vmwrite = VmxExitReasonBasic::VMWRITE as isize, +// Vmoff = VmxExitReasonBasic::VMXOFF as isize, +// Vmon = VmxExitReasonBasic::VMXON as isize, +// TprBelowThreshold = VmxExitReasonBasic::TPR_BELOW_THRESHOLD as isize, +// ApicAccess = VmxExitReasonBasic::APIC_ACCESS as isize, +// ApicWrite = VmxExitReasonBasic::APIC_WRITE as isize, +// EoiInduced = VmxExitReasonBasic::VIRTUALIZED_EOI as isize, //name +// Wbinvd = VmxExitReasonBasic::WBINVD as isize, +// Xsetbv = VmxExitReasonBasic::XSETBV as isize, +// TaskSwitch = VmxExitReasonBasic::TASK_SWITCH as isize, +// MceDuringVmentry = VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT as isize, //name +// GdtrIdtr = VmxExitReasonBasic::ACCESS_GDTR_OR_IDTR as isize, +// LdtrTr = VmxExitReasonBasic::ACCESS_LDTR_OR_TR as isize, +// EptViolation = VmxExitReasonBasic::EPT_VIOLATION as isize, +// EptMisconfig = VmxExitReasonBasic::EPT_MISCONFIG as isize, +// PauseInstruction = VmxExitReasonBasic::PAUSE as isize, +// MwaitInstruction = VmxExitReasonBasic::MWAIT as isize, +// MonitorTrapFlag = VmxExitReasonBasic::MONITOR_TRAP_FLAG as isize, +// MonitorInstruction = VmxExitReasonBasic::MONITOR as isize, +// Invept = VmxExitReasonBasic::INVEPT as isize, +// Invvpid = VmxExitReasonBasic::INVVPID as isize, +// Rdrand = VmxExitReasonBasic::RDRAND as isize, +// Rdseed = VmxExitReasonBasic::RDSEED as isize, +// PmlFull = VmxExitReasonBasic::PML_FULL as isize, +// Invpcid = VmxExitReasonBasic::INVPCID as isize, +// Vmfunc = VmxExitReasonBasic::VMFUNC as isize, +// PreemptionTimer = VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED as isize, +// Encls = VmxExitReasonBasic::ENCLS as isize, +// BusLock = VmxExitReasonBasic::BUS_LOCK as isize, +// Notify = VmxExitReasonBasic::NOTIFY as isize, +// Unknown, -pub struct VmxExitHandler; - -impl VmxExitHandler { - pub fn handle( +impl VmxExitHandlers { + #[inline(never)] + pub fn try_handle_exit( vcpu: &mut VirtCpu, basic: VmxExitReasonBasic, - ) -> Option> { + ) -> Option> { match basic { VmxExitReasonBasic::IO_INSTRUCTION => { return Some(Self::handle_io(vcpu)); } + VmxExitReasonBasic::EPT_VIOLATION => { + return Some(Self::handle_ept_violation(vcpu)); + } + _ => { - return None; + None } } } - fn handle_io(vcpu: &mut VirtCpu) -> Result<(), SystemError> { + fn handle_io(vcpu: &mut VirtCpu) -> Result { todo!(); } + + fn handle_ept_violation(vcpu: &mut VirtCpu) -> Result { + let exit_qualification = vcpu.get_exit_qual(); + + // EPT 违规发生在从 NMI 执行 iret 时, + // 在下一次 VM 进入之前必须设置 "blocked by NMI" 位。 + // 有一些错误可能会导致该位未被设置: + // AAK134, BY25。 + let vmx = vcpu.vmx(); + if vmx.idt_vectoring_info.bits() & IntrInfo::INTR_INFO_VALID_MASK.bits() != 0 + && vmx_info().enable_vnmi + && exit_qualification & IntrInfo::INTR_INFO_UNBLOCK_NMI.bits() as u64 != 0 + { + VmxAsm::vmx_vmwrite(guest::INTERRUPTIBILITY_STATE, 0x8); //GUEST_INTR_STATE_NMI + } + let gpa = VmxAsm::vmx_vmread(ro::GUEST_PHYSICAL_ADDR_FULL); + // trace_kvm_page_fault(vcpu, gpa, exit_qualification);//fztodo!() + + // 根据故障类型确定错误代码 + let mut error_code = if exit_qualification & (EptViolationExitQual::ACC_READ.bits()) != 0 {//active + PageFaultErr::PFERR_USER.bits() + } else { + 0 + }; + error_code |= if exit_qualification & (EptViolationExitQual::ACC_WRITE.bits()) != 0 {//active + PageFaultErr::PFERR_WRITE.bits() + } else { + 0 + }; + error_code |= if exit_qualification & (EptViolationExitQual::ACC_INSTR.bits()) != 0 { + PageFaultErr::PFERR_FETCH.bits() + } else { + 0 + }; + error_code |= if exit_qualification & (EptViolationExitQual::RWX_MASK.bits()) != 0 { + PageFaultErr::PFERR_PRESENT.bits() + } else { + 0 + }; + error_code |= if exit_qualification & (EptViolationExitQual::GVA_TRANSLATED.bits()) != 0 { + PageFaultErr::PFERR_GUEST_FINAL.bits()//active + } else { + PageFaultErr::PFERR_GUEST_PAGE.bits() + }; + //fixme:: 此时error_code为0x100000011,感觉有问题 + + vcpu.arch.exit_qual = exit_qualification; + + // 检查 GPA 是否超出物理内存限制,因为这是一个客户机页面错误。 + // 我们必须在这里模拟指令,因为如果非法地址是分页结构的地址, + // 则会设置 EPT_VIOLATION_ACC_WRITE 位。 + // 或者,如果支持,我们还可以使用 EPT 违规的高级 VM 退出信息来重建页面错误代码。 + // if allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa) { + // return kvm_emulate_instruction(vcpu, 0); + // } + + vcpu.page_fault(gpa, error_code, None, 0) + } } diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs index cd618ca51..e82d8704c 100644 --- a/kernel/src/arch/x86_64/vm/vmx/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -1,11 +1,12 @@ use core::intrinsics::likely; use core::intrinsics::unlikely; use core::sync::atomic::{AtomicBool, Ordering}; +use exit::VmxExitHandlers; use x86_64::registers::control::Cr3Flags; use x86_64::structures::paging::PhysFrame; use crate::arch::process::table::USER_DS; -use crate::arch::vm::mmu::KvmMmu; +use crate::arch::vm::mmu::mmu::KvmMmu; use crate::arch::vm::uapi::kvm_exit; use crate::arch::vm::uapi::{ AC_VECTOR, BP_VECTOR, DB_VECTOR, GP_VECTOR, MC_VECTOR, NM_VECTOR, PF_VECTOR, UD_VECTOR, @@ -100,6 +101,7 @@ use super::{ pub mod asm; pub mod capabilities; +pub mod ept; pub mod exit; pub mod vmcs; @@ -1048,9 +1050,10 @@ impl KvmFunc for VmxKvmFunc { vcpu.arch.clear_dirty(); - let cr3: (PhysFrame,Cr3Flags) = Cr3::read(); + let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); if unlikely(cr3 != vcpu.vmx().loaded_vmcs().host_state.cr3) { - let cr3_combined: u64 = (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); + let cr3_combined: u64 = + (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); vcpu.vmx().loaded_vmcs().host_state.cr3 = cr3; } @@ -1200,10 +1203,11 @@ impl KvmFunc for VmxKvmFunc { } fn handle_exit( + //vmx_handle_exit &self, vcpu: &mut VirtCpu, fastpath: ExitFastpathCompletion, - ) -> Result<(), SystemError> { + ) -> Result { let r = vmx_info().vmx_handle_exit(vcpu, fastpath); if vcpu.vmx().exit_reason.bus_lock_detected() { @@ -2343,7 +2347,8 @@ impl Vmx { VmxAsm::vmx_vmwrite(host::CR0, unsafe { cr0() }.bits() as u64); let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); - let cr3_combined: u64 = (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); + let cr3_combined: u64 = + (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); loaded_vmcs_host_state.cr3 = cr3; @@ -2968,10 +2973,10 @@ impl Vmx { &self, vcpu: &mut VirtCpu, exit_fastpath: ExitFastpathCompletion, - ) -> Result<(), SystemError> { + ) -> Result { let exit_reason = vcpu.vmx().exit_reason; - let unexpected_vmexit = |vcpu: &mut VirtCpu| -> Result<(), SystemError> { + let unexpected_vmexit = |vcpu: &mut VirtCpu| -> Result { kerror!("vmx: unexpected exit reason {:?}\n", exit_reason); self.dump_vmcs(vcpu); @@ -2986,7 +2991,7 @@ impl Vmx { run.__bindgen_anon_1.internal.data[1] = cpu; } - return Ok(()); + return Ok(0); }; let vectoring_info = vcpu.vmx().idt_vectoring_info; @@ -3036,8 +3041,11 @@ impl Vmx { if exit_fastpath != ExitFastpathCompletion::None { return Err(SystemError::EINVAL); } - - todo!() + match VmxExitHandlers::try_handle_exit(vcpu, VmxExitReasonBasic::from(exit_reason.basic())) + { + Some(Ok(r)) => return Ok(r), + Some(Err(_)) | None => unexpected_vmexit(vcpu), + } } /// 需要在缓存中更新的寄存器集。此处未列出的其他寄存器在 VM 退出后立即同步到缓存。 @@ -3209,6 +3217,8 @@ pub struct VmxVCpuPriv { req_immediate_exit: bool, guest_state_loaded: bool, + + exit_qualification: u64, //暂时不知道用处fztodo } #[derive(Debug, Default)] @@ -3269,6 +3279,7 @@ impl VmxVCpuPriv { exit_reason: VmxExitReason::new(), exit_intr_info: IntrInfo::empty(), msr_autostore: VmxMsrs::default(), + exit_qualification: 0, //fztodo }; vmx.vpid = vmx_info().alloc_vpid().unwrap_or_default() as u16; @@ -3570,6 +3581,12 @@ impl VmxVCpuPriv { flags } + pub fn get_exit_qual(&self) -> u64 { + self.exit_qualification + } + pub fn vmread_exit_qual(&mut self) { + self.exit_qualification = VmxAsm::vmx_vmread(ro::EXIT_QUALIFICATION); + } } bitflags! { @@ -3579,6 +3596,7 @@ bitflags! { const RW = 3; } + //https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/include/asm/kvm_host.h#249 pub struct PageFaultErr: u64 { const PFERR_PRESENT = 1 << 0; const PFERR_WRITE = 1 << 1; diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs index 7533bf650..77aa91a8d 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs @@ -85,9 +85,8 @@ impl VmxFeat { pub const KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = PinbasedControls::EXTERNAL_INTERRUPT_EXITING.bits() | PinbasedControls::NMI_EXITING.bits(); - pub const KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = PinbasedControls::VIRTUAL_NMIS - .bits() - | PinbasedControls::POSTED_INTERRUPTS.bits(); + pub const KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 = + PinbasedControls::VIRTUAL_NMIS.bits() | PinbasedControls::POSTED_INTERRUPTS.bits(); pub const KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS: u32 = EntryControls::LOAD_DEBUG_CONTROLS.bits() | EntryControls::IA32E_MODE_GUEST.bits(); diff --git a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs index bff5b0d09..7ada718ea 100644 --- a/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs @@ -121,7 +121,7 @@ impl LockedVMControlStructure { #[derive(Debug)] pub struct VmcsHostState { - pub cr3:(PhysFrame,Cr3Flags), + pub cr3: (PhysFrame, Cr3Flags), pub cr4: Cr4, pub gs_base: usize, pub fs_base: usize, @@ -170,7 +170,10 @@ impl VmcsHostState { impl Default for VmcsHostState { fn default() -> Self { Self { - cr3: (PhysFrame::containing_address(x86_64::PhysAddr::new(0)), Cr3Flags::empty()), + cr3: ( + PhysFrame::containing_address(x86_64::PhysAddr::new(0)), + Cr3Flags::empty(), + ), cr4: Cr4::empty(), gs_base: 0, fs_base: 0, diff --git a/kernel/src/mm/mod.rs b/kernel/src/mm/mod.rs index 3b95f7c2b..9bed5c63d 100644 --- a/kernel/src/mm/mod.rs +++ b/kernel/src/mm/mod.rs @@ -124,7 +124,7 @@ pub enum PageTableKind { } /// 物理内存地址 -#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)] +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash,Default)] #[repr(transparent)] pub struct PhysAddr(usize); diff --git a/kernel/src/virt/vm/kvm_host/mem.rs b/kernel/src/virt/vm/kvm_host/mem.rs index f284244e6..54b76614f 100644 --- a/kernel/src/virt/vm/kvm_host/mem.rs +++ b/kernel/src/virt/vm/kvm_host/mem.rs @@ -8,14 +8,11 @@ use hashbrown::HashMap; use system_error::SystemError; use crate::{ - arch::MMArch, - libs::{ + arch::{vm::mmu::mmu::PAGE_SIZE, MMArch}, libs::{ rbtree::RBTree, rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard}, spinlock::{SpinLock, SpinLockGuard}, - }, - mm::{MemoryManagementArch, VirtAddr}, - virt::vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion}, + }, mm::{kernel_mapper::KernelMapper, page::PageFlags, MemoryManagementArch, VirtAddr}, virt::{kvm::host_mem::PAGE_SHIFT, vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion}} }; use super::{LockedVm, Vm}; @@ -24,6 +21,19 @@ pub const KVM_USER_MEM_SLOTS: u16 = u16::MAX; pub const KVM_INTERNAL_MEM_SLOTS: u16 = 3; pub const KVM_MEM_SLOTS_NUM: u16 = KVM_USER_MEM_SLOTS - KVM_INTERNAL_MEM_SLOTS; pub const KVM_MEM_MAX_NR_PAGES: usize = (1 << 31) - 1; +pub const APIC_ACCESS_PAGE_PRIVATE_MEMSLOT: u16 = KVM_MEM_SLOTS_NUM+1; + +/// 对于普通的页帧号(PFN),最高的12位应该为零, +/// 因此我们可以mask位62到位52来表示错误的PFN, +/// mask位63来表示无槽的PFN。 +const KVM_PFN_ERR_MASK: u64 = 0x7ff << 52;//0x7FF0000000000000 +const KVM_PFN_ERR_NOSLOT_MASK: u64 = 0xfff << 52;//0xFFF0000000000000 +const KVM_PFN_NOSLOT: u64 = 1 << 63;//0x8000000000000000 + +const KVM_PFN_ERR_FAULT: u64 = KVM_PFN_ERR_MASK; +const KVM_PFN_ERR_HWPOISON: u64 = KVM_PFN_ERR_MASK + 1; +const KVM_PFN_ERR_RO_FAULT: u64 = KVM_PFN_ERR_MASK + 2; +const KVM_PFN_ERR_SIGPENDING: u64 = KVM_PFN_ERR_MASK + 3; #[derive(Debug, Default)] #[allow(dead_code)] @@ -76,7 +86,7 @@ pub struct KvmMemSlotSet { /// 存储虚拟地址(hva)和内存插槽之间的映射关系 hva_tree: RBTree>, /// 用于存储全局页帧号(gfn)和内存插槽之间的映射关系 - gfn_tree: RBTree>, + pub gfn_tree: RBTree>, /// 将内存插槽的ID映射到对应的内存插槽。 slots: HashMap>, @@ -132,9 +142,9 @@ impl LockedKvmMemSlot { #[derive(Debug, Default)] pub struct KvmMemSlot { /// 首个gfn - base_gfn: u64, + pub base_gfn: u64, /// 页数量 - npages: usize, + pub npages: usize, /// 脏页位图 dirty_bitmap: Option, /// 架构相关 @@ -146,6 +156,21 @@ pub struct KvmMemSlot { hva_node_key: [AddrRange; 2], } +impl KvmMemSlot { + pub fn check_aligned_addr(&self, align: usize) -> bool { + self.userspace_addr.data() % align == 0 + } + pub fn get_flags(&self) -> UserMemRegionFlag { + self.flags + } + pub fn get_id(&self) -> u16 { + self.id + } + // 检查内存槽是否可见 + pub fn is_visible(&self) -> bool { + self.id < KVM_USER_MEM_SLOTS && (self.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits()) == 0 + } +} #[derive(Debug)] pub struct LockedVmMemSlotSet { @@ -539,3 +564,135 @@ impl Vm { self.memslots[as_id] = self.get_inactive_memslot_set(as_id); } } +/// 将给定的客户机帧号(GFN)转换为用户空间虚拟地址(HVA),并根据内存槽的状态和标志进行相应的检查。 +/// +/// # 参数 +/// - `slot`: 可选的 `KvmMemSlot`,表示内存槽。 +/// - `gfn`: 客户机帧号(GFN),表示要转换的帧号。 +/// - `nr_pages`: 可选的可变引用,用于存储计算出的页数。 +/// - `write`: 布尔值,表示是否为写操作。 +/// +/// # 返回 +/// 如果成功,返回转换后的用户空间虚拟地址(HVA);如果失败,返回相应的错误。 +/// +/// # 错误 +/// 如果内存槽为空或无效,或者尝试对只读内存槽进行写操作,则返回 `SystemError::KVM_HVA_ERR_BAD`。 +fn __gfn_to_hva_many( + slot: &Option<&KvmMemSlot>, + gfn: u64, + nr_pages: Option<&mut u64>, + write: bool, +) -> Result { + kdebug!("__gfn_to_hva_many"); + + // 检查内存槽是否为空 + if slot.is_none() { + return Err(SystemError::KVM_HVA_ERR_BAD); + } + let slot = slot.as_ref().unwrap(); + + // 检查内存槽是否无效或尝试对只读内存槽进行写操作 + if slot.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0 || + (slot.flags.bits() & UserMemRegionFlag::READONLY.bits() != 0) && write { + return Err(SystemError::KVM_HVA_ERR_BAD); + } + + // 如果 `nr_pages` 不为空,计算并更新页数 + if let Some(nr_pages) = nr_pages { + *nr_pages = slot.npages as u64 - (gfn - slot.base_gfn); + } + + // 调用辅助函数将 GFN 转换为 HVA + return Ok(__gfn_to_hva_memslot(slot, gfn)); +} + +/// 将给定的全局帧号(GFN)转换为用户空间虚拟地址(HVA)。 +/// +/// # 参数 +/// - `slot`: `KvmMemSlot`,表示内存槽。 +/// - `gfn`: 全局帧号(GFN),表示要转换的帧号。 +/// +/// # 返回 +/// 转换后的用户空间虚拟地址(HVA)。 +fn __gfn_to_hva_memslot(slot: &KvmMemSlot, gfn: u64) -> u64 { + return slot.userspace_addr.data() as u64 + (gfn - slot.base_gfn) * PAGE_SIZE; +} +/// 将给定的全局帧号(GFN)转换为页帧号(PFN),并根据内存槽的状态和标志进行相应的检查。 +/// +/// # 参数 +/// - `slot`: 内存槽的引用。 +/// - `gfn`: 全局帧号(GFN),表示要转换的帧号。 +/// - `atomic`: 布尔值,表示是否为原子操作。 +/// - `interruptible`: 布尔值,表示操作是否可中断。 +/// - `async`: 可变引用,表示操作是否为异步。 +/// - `write_fault`: 布尔值,表示是否为写操作。 +/// - `writable`: 可变引用,表示是否可写。 +/// - `hva`: 可变引用,表示用户空间虚拟地址(HVA)。 +/// +/// # 返回 +/// 如果成功,返回转换后的页帧号(PFN);如果失败,返回相应的错误。 +pub fn __gfn_to_pfn_memslot( + slot: Option<&KvmMemSlot>, + gfn: u64, + atomic: bool, + interruptible: bool, + is_async: &mut bool, + write: bool, + writable: &mut bool, + hva: &mut u64, +) -> Result { + let addr = __gfn_to_hva_many(&slot, gfn, None, write)?; + *hva = addr; + + //todo:检查地址是否为错误 + + // 如果内存槽为只读,且 writable 不为空,则更新 writable 的值 + if slot.unwrap().flags.bits() & UserMemRegionFlag::READONLY.bits() != 0 { + *writable = false; + } + + let pfn = hva_to_pfn(addr, atomic,interruptible, is_async,write,writable)?; + return Ok(pfn); +} +/// 将用户空间虚拟地址(HVA)转换为页帧号(PFN)。 +/// +/// # 参数 +/// - `addr`: 用户空间虚拟地址(HVA)。 +/// - `atomic`: 布尔值,表示是否为原子操作。 +/// - `interruptible`: 布尔值,表示操作是否可中断。 +/// - `is_async`: 可变引用,表示操作是否为异步。 +/// - `write_fault`: 布尔值,表示是否为写操作。 +/// - `writable`: 可变引用,表示是否可写。 +/// +/// # 返回 +/// 如果成功,返回转换后的页帧号(PFN);如果失败,返回相应的错误。 +// 正确性待验证 +pub fn hva_to_pfn( + addr: u64, + atomic: bool, + _interruptible: bool, + is_async: &mut bool, + _write_fault: bool, + _writable: &mut bool, +) -> Result { + // 我们可以原子地或异步地执行,但不能同时执行 + assert!(!(atomic && *is_async), "Cannot be both atomic and async"); + + kdebug!("hva_to_pfn"); + unsafe { + let raw = addr as *const i32; + kdebug!("raw={:x}", *raw); + } + // let hpa = MMArch::virt_2_phys(VirtAddr::new(addr)).unwrap().data() as u64; + let hva = VirtAddr::new(addr as usize); + let mut mapper = KernelMapper::lock(); + let mapper = mapper.as_mut().unwrap(); + if let Some((hpa, _)) = mapper.translate(hva) { + return Ok(hpa.data() as u64 >> PAGE_SHIFT); + } + unsafe { + mapper.map(hva, PageFlags::mmio_flags()); + } + let (hpa, _) = mapper.translate(hva).unwrap(); + return Ok(hpa.data() as u64 >> PAGE_SHIFT); +} diff --git a/kernel/src/virt/vm/kvm_host/mod.rs b/kernel/src/virt/vm/kvm_host/mod.rs index 29aa746ef..e90dc1d7b 100644 --- a/kernel/src/virt/vm/kvm_host/mod.rs +++ b/kernel/src/virt/vm/kvm_host/mod.rs @@ -9,6 +9,7 @@ use alloc::{ vec::Vec, }; use hashbrown::HashMap; +use mem::LockedKvmMemSlot; use system_error::SystemError; use crate::{ @@ -17,7 +18,7 @@ use crate::{ CurrentKvmManager, KvmArch, VirtCpuArch, }, filesystem::vfs::file::{File, FileMode}, - libs::spinlock::{SpinLock, SpinLockGuard}, + libs::{rbtree::RBTree, spinlock::{SpinLock, SpinLockGuard}}, mm::ucontext::AddressSpace, process::ProcessManager, smp::cpu::ProcessorId, @@ -88,6 +89,7 @@ impl LockedVm { #[cfg(target_arch = "x86_64")] kvm_vmx: KvmVmx::default(), nr_memslots_dirty_logging: 0, + mmu_invalidate_seq: 0, }; let ret = Arc::new(Self { @@ -128,7 +130,7 @@ pub struct Vm { /// 对应活动和非活动内存槽,实际为:[[Arc; 2]; KVM_ADDRESS_SPACE_NUM],这里暂时写Vec memslots_set: Vec>>, /// 当前活动内存槽,实际为:[Arc; KVM_ADDRESS_SPACE_NUM],这里暂时写Vec - memslots: Vec>, + pub memslots: Vec>, /// 内存槽对应的页数 nr_memslot_pages: usize, @@ -140,6 +142,8 @@ pub struct Vm { #[cfg(target_arch = "x86_64")] pub kvm_vmx: KvmVmx, + + pub mmu_invalidate_seq:u64//用于表示内存管理单元(MMU)无效化序列号 } impl Vm { @@ -207,6 +211,7 @@ impl Vm { run: unsafe { Some(Box::new_zeroed().assume_init()) }, vcpu_idx: 0, mode: VcpuMode::OutsideGuestMode, + stat: Default::default(), }; } @@ -237,3 +242,19 @@ pub enum MutilProcessorState { ApResetHold, Suspended, } +///返回包含 gfn 的 memslot 的指针。如果没有找到,则返回 NULL。 +///当 "approx" 设置为 true 时,即使地址落在空洞中,也会返回 memslot。 +///在这种情况下,将返回空洞边界的其中一个 memslot。 +/// 先简陋完成,原本是二分,现在先遍历 +pub fn search_memslots(slot_set:Arc,gfn:u64,/*_approx:bool*/)->Option>{ + let slots=slot_set.lock(); + let node = &slots.gfn_tree; + //let(start,end)=(0,node.len()-1); + for (_gfn_num,slot) in node.iter(){ + let slot_guard = slot.read(); + if gfn >= slot_guard.base_gfn && gfn < slot_guard.base_gfn + slot_guard.npages as u64{ + return Some(slot.clone()); + } + } + return None; +} \ No newline at end of file diff --git a/kernel/src/virt/vm/kvm_host/vcpu.rs b/kernel/src/virt/vm/kvm_host/vcpu.rs index 56d71c2c5..ab0c075a0 100644 --- a/kernel/src/virt/vm/kvm_host/vcpu.rs +++ b/kernel/src/virt/vm/kvm_host/vcpu.rs @@ -6,8 +6,11 @@ use alloc::{ use crate::{ arch::{ - vm::{kvm_host::vcpu::VirtCpuRequest, vmx::VmxVCpuPriv}, - VirtCpuArch, + vm::{ + kvm_host::{vcpu::VirtCpuRequest, KvmReg}, + vmx::VmxVCpuPriv, + }, + VirtCpuArch, VirtCpuStat, }, libs::spinlock::{SpinLock, SpinLockGuard}, process::Pid, @@ -61,6 +64,7 @@ pub struct VirtCpu { pub stats_id: String, pub pv_time: GfnToHvaCache, pub arch: VirtCpuArch, + pub stat: VirtCpuStat, pub mode: VcpuMode, @@ -89,6 +93,19 @@ impl VirtCpu { pub fn vmx_mut(&mut self) -> &mut VmxVCpuPriv { self.private.as_mut().unwrap() } + //https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.h?fi=vmx_get_exit_qual#677 + #[inline] + pub fn get_exit_qual(&mut self) -> u64 { + if !self + .arch + .test_and_mark_available(KvmReg::VcpuExregExitInfo1) + { + self.vmx_mut().vmread_exit_qual(); + } + let vmx = self.vmx(); + vmx.get_exit_qual() + //vmx. + } } bitflags! { From 92c661a6e7b62fc123b2a8b020c8a9d10cb62781 Mon Sep 17 00:00:00 2001 From: GnoCiYeH Date: Wed, 18 Sep 2024 01:15:45 +0800 Subject: [PATCH 08/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=86=85=E5=AD=98?= =?UTF-8?q?=E8=99=9A=E6=8B=9F=E5=8C=96=E9=83=A8=E5=88=86=E5=8F=82=E6=95=B0?= =?UTF-8?q?=E4=BC=A0=E5=85=A5=EF=BC=8C=E8=A7=A3=E5=86=B3=E6=AD=BB=E9=94=81?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/src/arch/x86_64/vm/kvm_host/mod.rs | 13 +- kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs | 6 +- kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs | 210 ++++++++++-------- kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs | 118 +++++----- kernel/src/arch/x86_64/vm/vmx/exit.rs | 32 ++- kernel/src/arch/x86_64/vm/vmx/mod.rs | 11 +- kernel/src/virt/vm/kvm_host/mem.rs | 41 ++-- kernel/src/virt/vm/kvm_host/mod.rs | 25 ++- 8 files changed, 269 insertions(+), 187 deletions(-) diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs index 92aff5857..67026f189 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -38,7 +38,7 @@ pub const TSS_IOPB_SIZE: usize = 65536 / 8; pub const TSS_REDIRECTION_SIZE: usize = 256 / 8; pub const RMODE_TSS_SIZE: usize = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1; -pub const KVM_PFN_NOSLOT:u64 = 0x1 << 63; +pub const KVM_PFN_NOSLOT: u64 = 0x1 << 63; #[derive(Debug, Default)] pub struct X86KvmArch { @@ -63,7 +63,7 @@ pub struct X86KvmArch { msr_fliter: Option>, - pub noncoherent_dma_count:AtomicU32, + pub noncoherent_dma_count: AtomicU32, } impl X86KvmArch { @@ -219,6 +219,7 @@ pub trait KvmFunc: Send + Sync + Debug { fn handle_exit( &self, vcpu: &mut VirtCpu, + vm: &Vm, fastpath: ExitFastpathCompletion, ) -> Result; } @@ -401,7 +402,7 @@ bitflags! { const WRITE_PF_TO_SP = 1 << 8; } } -#[derive(Default,Debug)] +#[derive(Default, Debug)] ///用于跟踪和记录VCPU的各种统计信息。 pub struct KvmVcpuStat { //pub generic: KvmVcpuStatGeneric, @@ -441,12 +442,12 @@ pub struct KvmVcpuStat { pub notify_window_exits: u64, } #[inline] -/// 将 GFN 转换为 GPA +/// 将 GFN 转换为 GPA pub fn gfn_to_gpa(gfn: u64) -> u64 { gfn << 12 } #[inline] -/// 将 GPA 转换为 GFN +/// 将 GPA 转换为 GFN pub fn gpa_to_gfn(gfn: u64) -> u64 { gfn >> 12 -} \ No newline at end of file +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs index 133568c6f..e774ac57b 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -857,9 +857,9 @@ impl VirtCpu { // TODO: 一些中断或者tsc操作 - match x86_kvm_ops().handle_exit(self, exit_fastpath){ - Err(err)=>return Err(err), - Ok(_)=>{Ok(())} + match x86_kvm_ops().handle_exit(self, vm, exit_fastpath) { + Err(err) => return Err(err), + Ok(_) => Ok(()), } } diff --git a/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs index 4267394f2..edae9c9c7 100644 --- a/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs +++ b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs @@ -1,44 +1,54 @@ -use core::{intrinsics::unlikely, ops::Index}; use alloc::{ boxed::Box, sync::{Arc, Weak}, vec::Vec, }; +use core::{intrinsics::unlikely, ops::Index}; use system_error::SystemError; use crate::{ arch::vm::{ - kvm_host::{EmulType,KVM_PFN_NOSLOT} ,mmu::{mmu::{PFRet, PageLevel}, tdp_iter:: - { is_large_pte, is_shadow_present_pte, TdpIter}}, mtrr::kvm_mtrr_check_gfn_range_consistency, vmx::PageFaultErr + kvm_host::{EmulType, KVM_PFN_NOSLOT}, + mmu::{ + mmu::{PFRet, PageLevel}, + tdp_iter::{is_large_pte, is_shadow_present_pte, TdpIter}, + }, + mtrr::kvm_mtrr_check_gfn_range_consistency, + vmx::PageFaultErr, }, kwarn, + libs::spinlock::SpinLockGuard, mm::{virt_2_phys, PhysAddr}, virt::{ kvm::host_mem::PAGE_SHIFT, - vm::kvm_host::{mem::{LockedKvmMemSlot, LockedVmMemSlotSet, UserMemRegionFlag, __gfn_to_pfn_memslot}, search_memslots, vcpu::VirtCpu}, + vm::kvm_host::{ + mem::{LockedKvmMemSlot, LockedVmMemSlotSet, UserMemRegionFlag, __gfn_to_pfn_memslot}, + search_memslots, + vcpu::VirtCpu, + Vm, + }, }, }; use super::mmu::{gfn_round_for_level, is_tdp_mmu_enabled, KvmMmuPageRole}; #[derive(Debug, Default)] -pub struct KvmMmuPage{ - pub tdp_mmu_page:bool,// 标记是否为 TDP(Two-Dimensional Paging)页表页 - pub gfn: u64,// 客户机帧号(Guest Frame Number) - +pub struct KvmMmuPage { + pub tdp_mmu_page: bool, // 标记是否为 TDP(Two-Dimensional Paging)页表页 + pub gfn: u64, // 客户机帧号(Guest Frame Number) + /* - * The following two entries are used to key the shadow page in the - * hash table.暫時沒看出來 - */ + * The following two entries are used to key the shadow page in the + * hash table.暫時沒看出來 + */ pub role: KvmMmuPageRole, - pub spt: u64,// 指向页表条目(SPTE)的指针 + pub spt: u64, // 指向页表条目(SPTE)的指针 pub mmu_seq: u64, pub map_writable: bool, pub write_fault_to_shadow_pgtable: bool, } - #[derive(Debug, Default)] pub struct KvmPageFault { // vcpu.do_page_fault 的参数 @@ -89,6 +99,7 @@ impl VirtCpu { #[inline(never)] pub fn page_fault( &mut self, + vm: &Vm, cr2_or_gpa: u64, mut error_code: u64, insn: Option, @@ -120,7 +131,7 @@ impl VirtCpu { if r == PFRet::Invalid { r = self - .do_page_fault(cr2_or_gpa, error_code as u32, false, emulation_type)? + .do_page_fault(vm, cr2_or_gpa, error_code as u32, false, emulation_type)? .into(); if r == PFRet::Invalid { return Err(SystemError::EIO); @@ -158,8 +169,10 @@ impl VirtCpu { // self.emulate_instruction(cr2_or_gpa, emulation_type, insn, insn_len) todo!("emulate_instruction") } + fn do_page_fault( &mut self, + vm: &Vm, cr2_or_gpa: u64, error_code: u32, prefetch: bool, @@ -175,7 +188,7 @@ impl VirtCpu { rsvd: error_code & PageFaultErr::PFERR_RSVD.bits() as u32 != 0, user: error_code & PageFaultErr::PFERR_USER.bits() as u32 != 0, prefetch, - is_tdp : true, + is_tdp: true, nx_huge_page_workaround_enabled: false, //todo max_level: PageLevel::Level1G as u8, req_level: PageLevel::Level4K as u8, @@ -185,7 +198,7 @@ impl VirtCpu { //处理直接映射 if self.arch.mmu().root_role.get_direct() { page_fault.gfn = (page_fault.addr.data() >> PAGE_SHIFT) as u64; - page_fault.slot = self.gfn_to_memslot(page_fault.gfn);//kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);没完成 + page_fault.slot = self.gfn_to_memslot(page_fault.gfn, vm); //kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);没完成 } //异步页面错误(Async #PF),也称为预取错误(prefetch faults), //从客机(guest)的角度来看并不是错误,并且已经在原始错误发生时被计数。 @@ -194,7 +207,7 @@ impl VirtCpu { } let r = if page_fault.is_tdp { - self.tdp_page_fault(&mut page_fault).unwrap() + self.tdp_page_fault(vm, &mut page_fault).unwrap() } else { let handle = self.arch.mmu().page_fault.unwrap(); handle(self, &page_fault).unwrap() @@ -211,52 +224,58 @@ impl VirtCpu { PFRet::Spurious => self.stat.pf_spurious += 1, _ => {} } - + Ok(r) } - fn gfn_to_memslot(&self, gfn: u64) -> Option> { - let slot_set: Arc = self.kvm_vcpu_memslots(); + fn gfn_to_memslot(&self, gfn: u64, vm: &Vm) -> Option> { + let slot_set: Arc = self.kvm_vcpu_memslots(vm); //...todo - - search_memslots(slot_set, gfn) + search_memslots(slot_set, gfn) } - pub fn kvm_vcpu_memslots(&self) ->Arc { - let binding = self.kvm(); - let kvm = binding.lock(); - kvm.memslots.index(0).clone() + pub fn kvm_vcpu_memslots(&self, vm: &Vm) -> Arc { + vm.memslots.index(0).clone() } - fn tdp_page_fault(&mut self, page_fault: &mut KvmPageFault) -> Result { + fn tdp_page_fault( + &mut self, + vm: &Vm, + page_fault: &mut KvmPageFault, + ) -> Result { // 如果 shadow_memtype_mask 为真,并且虚拟机有非一致性 DMA //if shadow_memtype_mask != 0 && self.kvm().lock().arch.noncoherent_dma_count > 0 { - while page_fault.max_level > PageLevel::Level4K as u8{ - let page_num = PageLevel::kvm_pages_per_hpage(page_fault.max_level); + while page_fault.max_level > PageLevel::Level4K as u8 { + let page_num = PageLevel::kvm_pages_per_hpage(page_fault.max_level); - //低地址对齐 - let base = gfn_round_for_level(page_fault.gfn, page_fault.max_level); + //低地址对齐 + let base = gfn_round_for_level(page_fault.gfn, page_fault.max_level); - //检查给定 GFN 范围内的内存类型是否一致,暂未实现 - if kvm_mtrr_check_gfn_range_consistency(self, base, page_num) { - break; - } - - page_fault.max_level -= 1; + //检查给定 GFN 范围内的内存类型是否一致,暂未实现 + if kvm_mtrr_check_gfn_range_consistency(self, base, page_num) { + break; } + + page_fault.max_level -= 1; + } //} - if is_tdp_mmu_enabled() { - return self.kvm_tdp_mmu_page_fault(page_fault); - } + if is_tdp_mmu_enabled() { + return self.kvm_tdp_mmu_page_fault(vm, page_fault); + } self.direct_page_fault(page_fault) } - fn kvm_tdp_mmu_page_fault(&self,page_fault: &mut KvmPageFault)->Result{ - - //page_fault_handle_page_track(page_fault) + fn kvm_tdp_mmu_page_fault( + &self, + vm: &Vm, + page_fault: &mut KvmPageFault, + ) -> Result { + //page_fault_handle_page_track(page_fault) //fast_page_fault(page_fault); //mmu_topup_memory_caches(false);//补充内存缓存 - let mut r= self.kvm_faultin_pfn(page_fault, 1|1<<1|1<<2).unwrap(); + let mut r = self + .kvm_faultin_pfn(vm, page_fault, 1 | 1 << 1 | 1 << 2) + .unwrap(); if r != PFRet::Continue { return Ok(r.into()); } @@ -266,34 +285,37 @@ impl VirtCpu { self.tdp_map(page_fault); Ok(r.into()) } - fn tdp_map(&self,page_fault: &mut KvmPageFault)->Result{ + fn tdp_map(&self, page_fault: &mut KvmPageFault) -> Result { //没有实现SPTE,huge page相关 - let mmu=self.arch.mmu(); + let mmu = self.arch.mmu(); let kvm = self.kvm(); let ret = PFRet::Retry; - - - let mut tdp_iter : TdpIter=TdpIter::default(); + let mut tdp_iter: TdpIter = TdpIter::default(); + + tdp_iter.start( + virt_2_phys(mmu.root.hpa as usize), /*__va */ + mmu.root_role.level() as u8, + PageLevel::Level4K as u8, + page_fault.gfn, + ); - tdp_iter.start(virt_2_phys(mmu.root.hpa as usize)/*__va */,mmu.root_role.level() as u8, - PageLevel::Level4K as u8,page_fault.gfn ); - for iter in tdp_iter{ - if !(iter.valid && iter.gfnpa的转换可能有点问题 - // 如果启用了 NX 巨大页解决方法,则进行调整 if page_fault.nx_huge_page_workaround_enabled { page_fault.nx_huge_page_workaround_enabled = false; } - if iter.level == page_fault.goal_level{ + if iter.level == page_fault.goal_level { //self.map_handle_target_level(page_fault,&mut iter); } - + //如果在比目标更高的级别有一个映射大页的 SPTE, //那么该 SPTE 必须被清除并替换为非叶子 SPTE。 if is_shadow_present_pte(iter.old_spte) && !is_large_pte(iter.old_spte) { @@ -301,59 +323,69 @@ impl VirtCpu { } //SPTE是non-present或者指向一个需要split的大页 - - } todo!() } ///todo()!!! - fn map_handle_target_level(&self,page_fault:&mut KvmPageFault,iter: &mut TdpIter){ + fn map_handle_target_level(&self, page_fault: &mut KvmPageFault, iter: &mut TdpIter) { todo!() } - fn direct_page_fault(&self,page_fault: &KvmPageFault)->Result{ + fn direct_page_fault(&self, page_fault: &KvmPageFault) -> Result { todo!() } - fn kvm_faultin_pfn(&self,page_fault: &mut KvmPageFault,access: u32)->Result{ - page_fault.mmu_seq = self.kvm().lock().mmu_invalidate_seq; + fn kvm_faultin_pfn( + &self, + vm: &Vm, + page_fault: &mut KvmPageFault, + access: u32, + ) -> Result { + page_fault.mmu_seq = vm.mmu_invalidate_seq; self.__kvm_faultin_pfn(page_fault) } - fn __kvm_faultin_pfn(&self,page_fault: &mut KvmPageFault)->Result{ + fn __kvm_faultin_pfn(&self, page_fault: &mut KvmPageFault) -> Result { let slot = &page_fault.slot; let mut is_async = false; if slot.is_none() { return Err(SystemError::KVM_HVA_ERR_BAD); } let slot = slot.as_ref().unwrap().read(); - - if slot.get_flags().bits()& UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits()!= 0 { - return Ok(PFRet::Retry); - } - if !slot.is_visible(){ - /* 不要将私有内存槽暴露给 L2。 */ - if self.arch.is_guest_mode() { - drop(slot); - page_fault.slot = None; - page_fault.pfn = KVM_PFN_NOSLOT; - page_fault.map_writable = false; - return Ok(PFRet::Continue); - } - /* - * 如果 APIC 访问页面存在但被禁用,则直接进行仿真, - * 而不缓存 MMIO 访问或创建 MMIO SPTE。 - * 这样,当 AVIC 重新启用时,不需要清除缓存。 - */ - // if slot.get_id() == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && !self.kvm_apicv_activated() - // { - // return PFRet::Emulate; - // } - } + if slot.get_flags().bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0 { + return Ok(PFRet::Retry); + } + if !slot.is_visible() { + /* 不要将私有内存槽暴露给 L2。 */ + if self.arch.is_guest_mode() { + drop(slot); + page_fault.slot = None; + page_fault.pfn = KVM_PFN_NOSLOT; + page_fault.map_writable = false; + return Ok(PFRet::Continue); + } + /* + * 如果 APIC 访问页面存在但被禁用,则直接进行仿真, + * 而不缓存 MMIO 访问或创建 MMIO SPTE。 + * 这样,当 AVIC 重新启用时,不需要清除缓存。 + */ + // if slot.get_id() == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && !self.kvm_apicv_activated() + // { + // return PFRet::Emulate; + // } + } // 尝试将 GFN 转换为 PFN - page_fault.pfn = __gfn_to_pfn_memslot(Some(&slot), page_fault.gfn, false, false, &mut is_async, - page_fault.write, &mut page_fault.map_writable, - &mut page_fault.hva)?; + page_fault.pfn = __gfn_to_pfn_memslot( + Some(&slot), + page_fault.gfn, + false, + false, + &mut is_async, + page_fault.write, + &mut page_fault.map_writable, + &mut page_fault.hva, + )?; + if !is_async { return Ok(PFRet::Continue); /* *pfn 已经有正确的页面 */ } diff --git a/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs b/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs index a0298a022..3b0577c63 100644 --- a/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs +++ b/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs @@ -1,27 +1,34 @@ -use crate::{arch::vm::mmu::mmu::gfn_round_for_level, mm::{virt_2_phys, PhysAddr, VirtAddr}, time::sleep, virt::kvm::host_mem::PAGE_SHIFT}; - -use super::{mmu::{PageLevel, PAGE_SIZE}, mmu_internal::KvmMmuPage}; - +use crate::{ + arch::vm::mmu::mmu::gfn_round_for_level, + mm::{virt_2_phys, PhysAddr, VirtAddr}, + time::sleep, + virt::kvm::host_mem::PAGE_SHIFT, +}; + +use super::{ + mmu::{PageLevel, PAGE_SIZE}, + mmu_internal::KvmMmuPage, +}; pub const PT64_ROOT_MAX_LEVEL: usize = 5; //通常只用到4级,但是确实有5级的情况 pub const PT_LEVEL_BITS: u8 = 9; // 每个页表级别的位数 -pub const PT64_ENT_PER_PAGE: u32 = 1<<9; +pub const PT64_ENT_PER_PAGE: u32 = 1 << 9; pub const PTE_LEN: usize = 64; //Bits 51:12 are from the EPT PDPTE pub const PT64_BASE_ADDR_MASK: u64 = ((1u64 << 52) - 1) & !(PAGE_SIZE - 1); pub fn shadow_pt_index(addr: u64, level: u8) -> u64 { - (addr >> (PAGE_SHIFT as u8 + (level - 1) * PT_LEVEL_BITS)) & ((1 << PT_LEVEL_BITS) - 1) + (addr >> (PAGE_SHIFT as u8 + (level - 1) * PT_LEVEL_BITS)) & ((1 << PT_LEVEL_BITS) - 1) } pub fn is_last_spte(pte: u64, level: u8) -> bool { level == PageLevel::Level4K as u8 || is_large_pte(pte) } -pub fn is_shadow_present_pte(pte :u64) ->bool{ - pte & 1<<11 !=0//在intel手冊中:ept PTE:11 Ignored.不是很懂 +pub fn is_shadow_present_pte(pte: u64) -> bool { + pte & 1 << 11 != 0 //在intel手冊中:ept PTE:11 Ignored.不是很懂 } -pub fn is_large_pte(pte :u64) ->bool{ - pte & 1<<7 !=0//在intel手冊中:ept PTE:7 Ignored. +pub fn is_large_pte(pte: u64) -> bool { + pte & 1 << 7 != 0 //在intel手冊中:ept PTE:7 Ignored. } ///Bits 51:12 are from the EPT PDPTE pub fn spte_to_pfn(pte: u64) -> u64 { @@ -29,21 +36,25 @@ pub fn spte_to_pfn(pte: u64) -> u64 { } #[derive(Default)] -pub struct TdpIter{ +pub struct TdpIter { inner: TdpIterInner, } -impl TdpIter{ - pub fn start(&self,root_pt:usize,root_level:u8,min_level:u8,next_last_level_gfn:u64)->Self{ +impl TdpIter { + pub fn start( + &self, + root_pt: usize, + root_level: u8, + min_level: u8, + next_last_level_gfn: u64, + ) -> Self { let mut inner = self.inner.clone(); - inner.start(root_pt,root_level,min_level,next_last_level_gfn); - TdpIter{ - inner - } + inner.start(root_pt, root_level, min_level, next_last_level_gfn); + TdpIter { inner } } } ///迭代器将遍历分页结构,直到找到此 GFN 的映射。 -#[derive(Default,Clone)] +#[derive(Default, Clone)] pub struct TdpIterInner { next_last_level_gfn: u64, /// 线程上次让出时的 next_last_level_gfn。 @@ -59,7 +70,7 @@ pub struct TdpIterInner { /// 当前 SPTE 映射的最低 GFN hpa>>shift? pub gfn: u64, - ///给迭代器的根页级别 + ///给迭代器的根页级别 pub root_level: u8, ///迭代器应遍历到的最低级别 @@ -75,10 +86,16 @@ pub struct TdpIterInner { /// pub valid: bool, } -impl TdpIterInner{ +impl TdpIterInner { ///初始化ept iter - pub fn start(&mut self,root_pt :usize,root_level:u8, - min_level: u8,next_last_level_gfn:u64){ + #[inline(never)] + pub fn start( + &mut self, + root_pt: usize, + root_level: u8, + min_level: u8, + next_last_level_gfn: u64, + ) { // if root_pt.role.level() == 0 || root_pt.role.level() > PT64_ROOT_MAX_LEVEL as u32 { // self.valid = false; // return; @@ -91,8 +108,8 @@ impl TdpIterInner{ self.next_last_level_gfn = next_last_level_gfn; self.root_level = root_level as u8; self.min_level = min_level as u8; - self.pt_path[(self.root_level - 1) as usize] =root_pt as u64; - self.yielded_gfn=self.next_last_level_gfn; + self.pt_path[(self.root_level - 1) as usize] = root_pt as u64; + self.yielded_gfn = self.next_last_level_gfn; self.level = self.root_level; self.gfn = gfn_round_for_level(self.next_last_level_gfn, self.level); @@ -101,20 +118,22 @@ impl TdpIterInner{ } /* - * 重新计算当前GFN和level和SPTE指针,并重新读取SPTE。 - */ - fn tdp_iter_refresh_sptep(&mut self){ - self.sptep = PhysAddr::new((self.pt_path[self.level as usize - 1] + - shadow_pt_index(self.gfn <bool{ + fn try_step_down(&mut self) -> bool { if self.level == self.min_level { return false; } //在下降之前重新读取SPTE,以避免遍历到不再从此条目链接的页表中。 - self.old_spte =read_sptep(self.sptep); + self.old_spte = read_sptep(self.sptep); - match spte_to_child_pt(self.old_spte,self.level){ - Some(child_pt) =>{ + match spte_to_child_pt(self.old_spte, self.level) { + Some(child_pt) => { self.level -= 1; self.pt_path[self.level as usize - 1] = child_pt.data() as u64; self.gfn = gfn_round_for_level(self.gfn, self.level); @@ -140,10 +159,9 @@ impl TdpIterInner{ true } None => false, - } } - fn try_step_up(&mut self)->bool{ + fn try_step_up(&mut self) -> bool { if self.level == self.root_level { return false; } @@ -155,25 +173,24 @@ impl TdpIterInner{ ///在当前页表的当前级别中,移动到下一个条目。下一个条目可以指向一个page backing guest memory , ///或者另一个页表,或者它可能是不存在的。如果迭代器能够移动到页表中的下一个条目,则返回true, ///如果迭代器已经在当前页表的末尾,则返回false。 - fn try_step_side(&mut self)-> bool{ + fn try_step_side(&mut self) -> bool { //检查迭代器是否已经在当前页表的末尾。 - if shadow_pt_index(self.gfn< Option { - let inner=&mut self.inner; + let inner = &mut self.inner; if !inner.valid { return None; } @@ -184,20 +201,19 @@ impl Iterator for TdpIter{ None } } - } ///给定一个 SPTE 及其级别,返回一个指针,该指针包含 SPTE 所引用的子页表的hva。 ///如果没有这样的条目,则返回 null。 /// -fn spte_to_child_pt(spte:u64,level:u8) ->Option{ +fn spte_to_child_pt(spte: u64, level: u8) -> Option { //没有子页表 - if !is_shadow_present_pte(spte) || is_last_spte(spte,level){ + if !is_shadow_present_pte(spte) || is_last_spte(spte, level) { return None; } Some(VirtAddr::new(virt_2_phys//__va ((spte_to_pfn(spte)<u64{ - unsafe{*(sptep.data() as *const u64)} -} \ No newline at end of file +pub fn read_sptep(sptep: PhysAddr) -> u64 { + unsafe { *(sptep.data() as *const u64) } +} diff --git a/kernel/src/arch/x86_64/vm/vmx/exit.rs b/kernel/src/arch/x86_64/vm/vmx/exit.rs index 18dad7aaf..1558757d4 100644 --- a/kernel/src/arch/x86_64/vm/vmx/exit.rs +++ b/kernel/src/arch/x86_64/vm/vmx/exit.rs @@ -4,7 +4,9 @@ use x86::vmx::vmcs::{guest, ro}; use crate::{ arch::vm::asm::{IntrInfo, VmxAsm}, - virt::vm::kvm_host::vcpu::VirtCpu, + kerror, + libs::spinlock::SpinLockGuard, + virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, }; use super::{ept::EptViolationExitQual, vmx_info, PageFaultErr}; @@ -32,7 +34,7 @@ pub struct VmxExitReason { } //#define VMX_EXIT_REASONS -#[derive(FromPrimitive, PartialEq)] +#[derive(FromPrimitive, PartialEq, Clone, Copy)] #[allow(non_camel_case_types)] pub enum VmxExitReasonBasic { EXCEPTION_OR_NMI = 0, @@ -248,6 +250,7 @@ impl VmxExitHandlers { #[inline(never)] pub fn try_handle_exit( vcpu: &mut VirtCpu, + vm: &Vm, basic: VmxExitReasonBasic, ) -> Option> { match basic { @@ -255,12 +258,12 @@ impl VmxExitHandlers { return Some(Self::handle_io(vcpu)); } VmxExitReasonBasic::EPT_VIOLATION => { - return Some(Self::handle_ept_violation(vcpu)); + return Some(Self::handle_ept_violation(vcpu, vm)); } - - _ => { - None + VmxExitReasonBasic::EXTERNAL_INTERRUPT => { + return Some(Self::handle_external_interrupt(vcpu)); } + _ => None, } } @@ -268,7 +271,12 @@ impl VmxExitHandlers { todo!(); } - fn handle_ept_violation(vcpu: &mut VirtCpu) -> Result { + fn handle_external_interrupt(vcpu: &mut VirtCpu) -> Result { + vcpu.stat.irq_exits += 1; + Ok(1) + } + + fn handle_ept_violation(vcpu: &mut VirtCpu, vm: &Vm) -> Result { let exit_qualification = vcpu.get_exit_qual(); // EPT 违规发生在从 NMI 执行 iret 时, @@ -286,12 +294,14 @@ impl VmxExitHandlers { // trace_kvm_page_fault(vcpu, gpa, exit_qualification);//fztodo!() // 根据故障类型确定错误代码 - let mut error_code = if exit_qualification & (EptViolationExitQual::ACC_READ.bits()) != 0 {//active + let mut error_code = if exit_qualification & (EptViolationExitQual::ACC_READ.bits()) != 0 { + //active PageFaultErr::PFERR_USER.bits() } else { 0 }; - error_code |= if exit_qualification & (EptViolationExitQual::ACC_WRITE.bits()) != 0 {//active + error_code |= if exit_qualification & (EptViolationExitQual::ACC_WRITE.bits()) != 0 { + //active PageFaultErr::PFERR_WRITE.bits() } else { 0 @@ -307,7 +317,7 @@ impl VmxExitHandlers { 0 }; error_code |= if exit_qualification & (EptViolationExitQual::GVA_TRANSLATED.bits()) != 0 { - PageFaultErr::PFERR_GUEST_FINAL.bits()//active + PageFaultErr::PFERR_GUEST_FINAL.bits() //active } else { PageFaultErr::PFERR_GUEST_PAGE.bits() }; @@ -323,6 +333,6 @@ impl VmxExitHandlers { // return kvm_emulate_instruction(vcpu, 0); // } - vcpu.page_fault(gpa, error_code, None, 0) + vcpu.page_fault(vm, gpa, error_code, None, 0) } } diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs index 7591b0e78..7500f507b 100644 --- a/kernel/src/arch/x86_64/vm/vmx/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -1211,9 +1211,10 @@ impl KvmFunc for VmxKvmFunc { //vmx_handle_exit &self, vcpu: &mut VirtCpu, + vm: &Vm, fastpath: ExitFastpathCompletion, ) -> Result { - let r = vmx_info().vmx_handle_exit(vcpu, fastpath); + let r = vmx_info().vmx_handle_exit(vcpu, vm, fastpath); if vcpu.vmx().exit_reason.bus_lock_detected() { todo!() @@ -2977,6 +2978,7 @@ impl Vmx { pub fn vmx_handle_exit( &self, vcpu: &mut VirtCpu, + vm: &Vm, exit_fastpath: ExitFastpathCompletion, ) -> Result { let exit_reason = vcpu.vmx().exit_reason; @@ -3046,8 +3048,11 @@ impl Vmx { if exit_fastpath != ExitFastpathCompletion::None { return Err(SystemError::EINVAL); } - match VmxExitHandlers::try_handle_exit(vcpu, VmxExitReasonBasic::from(exit_reason.basic())) - { + match VmxExitHandlers::try_handle_exit( + vcpu, + vm, + VmxExitReasonBasic::from(exit_reason.basic()), + ) { Some(Ok(r)) => return Ok(r), Some(Err(_)) | None => unexpected_vmexit(vcpu), } diff --git a/kernel/src/virt/vm/kvm_host/mem.rs b/kernel/src/virt/vm/kvm_host/mem.rs index 8fa8613dd..d0eb826b8 100644 --- a/kernel/src/virt/vm/kvm_host/mem.rs +++ b/kernel/src/virt/vm/kvm_host/mem.rs @@ -8,11 +8,17 @@ use hashbrown::HashMap; use system_error::SystemError; use crate::{ - arch::{vm::mmu::mmu::PAGE_SIZE, MMArch}, libs::{ + arch::{vm::mmu::mmu::PAGE_SIZE, MMArch}, + libs::{ rbtree::RBTree, rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard}, spinlock::{SpinLock, SpinLockGuard}, - }, mm::{kernel_mapper::KernelMapper, page::PageFlags, MemoryManagementArch, VirtAddr}, virt::{kvm::host_mem::PAGE_SHIFT, vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion}} + }, + mm::{kernel_mapper::KernelMapper, page::PageFlags, MemoryManagementArch, VirtAddr}, + virt::{ + kvm::host_mem::PAGE_SHIFT, + vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion}, + }, }; use super::{LockedVm, Vm}; @@ -21,14 +27,14 @@ pub const KVM_USER_MEM_SLOTS: u16 = u16::MAX; pub const KVM_INTERNAL_MEM_SLOTS: u16 = 3; pub const KVM_MEM_SLOTS_NUM: u16 = KVM_USER_MEM_SLOTS - KVM_INTERNAL_MEM_SLOTS; pub const KVM_MEM_MAX_NR_PAGES: usize = (1 << 31) - 1; -pub const APIC_ACCESS_PAGE_PRIVATE_MEMSLOT: u16 = KVM_MEM_SLOTS_NUM+1; +pub const APIC_ACCESS_PAGE_PRIVATE_MEMSLOT: u16 = KVM_MEM_SLOTS_NUM + 1; /// 对于普通的页帧号(PFN),最高的12位应该为零, /// 因此我们可以mask位62到位52来表示错误的PFN, /// mask位63来表示无槽的PFN。 -const KVM_PFN_ERR_MASK: u64 = 0x7ff << 52;//0x7FF0000000000000 -const KVM_PFN_ERR_NOSLOT_MASK: u64 = 0xfff << 52;//0xFFF0000000000000 -const KVM_PFN_NOSLOT: u64 = 1 << 63;//0x8000000000000000 +const KVM_PFN_ERR_MASK: u64 = 0x7ff << 52; //0x7FF0000000000000 +const KVM_PFN_ERR_NOSLOT_MASK: u64 = 0xfff << 52; //0xFFF0000000000000 +const KVM_PFN_NOSLOT: u64 = 1 << 63; //0x8000000000000000 const KVM_PFN_ERR_FAULT: u64 = KVM_PFN_ERR_MASK; const KVM_PFN_ERR_HWPOISON: u64 = KVM_PFN_ERR_MASK + 1; @@ -158,7 +164,7 @@ pub struct KvmMemSlot { } impl KvmMemSlot { pub fn check_aligned_addr(&self, align: usize) -> bool { - self.userspace_addr.data() % align == 0 + self.userspace_addr.data() % align == 0 } pub fn get_flags(&self) -> UserMemRegionFlag { self.flags @@ -166,9 +172,10 @@ impl KvmMemSlot { pub fn get_id(&self) -> u16 { self.id } - // 检查内存槽是否可见 + // 检查内存槽是否可见 pub fn is_visible(&self) -> bool { - self.id < KVM_USER_MEM_SLOTS && (self.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits()) == 0 + self.id < KVM_USER_MEM_SLOTS + && (self.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits()) == 0 } } @@ -350,9 +357,8 @@ impl Vm { } }; - if change == KvmMemoryChangeMode::Create - || change == KvmMemoryChangeMode::Move - || slots_guard.gfn_tree.contains_key(&base_gfn) + if (change == KvmMemoryChangeMode::Create || change == KvmMemoryChangeMode::Move) + && slots_guard.gfn_tree.contains_key(&base_gfn) { return Err(SystemError::EEXIST); } @@ -594,8 +600,9 @@ fn __gfn_to_hva_many( let slot = slot.as_ref().unwrap(); // 检查内存槽是否无效或尝试对只读内存槽进行写操作 - if slot.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0 || - (slot.flags.bits() & UserMemRegionFlag::READONLY.bits() != 0) && write { + if slot.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0 + || (slot.flags.bits() & UserMemRegionFlag::READONLY.bits() != 0) && write + { return Err(SystemError::KVM_HVA_ERR_BAD); } @@ -645,15 +652,15 @@ pub fn __gfn_to_pfn_memslot( ) -> Result { let addr = __gfn_to_hva_many(&slot, gfn, None, write)?; *hva = addr; - + //todo:检查地址是否为错误 // 如果内存槽为只读,且 writable 不为空,则更新 writable 的值 if slot.unwrap().flags.bits() & UserMemRegionFlag::READONLY.bits() != 0 { *writable = false; } - - let pfn = hva_to_pfn(addr, atomic,interruptible, is_async,write,writable)?; + + let pfn = hva_to_pfn(addr, atomic, interruptible, is_async, write, writable)?; return Ok(pfn); } /// 将用户空间虚拟地址(HVA)转换为页帧号(PFN)。 diff --git a/kernel/src/virt/vm/kvm_host/mod.rs b/kernel/src/virt/vm/kvm_host/mod.rs index e90dc1d7b..a3883b507 100644 --- a/kernel/src/virt/vm/kvm_host/mod.rs +++ b/kernel/src/virt/vm/kvm_host/mod.rs @@ -18,7 +18,10 @@ use crate::{ CurrentKvmManager, KvmArch, VirtCpuArch, }, filesystem::vfs::file::{File, FileMode}, - libs::{rbtree::RBTree, spinlock::{SpinLock, SpinLockGuard}}, + libs::{ + rbtree::RBTree, + spinlock::{SpinLock, SpinLockGuard}, + }, mm::ucontext::AddressSpace, process::ProcessManager, smp::cpu::ProcessorId, @@ -143,7 +146,7 @@ pub struct Vm { #[cfg(target_arch = "x86_64")] pub kvm_vmx: KvmVmx, - pub mmu_invalidate_seq:u64//用于表示内存管理单元(MMU)无效化序列号 + pub mmu_invalidate_seq: u64, //用于表示内存管理单元(MMU)无效化序列号 } impl Vm { @@ -246,15 +249,23 @@ pub enum MutilProcessorState { ///当 "approx" 设置为 true 时,即使地址落在空洞中,也会返回 memslot。 ///在这种情况下,将返回空洞边界的其中一个 memslot。 /// 先简陋完成,原本是二分,现在先遍历 -pub fn search_memslots(slot_set:Arc,gfn:u64,/*_approx:bool*/)->Option>{ - let slots=slot_set.lock(); +pub fn search_memslots( + slot_set: Arc, + gfn: u64, /*_approx:bool*/ +) -> Option> { + let slots = slot_set.lock(); let node = &slots.gfn_tree; //let(start,end)=(0,node.len()-1); - for (_gfn_num,slot) in node.iter(){ + for (_gfn_num, slot) in node.iter() { let slot_guard = slot.read(); - if gfn >= slot_guard.base_gfn && gfn < slot_guard.base_gfn + slot_guard.npages as u64{ + kdebug!( + "gfn:{gfn},slot base_gfn: {},slot npages: {}", + slot_guard.base_gfn, + slot_guard.npages + ); + if gfn >= slot_guard.base_gfn && gfn < slot_guard.base_gfn + slot_guard.npages as u64 { return Some(slot.clone()); } } return None; -} \ No newline at end of file +} From 0d1f0c7c4b4e56dc0d9e5dd4da38f5db04181c65 Mon Sep 17 00:00:00 2001 From: Brahmamantra <2033552517@qq.com> Date: Sat, 21 Sep 2024 15:29:48 +0800 Subject: [PATCH 09/10] =?UTF-8?q?=E5=88=9D=E6=AD=A5=E5=AE=8C=E6=88=90ept?= =?UTF-8?q?=E6=98=A0=E5=B0=84.=E4=BD=86=E4=B8=8D=E5=81=9CEPT=5FVIOLATION?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/src/arch/x86_64/vm/kvm_host/mod.rs | 1 + kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs | 47 +- kernel/src/arch/x86_64/vm/mmu/mmu.rs | 118 ++++- kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs | 91 ++-- kernel/src/arch/x86_64/vm/mmu/mod.rs | 2 +- kernel/src/arch/x86_64/vm/mmu/pte.rs | 4 +- kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs | 438 +++++++++--------- kernel/src/arch/x86_64/vm/mod.rs | 2 +- kernel/src/arch/x86_64/vm/mtrr.rs | 4 +- kernel/src/arch/x86_64/vm/vmx/ept/mod.rs | 366 ++++++++++++++- kernel/src/arch/x86_64/vm/vmx/mod.rs | 18 +- kernel/src/mm/mod.rs | 2 +- kernel/src/virt/vm/kvm_host/mem.rs | 1 + 13 files changed, 796 insertions(+), 298 deletions(-) diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs index 67026f189..d11f52e89 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -335,6 +335,7 @@ pub enum KvmReg { VcpuRegsRip = 16, NrVcpuRegs = 17, + //VcpuExregPdptr = NrVcpuRegs, VcpuExregCr0, VcpuExregCr3, VcpuExregCr4, diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs index e774ac57b..47b949a1c 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -5,6 +5,7 @@ use alloc::{boxed::Box, sync::Arc, vec::Vec}; use bitmap::{traits::BitMapOps, AllocBitmap, BitMapCore}; use raw_cpuid::CpuId; use system_error::SystemError; +use x86::vmx::vmcs::guest; use x86::{ bits64::rflags::RFlags, controlregs::{Cr0, Cr4, Xcr0}, @@ -14,8 +15,9 @@ use x86::{ }; use x86_64::registers::control::EferFlags; +use crate::arch::vm::asm::VmxAsm; use crate::arch::vm::vmx::exit::ExitFastpathCompletion; -use crate::kwarn; +use crate::{kdebug, kwarn}; use crate::virt::vm::kvm_host::mem::KvmMmuMemoryCache; use crate::virt::vm::kvm_host::vcpu::VcpuMode; use crate::{ @@ -254,6 +256,27 @@ impl X86VcpuArch { self.efer.contains(EferFlags::LONG_MODE_ACTIVE) } + #[inline] + pub fn is_pae_paging(&mut self) -> bool { + + let flag1 = self.is_long_mode(); + let flag2 = self.is_pae(); + let flag3 = self.is_paging(); + + !flag1 && flag2 && flag3 + } + + #[inline] + pub fn is_pae(&mut self) -> bool { + !self.read_cr4_bits(Cr4::CR4_ENABLE_PAE).is_empty() + + } + #[inline] + pub fn is_paging(&mut self) -> bool { + //return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG)); + !self.read_cr0_bits(Cr0::CR0_ENABLE_PAGING).is_empty() + } + #[inline] pub fn is_portected_mode(&mut self) -> bool { !self.read_cr0_bits(Cr0::CR0_PROTECTED_MODE).is_empty() @@ -1177,7 +1200,7 @@ impl VirtCpu { if !self.arch.is_register_available(KvmReg::VcpuExregCr3) { x86_kvm_ops().cache_reg(&mut self.arch, KvmReg::VcpuExregCr3); } - + kdebug!("read_cr3:: cr3: {:#x}", self.arch.cr3); return self.arch.cr3; } @@ -1293,11 +1316,14 @@ impl VirtCpu { *mmu_reset_needed |= self.read_cr3() != sregs.cr3; self.arch.cr3 = sregs.cr3; + //kdebug!("_set_segmenet_regs_common 1:: cr3: {:#x}", self.arch.cr3); self.arch.mark_register_dirty(KvmReg::VcpuExregCr3); x86_kvm_ops().post_set_cr3(self, sregs.cr3); + //kdebug!("_set_segmenet_regs_common 2:: cr3: {:#x}", self.arch.cr3); + self.kvm_set_cr8(sregs.cr8); let efer = EferFlags::from_bits_truncate(sregs.efer); @@ -1462,6 +1488,23 @@ impl VirtCpu { KvmX86Asm::write_pkru(self.arch.pkru); } } + + pub fn load_pdptrs(&mut self){ + //let mmu = self.arch.mmu(); + if !self.arch.is_register_dirty(KvmReg::VcpuExregCr3){ + return; + } + if self.arch.is_pae_paging(){ + let mmu = self.arch.mmu(); + + VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[0]); + VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[1]); + VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[2]); + VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[3]); + }else{ + kdebug!("load_pdptrs: not pae paging"); + } + } } bitflags! { diff --git a/kernel/src/arch/x86_64/vm/mmu/mmu.rs b/kernel/src/arch/x86_64/vm/mmu/mmu.rs index e96c237eb..ccc85648e 100644 --- a/kernel/src/arch/x86_64/vm/mmu/mmu.rs +++ b/kernel/src/arch/x86_64/vm/mmu/mmu.rs @@ -1,3 +1,5 @@ +use crate::arch::vm::asm::VmxAsm; +use crate::arch::vm::kvm_host::KvmReg; use crate::kerror; use crate::virt::kvm::host_mem::PAGE_SHIFT; use crate::{arch::mm::X86_64MMArch, kdebug, kwarn}; @@ -9,16 +11,16 @@ use crate::{ }; use alloc::{sync::Arc, vec::Vec}; use bitfield_struct::bitfield; +use uefi::table::cfg; +use x86::vmx::vmcs::guest; use core::intrinsics::likely; +use core::ops::{Add, Sub}; use raw_cpuid::CpuId; use system_error::SystemError; use x86::controlregs::{Cr0, Cr4}; use x86_64::registers::control::EferFlags; -use super::super::{ - vmx::vmx_info, - x86_kvm_ops, -}; +use super::super::{vmx::vmx_info, x86_kvm_ops}; use super::mmu_internal::KvmPageFault; const PT64_ROOT_5LEVEL: usize = 5; @@ -38,12 +40,22 @@ static mut MAX_HUGE_PAGE_LEVEL: PageLevel = PageLevel::None; pub const PAGE_SIZE: u64 = 1 << PAGE_SHIFT; -pub fn is_tdp_mmu_enabled()->bool{ +pub fn is_tdp_mmu_enabled() -> bool { unsafe { TDP_MMU_ENABLED } } +pub fn max_huge_page_level() -> PageLevel{ + //不让外面直接修改MAX_HUGE_PAGE_LEVEL的值 + let level: PageLevel; + unsafe { + level = MAX_HUGE_PAGE_LEVEL; + } + level +} + #[allow(dead_code)] #[repr(u8)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub enum PageLevel { None, Level4K, @@ -52,15 +64,49 @@ pub enum PageLevel { Level512G, LevelNum, } +// 实现 Add trait +impl Add for PageLevel { + type Output = Self; + + fn add(self, other: usize) -> Self { + let result = self as usize + other; + match result { + 0 => PageLevel::None, + 1 => PageLevel::Level4K, + 2 => PageLevel::Level2M, + 3 => PageLevel::Level1G, + 4 => PageLevel::Level512G, + 5 => PageLevel::LevelNum, + _ => PageLevel::LevelNum, // 超出范围时返回 LevelNum + } + } +} +// 实现 Sub trait +impl Sub for PageLevel { + type Output = Self; + + fn sub(self, other: usize) -> Self { + let result = self as isize - other as isize; + match result { + 0 => PageLevel::None, + 1 => PageLevel::Level4K, + 2 => PageLevel::Level2M, + 3 => PageLevel::Level1G, + 4 => PageLevel::Level512G, + 5 => PageLevel::LevelNum, + _ => PageLevel::None, // 超出范围时返回 None + } + } +} impl PageLevel { fn kvm_hpage_gfn_shift(level: u8) -> u32 { ((level - 1) * 9) as u32 } - + fn kvm_hpage_shift(level: u8) -> u32 { PAGE_SHIFT + Self::kvm_hpage_gfn_shift(level) } - + fn kvm_hpage_size(level: u8) -> u64 { 1 << Self::kvm_hpage_shift(level) } @@ -99,7 +145,7 @@ impl LockedKvmMmu { } pub type KvmMmuPageFaultHandler = - fn(vcpu: &mut VirtCpu, page_fault:&KvmPageFault) -> Result; + fn(vcpu: &mut VirtCpu, page_fault: &KvmPageFault) -> Result; #[derive(Debug, Default)] #[allow(dead_code)] @@ -114,9 +160,17 @@ pub struct KvmMmu { prev_roots: [KvmMmuRootInfo; Self::KVM_MMU_NUM_PREV_ROOTS], pae_root: Vec, + + pub pdptrs: [u64;4], } impl KvmMmu { + pub fn save_pdptrs(&mut self){ + self.pdptrs[0] = VmxAsm::vmx_vmread(guest::PDPTE0_FULL); + self.pdptrs[1] = VmxAsm::vmx_vmread(guest::PDPTE1_FULL); + self.pdptrs[2] = VmxAsm::vmx_vmread(guest::PDPTE2_FULL); + self.pdptrs[3] = VmxAsm::vmx_vmread(guest::PDPTE3_FULL); + } const KVM_MMU_NUM_PREV_ROOTS: usize = 3; pub const INVALID_PAGE: u64 = u64::MAX; @@ -508,7 +562,7 @@ impl VirtCpu { if direct { self.mmu_alloc_direct_roots()?; } else { - self.mmu_alloc_shadow_roots()?; + self.mmu_alloc_shadow_roots(vm)?; } // TODO: kvm_mmu_sync_roots @@ -571,7 +625,51 @@ impl VirtCpu { Ok(()) } - fn mmu_alloc_shadow_roots(&mut self) -> Result<(), SystemError> { + ///没做完 + fn mmu_alloc_shadow_roots(&mut self,vm: &Vm) -> Result<(), SystemError> { + // let mut pdptrs:[u64;4] = [0;4]; + + // let root_pgd = self.get_guest_pgd(); + // let root_gfn = root_pgd >> PAGE_SHIFT; + + // let mut mmu = self.arch.mmu(); + // //检查gfn是否合法 + // if let Some(slot) = self.gfn_to_memslot(root_gfn,vm){ + // if !slot.read().is_visible(){ + // mmu.root.hpa = KvmMmu::INVALID_PAGE; + // return Err(SystemError::EFAULT); + // } + // } + + + + // //在SVM架构下?,读取PDPTR可能会访问客户机内存并导致缺页错误,从而使进程进入睡眠状态。 + // //为了避免在持有mmu_lock时发生睡眠,应该在获取mmu_lock之前先读取PDPTR。 + // //感觉先不用管 + // if mmu.cpu_role.base.level() == 3{ //PT32E_ROOT_LEVEL=3 + // mmu.save_pdptrs(); + // //x86_kvm_ops().cache_reg(&mut self.arch, KvmReg::NrVcpuRegs); + // for i in 0..4{ + // pdptrs[i] = self.arch.walk_mmu.clone().unwrap().lock().pdptrs[i]; + // if !pdptrs[i] & 0x1 != 0{ + // continue; + // } + + // if let Some(slot) = self.gfn_to_memslot(pdptrs[i]>>PAGE_SHIFT,vm){ + // if !slot.read().is_visible(){ + // pdptrs[i] = 0; + // } + // } + // } + // } + + + todo!(); } + // fn get_guest_pgd(&mut self)-> u64{ + // x86_kvm_ops().cache_reg(&mut self.arch, KvmReg::VcpuExregCr3); + // self.arch.cr3 + // } + } diff --git a/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs index edae9c9c7..3dc5055c1 100644 --- a/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs +++ b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs @@ -4,22 +4,23 @@ use alloc::{ vec::Vec, }; use core::{intrinsics::unlikely, ops::Index}; +use x86::vmx::vmcs::{guest, host}; use system_error::SystemError; use crate::{ - arch::vm::{ + arch::{vm::{ + asm::VmxAsm, kvm_host::{EmulType, KVM_PFN_NOSLOT}, mmu::{ mmu::{PFRet, PageLevel}, - tdp_iter::{is_large_pte, is_shadow_present_pte, TdpIter}, }, mtrr::kvm_mtrr_check_gfn_range_consistency, - vmx::PageFaultErr, - }, - kwarn, + vmx::{ept::EptPageMapper, PageFaultErr}, + }, MMArch}, + kdebug, kwarn, libs::spinlock::SpinLockGuard, - mm::{virt_2_phys, PhysAddr}, + mm::{page::PageFlags, syscall::ProtFlags, virt_2_phys, PhysAddr}, virt::{ kvm::host_mem::PAGE_SHIFT, vm::kvm_host::{ @@ -52,7 +53,9 @@ pub struct KvmMmuPage { #[derive(Debug, Default)] pub struct KvmPageFault { // vcpu.do_page_fault 的参数 - addr: PhysAddr, // gpa_t 通常是一个 64 位地址 + + // addr是guestOS传进来的gpa + addr: PhysAddr, error_code: u32, prefetch: bool, @@ -87,13 +90,26 @@ pub struct KvmPageFault { // kvm_faultin_pfn 的输出 mmu_seq: u64, - pfn: u64, // kvm_pfn_t 通常是一个 64 位地址 + + // kvm_pfn_t 通常是一个 64 位地址,相当于知道了hpa + pfn: u64, hva: u64, // hva_t 通常是一个 64 位地址 map_writable: bool, // 表示访客正在尝试写入包含用于翻译写入本身的一个或多个 PTE 的 gfn write_fault_to_shadow_pgtable: bool, } +impl KvmPageFault { + pub fn pfn(&self) -> u64 { + self.pfn + } + pub fn gfn(&self) -> u64 { + self.gfn + } + pub fn gpa(&self) -> u64 { + self.addr.data() as u64 + } +} impl VirtCpu { #[inline(never)] @@ -228,7 +244,7 @@ impl VirtCpu { Ok(r) } - fn gfn_to_memslot(&self, gfn: u64, vm: &Vm) -> Option> { + pub fn gfn_to_memslot(&self, gfn: u64, vm: &Vm) -> Option> { let slot_set: Arc = self.kvm_vcpu_memslots(vm); //...todo @@ -290,50 +306,26 @@ impl VirtCpu { let mmu = self.arch.mmu(); let kvm = self.kvm(); let ret = PFRet::Retry; - - let mut tdp_iter: TdpIter = TdpIter::default(); - - tdp_iter.start( - virt_2_phys(mmu.root.hpa as usize), /*__va */ - mmu.root_role.level() as u8, - PageLevel::Level4K as u8, - page_fault.gfn, - ); - - for iter in tdp_iter { - if !(iter.valid && iter.gfn < page_fault.gfn + 1) { - //fixme:不懂这里的判断条件 - break; - } - //fixme:这一步va->pa的转换可能有点问题 - - // 如果启用了 NX 巨大页解决方法,则进行调整 - if page_fault.nx_huge_page_workaround_enabled { - page_fault.nx_huge_page_workaround_enabled = false; - } - - if iter.level == page_fault.goal_level { - //self.map_handle_target_level(page_fault,&mut iter); - } - - //如果在比目标更高的级别有一个映射大页的 SPTE, - //那么该 SPTE 必须被清除并替换为非叶子 SPTE。 - if is_shadow_present_pte(iter.old_spte) && !is_large_pte(iter.old_spte) { - continue; - } - - //SPTE是non-present或者指向一个需要split的大页 - } - todo!() - } - ///todo()!!! - fn map_handle_target_level(&self, page_fault: &mut KvmPageFault, iter: &mut TdpIter) { - todo!() + let mut mapper = EptPageMapper::lock(); + if mapper.is_mapped(page_fault) { + kdebug!("page fault is already mapped"); + return Ok(PFRet::Continue.into()); + }; + let page_flags = PageFlags::from_prot_flags(ProtFlags::from_bits_truncate(0x7_u64), false); + mapper.map(PhysAddr::new(page_fault.gpa() as usize), page_flags); + if mapper.is_mapped(page_fault) { + kdebug!("page fault is mapped now"); + }; + kdebug!("The ept_root_addr is {:?}", EptPageMapper::root_page_addr()); + //todo: 一些参数的更新 + Ok(PFRet::Fixed.into()) + //todo!() } fn direct_page_fault(&self, page_fault: &KvmPageFault) -> Result { todo!() } + fn kvm_faultin_pfn( &self, vm: &Vm, @@ -375,6 +367,9 @@ impl VirtCpu { } // 尝试将 GFN 转换为 PFN + let guest_cr3 = VmxAsm::vmx_vmread(guest::CR3); + let host_cr3 = VmxAsm::vmx_vmread(host::CR3); + kdebug!("guest_cr3={:x}, host_cr3={:x}", guest_cr3, host_cr3); page_fault.pfn = __gfn_to_pfn_memslot( Some(&slot), page_fault.gfn, diff --git a/kernel/src/arch/x86_64/vm/mmu/mod.rs b/kernel/src/arch/x86_64/vm/mmu/mod.rs index 9c2f87955..63581a830 100644 --- a/kernel/src/arch/x86_64/vm/mmu/mod.rs +++ b/kernel/src/arch/x86_64/vm/mmu/mod.rs @@ -1,4 +1,4 @@ pub mod mmu; pub mod mmu_internal; +pub mod pte; pub mod tdp_iter; -pub mod pte; \ No newline at end of file diff --git a/kernel/src/arch/x86_64/vm/mmu/pte.rs b/kernel/src/arch/x86_64/vm/mmu/pte.rs index 052def047..871f4aa04 100644 --- a/kernel/src/arch/x86_64/vm/mmu/pte.rs +++ b/kernel/src/arch/x86_64/vm/mmu/pte.rs @@ -14,7 +14,7 @@ bitflags::bitflags! { } pub struct Pte { - pub address: u64, // 物理地址 + pub address: u64, // 物理地址 pub flags: PteFlags, // 页表条目标志 } @@ -40,4 +40,4 @@ impl Pte { } // 其他方法... -} \ No newline at end of file +} diff --git a/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs b/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs index 3b0577c63..57c577821 100644 --- a/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs +++ b/kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs @@ -1,219 +1,219 @@ -use crate::{ - arch::vm::mmu::mmu::gfn_round_for_level, - mm::{virt_2_phys, PhysAddr, VirtAddr}, - time::sleep, - virt::kvm::host_mem::PAGE_SHIFT, -}; - -use super::{ - mmu::{PageLevel, PAGE_SIZE}, - mmu_internal::KvmMmuPage, -}; - -pub const PT64_ROOT_MAX_LEVEL: usize = 5; //通常只用到4级,但是确实有5级的情况 -pub const PT_LEVEL_BITS: u8 = 9; // 每个页表级别的位数 -pub const PT64_ENT_PER_PAGE: u32 = 1 << 9; -pub const PTE_LEN: usize = 64; - -//Bits 51:12 are from the EPT PDPTE -pub const PT64_BASE_ADDR_MASK: u64 = ((1u64 << 52) - 1) & !(PAGE_SIZE - 1); - -pub fn shadow_pt_index(addr: u64, level: u8) -> u64 { - (addr >> (PAGE_SHIFT as u8 + (level - 1) * PT_LEVEL_BITS)) & ((1 << PT_LEVEL_BITS) - 1) -} -pub fn is_last_spte(pte: u64, level: u8) -> bool { - level == PageLevel::Level4K as u8 || is_large_pte(pte) -} -pub fn is_shadow_present_pte(pte: u64) -> bool { - pte & 1 << 11 != 0 //在intel手冊中:ept PTE:11 Ignored.不是很懂 -} -pub fn is_large_pte(pte: u64) -> bool { - pte & 1 << 7 != 0 //在intel手冊中:ept PTE:7 Ignored. -} -///Bits 51:12 are from the EPT PDPTE -pub fn spte_to_pfn(pte: u64) -> u64 { - (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT -} - -#[derive(Default)] -pub struct TdpIter { - inner: TdpIterInner, -} - -impl TdpIter { - pub fn start( - &self, - root_pt: usize, - root_level: u8, - min_level: u8, - next_last_level_gfn: u64, - ) -> Self { - let mut inner = self.inner.clone(); - inner.start(root_pt, root_level, min_level, next_last_level_gfn); - TdpIter { inner } - } -} -///迭代器将遍历分页结构,直到找到此 GFN 的映射。 -#[derive(Default, Clone)] -pub struct TdpIterInner { - next_last_level_gfn: u64, - /// 线程上次让出时的 next_last_level_gfn。 - /// 仅当 next_last_level_gfn != yielded_gfn 时让出,有助于确保前进。 - pub yielded_gfn: u64, - - ///指向遍历到当前 SPTE 的页表的指针 - pt_path: [u64; PT64_ROOT_MAX_LEVEL], - - ///指向当前 SPTE 的指针 是hva吗? - sptep: PhysAddr, - - /// 当前 SPTE 映射的最低 GFN hpa>>shift? - pub gfn: u64, - - ///给迭代器的根页级别 - pub root_level: u8, - - ///迭代器应遍历到的最低级别 - pub min_level: u8, - - ///迭代器在分页结构中的当前级别 - pub level: u8, - - ///sptep 处值的快照 - pub old_spte: u64, - - ///迭代器是否具有有效状态。如果迭代器走出分页结构的末端,则为 false。 - /// - pub valid: bool, -} -impl TdpIterInner { - ///初始化ept iter - #[inline(never)] - pub fn start( - &mut self, - root_pt: usize, - root_level: u8, - min_level: u8, - next_last_level_gfn: u64, - ) { - // if root_pt.role.level() == 0 || root_pt.role.level() > PT64_ROOT_MAX_LEVEL as u32 { - // self.valid = false; - // return; - // } - - if root_level < 1 || root_level > PT64_ROOT_MAX_LEVEL as u8 { - self.valid = false; - return; - } - self.next_last_level_gfn = next_last_level_gfn; - self.root_level = root_level as u8; - self.min_level = min_level as u8; - self.pt_path[(self.root_level - 1) as usize] = root_pt as u64; - self.yielded_gfn = self.next_last_level_gfn; - self.level = self.root_level; - - self.gfn = gfn_round_for_level(self.next_last_level_gfn, self.level); - self.tdp_iter_refresh_sptep(); - self.valid = true; - } - - /* - * 重新计算当前GFN和level和SPTE指针,并重新读取SPTE。 - */ - fn tdp_iter_refresh_sptep(&mut self) { - self.sptep = PhysAddr::new( - (self.pt_path[self.level as usize - 1] - + shadow_pt_index(self.gfn << PAGE_SHIFT, self.level)) as usize, - ); - self.old_spte = read_sptep(self.sptep); - } - - pub fn _next(&mut self) { - if self.try_step_down() { - return; - } - loop { - if self.try_step_side() { - return; - } - if !self.try_step_up() { - break; - } - } - self.valid = false; - } - ///在分页结构中向目标GFN下降一级。如果迭代器能够下降一级,则返回true,否则返回false。 - fn try_step_down(&mut self) -> bool { - if self.level == self.min_level { - return false; - } - //在下降之前重新读取SPTE,以避免遍历到不再从此条目链接的页表中。 - self.old_spte = read_sptep(self.sptep); - - match spte_to_child_pt(self.old_spte, self.level) { - Some(child_pt) => { - self.level -= 1; - self.pt_path[self.level as usize - 1] = child_pt.data() as u64; - self.gfn = gfn_round_for_level(self.gfn, self.level); - self.tdp_iter_refresh_sptep(); - true - } - None => false, - } - } - fn try_step_up(&mut self) -> bool { - if self.level == self.root_level { - return false; - } - self.level += 1; - self.gfn = gfn_round_for_level(self.gfn, self.level); - self.tdp_iter_refresh_sptep(); - true - } - ///在当前页表的当前级别中,移动到下一个条目。下一个条目可以指向一个page backing guest memory , - ///或者另一个页表,或者它可能是不存在的。如果迭代器能够移动到页表中的下一个条目,则返回true, - ///如果迭代器已经在当前页表的末尾,则返回false。 - fn try_step_side(&mut self) -> bool { - //检查迭代器是否已经在当前页表的末尾。 - if shadow_pt_index(self.gfn << PAGE_SHIFT, self.level) == (PT64_ENT_PER_PAGE - 1) as u64 { - return false; - } - - self.gfn += PageLevel::kvm_pages_per_hpage(self.level); - self.next_last_level_gfn = self.gfn; - self.sptep.add(PTE_LEN); //指向下一个spte,一个spte占64位 - self.old_spte = read_sptep(self.sptep); - true - } -} -impl Iterator for TdpIter { - type Item = TdpIterInner; // 返回 (gfn, spte) 元组 - - fn next(&mut self) -> Option { - let inner = &mut self.inner; - if !inner.valid { - return None; - } - inner._next(); - if inner.valid { - Some(inner.clone()) - } else { - None - } - } -} -///给定一个 SPTE 及其级别,返回一个指针,该指针包含 SPTE 所引用的子页表的hva。 -///如果没有这样的条目,则返回 null。 -/// -fn spte_to_child_pt(spte: u64, level: u8) -> Option { - //没有子页表 - if !is_shadow_present_pte(spte) || is_last_spte(spte, level) { - return None; - } - Some(VirtAddr::new(virt_2_phys//__va - ((spte_to_pfn(spte)< u64 { - unsafe { *(sptep.data() as *const u64) } -} +// use crate::{ +// arch::vm::mmu::mmu::gfn_round_for_level, +// mm::{virt_2_phys, PhysAddr, VirtAddr}, +// time::sleep, +// virt::kvm::host_mem::PAGE_SHIFT, +// }; + +// use super::{ +// mmu::{PageLevel, PAGE_SIZE}, +// mmu_internal::KvmMmuPage, +// }; + +// pub const PT64_ROOT_MAX_LEVEL: usize = 5; //通常只用到4级,但是确实有5级的情况 +// pub const PT_LEVEL_BITS: u8 = 9; // 每个页表级别的位数 +// pub const PT64_ENT_PER_PAGE: u32 = 1 << 9; +// pub const PTE_LEN: usize = 64; + +// //Bits 51:12 are from the EPT PDPTE +// pub const PT64_BASE_ADDR_MASK: u64 = ((1u64 << 52) - 1) & !(PAGE_SIZE - 1); + +// pub fn shadow_pt_index(addr: u64, level: u8) -> u64 { +// (addr >> (PAGE_SHIFT as u8 + (level - 1) * PT_LEVEL_BITS)) & ((1 << PT_LEVEL_BITS) - 1) +// } +// pub fn is_last_spte(pte: u64, level: u8) -> bool { +// level == PageLevel::Level4K as u8 || is_large_pte(pte) +// } +// pub fn is_shadow_present_pte(pte: u64) -> bool { +// pte & 1 << 11 != 0 //在intel手冊中:ept PTE:11 Ignored.不是很懂 +// } +// pub fn is_large_pte(pte: u64) -> bool { +// pte & 1 << 7 != 0 //在intel手冊中:ept PTE:7 Ignored. +// } +// ///Bits 51:12 are from the EPT PDPTE +// pub fn spte_to_pfn(pte: u64) -> u64 { +// (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT +// } + +// #[derive(Default)] +// pub struct TdpIter { +// inner: TdpIterInner, +// } + +// impl TdpIter { +// pub fn start( +// &self, +// root_pt: usize, +// root_level: u8, +// min_level: u8, +// next_last_level_gfn: u64, +// ) -> Self { +// let mut inner = self.inner.clone(); +// inner.start(root_pt, root_level, min_level, next_last_level_gfn); +// TdpIter { inner } +// } +// } +// ///迭代器将遍历分页结构,直到找到此 GFN 的映射。 +// #[derive(Default, Clone)] +// pub struct TdpIterInner { +// next_last_level_gfn: u64, +// /// 线程上次让出时的 next_last_level_gfn。 +// /// 仅当 next_last_level_gfn != yielded_gfn 时让出,有助于确保前进。 +// pub yielded_gfn: u64, + +// ///指向遍历到当前 SPTE 的页表的指针 +// pt_path: [u64; PT64_ROOT_MAX_LEVEL], + +// ///指向当前 SPTE 的指针 是hva吗? +// sptep: PhysAddr, + +// /// 当前 SPTE 映射的最低 GFN hpa>>shift? +// pub gfn: u64, + +// ///给迭代器的根页级别 +// pub root_level: u8, + +// ///迭代器应遍历到的最低级别 +// pub min_level: u8, + +// ///迭代器在分页结构中的当前级别 +// pub level: u8, + +// ///sptep 处值的快照 +// pub old_spte: u64, + +// ///迭代器是否具有有效状态。如果迭代器走出分页结构的末端,则为 false。 +// /// +// pub valid: bool, +// } +// impl TdpIterInner { +// ///初始化ept iter +// #[inline(never)] +// pub fn start( +// &mut self, +// root_pt: usize, +// root_level: u8, +// min_level: u8, +// next_last_level_gfn: u64, +// ) { +// // if root_pt.role.level() == 0 || root_pt.role.level() > PT64_ROOT_MAX_LEVEL as u32 { +// // self.valid = false; +// // return; +// // } + +// if root_level < 1 || root_level > PT64_ROOT_MAX_LEVEL as u8 { +// self.valid = false; +// return; +// } +// self.next_last_level_gfn = next_last_level_gfn; +// self.root_level = root_level as u8; +// self.min_level = min_level as u8; +// self.pt_path[(self.root_level - 1) as usize] = root_pt as u64; +// self.yielded_gfn = self.next_last_level_gfn; +// self.level = self.root_level; + +// self.gfn = gfn_round_for_level(self.next_last_level_gfn, self.level); +// self.tdp_iter_refresh_sptep(); +// self.valid = true; +// } + +// /* +// * 重新计算当前GFN和level和SPTE指针,并重新读取SPTE。 +// */ +// fn tdp_iter_refresh_sptep(&mut self) { +// // self.sptep = PhysAddr::new( +// // (self.pt_path[self.level as usize - 1] +// // + shadow_pt_index(self.gfn << PAGE_SHIFT, self.level)) as usize, +// // ); +// // self.old_spte = read_sptep(self.sptep); +// } + +// pub fn _next(&mut self) { +// if self.try_step_down() { +// return; +// } +// loop { +// if self.try_step_side() { +// return; +// } +// if !self.try_step_up() { +// break; +// } +// } +// self.valid = false; +// } +// ///在分页结构中向目标GFN下降一级。如果迭代器能够下降一级,则返回true,否则返回false。 +// fn try_step_down(&mut self) -> bool { +// if self.level == self.min_level { +// return false; +// } +// //在下降之前重新读取SPTE,以避免遍历到不再从此条目链接的页表中。 +// self.old_spte = read_sptep(self.sptep); + +// match spte_to_child_pt(self.old_spte, self.level) { +// Some(child_pt) => { +// self.level -= 1; +// self.pt_path[self.level as usize - 1] = child_pt.data() as u64; +// self.gfn = gfn_round_for_level(self.gfn, self.level); +// self.tdp_iter_refresh_sptep(); +// true +// } +// None => false, +// } +// } +// fn try_step_up(&mut self) -> bool { +// if self.level == self.root_level { +// return false; +// } +// self.level += 1; +// self.gfn = gfn_round_for_level(self.gfn, self.level); +// self.tdp_iter_refresh_sptep(); +// true +// } +// ///在当前页表的当前级别中,移动到下一个条目。下一个条目可以指向一个page backing guest memory , +// ///或者另一个页表,或者它可能是不存在的。如果迭代器能够移动到页表中的下一个条目,则返回true, +// ///如果迭代器已经在当前页表的末尾,则返回false。 +// fn try_step_side(&mut self) -> bool { +// //检查迭代器是否已经在当前页表的末尾。 +// if shadow_pt_index(self.gfn << PAGE_SHIFT, self.level) == (PT64_ENT_PER_PAGE - 1) as u64 { +// return false; +// } + +// self.gfn += PageLevel::kvm_pages_per_hpage(self.level); +// self.next_last_level_gfn = self.gfn; +// self.sptep.add(PTE_LEN); //指向下一个spte,一个spte占64位 +// self.old_spte = read_sptep(self.sptep); +// true +// } +// } +// impl Iterator for TdpIter { +// type Item = TdpIterInner; // 返回 (gfn, spte) 元组 + +// fn next(&mut self) -> Option { +// let inner = &mut self.inner; +// if !inner.valid { +// return None; +// } +// inner._next(); +// if inner.valid { +// Some(inner.clone()) +// } else { +// None +// } +// } +// } +// ///给定一个 SPTE 及其级别,返回一个指针,该指针包含 SPTE 所引用的子页表的hva。 +// ///如果没有这样的条目,则返回 null。 +// /// +// fn spte_to_child_pt(spte: u64, level: u8) -> Option { +// //没有子页表 +// if !is_shadow_present_pte(spte) || is_last_spte(spte, level) { +// return None; +// } +// Some(VirtAddr::new(virt_2_phys//__va +// ((spte_to_pfn(spte)< u64 { +// unsafe { *(sptep.data() as *const u64) } +// } diff --git a/kernel/src/arch/x86_64/vm/mod.rs b/kernel/src/arch/x86_64/vm/mod.rs index 695af9cb3..87e657236 100644 --- a/kernel/src/arch/x86_64/vm/mod.rs +++ b/kernel/src/arch/x86_64/vm/mod.rs @@ -27,9 +27,9 @@ pub(super) mod exit; pub mod kvm_host; pub mod mem; pub mod mmu; +pub mod mtrr; pub mod uapi; pub mod vmx; -pub mod mtrr; static mut KVM_X86_MANAGER: Option = None; diff --git a/kernel/src/arch/x86_64/vm/mtrr.rs b/kernel/src/arch/x86_64/vm/mtrr.rs index 873689a1d..d8436421f 100644 --- a/kernel/src/arch/x86_64/vm/mtrr.rs +++ b/kernel/src/arch/x86_64/vm/mtrr.rs @@ -11,7 +11,7 @@ pub fn kvm_mtrr_check_gfn_range_consistency(vcpu: &mut VirtCpu, gfn: u64, page_n // }; let start = gfn_to_gpa(gfn); let end = gfn_to_gpa(gfn + page_num); - + // mtrr_for_each_mem_type(&mut iter, mtrr_state, start, end, |iter| { // if iter.mem_type == -1 { // iter.mem_type = iter.mem_type; @@ -34,4 +34,4 @@ pub fn kvm_mtrr_check_gfn_range_consistency(vcpu: &mut VirtCpu, gfn: u64, page_n // iter.mem_type == mtrr_default_type(mtrr_state) true -} \ No newline at end of file +} diff --git a/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs b/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs index ab5263908..cc5923a8c 100644 --- a/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs @@ -1,4 +1,26 @@ +use crate::arch::mm::{LockedFrameAllocator, PageMapper}; +use crate::arch::vm::asm::VmxAsm; +use crate::arch::vm::mmu::mmu::{max_huge_page_level, PageLevel}; +use crate::arch::vm::mmu::mmu_internal::KvmPageFault; +use crate::arch::MMArch; +use crate::libs::spinlock::{SpinLock, SpinLockGuard}; +use crate::mm::allocator::page_frame::FrameAllocator; +use crate::{kdebug, kerror, kinfo, kwarn}; +use crate::mm::page::{page_manager_lock_irqsave, Page, PageEntry, PageFlags, PageFlush, PageManager, PageTable}; +use crate::mm::{MemoryManagementArch, PageTableKind, PhysAddr, VirtAddr}; +use crate::smp::core::smp_get_processor_id; +use crate::smp::cpu::AtomicProcessorId; +use crate::smp::cpu::ProcessorId; +use core::marker::PhantomData; +use core::ops::Add; +use core::sync::atomic::{compiler_fence, AtomicUsize, Ordering}; +use hashbrown::HashMap; +use system_error::SystemError; +use x86::msr; +use x86::vmx::vmcs::control; +use crate::arch::x86_64::mm::X86_64MMArch; use crate::libs::rwlock::RwLock; +use super::vmx_info; // pub const VMX_EPT_MT_EPTE_SHIFT:u64 = 3; pub const VMX_EPT_RWX_MASK: u64 = 0x7 << 3; @@ -21,11 +43,345 @@ bitflags! { const GVA_TRANSLATED = 1 << EPT_VIOLATION_GVA_TRANSLATED_BIT; } } -struct EptPageTable { - // EPT 页表数据结构 + +// /// 全局EPT物理页信息管理器 +// pub static mut EPT_PAGE_MANAGER: Option> = None; + +// /// 初始化EPT_PAGE_MANAGER +// pub fn ept_page_manager_init() { +// kinfo!("page_manager_init"); +// let page_manager = SpinLock::new(EptPageManager::new()); + +// compiler_fence(Ordering::SeqCst); +// unsafe { EPT_PAGE_MANAGER = Some(page_manager) }; +// compiler_fence(Ordering::SeqCst); + +// kinfo!("page_manager_init done"); +// } + +// pub fn ept_page_manager_lock_irqsave() -> SpinLockGuard<'static, EptPageManager> { +// unsafe { EPT_PAGE_MANAGER.as_ref().unwrap().lock_irqsave() } +// } +// EPT 页表数据结构 +#[derive(Debug)] +pub struct EptPageTable { + /// 当前页表表示的虚拟地址空间的起始地址 + base: VirtAddr, + /// 当前页表所在的物理地址 + phys: PhysAddr, + /// 当前页表的层级 + /// PageLevel::4K = 1 + level: PageLevel, +} +impl EptPageTable{ + pub fn phys(&self) -> PhysAddr { + self.phys + } + + /// 设置当前页表的第i个页表项 + pub unsafe fn set_entry(&self, i: usize, entry: PageEntry) -> Option<()> { + let entry_virt = self.entry_virt(i)?; + MMArch::write::(entry_virt, entry.data()); + return Some(()); + } + /// 判断当前页表的第i个页表项是否已经填写了值 + /// + /// ## 参数 + /// - Some(true) 如果已经填写了值 + /// - Some(false) 如果未填写值 + /// - None 如果i超出了页表项的范围 + pub fn entry_mapped(&self, i: usize) -> Option { + let etv = unsafe { self.entry_virt(i) }?; + if unsafe { MMArch::read::(etv) } != 0 { + return Some(true); + } else { + return Some(false); + } + } + + /// 获取当前页表的层级 + #[inline(always)] + pub fn level(&self) -> PageLevel { + self.level + } + + /// 获取第i个页表项所表示的虚拟内存空间的起始地址 + pub fn entry_base(&self, i: usize) -> Option { + if i < MMArch::PAGE_ENTRY_NUM { + let shift = (self.level as usize - 1) * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT; + return Some(self.base.add(i << shift)); + } else { + return None; + } + } + /// 获取当前页表自身所在的虚拟地址 + #[inline(always)] + pub unsafe fn virt(&self) -> VirtAddr { + return MMArch::phys_2_virt(self.phys).unwrap(); + } + /// 获取当前页表的第i个页表项所在的虚拟地址(注意与entry_base进行区分) + pub unsafe fn entry_virt(&self, i: usize) -> Option { + if i < MMArch::PAGE_ENTRY_NUM { + return Some(self.virt().add(i * MMArch::PAGE_ENTRY_SIZE)); + } else { + return None; + } + } + /// 获取当前页表的第i个页表项 + pub unsafe fn entry(&self, i: usize) -> Option> { + let entry_virt = self.entry_virt(i)?; + return Some(PageEntry::from_usize(MMArch::read::(entry_virt))); + } + + pub fn new(base:VirtAddr,phys: PhysAddr,level:PageLevel) -> Self { + Self { + base: base, + phys, + level + } + } + /// 根据虚拟地址,获取对应的页表项在页表中的下标 + /// + /// ## 参数 + /// + /// - gpa: 虚拟地址 + /// + /// ## 返回值 + /// + /// 页表项在页表中的下标。如果addr不在当前页表所表示的虚拟地址空间中,则返回None + pub unsafe fn index_of(&self, gpa: PhysAddr) -> Option { + let addr = VirtAddr::new(gpa.data() & MMArch::PAGE_ADDRESS_MASK); + let shift = (self.level - 1) as usize * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT; + + let mask = (MMArch::PAGE_ENTRY_NUM << shift) - 1; + if addr < self.base || addr >= self.base.add(mask) { + return None; + } else { + return Some((addr.data() >> shift) & MMArch::PAGE_ENTRY_MASK); + } + } + + pub fn next_level_table(&self, index: usize) -> Option { + if self.level == PageLevel::Level4K { + return None; + } + // 返回下一级页表 + return Some(EptPageTable::new( + self.entry_base(index)?, + unsafe { self.entry(index) }?.address().ok()?, + self.level - 1, + )); + } +} + +// // EPT物理页管理器 +// pub struct EptPageManager { +// phys2page: HashMap, +// } + +// impl EptPageManager { +// pub fn new() -> Self { +// Self { +// phys2page: HashMap::new(), +// } +// } + +// } + +/// Check if MTRR is supported +pub fn check_ept_features() -> Result<(), SystemError> { + const MTRR_ENABLE_BIT: u64 = 1 << 11; + let ia32_mtrr_def_type = unsafe { msr::rdmsr(msr::IA32_MTRR_DEF_TYPE) }; + if (ia32_mtrr_def_type & MTRR_ENABLE_BIT) == 0 { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + Ok(()) +} + +/// 标志当前没有处理器持有内核映射器的锁 +/// 之所以需要这个标志,是因为AtomicUsize::new(0)会把0当作一个处理器的id +const EPT_MAPPER_NO_PROCESSOR: ProcessorId = ProcessorId::INVALID; +/// 当前持有内核映射器锁的处理器 +static EPT_MAPPER_LOCK_OWNER: AtomicProcessorId = AtomicProcessorId::new(EPT_MAPPER_NO_PROCESSOR); +/// 内核映射器的锁计数器 +static EPT_MAPPER_LOCK_COUNT: AtomicUsize = AtomicUsize::new(0); + +pub struct EptPageMapper { + /// EPT页表映射器 + //mapper: PageMapper,//PageTableKind::EPT, LockedFrameAllocator + /// 标记当前映射器是否为只读 + readonly: bool, + // EPT页表根地址 + root_page_addr: PhysAddr, + /// 页分配器 + frame_allocator: LockedFrameAllocator, } -struct EptManager { - ept: RwLock, - // 其他字段 +impl EptPageMapper{ + /// 返回最上层的ept页表 + pub fn table(&self) ->EptPageTable { + EptPageTable::new(VirtAddr::new(0), + self.root_page_addr,max_huge_page_level()) + } + pub fn root_page_addr() -> PhysAddr { + let eptp =VmxAsm::vmx_vmread(control::EPTP_FULL); + PhysAddr::new(eptp as usize) + } + + fn lock_cpu(cpuid: ProcessorId) -> Self { + loop { + match EPT_MAPPER_LOCK_OWNER.compare_exchange_weak( + EPT_MAPPER_NO_PROCESSOR, + cpuid, + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => break, + // 当前处理器已经持有了锁 + Err(id) if id == cpuid => break, + // either CAS failed, or some other hardware thread holds the lock + Err(_) => core::hint::spin_loop(), + } + } + + let prev_count = EPT_MAPPER_LOCK_COUNT.fetch_add(1, Ordering::Relaxed); + compiler_fence(Ordering::Acquire); + + // 本地核心已经持有过锁,因此标记当前加锁获得的映射器为只读 + let readonly = prev_count > 0; + let root_page_addr = Self::root_page_addr(); + kdebug!("EptPageMapper root_page_addr: {:?}", root_page_addr); + return Self { + readonly, + root_page_addr, + frame_allocator: LockedFrameAllocator, + }; + } + + /// @brief 锁定内核映射器, 并返回一个内核映射器对象 + /// 目前只有这一个办法可以获得EptPageMapper对象 + #[inline(always)] + pub fn lock() -> Self { + //fixme:得到的是cpuid还是vcpuid? + let cpuid = smp_get_processor_id(); + return Self::lock_cpu(cpuid); + } + + /// @brief: 检查有无gpa->hpa的映射 + #[no_mangle] + pub fn is_mapped(&self,page_fault:&mut KvmPageFault) -> bool { + let gpa = page_fault.gpa(); + let mut page_table = self.table(); + let mut next_page_table; + loop { + let index:usize = unsafe { + if let Some(i) = page_table.index_of(PhysAddr::new(gpa as usize)){ + i + }else{ + kerror!("ept page table index_of failed"); + return false; + } + }; + if let Some(table) = page_table.next_level_table(index) { + kdebug!("ept page table next level table: {:?}", table); + if table.level() == PageLevel::Level4K { + return true; + } + next_page_table = table; + }else{ + return false; + } + page_table = next_page_table; + + } + } + + /// 从当前EptPageMapper的页分配器中分配一个物理页(hpa),并将其映射到指定的gpa + pub fn map( + &mut self, + gpa: PhysAddr, + flags: PageFlags, + ) -> Option>{ + compiler_fence(Ordering::SeqCst); + let hpa: PhysAddr = unsafe { self.frame_allocator.allocate_one() }?; + compiler_fence(Ordering::SeqCst); + + let mut page_manager_guard: SpinLockGuard<'static, PageManager> = + page_manager_lock_irqsave(); + if !page_manager_guard.contains(&hpa) { + page_manager_guard.insert(hpa, Page::new(false)); + } + self.map_gpa(gpa, hpa, flags) + } + + + ///映射一个hpa到指定的gpa + pub fn map_gpa( + &mut self, + gpa: PhysAddr, + hpa: PhysAddr, + flags: PageFlags, + ) -> Option> { + // 验证虚拟地址和物理地址是否对齐 + if !(gpa.check_aligned(MMArch::PAGE_SIZE) && hpa.check_aligned(MMArch::PAGE_SIZE)) { + kerror!( + "Try to map unaligned page: gpa={:?}, hpa={:?}", + gpa, + hpa + ); + return None; + } + + let gpa = PhysAddr::new(gpa.data() & (!MMArch::PAGE_NEGATIVE_MASK)); + + // TODO: 验证flags是否合法 + + // 创建页表项 + let entry = PageEntry::new(hpa, flags); + let mut table = self.table(); + kdebug!("ept page table: {:?}", table); + kdebug!("Now eptp is : {:?}", VmxAsm::vmx_vmread(control::EPTP_FULL)); + loop{ + let i = unsafe { table.index_of(gpa).unwrap() }; + assert!(i < MMArch::PAGE_ENTRY_NUM); + if table.level() == PageLevel::Level4K { + //todo: 检查是否已经映射 + //fixme::按道理已经检查过了,不知道是否正确 + if table.entry_mapped(i).unwrap() { + kwarn!("Page gpa :: {:?} already mapped", gpa); + } + + compiler_fence(Ordering::SeqCst); + + unsafe { table.set_entry(i, entry) }; + compiler_fence(Ordering::SeqCst); + return Some(PageFlush::new(VirtAddr::new(gpa.data()))); + }else{ + let next_table = table.next_level_table(i); + if let Some(next_table) = next_table { + table = next_table; + } else { + // 分配下一级页表 + let frame = unsafe { self.frame_allocator.allocate_one() }?; + + // 清空这个页帧 + unsafe { MMArch::write_bytes(MMArch::phys_2_virt(frame).unwrap(), 0, MMArch::PAGE_SIZE) }; + + // fixme::设置页表项的flags,可能有点问题 + let flags: PageFlags = + unsafe { PageFlags::from_data(MMArch::ENTRY_FLAG_DEFAULT_TABLE | MMArch::ENTRY_FLAG_READWRITE) }; + + kdebug!("EptEntryFlags: {:?}", flags); + + + // 把新分配的页表映射到当前页表 + unsafe { table.set_entry(i, PageEntry::new(frame, flags)) }; + + // 获取新分配的页表 + table = table.next_level_table(i)?; + } + } + + } + } } diff --git a/kernel/src/arch/x86_64/vm/vmx/mod.rs b/kernel/src/arch/x86_64/vm/vmx/mod.rs index 7500f507b..0406e40a0 100644 --- a/kernel/src/arch/x86_64/vm/vmx/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/mod.rs @@ -482,6 +482,7 @@ impl KvmFunc for VmxKvmFunc { // VCPU_EXREG_PDPTR KvmReg::NrVcpuRegs => { if vmx_info().enable_ept { + todo!() } } @@ -494,7 +495,10 @@ impl KvmFunc for VmxKvmFunc { ); } KvmReg::VcpuExregCr3 => { - todo!() + //当拦截CR3加载时(例如用于影子分页),KVM(Kernel-based Virtual Machine)的CR3会被加载到硬件中,而不是客户机的CR3。 + //暂时先直接读寄存器 + vcpu.cr3 = VmxAsm::vmx_vmread(guest::CR3); + //todo!() } KvmReg::VcpuExregCr4 => { let guest_owned = vcpu.cr4_guest_owned_bits; @@ -1054,11 +1058,8 @@ impl KvmFunc for VmxKvmFunc { vcpu.arch.clear_dirty(); - let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); if unlikely(cr3 != vcpu.vmx().loaded_vmcs().host_state.cr3) { - let cr3_combined: u64 = - (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); let cr3_combined: u64 = (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); @@ -1238,13 +1239,15 @@ impl KvmFunc for VmxKvmFunc { todo!() } else if vcpu.arch.is_register_dirty(KvmReg::VcpuExregCr3) { guest_cr3 = vcpu.arch.cr3; + kdebug!("load_mmu_pgd: guest_cr3 = {:#x}", guest_cr3); + } else { return; } } else { todo!(); } - + vcpu.load_pdptrs(); VmxAsm::vmx_vmwrite(guest::CR3, guest_cr3); } } @@ -2351,8 +2354,6 @@ impl Vmx { VmxAsm::vmx_vmwrite(host::CR0, unsafe { cr0() }.bits() as u64); let cr3: (PhysFrame, Cr3Flags) = Cr3::read(); - let cr3_combined: u64 = - (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); let cr3_combined: u64 = (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF); VmxAsm::vmx_vmwrite(host::CR3, cr3_combined); @@ -3749,3 +3750,6 @@ unsafe extern "C" fn vmx_spec_ctrl_restore_host(_vcpu_vmx: &VmxVCpuPriv, _flags: // TODO kwarn!("vmx_spec_ctrl_restore_host todo!"); } +const fn vmcs_control_bit(x: u32) -> u32 { + 1 << (x & 0x1f) +} \ No newline at end of file diff --git a/kernel/src/mm/mod.rs b/kernel/src/mm/mod.rs index 9bed5c63d..1afb79d64 100644 --- a/kernel/src/mm/mod.rs +++ b/kernel/src/mm/mod.rs @@ -124,7 +124,7 @@ pub enum PageTableKind { } /// 物理内存地址 -#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash,Default)] +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Default)] #[repr(transparent)] pub struct PhysAddr(usize); diff --git a/kernel/src/virt/vm/kvm_host/mem.rs b/kernel/src/virt/vm/kvm_host/mem.rs index d0eb826b8..3f552301c 100644 --- a/kernel/src/virt/vm/kvm_host/mem.rs +++ b/kernel/src/virt/vm/kvm_host/mem.rs @@ -699,6 +699,7 @@ pub fn hva_to_pfn( if let Some((hpa, _)) = mapper.translate(hva) { return Ok(hpa.data() as u64 >> PAGE_SHIFT); } + kdebug!("hva_to_pfn NOT FOUND,try map a new pfn"); unsafe { mapper.map(hva, PageFlags::mmio_flags()); } From a2dd6bc9bed96396911280a7ad46496522b18b74 Mon Sep 17 00:00:00 2001 From: GnoCiYeH Date: Sun, 29 Sep 2024 14:07:15 +0800 Subject: [PATCH 10/10] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E4=BA=9Bio?= =?UTF-8?q?=E8=99=9A=E6=8B=9F=E5=8C=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernel/src/arch/x86_64/vm/kvm_host/io.rs | 129 +++++++++++++ kernel/src/arch/x86_64/vm/kvm_host/lapic.rs | 62 ------ .../arch/x86_64/vm/kvm_host/lapic/apicdef.rs | 18 ++ .../src/arch/x86_64/vm/kvm_host/lapic/mod.rs | 121 ++++++++++++ kernel/src/arch/x86_64/vm/kvm_host/mod.rs | 2 + kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs | 22 ++- kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs | 22 ++- kernel/src/arch/x86_64/vm/uapi.rs | 2 + kernel/src/arch/x86_64/vm/vmx/ept/mod.rs | 158 ++++++++------- kernel/src/arch/x86_64/vm/vmx/exit.rs | 23 ++- kernel/src/virt/vm/kvm_host/io.rs | 182 ++++++++++++++++++ kernel/src/virt/vm/kvm_host/mod.rs | 15 ++ kernel/src/virt/vm/user_api.rs | 3 + 13 files changed, 598 insertions(+), 161 deletions(-) create mode 100644 kernel/src/arch/x86_64/vm/kvm_host/io.rs delete mode 100644 kernel/src/arch/x86_64/vm/kvm_host/lapic.rs create mode 100644 kernel/src/arch/x86_64/vm/kvm_host/lapic/apicdef.rs create mode 100644 kernel/src/arch/x86_64/vm/kvm_host/lapic/mod.rs create mode 100644 kernel/src/virt/vm/kvm_host/io.rs diff --git a/kernel/src/arch/x86_64/vm/kvm_host/io.rs b/kernel/src/arch/x86_64/vm/kvm_host/io.rs new file mode 100644 index 000000000..77adfa6a9 --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/io.rs @@ -0,0 +1,129 @@ +use system_error::SystemError; + +use crate::{ + arch::{ + vm::{ + kvm_host::{vcpu::X86VcpuArch, KvmReg}, + uapi::{kvm_exit::KVM_EXIT_IO, KVM_PIO_PAGE_OFFSET}, + }, + MMArch, + }, + kwarn, + mm::MemoryManagementArch, + virt::vm::user_api::{UapiKvmRun, KVM_EXIT_IO_IN, KVM_EXIT_IO_OUT}, +}; + +#[derive(Debug)] +pub struct KvmPioRequest { + pub linear_rip: usize, + pub count: usize, + pub is_in: bool, + pub port: u16, + pub size: u32, +} + +impl X86VcpuArch { + pub fn kvm_fast_pio( + &mut self, + run: &mut UapiKvmRun, + size: u32, + port: u16, + is_in: bool, + ) -> Result { + let ret = if is_in { + self.kvm_fast_pio_in(size, port); + } else { + self.kvm_fast_pio_out(run, size, port); + }; + + todo!(); + } + + fn kvm_fast_pio_in(&self, size: u32, port: u16) { + let val = if size < 4 { + self.read_reg(KvmReg::VcpuRegsRax) + } else { + 0 + }; + todo!() + } + + fn kvm_fast_pio_out(&mut self, run: &mut UapiKvmRun, size: u32, port: u16) { + let val = self.read_reg(KvmReg::VcpuRegsRax) as usize; + + let data = unsafe { + core::slice::from_raw_parts_mut( + &mut (val as u8) as *mut u8, + core::mem::size_of_val(&val), + ) + }; + if self.emulator_pio_in_out(run, size, port, data, 1, false) { + return; + } + + todo!() + } + + // 返回值 -》 true: 用户态io, false: apic等io + fn emulator_pio_in_out( + &mut self, + run: &mut UapiKvmRun, + size: u32, + port: u16, + data: &mut [u8], + count: u32, + is_in: bool, + ) -> bool { + if self.pio.count != 0 { + kwarn!("emulator_pio_in_out: self.pio.count != 0, check!"); + } + + for i in 0..count { + let r: bool = if is_in { + // 暂时 + false + } else { + // 暂时 + false + }; + + if !r { + if i == 0 { + // 第一个就失败,说明不是内部端口,采用用户空间io处理 + self.pio.port = port; + self.pio.is_in = is_in; + self.pio.count = count as usize; + self.pio.size = size; + + if is_in { + self.pio_data[0..(size * count) as usize].fill(0); + } else { + self.pio_data[0..(size * count) as usize] + .copy_from_slice(&data[0..(size * count) as usize]); + } + run.exit_reason = KVM_EXIT_IO; + unsafe { + let io = &mut run.__bindgen_anon_1.io; + io.direction = if is_in { + KVM_EXIT_IO_IN + } else { + KVM_EXIT_IO_OUT + }; + io.size = size as u8; + io.data_offset = KVM_PIO_PAGE_OFFSET * MMArch::PAGE_SIZE as u64; + io.count = count; + io.port = port; + } + return true; + } + + if is_in { + self.pio_data[0..(size * (count - i)) as usize].fill(0); + } + break; + } + } + + return false; + } +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs b/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs deleted file mode 100644 index 90a4bbda9..000000000 --- a/kernel/src/arch/x86_64/vm/kvm_host/lapic.rs +++ /dev/null @@ -1,62 +0,0 @@ -use alloc::boxed::Box; - -use crate::{ - arch::kvm_arch_ops, - virt::vm::kvm_host::{vcpu::VirtCpu, Vm}, -}; - -const APIC_DEFAULT_PHYS_BASE: u64 = 0xfee00000; -#[allow(dead_code)] -const MSR_IA32_APICBASE: u64 = 0x0000001b; -const MSR_IA32_APICBASE_BSP: u64 = 1 << 8; -const MSR_IA32_APICBASE_ENABLE: u64 = 1 << 11; -#[allow(dead_code)] -const MSR_IA32_APICBASE_BASE: u64 = 0xfffff << 12; - -#[derive(Debug)] -pub struct KvmLapic { - pub apicv_active: bool, - pub regs: Box<[u8]>, -} - -impl VirtCpu { - pub fn lapic_reset(&mut self, vm: &Vm, init_event: bool) { - kvm_arch_ops().apicv_pre_state_restore(self); - - if !init_event { - let mut msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; - if vm.arch.bsp_vcpu_id == self.vcpu_id { - msr_val |= MSR_IA32_APICBASE_BSP; - } - - self.lapic_set_base(msr_val); - } - - if self.arch.apic.is_none() { - return; - } - - todo!() - } - - fn lapic_set_base(&mut self, value: u64) { - let old_val = self.arch.apic_base; - let apic = self.arch.apic.as_ref(); - - self.arch.apic_base = value; - - if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 { - // TODO: kvm_update_cpuid_runtime(vcpu); - } - - if apic.is_none() { - return; - } - - if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 { - // if value & MSR_IA32_APICBASE_ENABLE != 0 {} - } - - todo!() - } -} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/lapic/apicdef.rs b/kernel/src/arch/x86_64/vm/kvm_host/lapic/apicdef.rs new file mode 100644 index 000000000..2444ced2a --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/lapic/apicdef.rs @@ -0,0 +1,18 @@ +pub const APIC_DEFAULT_PHYS_BASE: u64 = 0xfee00000; +#[allow(dead_code)] +pub const MSR_IA32_APICBASE: u64 = 0x0000001b; +pub const MSR_IA32_APICBASE_BSP: u64 = 1 << 8; +pub const MSR_IA32_APICBASE_ENABLE: u64 = 1 << 11; +#[allow(dead_code)] +pub const MSR_IA32_APICBASE_BASE: u64 = 0xfffff << 12; + +pub const APIC_BASE_MSR: u32 = 0x800; +pub const APIC_ID: u32 = 0x20; +pub const APIC_LVR: u32 = 0x30; +pub const APIC_TASKPRI: u32 = 0x80; +pub const APIC_PROCPRI: u32 = 0xA0; +pub const APIC_EOI: u32 = 0xB0; +pub const APIC_SPIV: u32 = 0xF0; +pub const APIC_IRR: u32 = 0x200; +pub const APIC_ICR: u32 = 0x300; +pub const APIC_LVTCMCI: u32 = 0x2f0; diff --git a/kernel/src/arch/x86_64/vm/kvm_host/lapic/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/lapic/mod.rs new file mode 100644 index 000000000..6c50c39fb --- /dev/null +++ b/kernel/src/arch/x86_64/vm/kvm_host/lapic/mod.rs @@ -0,0 +1,121 @@ +use alloc::boxed::Box; +use system_error::SystemError; + +use crate::{ + arch::kvm_arch_ops, + kwarn, + virt::vm::kvm_host::{io::KvmIoDeviceOps, vcpu::VirtCpu, Vm}, +}; + +use apicdef::*; + +mod apicdef; + +#[derive(Debug)] +pub struct KvmLapic { + pub base_address: usize, + pub dev: Box, + pub apicv_active: bool, + pub regs: Box<[u8]>, +} + +impl KvmLapic { + const LAPIC_MMIO_LENGTH: usize = 1 << 12; + + pub fn apic_mmio_in_range(&self, addr: usize) -> bool { + return addr >= self.base_address && addr < self.base_address + Self::LAPIC_MMIO_LENGTH; + } + + pub fn kvm_lapic_reg_write(&self, reg: u32, val: u32) -> bool { + let mut ret; + match reg { + _ => { + kwarn!("kvm_lapic_reg_write: reg: {reg} not found"); + ret = false; + } + } + return ret; + } +} + +impl VirtCpu { + pub fn lapic_reset(&mut self, vm: &Vm, init_event: bool) { + kvm_arch_ops().apicv_pre_state_restore(self); + + if !init_event { + let mut msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; + if vm.arch.bsp_vcpu_id == self.vcpu_id { + msr_val |= MSR_IA32_APICBASE_BSP; + } + + self.lapic_set_base(msr_val); + } + + if self.arch.apic.is_none() { + return; + } + + todo!() + } + + fn lapic_set_base(&mut self, value: u64) { + let old_val = self.arch.apic_base; + let apic = self.arch.apic.as_ref(); + + self.arch.apic_base = value; + + if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 { + // TODO: kvm_update_cpuid_runtime(vcpu); + } + + if apic.is_none() { + return; + } + + if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 { + // if value & MSR_IA32_APICBASE_ENABLE != 0 {} + } + + todo!() + } +} + +#[derive(Debug)] +pub struct KvmApicMMioDev {} + +impl KvmIoDeviceOps for KvmApicMMioDev { + fn read( + &self, + vcpu: &VirtCpu, + addr: usize, + len: u32, + val: &mut usize, + ) -> Result<(), SystemError> { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + + fn write( + &self, + vcpu: &VirtCpu, + addr: usize, + len: u32, + data: &usize, + ) -> Result<(), SystemError> { + let apic = vcpu.arch.apic.as_ref().unwrap(); + + if !apic.apic_mmio_in_range(addr) { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + + let offset = addr - apic.base_address; + + if len != 4 || (offset & 0xf != 0) { + return Ok(()); + } + + let val = unsafe { *((*data) as *const u32) }; + + apic.kvm_lapic_reg_write((offset & 0xff0) as u32, val); + return Ok(()); + } +} diff --git a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs index d11f52e89..4bfc122c0 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/mod.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/mod.rs @@ -29,6 +29,7 @@ use super::{ x86_kvm_manager, x86_kvm_ops, }; +pub mod io; pub mod lapic; pub mod vcpu; #[allow(dead_code)] @@ -442,6 +443,7 @@ pub struct KvmVcpuStat { pub guest_mode: u64, pub notify_window_exits: u64, } + #[inline] /// 将 GFN 转换为 GPA pub fn gfn_to_gpa(gfn: u64) -> u64 { diff --git a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs index 47b949a1c..cd215f890 100644 --- a/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs +++ b/kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs @@ -17,7 +17,8 @@ use x86_64::registers::control::EferFlags; use crate::arch::vm::asm::VmxAsm; use crate::arch::vm::vmx::exit::ExitFastpathCompletion; -use crate::{kdebug, kwarn}; +use crate::arch::MMArch; +use crate::mm::MemoryManagementArch; use crate::virt::vm::kvm_host::mem::KvmMmuMemoryCache; use crate::virt::vm::kvm_host::vcpu::VcpuMode; use crate::{ @@ -45,7 +46,9 @@ use crate::{ user_api::{UapiKvmRun, UapiKvmSegment}, }, }; +use crate::{kdebug, kwarn}; +use super::io::KvmPioRequest; use super::{lapic::KvmLapic, HFlags, KvmCommonRegs, KvmIrqChipMode}; #[derive(Debug)] @@ -60,7 +63,7 @@ pub struct X86VcpuArch { mp_state: MutilProcessorState, pub apic_base: u64, /// apic - pub apic: Option, + pub apic: Option>, /// 主机pkru寄存器 host_pkru: u32, pkru: u32, @@ -148,6 +151,10 @@ pub struct X86VcpuArch { /* set at EPT violation at this point */ pub exit_qual: u64, + + pub pio: KvmPioRequest, + + pub pio_data: Box<[u8; MMArch::PAGE_SIZE]>, } impl X86VcpuArch { @@ -160,7 +167,6 @@ impl X86VcpuArch { ret.regs_avail = AllocBitmap::new(32); ret.regs_dirty = AllocBitmap::new(32); ret.mp_state = MutilProcessorState::Runnable; - ret.apic = None; //max_phyaddr=?? fztodo *ret @@ -258,7 +264,6 @@ impl X86VcpuArch { #[inline] pub fn is_pae_paging(&mut self) -> bool { - let flag1 = self.is_long_mode(); let flag2 = self.is_pae(); let flag3 = self.is_paging(); @@ -269,7 +274,6 @@ impl X86VcpuArch { #[inline] pub fn is_pae(&mut self) -> bool { !self.read_cr4_bits(Cr4::CR4_ENABLE_PAE).is_empty() - } #[inline] pub fn is_paging(&mut self) -> bool { @@ -1489,19 +1493,19 @@ impl VirtCpu { } } - pub fn load_pdptrs(&mut self){ + pub fn load_pdptrs(&mut self) { //let mmu = self.arch.mmu(); - if !self.arch.is_register_dirty(KvmReg::VcpuExregCr3){ + if !self.arch.is_register_dirty(KvmReg::VcpuExregCr3) { return; } - if self.arch.is_pae_paging(){ + if self.arch.is_pae_paging() { let mmu = self.arch.mmu(); VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[0]); VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[1]); VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[2]); VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[3]); - }else{ + } else { kdebug!("load_pdptrs: not pae paging"); } } diff --git a/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs index 3dc5055c1..f26eb994a 100644 --- a/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs +++ b/kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs @@ -9,15 +9,16 @@ use x86::vmx::vmcs::{guest, host}; use system_error::SystemError; use crate::{ - arch::{vm::{ - asm::VmxAsm, - kvm_host::{EmulType, KVM_PFN_NOSLOT}, - mmu::{ - mmu::{PFRet, PageLevel}, + arch::{ + vm::{ + asm::VmxAsm, + kvm_host::{EmulType, KVM_PFN_NOSLOT}, + mmu::mmu::{PFRet, PageLevel}, + mtrr::kvm_mtrr_check_gfn_range_consistency, + vmx::{ept::EptPageMapper, PageFaultErr}, }, - mtrr::kvm_mtrr_check_gfn_range_consistency, - vmx::{ept::EptPageMapper, PageFaultErr}, - }, MMArch}, + MMArch, + }, kdebug, kwarn, libs::spinlock::SpinLockGuard, mm::{page::PageFlags, syscall::ProtFlags, virt_2_phys, PhysAddr}, @@ -312,7 +313,10 @@ impl VirtCpu { return Ok(PFRet::Continue.into()); }; let page_flags = PageFlags::from_prot_flags(ProtFlags::from_bits_truncate(0x7_u64), false); - mapper.map(PhysAddr::new(page_fault.gpa() as usize), page_flags); + mapper + .map(PhysAddr::new(page_fault.gpa() as usize), page_flags) + .unwrap() + .flush(); if mapper.is_mapped(page_fault) { kdebug!("page fault is mapped now"); }; diff --git a/kernel/src/arch/x86_64/vm/uapi.rs b/kernel/src/arch/x86_64/vm/uapi.rs index c7a8ccc24..5b0e1d9b5 100644 --- a/kernel/src/arch/x86_64/vm/uapi.rs +++ b/kernel/src/arch/x86_64/vm/uapi.rs @@ -2,6 +2,8 @@ use crate::virt::vm::user_api::UapiKvmSegment; +pub const KVM_PIO_PAGE_OFFSET: u64 = 1; + pub const DE_VECTOR: usize = 0; pub const DB_VECTOR: usize = 1; pub const BP_VECTOR: usize = 3; diff --git a/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs b/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs index cc5923a8c..b00fce18f 100644 --- a/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs +++ b/kernel/src/arch/x86_64/vm/vmx/ept/mod.rs @@ -1,16 +1,21 @@ +use super::vmx_info; use crate::arch::mm::{LockedFrameAllocator, PageMapper}; use crate::arch::vm::asm::VmxAsm; use crate::arch::vm::mmu::mmu::{max_huge_page_level, PageLevel}; use crate::arch::vm::mmu::mmu_internal::KvmPageFault; +use crate::arch::x86_64::mm::X86_64MMArch; use crate::arch::MMArch; +use crate::libs::rwlock::RwLock; use crate::libs::spinlock::{SpinLock, SpinLockGuard}; use crate::mm::allocator::page_frame::FrameAllocator; -use crate::{kdebug, kerror, kinfo, kwarn}; -use crate::mm::page::{page_manager_lock_irqsave, Page, PageEntry, PageFlags, PageFlush, PageManager, PageTable}; +use crate::mm::page::{ + page_manager_lock_irqsave, Page, PageEntry, PageFlags, PageFlush, PageManager, PageTable, +}; use crate::mm::{MemoryManagementArch, PageTableKind, PhysAddr, VirtAddr}; use crate::smp::core::smp_get_processor_id; use crate::smp::cpu::AtomicProcessorId; use crate::smp::cpu::ProcessorId; +use crate::{kdebug, kerror, kinfo, kwarn}; use core::marker::PhantomData; use core::ops::Add; use core::sync::atomic::{compiler_fence, AtomicUsize, Ordering}; @@ -18,9 +23,6 @@ use hashbrown::HashMap; use system_error::SystemError; use x86::msr; use x86::vmx::vmcs::control; -use crate::arch::x86_64::mm::X86_64MMArch; -use crate::libs::rwlock::RwLock; -use super::vmx_info; // pub const VMX_EPT_MT_EPTE_SHIFT:u64 = 3; pub const VMX_EPT_RWX_MASK: u64 = 0x7 << 3; @@ -73,11 +75,11 @@ pub struct EptPageTable { /// PageLevel::4K = 1 level: PageLevel, } -impl EptPageTable{ +impl EptPageTable { pub fn phys(&self) -> PhysAddr { self.phys } - + /// 设置当前页表的第i个页表项 pub unsafe fn set_entry(&self, i: usize, entry: PageEntry) -> Option<()> { let entry_virt = self.entry_virt(i)?; @@ -99,14 +101,14 @@ impl EptPageTable{ } } - /// 获取当前页表的层级 - #[inline(always)] - pub fn level(&self) -> PageLevel { - self.level - } + /// 获取当前页表的层级 + #[inline(always)] + pub fn level(&self) -> PageLevel { + self.level + } - /// 获取第i个页表项所表示的虚拟内存空间的起始地址 - pub fn entry_base(&self, i: usize) -> Option { + /// 获取第i个页表项所表示的虚拟内存空间的起始地址 + pub fn entry_base(&self, i: usize) -> Option { if i < MMArch::PAGE_ENTRY_NUM { let shift = (self.level as usize - 1) * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT; return Some(self.base.add(i << shift)); @@ -114,33 +116,33 @@ impl EptPageTable{ return None; } } - /// 获取当前页表自身所在的虚拟地址 - #[inline(always)] - pub unsafe fn virt(&self) -> VirtAddr { - return MMArch::phys_2_virt(self.phys).unwrap(); - } + /// 获取当前页表自身所在的虚拟地址 + #[inline(always)] + pub unsafe fn virt(&self) -> VirtAddr { + return MMArch::phys_2_virt(self.phys).unwrap(); + } /// 获取当前页表的第i个页表项所在的虚拟地址(注意与entry_base进行区分) pub unsafe fn entry_virt(&self, i: usize) -> Option { - if i < MMArch::PAGE_ENTRY_NUM { + if i < MMArch::PAGE_ENTRY_NUM { return Some(self.virt().add(i * MMArch::PAGE_ENTRY_SIZE)); } else { - return None; - } + return None; + } } - /// 获取当前页表的第i个页表项 - pub unsafe fn entry(&self, i: usize) -> Option> { + /// 获取当前页表的第i个页表项 + pub unsafe fn entry(&self, i: usize) -> Option> { let entry_virt = self.entry_virt(i)?; return Some(PageEntry::from_usize(MMArch::read::(entry_virt))); } - - pub fn new(base:VirtAddr,phys: PhysAddr,level:PageLevel) -> Self { + + pub fn new(base: VirtAddr, phys: PhysAddr, level: PageLevel) -> Self { Self { base: base, phys, - level + level, } } - /// 根据虚拟地址,获取对应的页表项在页表中的下标 + /// 根据虚拟地址,获取对应的页表项在页表中的下标 /// /// ## 参数 /// @@ -151,7 +153,7 @@ impl EptPageTable{ /// 页表项在页表中的下标。如果addr不在当前页表所表示的虚拟地址空间中,则返回None pub unsafe fn index_of(&self, gpa: PhysAddr) -> Option { let addr = VirtAddr::new(gpa.data() & MMArch::PAGE_ADDRESS_MASK); - let shift = (self.level - 1) as usize * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT; + let shift = (self.level - 1) as usize * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT; let mask = (MMArch::PAGE_ENTRY_NUM << shift) - 1; if addr < self.base || addr >= self.base.add(mask) { @@ -185,7 +187,7 @@ impl EptPageTable{ // phys2page: HashMap::new(), // } // } - + // } /// Check if MTRR is supported @@ -217,14 +219,13 @@ pub struct EptPageMapper { frame_allocator: LockedFrameAllocator, } -impl EptPageMapper{ +impl EptPageMapper { /// 返回最上层的ept页表 - pub fn table(&self) ->EptPageTable { - EptPageTable::new(VirtAddr::new(0), - self.root_page_addr,max_huge_page_level()) - } + pub fn table(&self) -> EptPageTable { + EptPageTable::new(VirtAddr::new(0), self.root_page_addr, max_huge_page_level()) + } pub fn root_page_addr() -> PhysAddr { - let eptp =VmxAsm::vmx_vmread(control::EPTP_FULL); + let eptp = VmxAsm::vmx_vmread(control::EPTP_FULL); PhysAddr::new(eptp as usize) } @@ -251,7 +252,7 @@ impl EptPageMapper{ let readonly = prev_count > 0; let root_page_addr = Self::root_page_addr(); kdebug!("EptPageMapper root_page_addr: {:?}", root_page_addr); - return Self { + return Self { readonly, root_page_addr, frame_allocator: LockedFrameAllocator, @@ -269,39 +270,34 @@ impl EptPageMapper{ /// @brief: 检查有无gpa->hpa的映射 #[no_mangle] - pub fn is_mapped(&self,page_fault:&mut KvmPageFault) -> bool { + pub fn is_mapped(&self, page_fault: &mut KvmPageFault) -> bool { let gpa = page_fault.gpa(); let mut page_table = self.table(); let mut next_page_table; loop { - let index:usize = unsafe { - if let Some(i) = page_table.index_of(PhysAddr::new(gpa as usize)){ - i - }else{ + let index: usize = unsafe { + if let Some(i) = page_table.index_of(PhysAddr::new(gpa as usize)) { + i + } else { kerror!("ept page table index_of failed"); return false; } }; - if let Some(table) = page_table.next_level_table(index) { + if let Some(table) = page_table.next_level_table(index) { kdebug!("ept page table next level table: {:?}", table); if table.level() == PageLevel::Level4K { return true; } next_page_table = table; - }else{ + } else { return false; } page_table = next_page_table; - } } /// 从当前EptPageMapper的页分配器中分配一个物理页(hpa),并将其映射到指定的gpa - pub fn map( - &mut self, - gpa: PhysAddr, - flags: PageFlags, - ) -> Option>{ + pub fn map(&mut self, gpa: PhysAddr, flags: PageFlags) -> Option> { compiler_fence(Ordering::SeqCst); let hpa: PhysAddr = unsafe { self.frame_allocator.allocate_one() }?; compiler_fence(Ordering::SeqCst); @@ -314,7 +310,6 @@ impl EptPageMapper{ self.map_gpa(gpa, hpa, flags) } - ///映射一个hpa到指定的gpa pub fn map_gpa( &mut self, @@ -322,13 +317,9 @@ impl EptPageMapper{ hpa: PhysAddr, flags: PageFlags, ) -> Option> { - // 验证虚拟地址和物理地址是否对齐 - if !(gpa.check_aligned(MMArch::PAGE_SIZE) && hpa.check_aligned(MMArch::PAGE_SIZE)) { - kerror!( - "Try to map unaligned page: gpa={:?}, hpa={:?}", - gpa, - hpa - ); + // 验证虚拟地址和物理地址是否对齐 + if !(gpa.check_aligned(MMArch::PAGE_SIZE) && hpa.check_aligned(MMArch::PAGE_SIZE)) { + kerror!("Try to map unaligned page: gpa={:?}, hpa={:?}", gpa, hpa); return None; } @@ -336,12 +327,12 @@ impl EptPageMapper{ // TODO: 验证flags是否合法 - // 创建页表项 + // 创建页表项 let entry = PageEntry::new(hpa, flags); let mut table = self.table(); kdebug!("ept page table: {:?}", table); kdebug!("Now eptp is : {:?}", VmxAsm::vmx_vmread(control::EPTP_FULL)); - loop{ + loop { let i = unsafe { table.index_of(gpa).unwrap() }; assert!(i < MMArch::PAGE_ENTRY_NUM); if table.level() == PageLevel::Level4K { @@ -356,32 +347,39 @@ impl EptPageMapper{ unsafe { table.set_entry(i, entry) }; compiler_fence(Ordering::SeqCst); return Some(PageFlush::new(VirtAddr::new(gpa.data()))); - }else{ + } else { let next_table = table.next_level_table(i); if let Some(next_table) = next_table { table = next_table; } else { - // 分配下一级页表 - let frame = unsafe { self.frame_allocator.allocate_one() }?; - - // 清空这个页帧 - unsafe { MMArch::write_bytes(MMArch::phys_2_virt(frame).unwrap(), 0, MMArch::PAGE_SIZE) }; - - // fixme::设置页表项的flags,可能有点问题 - let flags: PageFlags = - unsafe { PageFlags::from_data(MMArch::ENTRY_FLAG_DEFAULT_TABLE | MMArch::ENTRY_FLAG_READWRITE) }; - - kdebug!("EptEntryFlags: {:?}", flags); - - - // 把新分配的页表映射到当前页表 - unsafe { table.set_entry(i, PageEntry::new(frame, flags)) }; - - // 获取新分配的页表 - table = table.next_level_table(i)?; + // 分配下一级页表 + let frame = unsafe { self.frame_allocator.allocate_one() }?; + + // 清空这个页帧 + unsafe { + MMArch::write_bytes( + MMArch::phys_2_virt(frame).unwrap(), + 0, + MMArch::PAGE_SIZE, + ) + }; + + // fixme::设置页表项的flags,可能有点问题 + let flags: PageFlags = unsafe { + PageFlags::from_data( + MMArch::ENTRY_FLAG_DEFAULT_TABLE | MMArch::ENTRY_FLAG_READWRITE, + ) + }; + + kdebug!("EptEntryFlags: {:?}", flags); + + // 把新分配的页表映射到当前页表 + unsafe { table.set_entry(i, PageEntry::new(frame, flags)) }; + + // 获取新分配的页表 + table = table.next_level_table(i)?; } } - } } } diff --git a/kernel/src/arch/x86_64/vm/vmx/exit.rs b/kernel/src/arch/x86_64/vm/vmx/exit.rs index 1558757d4..38a6a9721 100644 --- a/kernel/src/arch/x86_64/vm/vmx/exit.rs +++ b/kernel/src/arch/x86_64/vm/vmx/exit.rs @@ -268,7 +268,28 @@ impl VmxExitHandlers { } fn handle_io(vcpu: &mut VirtCpu) -> Result { - todo!(); + let exit_qualification = vcpu.get_exit_qual(); + let string = (exit_qualification & 16) != 0; + + vcpu.stat.io_exits += 1; + + if string { + todo!("kvm_emulate_instruction todo"); + } + + let port = exit_qualification >> 16; + let size = (exit_qualification & 7) + 1; + let is_in = (exit_qualification & 8) != 0; + + return vcpu + .arch + .kvm_fast_pio( + vcpu.run.as_mut().unwrap().as_mut(), + size as u32, + port as u16, + is_in, + ) + .and(Ok(0)); } fn handle_external_interrupt(vcpu: &mut VirtCpu) -> Result { diff --git a/kernel/src/virt/vm/kvm_host/io.rs b/kernel/src/virt/vm/kvm_host/io.rs new file mode 100644 index 000000000..9f22da4d0 --- /dev/null +++ b/kernel/src/virt/vm/kvm_host/io.rs @@ -0,0 +1,182 @@ +use core::fmt::Debug; + +use alloc::vec::Vec; + +use alloc::boxed::Box; +use system_error::SystemError; + +use super::vcpu::VirtCpu; +use super::{KvmBus, Vm}; + +pub trait KvmIoDeviceOps: Send + Sync + Debug { + fn read( + &self, + vcpu: &VirtCpu, + addr: usize, + len: u32, + val: &mut usize, + ) -> Result<(), SystemError> { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + + fn write(&self, vcpu: &VirtCpu, addr: usize, len: u32, val: &usize) -> Result<(), SystemError> { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } +} + +#[derive(Debug)] +pub struct KvmIoRange { + pub addr: usize, + pub len: u32, + pub dev_ops: Option>, +} + +impl PartialEq for KvmIoRange { + fn eq(&self, other: &Self) -> bool { + self.addr == other.addr && self.len == other.len + } +} + +impl Eq for KvmIoRange {} + +impl PartialOrd for KvmIoRange { + fn partial_cmp(&self, other: &Self) -> Option { + let mut addr1 = self.addr; + let mut addr2 = other.addr; + + if addr1 < addr2 { + return Some(core::cmp::Ordering::Less); + } + + if other.len != 0 { + addr1 += self.len as usize; + addr2 += other.len as usize; + } + + if addr1 > addr2 { + return Some(core::cmp::Ordering::Greater); + } + + return Some(core::cmp::Ordering::Equal); + } +} + +impl Ord for KvmIoRange { + fn cmp(&self, other: &Self) -> core::cmp::Ordering { + return self.partial_cmp(other).unwrap(); + } +} + +#[derive(Debug)] +pub struct KvmIoBus { + pub dev_count: u32, + pub ioeventfd_count: u32, + pub range: Vec, +} + +impl VirtCpu { + pub fn kvm_io_bus_write( + &self, + vm: &mut Vm, + bus_idx: KvmBus, + addr: usize, + len: u32, + val: &usize, + ) -> Result<(), SystemError> { + let bus_idx = bus_idx as usize; + if bus_idx >= vm.buses.len() { + return Err(SystemError::ENOMEM); + } + let bus = &mut vm.buses[bus_idx]; + let range = KvmIoRange { + addr, + len, + dev_ops: None, + }; + + return self.internal_kvm_bus_write(bus, range, val).and(Ok(())); + } + + fn internal_kvm_bus_write( + &self, + bus: &KvmIoBus, + range: KvmIoRange, + val: &usize, + ) -> Result { + let mut idx = Self::kvm_io_bus_get_first_dev(bus, range.addr, range.len)?; + + while idx < bus.dev_count as usize && range == bus.range[idx] { + if let Some(dev_ops) = &bus.range[idx].dev_ops { + dev_ops.write(self, range.addr, range.len, val)?; + return Ok(idx); + } + idx += 1; + } + + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + + pub fn kvm_io_bus_read( + &self, + vm: &mut Vm, + bus_idx: KvmBus, + addr: usize, + len: u32, + val: &mut usize, + ) -> Result<(), SystemError> { + let bus_idx = bus_idx as usize; + if bus_idx >= vm.buses.len() { + return Err(SystemError::ENOMEM); + } + let bus = &mut vm.buses[bus_idx]; + let range = KvmIoRange { + addr, + len, + dev_ops: None, + }; + + return self.internal_kvm_bus_read(bus, range, val).and(Ok(())); + } + + fn internal_kvm_bus_read( + &self, + bus: &KvmIoBus, + range: KvmIoRange, + val: &mut usize, + ) -> Result { + let mut idx = Self::kvm_io_bus_get_first_dev(bus, range.addr, range.len)?; + + while idx < bus.dev_count as usize && range == bus.range[idx] { + if let Some(dev_ops) = &bus.range[idx].dev_ops { + dev_ops.read(self, range.addr, range.len, val)?; + return Ok(idx); + } + idx += 1; + } + + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + + fn kvm_io_bus_get_first_dev( + bus: &KvmIoBus, + addr: usize, + len: u32, + ) -> Result { + let key = KvmIoRange { + addr, + len, + dev_ops: None, + }; + let range = bus.range.binary_search(&key); + + if let Ok(mut idx) = range { + while idx > 0 && key == bus.range[idx - 1] { + idx -= 1; + } + + return Ok(idx); + } else { + return Err(SystemError::ENOENT); + } + } +} diff --git a/kernel/src/virt/vm/kvm_host/mod.rs b/kernel/src/virt/vm/kvm_host/mod.rs index a3883b507..a0af2a3a3 100644 --- a/kernel/src/virt/vm/kvm_host/mod.rs +++ b/kernel/src/virt/vm/kvm_host/mod.rs @@ -9,6 +9,7 @@ use alloc::{ vec::Vec, }; use hashbrown::HashMap; +use io::KvmIoBus; use mem::LockedKvmMemSlot; use system_error::SystemError; @@ -36,6 +37,7 @@ use self::{ vcpu::{GuestDebug, VcpuMode}, }; +pub mod io; pub mod mem; pub mod vcpu; @@ -93,6 +95,7 @@ impl LockedVm { kvm_vmx: KvmVmx::default(), nr_memslots_dirty_logging: 0, mmu_invalidate_seq: 0, + buses: Vec::with_capacity(KvmBus::NrBuses as usize), }; let ret = Arc::new(Self { @@ -147,6 +150,8 @@ pub struct Vm { pub kvm_vmx: KvmVmx, pub mmu_invalidate_seq: u64, //用于表示内存管理单元(MMU)无效化序列号 + + pub buses: Vec, } impl Vm { @@ -245,6 +250,16 @@ pub enum MutilProcessorState { ApResetHold, Suspended, } + +pub enum KvmBus { + MmioBus = 0, + PioBus = 1, + VirtioCcwNotifyBus = 2, + FastMmioBus = 3, + + NrBuses, +} + ///返回包含 gfn 的 memslot 的指针。如果没有找到,则返回 NULL。 ///当 "approx" 设置为 true 时,即使地址落在空洞中,也会返回 memslot。 ///在这种情况下,将返回空洞边界的其中一个 memslot。 diff --git a/kernel/src/virt/vm/user_api.rs b/kernel/src/virt/vm/user_api.rs index e7d078c2b..03c07abbb 100644 --- a/kernel/src/virt/vm/user_api.rs +++ b/kernel/src/virt/vm/user_api.rs @@ -158,6 +158,9 @@ pub struct UapiKvmRunBindgenTy1BindgenTy4 { pub data_offset: u64, } +pub const KVM_EXIT_IO_IN: u8 = 0; +pub const KVM_EXIT_IO_OUT: u8 = 1; + #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct UapiKvmDebugExitArch {