diff options
| author | Natasha Moongrave <natasha@256phi.eu> | 2026-04-08 16:45:40 +0200 |
|---|---|---|
| committer | Natasha Moongrave <natasha@256phi.eu> | 2026-04-08 16:45:40 +0200 |
| commit | c683e9ba28583c4650992394067422bb6cff75f6 (patch) | |
| tree | 19d7205b61d4fdcaecbbd67b4474675860676360 | |
| parent | af1089a4262414b64714b87180f2223c8a40918f (diff) | |
[Phase 2.2/2.3] SYSCALL/SYSRET MSR setup + syscall dispatcher
syscall/mod.rs:
- Configure STAR (kernel CS=0x08, user base=0x10), LSTAR, SFMASK (clear IF),
EFER.SCE to enable the SYSCALL/SYSRET instruction pair
- Naked assembly entry stub: swapgs, save/restore user RSP via per-CPU static,
remap r10→rcx for Linux arg4 convention, call Rust syscall_handler, sysretq
- validate_user_ptr(): bounds check for all user pointers (0..USER_ADDR_MAX)
- set_syscall_kernel_stack(): for scheduler to update per-CPU kernel RSP
syscall/dispatch.rs:
- dispatch(): routes syscall numbers; unknown → -ENOSYS (never panics)
- sys_write(fd, buf, count): fd 1/2 → serial; validates user ptr; read_volatile
loop avoids Rust aliasing assumptions on untrusted memory
- sys_exit(): serial log + hlt_loop() placeholder; Phase 2.5 adds scheduling
lib.rs: register syscall::init() + syscall::init_kernel_stack() in strix_os::init()
All existing integration tests pass.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
| -rw-r--r-- | NOTES.md | 18 | ||||
| -rw-r--r-- | StrixKernel/src/lib.rs | 6 | ||||
| -rw-r--r-- | StrixKernel/src/syscall/dispatch.rs | 120 | ||||
| -rw-r--r-- | StrixKernel/src/syscall/mod.rs | 304 |
4 files changed, 445 insertions, 3 deletions
@@ -18,8 +18,8 @@ **Branch**: `CLAUDE_TEST` **Phase**: Phase 2 — User Space Foundation -**Last commit**: `[Phase 2.1] GDT user space segments + heap growth` -**Next task**: `[Phase 2.2]` — SYSCALL/SYSRET MSR setup +**Last commit**: `[Phase 2.2/2.3] SYSCALL/SYSRET MSR setup + syscall dispatcher` +**Next task**: `[Phase 2.4]` — Process structure (task/process.rs) --- @@ -75,6 +75,20 @@ User address limit: 0x0000_8000_0000_0000 (canonical boundary) **Next**: Phase 2.1 — Extend GDT. **Decisions**: None new. +### [Phase 2.2/2.3] 2026-04-08 — SYSCALL/SYSRET MSR setup + dispatcher +**Done**: +- `src/syscall/mod.rs`: configure STAR/LSTAR/SFMASK/EFER.SCE MSRs +- Naked assembly `syscall_entry_asm`: swapgs, save user RSP, load kernel RSP, remap r10→rcx for arg4, call `syscall_handler`, restore, sysretq +- `src/syscall/dispatch.rs`: dispatcher with `write(1)`, `exit(60)`, `exit_group(231)`; unknown → `-ENOSYS` +- `sys_write`: validates user pointer before dereference; uses `read_volatile` to avoid aliasing UB +- Registered `syscall::init()` + `syscall::init_kernel_stack()` in `strix_os::init()` +- All existing integration tests still pass in QEMU +**Next**: Phase 2.4 — Process structure +**Decisions**: +- Use `swapgs` pattern for user/kernel GS switching (per-CPU scratch for user RSP) +- `sys_write` uses raw pointer + `read_volatile` loop (not `&[u8]` slice) on user memory +- `sys_exit` currently halts; Phase 2.5 will add proper process termination + ### [Phase 2.1] 2026-04-08 — GDT user space segments + heap growth **Done**: - Restructured `StrixKernel/src/gdt.rs`: added `kernel_data`, `user_data`, `user_code` segments in the correct order for SYSCALL/SYSRET ABI diff --git a/StrixKernel/src/lib.rs b/StrixKernel/src/lib.rs index 0d1fd4e..725758b 100644 --- a/StrixKernel/src/lib.rs +++ b/StrixKernel/src/lib.rs @@ -41,12 +41,13 @@ use core::panic::PanicInfo; extern crate alloc; +pub mod allocator; pub mod gdt; pub mod interrupts; pub mod memory; pub mod serial; +pub mod syscall; pub mod vga_buffer; -pub mod allocator; /// Initializes the kernel's core subsystems. /// @@ -89,6 +90,9 @@ pub fn init() { // The PICS static is protected by a spinlock for safe concurrent access. unsafe { interrupts::PICS.lock().initialize() }; x86_64::instructions::interrupts::enable(); + // Initialize SYSCALL/SYSRET MSRs (requires GDT already loaded above). + syscall::init(); + syscall::init_kernel_stack(); } /// A trait for types that can be run as tests in the kernel test framework. diff --git a/StrixKernel/src/syscall/dispatch.rs b/StrixKernel/src/syscall/dispatch.rs new file mode 100644 index 0000000..73e35e6 --- /dev/null +++ b/StrixKernel/src/syscall/dispatch.rs @@ -0,0 +1,120 @@ +//! # Syscall Dispatcher +//! +//! Routes incoming syscall numbers to their handler functions. +//! Follows the Linux x86-64 syscall ABI (unistd_64.h numbering). +//! +//! ## Error Handling +//! +//! All unimplemented syscalls return [`errno::ENOSYS`] (`-38`), NOT a panic. +//! Panicking on unknown syscalls would make the kernel trivially DoS-able from +//! user space. Returning `-ENOSYS` is the correct POSIX behavior. + +use super::errno; +use super::validate_user_ptr; +use crate::serial_println; + +// ── Syscall number constants (Linux x86-64 ABI) ────────────────────────────── + +/// `write(fd, buf, count)` — write to a file descriptor +const SYS_WRITE: u64 = 1; +/// `exit(status)` — terminate the calling process +const SYS_EXIT: u64 = 60; +/// `exit_group(status)` — terminate all threads in the process +const SYS_EXIT_GROUP: u64 = 231; + +// ── Dispatcher ─────────────────────────────────────────────────────────────── + +/// Dispatches a syscall to its handler. +/// +/// # Arguments +/// +/// * `num` — syscall number (from user `rax`) +/// * `arg1`–`arg6` — syscall arguments (rdi, rsi, rdx, r10, r8, r9) +/// +/// # Returns +/// +/// Value to place in `rax` on return to user space. +/// Negative = negated errno. +pub fn dispatch(num: u64, arg1: u64, arg2: u64, arg3: u64, _arg4: u64, _arg5: u64, _arg6: u64) -> i64 { + match num { + SYS_WRITE => sys_write(arg1, arg2, arg3), + SYS_EXIT => sys_exit(arg1), + SYS_EXIT_GROUP => sys_exit(arg1), + _ => { + // Log unimplemented syscall for debugging (don't panic). + serial_println!("[syscall] ENOSYS: num={}", num); + errno::ENOSYS + } + } +} + +// ── Handlers ───────────────────────────────────────────────────────────────── + +/// `write(fd, buf, count)` — write bytes to a file descriptor. +/// +/// Phase 2 implementation: only supports fd=1 (stdout) and fd=2 (stderr), +/// both wired to the serial port. Full VFS-backed implementation comes in +/// Phase 4 when the file descriptor table is available. +/// +/// # Security +/// +/// `buf` is a user-space pointer. It is validated to be within the canonical +/// user address range before any dereference. +fn sys_write(fd: u64, buf_ptr: u64, count: u64) -> i64 { + // Only stdout/stderr are supported before VFS is implemented. + if fd != 1 && fd != 2 { + return errno::EBADF; + } + + // Reject zero-length writes immediately. + if count == 0 { + return 0; + } + + // Reject absurdly large writes to prevent integer overflow in validation. + // 16 MiB is a generous upper bound for a single write. + if count > 16 * 1024 * 1024 { + return errno::EINVAL; + } + + // Validate the user pointer range before dereferencing. + if !validate_user_ptr(buf_ptr, count) { + return errno::EFAULT; + } + + // SAFETY: We have validated: + // 1. buf_ptr + count is within user address space (not a kernel address) + // 2. count > 0 and count <= 16 MiB + // + // Remaining risk: the user-space buffer may be unmapped or partially + // unmapped. A full implementation would catch the resulting page fault + // and return EFAULT. For Phase 2, we rely on the fact that the test + // code maps its own buffers correctly. + // + // DO NOT add any Rust references (&[u8]) here — a raw pointer loop avoids + // Rust's aliasing assumptions for untrusted user memory. + let mut written: u64 = 0; + let mut ptr = buf_ptr as *const u8; + while written < count { + // SAFETY: ptr is within the validated user range and advances by 1 each iter. + let byte = unsafe { ptr.read_volatile() }; + crate::serial_print!("{}", byte as char); + ptr = unsafe { ptr.add(1) }; + written += 1; + } + + written as i64 +} + +/// `exit(status)` — terminate the current process. +/// +/// Phase 2 implementation: for the initial single-process boot, we just +/// halt the kernel. The scheduler (Phase 2.5) will replace this with proper +/// process termination and scheduling of the next ready process. +fn sys_exit(status: u64) -> i64 { + let code = (status & 0xFF) as u8; + serial_println!("[syscall] exit({})", code); + // TODO(Phase 2.5): mark current process as Zombie, schedule next process. + // For now, halt. This is a deliberate placeholder. + crate::hlt_loop(); +} diff --git a/StrixKernel/src/syscall/mod.rs b/StrixKernel/src/syscall/mod.rs new file mode 100644 index 0000000..99d9dd1 --- /dev/null +++ b/StrixKernel/src/syscall/mod.rs @@ -0,0 +1,304 @@ +//! # System Call Interface +//! +//! This module implements the x86-64 `SYSCALL`/`SYSRET` fast system call mechanism, +//! providing the boundary between user space (Ring 3) and the kernel (Ring 0). +//! +//! ## SYSCALL/SYSRET Mechanics +//! +//! On x86-64, `SYSCALL` is the recommended system call instruction (faster than +//! `INT 0x80`). The CPU: +//! 1. Saves `RIP` → `RCX`, `RFLAGS` → `R11` +//! 2. Loads CS from `STAR[47:32]`, SS from `STAR[47:32]+8` +//! 3. Masks RFLAGS with `SFMASK` (we clear IF to disable interrupts) +//! 4. Jumps to the address in `LSTAR` +//! +//! On `SYSRET` (64-bit): +//! 1. Restores `RIP` from `RCX`, `RFLAGS` from `R11` +//! 2. Loads CS from `STAR[63:48]+16`, SS from `STAR[63:48]+8` +//! 3. Returns to Ring 3 +//! +//! ## Linux x86-64 ABI +//! +//! Strix OS follows the standard Linux syscall ABI for compatibility: +//! +//! | Register | Role | +//! |----------|------| +//! | `rax` | Syscall number (in); return value (out) | +//! | `rdi` | Argument 1 | +//! | `rsi` | Argument 2 | +//! | `rdx` | Argument 3 | +//! | `r10` | Argument 4 (note: `r10` not `rcx` — `rcx` is clobbered by SYSCALL) | +//! | `r8` | Argument 5 | +//! | `r9` | Argument 6 | +//! | `rcx` | Clobbered by SYSCALL (saved original RIP) | +//! | `r11` | Clobbered by SYSCALL (saved original RFLAGS) | +//! +//! Return value convention: negative value = `-errno` (e.g. `-22` = `-EINVAL`). +//! +//! ## Security +//! +//! Every syscall that receives a pointer from user space must validate it with +//! [`validate_user_ptr`] before dereferencing. Pointers outside the canonical +//! user address range `0..USER_ADDR_MAX` are rejected with `-EFAULT`. + +pub mod dispatch; + +use x86_64::registers::model_specific::{Efer, EferFlags, LStar, SFMask, Star}; +use x86_64::registers::rflags::RFlags; +use x86_64::VirtAddr; + +use crate::gdt; + +/// Upper bound of the user-accessible virtual address space (exclusive). +/// +/// On x86-64, canonical addresses have bits 48–63 equal to bit 47. +/// User space occupies `0x0000_0000_0000_0000..0x0000_8000_0000_0000`. +/// Any pointer at or above this value is a kernel address and must be rejected. +pub const USER_ADDR_MAX: u64 = 0x0000_8000_0000_0000; + +/// Errno constants (negated, as returned in rax). +pub mod errno { + /// Operation not permitted + pub const EPERM: i64 = -1; + /// No such file or directory + pub const ENOENT: i64 = -2; + /// Input/output error + pub const EIO: i64 = -5; + /// Bad file descriptor + pub const EBADF: i64 = -9; + /// Cannot allocate memory + pub const ENOMEM: i64 = -12; + /// Invalid argument + pub const EINVAL: i64 = -22; + /// Function not implemented + pub const ENOSYS: i64 = -38; + /// Bad address (pointer validation failure) + pub const EFAULT: i64 = -14; +} + +/// Validates that a user-supplied pointer range lies entirely within user space. +/// +/// Returns `true` if the entire range `[ptr, ptr+len)` is within the canonical +/// user address range `[0, USER_ADDR_MAX)`. +/// +/// # Arguments +/// +/// * `ptr` - Base address from user space +/// * `len` - Length of the region in bytes +/// +/// # Security +/// +/// Must be called before any syscall handler dereferences a user pointer. +/// Failure to validate allows a malicious user process to read/write arbitrary +/// kernel memory (kernel data disclosure or control-flow hijack). +#[inline] +pub fn validate_user_ptr(ptr: u64, len: u64) -> bool { + // Check that neither ptr nor ptr+len overflows, and the entire range is below + // the user/kernel address boundary. + ptr < USER_ADDR_MAX && ptr.saturating_add(len) <= USER_ADDR_MAX +} + +/// The syscall entry point — called by the CPU on every `SYSCALL` instruction. +/// +/// This is the Rust side of the syscall entry. It receives the register state +/// saved by the assembly stub in `syscall_entry_asm`, dispatches to the +/// appropriate handler, and returns the result in `rax`. +/// +/// # Arguments (matching Linux x86-64 ABI) +/// +/// * `syscall_num` — syscall number (from rax) +/// * `arg1..arg6` — syscall arguments (rdi, rsi, rdx, r10, r8, r9) +/// +/// # Returns +/// +/// The value to place in `rax` on return to user space. +/// Negative values indicate errors (negated errno). +#[unsafe(no_mangle)] +pub extern "C" fn syscall_handler( + syscall_num: u64, + arg1: u64, + arg2: u64, + arg3: u64, + arg4: u64, + arg5: u64, + arg6: u64, +) -> i64 { + dispatch::dispatch(syscall_num, arg1, arg2, arg3, arg4, arg5, arg6) +} + +/// Initializes the SYSCALL/SYSRET mechanism by configuring the required MSRs. +/// +/// Must be called after [`crate::gdt::init`] (we read the GDT selectors). +/// +/// # MSR Configuration +/// +/// - **STAR**: encodes the kernel CS/SS selectors (for SYSCALL) and the base +/// for computing user CS/SS on SYSRET. +/// - **LSTAR**: the virtual address of `syscall_entry_asm` — where the CPU +/// jumps on SYSCALL. +/// - **SFMASK**: bits to clear from RFLAGS on SYSCALL entry. We clear `IF` +/// (interrupt flag) so kernel entry runs with interrupts disabled until we +/// explicitly re-enable them. +/// - **EFER.SCE**: the System Call Extensions bit must be set to enable SYSCALL. +pub fn init() { + let kernel_cs = gdt::GDT.1.kernel_code_selector; + let kernel_ss = gdt::GDT.1.kernel_data_selector; + + // SAFETY: Writing to MSRs is always unsafe (ring-0 only operation). + // We own these MSRs; no other code writes them. + unsafe { + // Enable SYSCALL/SYSRET via EFER.SCE + Efer::update(|f| *f |= EferFlags::SYSTEM_CALL_EXTENSIONS); + + // STAR: [47:32] = kernel CS (SS = kernel CS + 8 auto), + // [63:48] = kernel DS (SYSRET derives user SS=+8, user CS=+16) + Star::write( + gdt::GDT.1.user_code_selector, + gdt::GDT.1.user_data_selector, + kernel_cs, + kernel_ss, + ) + .expect("STAR MSR write failed: invalid segment selectors"); + + // LSTAR: syscall entry point + LStar::write(VirtAddr::new(syscall_entry_asm as *const () as u64)); + + // SFMASK: clear IF on SYSCALL entry (interrupts disabled in syscall handler) + SFMask::write(RFlags::INTERRUPT_FLAG); + } +} + +/// Raw assembly syscall entry stub. +/// +/// The CPU jumps here on every `SYSCALL` instruction. We must: +/// 1. Switch to the kernel stack (SYSCALL does NOT switch RSP automatically) +/// 2. Save the caller-save registers that the C ABI doesn't preserve +/// 3. Rearrange arguments to match Rust's `extern "C"` calling convention +/// 4. Call `syscall_handler` +/// 5. Restore registers and execute `SYSRET` +/// +/// # Stack Management +/// +/// SYSCALL does not switch stacks. We need a per-CPU kernel stack pointer. +/// For now we use a simple per-CPU static; the scheduler will replace this +/// with a per-process kernel stack in Phase 2.5. +/// +/// # Register State on Entry (from CPU) +/// +/// | Register | Content | +/// |----------|---------| +/// | rax | syscall number | +/// | rcx | saved user RIP (return address) | +/// | r11 | saved user RFLAGS | +/// | rdi | arg1 | +/// | rsi | arg2 | +/// | rdx | arg3 | +/// | r10 | arg4 (Linux uses r10 instead of rcx for arg4) | +/// | r8 | arg5 | +/// | r9 | arg6 | +/// +/// # SYSRET Requirements +/// +/// Before `SYSRET`: +/// - `rcx` = user RIP to return to +/// - `r11` = user RFLAGS to restore +/// - `rsp` = user stack pointer +/// - CS/SS loaded from STAR automatically +#[unsafe(naked)] +unsafe extern "C" fn syscall_entry_asm() { + // SAFETY: This is a naked function — no prologue/epilogue. We manually + // manage the stack. The `syscall` instruction has already saved RIP→RCX + // and RFLAGS→R11. + core::arch::naked_asm!( + // ── Switch to kernel stack ────────────────────────────────────────── + // Save user RSP into a per-CPU scratch location, then load kernel RSP. + // We use the GS-relative per-CPU area for the user RSP spill slot. + // For Phase 2.2 we use a simple static kernel stack; the scheduler + // will update TSS.RSP0 and we'll switch via that in Phase 2.5. + "swapgs", // swap GS base: user GS <-> kernel GS + "mov [{user_rsp}], rsp", // save user RSP to per-CPU scratch + "mov rsp, [{kernel_rsp}]", // load kernel RSP from per-CPU area + + // ── Save caller-saved registers not preserved by the C ABI ────────── + // rcx = user RIP (must survive until SYSRET) + // r11 = user RFLAGS (must survive until SYSRET) + "push rcx", // save user RIP + "push r11", // save user RFLAGS + + // ── Remap arg4: Linux passes arg4 in r10, C ABI expects rcx ───────── + "mov rcx, r10", // arg4: r10 → rcx for C ABI + + // ── Call Rust syscall handler ──────────────────────────────────────── + // syscall_handler(syscall_num=rax, arg1=rdi, arg2=rsi, arg3=rdx, + // arg4=rcx [was r10], arg5=r8, arg6=r9) + // Return value lands in rax. + "call {handler}", + + // ── Restore saved registers ────────────────────────────────────────── + "pop r11", // restore user RFLAGS + "pop rcx", // restore user RIP + + // ── Restore user stack pointer ─────────────────────────────────────── + "mov rsp, [{user_rsp}]", // restore user RSP + "swapgs", // swap GS base back to user GS + + // ── Return to user space ───────────────────────────────────────────── + // SYSRET restores RIP from RCX, RFLAGS from R11, sets CS/SS from STAR. + "sysretq", + + handler = sym syscall_handler, + user_rsp = sym SYSCALL_USER_RSP, + kernel_rsp = sym SYSCALL_KERNEL_RSP, + ); +} + +/// Per-CPU scratch space: user RSP saved during SYSCALL entry. +/// +/// This is a temporary spill slot; the value is only valid between the +/// `swapgs`/`mov [{user_rsp}]` and the corresponding restore. In a +/// multi-CPU system this would need to be per-CPU (GS-relative). For now +/// Strix OS runs on a single CPU. +static mut SYSCALL_USER_RSP: u64 = 0; + +/// Per-CPU kernel RSP used as the initial kernel stack on SYSCALL entry. +/// +/// Initialized by [`init_kernel_stack`]. The scheduler updates this in +/// Phase 2.5 to point at the current process's kernel stack. +static mut SYSCALL_KERNEL_RSP: u64 = 0; + +/// Kernel stack for the initial syscall handler context (4 KiB). +const SYSCALL_STACK_SIZE: usize = 4096; +static mut SYSCALL_STACK: [u8; SYSCALL_STACK_SIZE] = [0; SYSCALL_STACK_SIZE]; + +/// Initializes the per-CPU kernel stack pointer used by the syscall entry stub. +/// +/// Must be called after [`init`]. +/// +/// # Safety +/// +/// Must be called from a single CPU before any user process runs. +pub fn init_kernel_stack() { + // SAFETY: Single-threaded initialization, no interrupts active yet. + // We use addr_of! to get the address without creating a reference to + // the mutable static (required by Rust 2024 edition). + unsafe { + let stack_base = core::ptr::addr_of!(SYSCALL_STACK) as u64; + SYSCALL_KERNEL_RSP = stack_base + SYSCALL_STACK_SIZE as u64; + } +} + +/// Updates the per-CPU kernel stack pointer for the syscall entry stub. +/// +/// Called by the scheduler in Phase 2.5 before entering each user process. +/// This ensures that SYSCALL switches to the correct per-process kernel stack. +/// +/// # Safety +/// +/// Must be called with interrupts disabled. `kernel_stack_top` must be a +/// valid mapped kernel stack pointer. +pub unsafe fn set_syscall_kernel_stack(kernel_stack_top: u64) { + // SAFETY: Caller ensures interrupts are disabled (no concurrent access). + unsafe { + SYSCALL_KERNEL_RSP = kernel_stack_top; + } +} |
