aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNatasha Moongrave <natasha@256phi.eu>2026-04-08 16:45:40 +0200
committerNatasha Moongrave <natasha@256phi.eu>2026-04-08 16:45:40 +0200
commitc683e9ba28583c4650992394067422bb6cff75f6 (patch)
tree19d7205b61d4fdcaecbbd67b4474675860676360
parentaf1089a4262414b64714b87180f2223c8a40918f (diff)
[Phase 2.2/2.3] SYSCALL/SYSRET MSR setup + syscall dispatcher
syscall/mod.rs: - Configure STAR (kernel CS=0x08, user base=0x10), LSTAR, SFMASK (clear IF), EFER.SCE to enable the SYSCALL/SYSRET instruction pair - Naked assembly entry stub: swapgs, save/restore user RSP via per-CPU static, remap r10→rcx for Linux arg4 convention, call Rust syscall_handler, sysretq - validate_user_ptr(): bounds check for all user pointers (0..USER_ADDR_MAX) - set_syscall_kernel_stack(): for scheduler to update per-CPU kernel RSP syscall/dispatch.rs: - dispatch(): routes syscall numbers; unknown → -ENOSYS (never panics) - sys_write(fd, buf, count): fd 1/2 → serial; validates user ptr; read_volatile loop avoids Rust aliasing assumptions on untrusted memory - sys_exit(): serial log + hlt_loop() placeholder; Phase 2.5 adds scheduling lib.rs: register syscall::init() + syscall::init_kernel_stack() in strix_os::init() All existing integration tests pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
-rw-r--r--NOTES.md18
-rw-r--r--StrixKernel/src/lib.rs6
-rw-r--r--StrixKernel/src/syscall/dispatch.rs120
-rw-r--r--StrixKernel/src/syscall/mod.rs304
4 files changed, 445 insertions, 3 deletions
diff --git a/NOTES.md b/NOTES.md
index 5358eb5..3463729 100644
--- a/NOTES.md
+++ b/NOTES.md
@@ -18,8 +18,8 @@
**Branch**: `CLAUDE_TEST`
**Phase**: Phase 2 — User Space Foundation
-**Last commit**: `[Phase 2.1] GDT user space segments + heap growth`
-**Next task**: `[Phase 2.2]` — SYSCALL/SYSRET MSR setup
+**Last commit**: `[Phase 2.2/2.3] SYSCALL/SYSRET MSR setup + syscall dispatcher`
+**Next task**: `[Phase 2.4]` — Process structure (task/process.rs)
---
@@ -75,6 +75,20 @@ User address limit: 0x0000_8000_0000_0000 (canonical boundary)
**Next**: Phase 2.1 — Extend GDT.
**Decisions**: None new.
+### [Phase 2.2/2.3] 2026-04-08 — SYSCALL/SYSRET MSR setup + dispatcher
+**Done**:
+- `src/syscall/mod.rs`: configure STAR/LSTAR/SFMASK/EFER.SCE MSRs
+- Naked assembly `syscall_entry_asm`: swapgs, save user RSP, load kernel RSP, remap r10→rcx for arg4, call `syscall_handler`, restore, sysretq
+- `src/syscall/dispatch.rs`: dispatcher with `write(1)`, `exit(60)`, `exit_group(231)`; unknown → `-ENOSYS`
+- `sys_write`: validates user pointer before dereference; uses `read_volatile` to avoid aliasing UB
+- Registered `syscall::init()` + `syscall::init_kernel_stack()` in `strix_os::init()`
+- All existing integration tests still pass in QEMU
+**Next**: Phase 2.4 — Process structure
+**Decisions**:
+- Use `swapgs` pattern for user/kernel GS switching (per-CPU scratch for user RSP)
+- `sys_write` uses raw pointer + `read_volatile` loop (not `&[u8]` slice) on user memory
+- `sys_exit` currently halts; Phase 2.5 will add proper process termination
+
### [Phase 2.1] 2026-04-08 — GDT user space segments + heap growth
**Done**:
- Restructured `StrixKernel/src/gdt.rs`: added `kernel_data`, `user_data`, `user_code` segments in the correct order for SYSCALL/SYSRET ABI
diff --git a/StrixKernel/src/lib.rs b/StrixKernel/src/lib.rs
index 0d1fd4e..725758b 100644
--- a/StrixKernel/src/lib.rs
+++ b/StrixKernel/src/lib.rs
@@ -41,12 +41,13 @@ use core::panic::PanicInfo;
extern crate alloc;
+pub mod allocator;
pub mod gdt;
pub mod interrupts;
pub mod memory;
pub mod serial;
+pub mod syscall;
pub mod vga_buffer;
-pub mod allocator;
/// Initializes the kernel's core subsystems.
///
@@ -89,6 +90,9 @@ pub fn init() {
// The PICS static is protected by a spinlock for safe concurrent access.
unsafe { interrupts::PICS.lock().initialize() };
x86_64::instructions::interrupts::enable();
+ // Initialize SYSCALL/SYSRET MSRs (requires GDT already loaded above).
+ syscall::init();
+ syscall::init_kernel_stack();
}
/// A trait for types that can be run as tests in the kernel test framework.
diff --git a/StrixKernel/src/syscall/dispatch.rs b/StrixKernel/src/syscall/dispatch.rs
new file mode 100644
index 0000000..73e35e6
--- /dev/null
+++ b/StrixKernel/src/syscall/dispatch.rs
@@ -0,0 +1,120 @@
+//! # Syscall Dispatcher
+//!
+//! Routes incoming syscall numbers to their handler functions.
+//! Follows the Linux x86-64 syscall ABI (unistd_64.h numbering).
+//!
+//! ## Error Handling
+//!
+//! All unimplemented syscalls return [`errno::ENOSYS`] (`-38`), NOT a panic.
+//! Panicking on unknown syscalls would make the kernel trivially DoS-able from
+//! user space. Returning `-ENOSYS` is the correct POSIX behavior.
+
+use super::errno;
+use super::validate_user_ptr;
+use crate::serial_println;
+
+// ── Syscall number constants (Linux x86-64 ABI) ──────────────────────────────
+
+/// `write(fd, buf, count)` — write to a file descriptor
+const SYS_WRITE: u64 = 1;
+/// `exit(status)` — terminate the calling process
+const SYS_EXIT: u64 = 60;
+/// `exit_group(status)` — terminate all threads in the process
+const SYS_EXIT_GROUP: u64 = 231;
+
+// ── Dispatcher ───────────────────────────────────────────────────────────────
+
+/// Dispatches a syscall to its handler.
+///
+/// # Arguments
+///
+/// * `num` — syscall number (from user `rax`)
+/// * `arg1`–`arg6` — syscall arguments (rdi, rsi, rdx, r10, r8, r9)
+///
+/// # Returns
+///
+/// Value to place in `rax` on return to user space.
+/// Negative = negated errno.
+pub fn dispatch(num: u64, arg1: u64, arg2: u64, arg3: u64, _arg4: u64, _arg5: u64, _arg6: u64) -> i64 {
+ match num {
+ SYS_WRITE => sys_write(arg1, arg2, arg3),
+ SYS_EXIT => sys_exit(arg1),
+ SYS_EXIT_GROUP => sys_exit(arg1),
+ _ => {
+ // Log unimplemented syscall for debugging (don't panic).
+ serial_println!("[syscall] ENOSYS: num={}", num);
+ errno::ENOSYS
+ }
+ }
+}
+
+// ── Handlers ─────────────────────────────────────────────────────────────────
+
+/// `write(fd, buf, count)` — write bytes to a file descriptor.
+///
+/// Phase 2 implementation: only supports fd=1 (stdout) and fd=2 (stderr),
+/// both wired to the serial port. Full VFS-backed implementation comes in
+/// Phase 4 when the file descriptor table is available.
+///
+/// # Security
+///
+/// `buf` is a user-space pointer. It is validated to be within the canonical
+/// user address range before any dereference.
+fn sys_write(fd: u64, buf_ptr: u64, count: u64) -> i64 {
+ // Only stdout/stderr are supported before VFS is implemented.
+ if fd != 1 && fd != 2 {
+ return errno::EBADF;
+ }
+
+ // Reject zero-length writes immediately.
+ if count == 0 {
+ return 0;
+ }
+
+ // Reject absurdly large writes to prevent integer overflow in validation.
+ // 16 MiB is a generous upper bound for a single write.
+ if count > 16 * 1024 * 1024 {
+ return errno::EINVAL;
+ }
+
+ // Validate the user pointer range before dereferencing.
+ if !validate_user_ptr(buf_ptr, count) {
+ return errno::EFAULT;
+ }
+
+ // SAFETY: We have validated:
+ // 1. buf_ptr + count is within user address space (not a kernel address)
+ // 2. count > 0 and count <= 16 MiB
+ //
+ // Remaining risk: the user-space buffer may be unmapped or partially
+ // unmapped. A full implementation would catch the resulting page fault
+ // and return EFAULT. For Phase 2, we rely on the fact that the test
+ // code maps its own buffers correctly.
+ //
+ // DO NOT add any Rust references (&[u8]) here — a raw pointer loop avoids
+ // Rust's aliasing assumptions for untrusted user memory.
+ let mut written: u64 = 0;
+ let mut ptr = buf_ptr as *const u8;
+ while written < count {
+ // SAFETY: ptr is within the validated user range and advances by 1 each iter.
+ let byte = unsafe { ptr.read_volatile() };
+ crate::serial_print!("{}", byte as char);
+ ptr = unsafe { ptr.add(1) };
+ written += 1;
+ }
+
+ written as i64
+}
+
+/// `exit(status)` — terminate the current process.
+///
+/// Phase 2 implementation: for the initial single-process boot, we just
+/// halt the kernel. The scheduler (Phase 2.5) will replace this with proper
+/// process termination and scheduling of the next ready process.
+fn sys_exit(status: u64) -> i64 {
+ let code = (status & 0xFF) as u8;
+ serial_println!("[syscall] exit({})", code);
+ // TODO(Phase 2.5): mark current process as Zombie, schedule next process.
+ // For now, halt. This is a deliberate placeholder.
+ crate::hlt_loop();
+}
diff --git a/StrixKernel/src/syscall/mod.rs b/StrixKernel/src/syscall/mod.rs
new file mode 100644
index 0000000..99d9dd1
--- /dev/null
+++ b/StrixKernel/src/syscall/mod.rs
@@ -0,0 +1,304 @@
+//! # System Call Interface
+//!
+//! This module implements the x86-64 `SYSCALL`/`SYSRET` fast system call mechanism,
+//! providing the boundary between user space (Ring 3) and the kernel (Ring 0).
+//!
+//! ## SYSCALL/SYSRET Mechanics
+//!
+//! On x86-64, `SYSCALL` is the recommended system call instruction (faster than
+//! `INT 0x80`). The CPU:
+//! 1. Saves `RIP` → `RCX`, `RFLAGS` → `R11`
+//! 2. Loads CS from `STAR[47:32]`, SS from `STAR[47:32]+8`
+//! 3. Masks RFLAGS with `SFMASK` (we clear IF to disable interrupts)
+//! 4. Jumps to the address in `LSTAR`
+//!
+//! On `SYSRET` (64-bit):
+//! 1. Restores `RIP` from `RCX`, `RFLAGS` from `R11`
+//! 2. Loads CS from `STAR[63:48]+16`, SS from `STAR[63:48]+8`
+//! 3. Returns to Ring 3
+//!
+//! ## Linux x86-64 ABI
+//!
+//! Strix OS follows the standard Linux syscall ABI for compatibility:
+//!
+//! | Register | Role |
+//! |----------|------|
+//! | `rax` | Syscall number (in); return value (out) |
+//! | `rdi` | Argument 1 |
+//! | `rsi` | Argument 2 |
+//! | `rdx` | Argument 3 |
+//! | `r10` | Argument 4 (note: `r10` not `rcx` — `rcx` is clobbered by SYSCALL) |
+//! | `r8` | Argument 5 |
+//! | `r9` | Argument 6 |
+//! | `rcx` | Clobbered by SYSCALL (saved original RIP) |
+//! | `r11` | Clobbered by SYSCALL (saved original RFLAGS) |
+//!
+//! Return value convention: negative value = `-errno` (e.g. `-22` = `-EINVAL`).
+//!
+//! ## Security
+//!
+//! Every syscall that receives a pointer from user space must validate it with
+//! [`validate_user_ptr`] before dereferencing. Pointers outside the canonical
+//! user address range `0..USER_ADDR_MAX` are rejected with `-EFAULT`.
+
+pub mod dispatch;
+
+use x86_64::registers::model_specific::{Efer, EferFlags, LStar, SFMask, Star};
+use x86_64::registers::rflags::RFlags;
+use x86_64::VirtAddr;
+
+use crate::gdt;
+
+/// Upper bound of the user-accessible virtual address space (exclusive).
+///
+/// On x86-64, canonical addresses have bits 48–63 equal to bit 47.
+/// User space occupies `0x0000_0000_0000_0000..0x0000_8000_0000_0000`.
+/// Any pointer at or above this value is a kernel address and must be rejected.
+pub const USER_ADDR_MAX: u64 = 0x0000_8000_0000_0000;
+
+/// Errno constants (negated, as returned in rax).
+pub mod errno {
+ /// Operation not permitted
+ pub const EPERM: i64 = -1;
+ /// No such file or directory
+ pub const ENOENT: i64 = -2;
+ /// Input/output error
+ pub const EIO: i64 = -5;
+ /// Bad file descriptor
+ pub const EBADF: i64 = -9;
+ /// Cannot allocate memory
+ pub const ENOMEM: i64 = -12;
+ /// Invalid argument
+ pub const EINVAL: i64 = -22;
+ /// Function not implemented
+ pub const ENOSYS: i64 = -38;
+ /// Bad address (pointer validation failure)
+ pub const EFAULT: i64 = -14;
+}
+
+/// Validates that a user-supplied pointer range lies entirely within user space.
+///
+/// Returns `true` if the entire range `[ptr, ptr+len)` is within the canonical
+/// user address range `[0, USER_ADDR_MAX)`.
+///
+/// # Arguments
+///
+/// * `ptr` - Base address from user space
+/// * `len` - Length of the region in bytes
+///
+/// # Security
+///
+/// Must be called before any syscall handler dereferences a user pointer.
+/// Failure to validate allows a malicious user process to read/write arbitrary
+/// kernel memory (kernel data disclosure or control-flow hijack).
+#[inline]
+pub fn validate_user_ptr(ptr: u64, len: u64) -> bool {
+ // Check that neither ptr nor ptr+len overflows, and the entire range is below
+ // the user/kernel address boundary.
+ ptr < USER_ADDR_MAX && ptr.saturating_add(len) <= USER_ADDR_MAX
+}
+
+/// The syscall entry point — called by the CPU on every `SYSCALL` instruction.
+///
+/// This is the Rust side of the syscall entry. It receives the register state
+/// saved by the assembly stub in `syscall_entry_asm`, dispatches to the
+/// appropriate handler, and returns the result in `rax`.
+///
+/// # Arguments (matching Linux x86-64 ABI)
+///
+/// * `syscall_num` — syscall number (from rax)
+/// * `arg1..arg6` — syscall arguments (rdi, rsi, rdx, r10, r8, r9)
+///
+/// # Returns
+///
+/// The value to place in `rax` on return to user space.
+/// Negative values indicate errors (negated errno).
+#[unsafe(no_mangle)]
+pub extern "C" fn syscall_handler(
+ syscall_num: u64,
+ arg1: u64,
+ arg2: u64,
+ arg3: u64,
+ arg4: u64,
+ arg5: u64,
+ arg6: u64,
+) -> i64 {
+ dispatch::dispatch(syscall_num, arg1, arg2, arg3, arg4, arg5, arg6)
+}
+
+/// Initializes the SYSCALL/SYSRET mechanism by configuring the required MSRs.
+///
+/// Must be called after [`crate::gdt::init`] (we read the GDT selectors).
+///
+/// # MSR Configuration
+///
+/// - **STAR**: encodes the kernel CS/SS selectors (for SYSCALL) and the base
+/// for computing user CS/SS on SYSRET.
+/// - **LSTAR**: the virtual address of `syscall_entry_asm` — where the CPU
+/// jumps on SYSCALL.
+/// - **SFMASK**: bits to clear from RFLAGS on SYSCALL entry. We clear `IF`
+/// (interrupt flag) so kernel entry runs with interrupts disabled until we
+/// explicitly re-enable them.
+/// - **EFER.SCE**: the System Call Extensions bit must be set to enable SYSCALL.
+pub fn init() {
+ let kernel_cs = gdt::GDT.1.kernel_code_selector;
+ let kernel_ss = gdt::GDT.1.kernel_data_selector;
+
+ // SAFETY: Writing to MSRs is always unsafe (ring-0 only operation).
+ // We own these MSRs; no other code writes them.
+ unsafe {
+ // Enable SYSCALL/SYSRET via EFER.SCE
+ Efer::update(|f| *f |= EferFlags::SYSTEM_CALL_EXTENSIONS);
+
+ // STAR: [47:32] = kernel CS (SS = kernel CS + 8 auto),
+ // [63:48] = kernel DS (SYSRET derives user SS=+8, user CS=+16)
+ Star::write(
+ gdt::GDT.1.user_code_selector,
+ gdt::GDT.1.user_data_selector,
+ kernel_cs,
+ kernel_ss,
+ )
+ .expect("STAR MSR write failed: invalid segment selectors");
+
+ // LSTAR: syscall entry point
+ LStar::write(VirtAddr::new(syscall_entry_asm as *const () as u64));
+
+ // SFMASK: clear IF on SYSCALL entry (interrupts disabled in syscall handler)
+ SFMask::write(RFlags::INTERRUPT_FLAG);
+ }
+}
+
+/// Raw assembly syscall entry stub.
+///
+/// The CPU jumps here on every `SYSCALL` instruction. We must:
+/// 1. Switch to the kernel stack (SYSCALL does NOT switch RSP automatically)
+/// 2. Save the caller-save registers that the C ABI doesn't preserve
+/// 3. Rearrange arguments to match Rust's `extern "C"` calling convention
+/// 4. Call `syscall_handler`
+/// 5. Restore registers and execute `SYSRET`
+///
+/// # Stack Management
+///
+/// SYSCALL does not switch stacks. We need a per-CPU kernel stack pointer.
+/// For now we use a simple per-CPU static; the scheduler will replace this
+/// with a per-process kernel stack in Phase 2.5.
+///
+/// # Register State on Entry (from CPU)
+///
+/// | Register | Content |
+/// |----------|---------|
+/// | rax | syscall number |
+/// | rcx | saved user RIP (return address) |
+/// | r11 | saved user RFLAGS |
+/// | rdi | arg1 |
+/// | rsi | arg2 |
+/// | rdx | arg3 |
+/// | r10 | arg4 (Linux uses r10 instead of rcx for arg4) |
+/// | r8 | arg5 |
+/// | r9 | arg6 |
+///
+/// # SYSRET Requirements
+///
+/// Before `SYSRET`:
+/// - `rcx` = user RIP to return to
+/// - `r11` = user RFLAGS to restore
+/// - `rsp` = user stack pointer
+/// - CS/SS loaded from STAR automatically
+#[unsafe(naked)]
+unsafe extern "C" fn syscall_entry_asm() {
+ // SAFETY: This is a naked function — no prologue/epilogue. We manually
+ // manage the stack. The `syscall` instruction has already saved RIP→RCX
+ // and RFLAGS→R11.
+ core::arch::naked_asm!(
+ // ── Switch to kernel stack ──────────────────────────────────────────
+ // Save user RSP into a per-CPU scratch location, then load kernel RSP.
+ // We use the GS-relative per-CPU area for the user RSP spill slot.
+ // For Phase 2.2 we use a simple static kernel stack; the scheduler
+ // will update TSS.RSP0 and we'll switch via that in Phase 2.5.
+ "swapgs", // swap GS base: user GS <-> kernel GS
+ "mov [{user_rsp}], rsp", // save user RSP to per-CPU scratch
+ "mov rsp, [{kernel_rsp}]", // load kernel RSP from per-CPU area
+
+ // ── Save caller-saved registers not preserved by the C ABI ──────────
+ // rcx = user RIP (must survive until SYSRET)
+ // r11 = user RFLAGS (must survive until SYSRET)
+ "push rcx", // save user RIP
+ "push r11", // save user RFLAGS
+
+ // ── Remap arg4: Linux passes arg4 in r10, C ABI expects rcx ─────────
+ "mov rcx, r10", // arg4: r10 → rcx for C ABI
+
+ // ── Call Rust syscall handler ────────────────────────────────────────
+ // syscall_handler(syscall_num=rax, arg1=rdi, arg2=rsi, arg3=rdx,
+ // arg4=rcx [was r10], arg5=r8, arg6=r9)
+ // Return value lands in rax.
+ "call {handler}",
+
+ // ── Restore saved registers ──────────────────────────────────────────
+ "pop r11", // restore user RFLAGS
+ "pop rcx", // restore user RIP
+
+ // ── Restore user stack pointer ───────────────────────────────────────
+ "mov rsp, [{user_rsp}]", // restore user RSP
+ "swapgs", // swap GS base back to user GS
+
+ // ── Return to user space ─────────────────────────────────────────────
+ // SYSRET restores RIP from RCX, RFLAGS from R11, sets CS/SS from STAR.
+ "sysretq",
+
+ handler = sym syscall_handler,
+ user_rsp = sym SYSCALL_USER_RSP,
+ kernel_rsp = sym SYSCALL_KERNEL_RSP,
+ );
+}
+
+/// Per-CPU scratch space: user RSP saved during SYSCALL entry.
+///
+/// This is a temporary spill slot; the value is only valid between the
+/// `swapgs`/`mov [{user_rsp}]` and the corresponding restore. In a
+/// multi-CPU system this would need to be per-CPU (GS-relative). For now
+/// Strix OS runs on a single CPU.
+static mut SYSCALL_USER_RSP: u64 = 0;
+
+/// Per-CPU kernel RSP used as the initial kernel stack on SYSCALL entry.
+///
+/// Initialized by [`init_kernel_stack`]. The scheduler updates this in
+/// Phase 2.5 to point at the current process's kernel stack.
+static mut SYSCALL_KERNEL_RSP: u64 = 0;
+
+/// Kernel stack for the initial syscall handler context (4 KiB).
+const SYSCALL_STACK_SIZE: usize = 4096;
+static mut SYSCALL_STACK: [u8; SYSCALL_STACK_SIZE] = [0; SYSCALL_STACK_SIZE];
+
+/// Initializes the per-CPU kernel stack pointer used by the syscall entry stub.
+///
+/// Must be called after [`init`].
+///
+/// # Safety
+///
+/// Must be called from a single CPU before any user process runs.
+pub fn init_kernel_stack() {
+ // SAFETY: Single-threaded initialization, no interrupts active yet.
+ // We use addr_of! to get the address without creating a reference to
+ // the mutable static (required by Rust 2024 edition).
+ unsafe {
+ let stack_base = core::ptr::addr_of!(SYSCALL_STACK) as u64;
+ SYSCALL_KERNEL_RSP = stack_base + SYSCALL_STACK_SIZE as u64;
+ }
+}
+
+/// Updates the per-CPU kernel stack pointer for the syscall entry stub.
+///
+/// Called by the scheduler in Phase 2.5 before entering each user process.
+/// This ensures that SYSCALL switches to the correct per-process kernel stack.
+///
+/// # Safety
+///
+/// Must be called with interrupts disabled. `kernel_stack_top` must be a
+/// valid mapped kernel stack pointer.
+pub unsafe fn set_syscall_kernel_stack(kernel_stack_top: u64) {
+ // SAFETY: Caller ensures interrupts are disabled (no concurrent access).
+ unsafe {
+ SYSCALL_KERNEL_RSP = kernel_stack_top;
+ }
+}