Skip to content

Commit

Permalink
arch/x86: Support simd
Browse files Browse the repository at this point in the history
  • Loading branch information
Qwinci committed Oct 31, 2023
1 parent da40ac8 commit 3d7dd1a
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 1 deletion.
32 changes: 31 additions & 1 deletion src/arch/x86/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,34 @@ typedef enum : u32 {
void x86_set_cpu_local(X86Task* task);
void x86_set_msr(Msr msr, u64 value);
u64 x86_get_msr(Msr msr);
X86Cpu* x86_get_cur_cpu();
X86Cpu* x86_get_cur_cpu();

typedef struct {
usize xsave_area_size;
bool rdrnd;
bool xsave;
bool avx;
bool avx512;
} CpuFeatures;

extern CpuFeatures CPU_FEATURES;

static inline void xsave(u8* area, u64 mask) {
assert((usize) area % 64 == 0);
usize low = mask & 0xFFFFFFFF;
usize high = mask >> 32;
__asm__ volatile("xsave %0" : : "m"(*area), "a"(low), "d"(high) : "memory");
}

static inline void xrstor(u8* area, u64 mask) {
assert((usize) area % 64 == 0);
usize low = mask & 0xFFFFFFFF;
usize high = mask >> 32;
__asm__ volatile("xrstor %0" : : "m"(*area), "a"(low), "d"(high) : "memory");
}

static inline void wrxcr(u32 index, u64 value) {
u32 low = value;
u32 high = value >> 32;
__asm__ volatile("xsetbv" : : "c"(index), "a"(low), "d"(high) : "memory");
}
49 changes: 49 additions & 0 deletions src/arch/x86/sched/executor.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include "types.h"
#include "assert.h"

typedef struct {
u64 fs;
Expand All @@ -8,4 +9,52 @@ typedef struct {

typedef struct {
ExecutorGenericState generic;
u8* simd;
} ExecutorState;

typedef struct {
u16 fcw;
u16 fsw;
u8 ftw;
u8 reserved0;
u16 fop;
u64 fip;
u64 fdp;
u32 mxcsr;
u32 mxcsr_mask;
u8 st0[10];
u8 reserved1[6];
u8 st1[10];
u8 reserved2[6];
u8 st2[10];
u8 reserved3[6];
u8 st3[10];
u8 reserved4[6];
u8 st4[10];
u8 reserved5[6];
u8 st5[10];
u8 reserved6[6];
u8 st6[10];
u8 reserved7[6];
u8 st7[10];
u8 reserved8[6];
u8 xmm0[16];
u8 xmm1[16];
u8 xmm2[16];
u8 xmm3[16];
u8 xmm4[16];
u8 xmm5[16];
u8 xmm6[16];
u8 xmm7[16];
u8 xmm8[16];
u8 xmm9[16];
u8 xmm10[16];
u8 xmm11[16];
u8 xmm12[16];
u8 xmm13[16];
u8 xmm14[16];
u8 xmm15[16];
u8 reserved9[48];
u8 available[48];
} FxState;
static_assert(sizeof(FxState) == 512);
18 changes: 18 additions & 0 deletions src/arch/x86/sched/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,26 @@ void arch_switch_task(Task* self, Task* new_task) {
x86_set_msr(MSR_KERNELGSBASE, x86_new->state.generic.gs);
x86_set_msr(MSR_FSBASE, x86_new->state.generic.fs);

if (x86_self->user) {
if (CPU_FEATURES.xsave) {
xsave(x86_self->state.simd, ~0);
}
else {
__asm__ volatile("fxsaveq %0" : : "m"(*x86_self->state.simd) : "memory");
}
}

X86Task* prev = x86_switch_task(x86_self, x86_new);

if (x86_self->user) {
if (CPU_FEATURES.xsave) {
xrstor(x86_self->state.simd, ~0);
}
else {
__asm__ volatile("fxrstorq %0" : : "m"(*x86_self->state.simd) : "memory");
}
}

x86_set_msr(MSR_KERNELGSBASE, x86_self->state.generic.gs);
x86_set_msr(MSR_FSBASE, x86_self->state.generic.fs);

Expand Down
19 changes: 19 additions & 0 deletions src/arch/x86/sched/x86_task.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "sys/dev.h"
#include "fs/vfs.h"
#include "sys/fs.h"
#include "arch/x86/cpu.h"

typedef struct {
u64 r15, r14, r13, r12, rbp, rbx;
Expand Down Expand Up @@ -101,6 +102,19 @@ Task* arch_create_user_task(Process* process, const char* name, void (*fn)(void*
task->user = true;
task->common.priority = 0;
task->common.process = process;
usize simd_size = CPU_FEATURES.xsave ? CPU_FEATURES.xsave_area_size : sizeof(FxState);
task->state.simd = (u8*) vm_kernel_alloc_backed(ALIGNUP(simd_size, PAGE_SIZE) / PAGE_SIZE, PF_READ | PF_WRITE);
if (!task->state.simd) {
kprintf("[kernel][x86]: failed to allocate simd state (out of memory)\n");
kfree(kernel_stack, KERNEL_STACK_SIZE);
vm_user_dealloc_backed(process, (void*) task->stack_base, USER_STACK_SIZE / PAGE_SIZE, NULL);
kfree(task, sizeof(X86Task));
return NULL;
}
FxState* fx_state = (FxState*) task->state.simd;
// IM | DM | ZM | OM | UM | PM | PC
fx_state->fcw = 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 0b11 << 8;
fx_state->mxcsr = 0b1111110000000;

return &task->common;
}
Expand Down Expand Up @@ -153,6 +167,11 @@ void arch_destroy_task(Task* task) {
X86Task* x86_task = container_of(task, X86Task, common);

if (x86_task->user) {
usize simd_size = CPU_FEATURES.xsave ? CPU_FEATURES.xsave_area_size : sizeof(FxState);
assert(x86_task->state.simd);
vm_kernel_dealloc_backed(x86_task->state.simd, ALIGNUP(simd_size, PAGE_SIZE) / PAGE_SIZE);
x86_task->state.simd = NULL;

vm_user_dealloc_backed(task->process, (void*) x86_task->stack_base, USER_STACK_SIZE / PAGE_SIZE, NULL);

kfree((void*) (x86_task->kernel_stack_base), KERNEL_STACK_SIZE);
Expand Down
68 changes: 68 additions & 0 deletions src/arch/x86/smp.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "string.h"
#include "utils/spinlock.h"
#include "utils/math.h"
#include "cpuid.h"

static volatile struct limine_smp_request SMP_REQUEST = {
.id = LIMINE_SMP_REQUEST
Expand Down Expand Up @@ -52,6 +53,43 @@ static X86Task* create_this_task(Cpu* cpu) {
return self;
}

static void init_simd() {
u64 cr0;
__asm__ volatile("mov %%cr0, %0" : "=r"(cr0));
// clear EM
cr0 &= ~(1 << 2);
// set MP
cr0 |= 1 << 1;
__asm__ volatile("mov %0, %%cr0" : : "r"(cr0));

u64 cr4;
__asm__ volatile("mov %%cr4, %0" : "=r"(cr4));
// set OSXMMEXCPT and OSFXSR
cr4 |= 1 << 10 | 1 << 9;
if (CPU_FEATURES.xsave) {
// set OSXSAVE
cr4 |= 1 << 18;
}
__asm__ volatile("mov %0, %%cr4" : : "r"(cr4));

if (CPU_FEATURES.xsave) {
// x87 and SSE
u64 xcr0 = 1 << 0 | 1 << 1;

if (CPU_FEATURES.avx) {
xcr0 |= 1 << 2;
}

if (CPU_FEATURES.avx512) {
xcr0 |= 1 << 5;
xcr0 |= 1 << 6;
xcr0 |= 1 << 7;
}

wrxcr(0, xcr0);
}
}

[[noreturn]] static void x86_ap_entry(struct limine_smp_info* info) {
X86Cpu* cpu = &CPUS[CPU_COUNT++];

Expand All @@ -69,6 +107,8 @@ static X86Task* create_this_task(Cpu* cpu) {

__asm__ volatile("mov $6 * 8, %%ax; ltr %%ax" : : : "ax");

init_simd();

lapic_init();
lapic_timer_init();

Expand All @@ -83,7 +123,33 @@ static X86Task* create_this_task(Cpu* cpu) {
panic("x86_ap_entry resumed\n");
}

CpuFeatures CPU_FEATURES = {};

static void detect_cpu_features() {
Cpuid info = cpuid(1, 0);

if (info.ecx & 1U << 30) {
CPU_FEATURES.rdrnd = true;
}
if (info.ecx & 1U << 26) {
CPU_FEATURES.xsave = true;

if (info.ecx & 1U << 28) {
CPU_FEATURES.avx = true;
}

info = cpuid(7, 0);
if (info.ebx & 1U << 16) {
CPU_FEATURES.avx512 = true;
}

CPU_FEATURES.xsave_area_size = cpuid(0xD, 0).ecx;
}
}

void arch_init_smp() {
detect_cpu_features();

assert(SMP_REQUEST.response);
X86_BSP_ID = SMP_REQUEST.response->bsp_lapic_id;

Expand Down Expand Up @@ -113,6 +179,8 @@ void arch_init_smp() {
lapic_timer_init();
kprintf("[kernel][timer]: apic frequency %uhz\n", CPUS[0].lapic_timer.freq);

init_simd();

x86_init_usermode();
sched_init(true);
kprintf("[kernel][smp]: sched init\n");
Expand Down

0 comments on commit 3d7dd1a

Please sign in to comment.