hrev54291 adds 1 changeset to branch 'master'
old head: 734c1e049163f67ee02bf02caae5fb9a0108db84
new head: 9495126984d664dfa8afc1382b5614ee69ba2a1e
overview:
https://git.haiku-os.org/haiku/log/?qt=range&q=9495126984d6+%5E734c1e049163
----------------------------------------------------------------------------
9495126984d6: kernel/x86_64: AVX support
xsave or xsavec are supported.
breaks vregs compatibility.
change the thread structure object cache alignment to 64
the xsave fpu_state size isn't defined, it is for instance 832 here, thus I
picked 1024.
Change-Id: I4a0cab0bc42c1d37f24dcafb8259f8ff24a330d2
Reviewed-on: https://review.haiku-os.org/c/haiku/+/2849
Reviewed-by: Adrien Destugues <pulkomandy@xxxxxxxxx>
[ Jérôme Duval <jerome.duval@xxxxxxxxx> ]
----------------------------------------------------------------------------
Revision: hrev54291
Commit: 9495126984d664dfa8afc1382b5614ee69ba2a1e
URL: https://git.haiku-os.org/haiku/commit/?id=9495126984d6
Author: Jérôme Duval <jerome.duval@xxxxxxxxx>
Date: Tue May 5 21:03:39 2020 UTC
Committer: Adrien Destugues <pulkomandy@xxxxxxxxx>
Commit-Date: Wed Jun 3 06:16:48 2020 UTC
----------------------------------------------------------------------------
10 files changed, 191 insertions(+), 27 deletions(-)
headers/posix/arch/x86_64/signal.h | 14 ++++-
.../private/kernel/arch/x86/arch_altcodepatch.h | 2 +
headers/private/kernel/arch/x86/arch_cpu.h | 16 +++++
.../private/kernel/arch/x86/arch_thread_types.h | 5 ++
src/system/kernel/arch/x86/64/arch.S | 13 ++++
src/system/kernel/arch/x86/64/interrupts.S | 66 ++++++++++++++++----
src/system/kernel/arch/x86/64/thread.cpp | 49 +++++++++++----
src/system/kernel/arch/x86/arch_altcodepatch.cpp | 3 +-
src/system/kernel/arch/x86/arch_cpu.cpp | 46 ++++++++++++++
src/system/kernel/thread.cpp | 4 +-
----------------------------------------------------------------------------
diff --git a/headers/posix/arch/x86_64/signal.h
b/headers/posix/arch/x86_64/signal.h
index 871d1f59b6..ad4fa4b128 100644
--- a/headers/posix/arch/x86_64/signal.h
+++ b/headers/posix/arch/x86_64/signal.h
@@ -89,6 +89,18 @@ struct fpu_state {
unsigned char _reserved_416_511[96];
};
+struct xstate_hdr {
+ unsigned long bv;
+ unsigned long xcomp_bv;
+ unsigned char _reserved[48];
+};
+
+struct savefpu {
+ struct fpu_state fp_fxsave;
+ struct xstate_hdr fp_xstate;
+ unsigned long fp_ymm[16][2];
+};
+
struct vregs {
unsigned long rax;
unsigned long rbx;
@@ -110,7 +122,7 @@ struct vregs {
unsigned long rip;
unsigned long rflags;
- struct fpu_state fpu;
+ struct savefpu fpu;
};
diff --git a/headers/private/kernel/arch/x86/arch_altcodepatch.h
b/headers/private/kernel/arch/x86/arch_altcodepatch.h
index 99f01e2784..762a083de8 100644
--- a/headers/private/kernel/arch/x86/arch_altcodepatch.h
+++ b/headers/private/kernel/arch/x86/arch_altcodepatch.h
@@ -22,6 +22,8 @@
#define ALTCODEPATCH_TAG_STAC 1
#define ALTCODEPATCH_TAG_CLAC 2
+#define ALTCODEPATCH_TAG_XSAVE 3
+#define ALTCODEPATCH_TAG_XRSTOR 4
#ifdef _ASSEMBLER
diff --git a/headers/private/kernel/arch/x86/arch_cpu.h
b/headers/private/kernel/arch/x86/arch_cpu.h
index fa5215eead..155c88d9d2 100644
--- a/headers/private/kernel/arch/x86/arch_cpu.h
+++ b/headers/private/kernel/arch/x86/arch_cpu.h
@@ -354,9 +354,15 @@
#define IA32_CR4_GLOBAL_PAGES (1UL << 7)
#define CR4_OS_FXSR (1UL << 9)
#define CR4_OS_XMM_EXCEPTION (1UL << 10)
+#define IA32_CR4_OSXSAVE (1UL << 18)
#define IA32_CR4_SMEP (1UL << 20)
#define IA32_CR4_SMAP (1UL << 21)
+// Extended Control Register XCR0 flags
+#define IA32_XCR0_X87 (1UL << 0)
+#define IA32_XCR0_SSE (1UL << 1)
+#define IA32_XCR0_AVX (1UL << 2)
+
// page fault error codes (http://wiki.osdev.org/Page_Fault)
#define PGFAULT_P 0x01 //
Protection violation
#define PGFAULT_W 0x02 // Write
@@ -547,6 +553,16 @@ struct intel_microcode_extended_signature {
#define clear_ac() \
__asm__ volatile (ASM_CLAC : : : "memory")
+#define xgetbv(reg) ({ \
+ uint32 low, high; \
+ __asm__ volatile ("xgetbv" : "=a" (low), "=d" (high), "c" (reg)); \
+ (low | (uint64)high << 32); \
+})
+
+#define xsetbv(reg, value) { \
+ uint32 low = value; uint32 high = value >> 32; \
+ __asm__ volatile ("xsetbv" : : "a" (low), "d" (high), "c" (reg)); }
+
#define out8(value,port) \
__asm__ ("outb %%al,%%dx" : : "a" (value), "d" (port))
diff --git a/headers/private/kernel/arch/x86/arch_thread_types.h
b/headers/private/kernel/arch/x86/arch_thread_types.h
index f02ae41ad6..5bd1b637d9 100644
--- a/headers/private/kernel/arch/x86/arch_thread_types.h
+++ b/headers/private/kernel/arch/x86/arch_thread_types.h
@@ -53,8 +53,13 @@ struct arch_thread {
struct farcall interrupt_stack;
#endif
+#ifndef __x86_64__
// 512 byte floating point save point - this must be 16 byte aligned
uint8 fpu_state[512] _ALIGNED(16);
+#else
+ // floating point save point - this must be 64 byte aligned for xsave
+ uint8 fpu_state[1024] _ALIGNED(64);
+#endif
addr_t GetFramePointer() const;
} _ALIGNED(16);
diff --git a/src/system/kernel/arch/x86/64/arch.S
b/src/system/kernel/arch/x86/64/arch.S
index d88b05bfed..4c4aaa66cc 100644
--- a/src/system/kernel/arch/x86/64/arch.S
+++ b/src/system/kernel/arch/x86/64/arch.S
@@ -119,3 +119,16 @@ FUNCTION_END(_stac)
FUNCTION(_clac):
clac
FUNCTION_END(_clac)
+
+FUNCTION(_xsave):
+ xsave64 (%rdi)
+FUNCTION_END(_xsave)
+
+FUNCTION(_xsavec):
+ xsavec64 (%rdi)
+FUNCTION_END(_xsavec)
+
+FUNCTION(_xrstor):
+ xrstor64 (%rdi)
+FUNCTION_END(_xrstor)
+
diff --git a/src/system/kernel/arch/x86/64/interrupts.S
b/src/system/kernel/arch/x86/64/interrupts.S
index f58135cace..124fe061a3 100644
--- a/src/system/kernel/arch/x86/64/interrupts.S
+++ b/src/system/kernel/arch/x86/64/interrupts.S
@@ -221,16 +221,32 @@ STATIC_FUNCTION(int_bottom):
// exception.
orq $X86_EFLAGS_RESUME, IFRAME_flags(%rbp)
- subq $512, %rsp
- andq $~15, %rsp
- fxsaveq (%rsp)
+ // xsave needs a 64-byte alignment
+ andq $~63, %rsp
+ movq (gFPUSaveLength), %rcx
+ subq %rcx, %rsp
+ leaq (%rsp), %rdi
+ shrq $3, %rcx
+ movq $0, %rax
+ rep stosq
+ movl (gXsaveMask), %eax
+ movl (gXsaveMask+4), %edx
+ movq %rsp, %rdi
+ CODEPATCH_START
+ fxsaveq (%rdi)
+ CODEPATCH_END(ALTCODEPATCH_TAG_XSAVE)
// Call the interrupt handler.
movq %rbp, %rdi
movq IFRAME_vector(%rbp), %rax
call *gInterruptHandlerTable(, %rax, 8)
- fxrstorq (%rsp)
+ movl (gXsaveMask), %eax
+ movl (gXsaveMask+4), %edx
+ movq %rsp, %rdi
+ CODEPATCH_START
+ fxrstorq (%rdi)
+ CODEPATCH_END(ALTCODEPATCH_TAG_XRSTOR)
movq %rbp, %rsp
// Restore the saved registers.
@@ -253,9 +269,22 @@ STATIC_FUNCTION(int_bottom_user):
// Frame pointer is the iframe.
movq %rsp, %rbp
- subq $512, %rsp
- andq $~15, %rsp
- fxsaveq (%rsp)
+ // xsave needs a 64-byte alignment
+ andq $~63, %rsp
+ movq (gFPUSaveLength), %rcx
+ subq %rcx, %rsp
+ leaq (%rsp), %rdi
+ shrq $3, %rcx
+ movq $0, %rax
+ rep stosq
+ movl (gXsaveMask), %eax
+ movl (gXsaveMask+4), %edx
+
+ movq %rsp, %rdi
+ CODEPATCH_START
+ fxsaveq (%rdi)
+ CODEPATCH_END(ALTCODEPATCH_TAG_XSAVE)
+
movq %rsp, IFRAME_fpu(%rbp)
// Set the RF (resume flag) in RFLAGS. This prevents an instruction
@@ -286,7 +315,12 @@ STATIC_FUNCTION(int_bottom_user):
UPDATE_THREAD_KERNEL_TIME()
- fxrstorq (%rsp)
+ movl (gXsaveMask), %eax
+ movl (gXsaveMask+4), %edx
+ movq %rsp, %rdi
+ CODEPATCH_START
+ fxrstorq (%rdi)
+ CODEPATCH_END(ALTCODEPATCH_TAG_XRSTOR)
movq %rbp, %rsp
// Restore the saved registers.
@@ -315,7 +349,12 @@ STATIC_FUNCTION(int_bottom_user):
movq %rbp, %rdi
call x86_init_user_debug_at_kernel_exit
1:
- fxrstorq (%rsp)
+ movl (gXsaveMask), %eax
+ movl (gXsaveMask+4), %edx
+ movq %rsp, %rdi
+ CODEPATCH_START
+ fxrstorq (%rdi)
+ CODEPATCH_END(ALTCODEPATCH_TAG_XRSTOR)
movq %rbp, %rsp
// Restore the saved registers.
@@ -522,8 +561,13 @@ FUNCTION(x86_64_syscall_entry):
jmp .Liret
.Lrestore_fpu:
- movq IFRAME_fpu(%rbp), %rax
- fxrstorq (%rax)
+ movq IFRAME_fpu(%rbp), %rdi
+
+ movl (gXsaveMask), %eax
+ movl (gXsaveMask+4), %edx
+ CODEPATCH_START
+ fxrstorq (%rdi)
+ CODEPATCH_END(ALTCODEPATCH_TAG_XRSTOR)
.Liret:
// Restore the saved registers.
RESTORE_IFRAME()
diff --git a/src/system/kernel/arch/x86/64/thread.cpp
b/src/system/kernel/arch/x86/64/thread.cpp
index a29697a03f..e50c3a8fa2 100644
--- a/src/system/kernel/arch/x86/64/thread.cpp
+++ b/src/system/kernel/arch/x86/64/thread.cpp
@@ -68,7 +68,10 @@ class RestartSyscall : public AbstractTraceEntry {
extern "C" void x86_64_thread_entry();
// Initial thread saved state.
-static arch_thread sInitialState;
+static arch_thread sInitialState _ALIGNED(64);
+extern uint64 gFPUSaveLength;
+extern bool gHasXsave;
+extern bool gHasXsavec;
void
@@ -140,12 +143,36 @@ arch_thread_init(kernel_args* args)
{
// Save one global valid FPU state; it will be copied in the arch
dependent
// part of each new thread.
- asm volatile (
- "clts;" \
- "fninit;" \
- "fnclex;" \
- "fxsave %0;"
- : "=m" (sInitialState.fpu_state));
+ if (gHasXsave || gHasXsavec) {
+ ASSERT(gFPUSaveLength <= sizeof(sInitialState.fpu_state));
+ memset(sInitialState.fpu_state, 0, gFPUSaveLength);
+ if (gHasXsavec) {
+ asm volatile (
+ "clts;" \
+ "fninit;" \
+ "fnclex;" \
+ "movl $0x7,%%eax;" \
+ "movl $0x0,%%edx;" \
+ "xsavec64 %0"
+ :: "m" (sInitialState.fpu_state));
+ } else {
+ asm volatile (
+ "clts;" \
+ "fninit;" \
+ "fnclex;" \
+ "movl $0x7,%%eax;" \
+ "movl $0x0,%%edx;" \
+ "xsave64 %0"
+ :: "m" (sInitialState.fpu_state));
+ }
+ } else {
+ asm volatile (
+ "clts;" \
+ "fninit;" \
+ "fnclex;" \
+ "fxsaveq %0"
+ :: "m" (sInitialState.fpu_state));
+ }
return B_OK;
}
@@ -309,11 +336,10 @@ arch_setup_signal_frame(Thread* thread, struct sigaction*
action,
if (frame->fpu != nullptr) {
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu,
frame->fpu,
- sizeof(signalFrameData->context.uc_mcontext.fpu));
+ gFPUSaveLength);
} else {
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu,
- sInitialState.fpu_state,
- sizeof(signalFrameData->context.uc_mcontext.fpu));
+ sInitialState.fpu_state, gFPUSaveLength);
}
// Fill in signalFrameData->context.uc_stack.
@@ -385,8 +411,7 @@ arch_restore_signal_frame(struct signal_frame_data*
signalFrameData)
Thread* thread = thread_get_current_thread();
memcpy(thread->arch_info.fpu_state,
- (void*)&signalFrameData->context.uc_mcontext.fpu,
- sizeof(thread->arch_info.fpu_state));
+ (void*)&signalFrameData->context.uc_mcontext.fpu,
gFPUSaveLength);
frame->fpu = &thread->arch_info.fpu_state;
// The syscall return code overwrites frame->ax with the return value of
diff --git a/src/system/kernel/arch/x86/arch_altcodepatch.cpp
b/src/system/kernel/arch/x86/arch_altcodepatch.cpp
index b99a7d12c7..958448a12b 100644
--- a/src/system/kernel/arch/x86/arch_altcodepatch.cpp
+++ b/src/system/kernel/arch/x86/arch_altcodepatch.cpp
@@ -52,6 +52,7 @@ arch_altcodepatch_replace(uint16 tag, void* newcodepatch,
size_t length)
// disable write after patch
set_area_protection(info->text_region.id, kernelProtection);
- dprintf("arch_altcodepatch_replace found %" B_PRIu32 "
altcodepatches\n", count);
+ dprintf("arch_altcodepatch_replace found %" B_PRIu32 " altcodepatches "
+ "for tag %u\n", count, tag);
}
diff --git a/src/system/kernel/arch/x86/arch_cpu.cpp
b/src/system/kernel/arch/x86/arch_cpu.cpp
index d6fbd74c10..e62c85e28f 100644
--- a/src/system/kernel/arch/x86/arch_cpu.cpp
+++ b/src/system/kernel/arch/x86/arch_cpu.cpp
@@ -84,6 +84,13 @@ struct set_mtrrs_parameter {
#ifdef __x86_64__
extern addr_t _stac;
extern addr_t _clac;
+extern addr_t _xsave;
+extern addr_t _xsavec;
+extern addr_t _xrstor;
+uint64 gXsaveMask;
+uint64 gFPUSaveLength = 512;
+bool gHasXsave = false;
+bool gHasXsavec = false;
#endif
extern "C" void x86_reboot(void);
@@ -1406,6 +1413,20 @@ enable_smep(void* dummy, int cpu)
{
x86_write_cr4(x86_read_cr4() | IA32_CR4_SMEP);
}
+
+
+static void
+enable_osxsave(void* dummy, int cpu)
+{
+ x86_write_cr4(x86_read_cr4() | IA32_CR4_OSXSAVE);
+}
+
+
+static void
+enable_xsavemask(void* dummy, int cpu)
+{
+ xsetbv(0, gXsaveMask);
+}
#endif
@@ -1459,6 +1480,31 @@ arch_cpu_init_post_vm(kernel_args* args)
} else
dprintf("SMAP disabled per safemode setting\n");
}
+
+ // if available enable XSAVE (XSAVE and extended states)
+ gHasXsave = x86_check_feature(IA32_FEATURE_EXT_XSAVE, FEATURE_EXT);
+ if (gHasXsave) {
+ gHasXsavec = x86_check_feature(IA32_FEATURE_XSAVEC,
+ FEATURE_D_1_EAX);
+
+ call_all_cpus_sync(&enable_osxsave, NULL);
+ gXsaveMask = IA32_XCR0_X87 | IA32_XCR0_SSE;
+ cpuid_info cpuid;
+ get_current_cpuid(&cpuid, 0xd, 0);
+ gXsaveMask |= (cpuid.regs.eax & IA32_XCR0_AVX);
+ call_all_cpus_sync(&enable_xsavemask, NULL);
+ get_current_cpuid(&cpuid, 0xd, 0);
+ gFPUSaveLength = cpuid.regs.ebx;
+
+ arch_altcodepatch_replace(ALTCODEPATCH_TAG_XSAVE,
+ gHasXsavec ? &_xsavec : &_xsave, 4);
+ arch_altcodepatch_replace(ALTCODEPATCH_TAG_XRSTOR,
+ &_xrstor, 4);
+
+ dprintf("enable %s 0x%" B_PRIx64 " %" B_PRId64 "\n",
+ gHasXsavec ? "XSAVEC" : "XSAVE", gXsaveMask,
gFPUSaveLength);
+ }
+
#endif
return B_OK;
diff --git a/src/system/kernel/thread.cpp b/src/system/kernel/thread.cpp
index 4ea3f605c7..2fae73aa2f 100644
--- a/src/system/kernel/thread.cpp
+++ b/src/system/kernel/thread.cpp
@@ -2679,9 +2679,9 @@ thread_init(kernel_args *args)
panic("thread_init(): failed to init thread hash table!");
// create the thread structure object cache
- sThreadCache = create_object_cache("threads", sizeof(Thread), 16, NULL,
+ sThreadCache = create_object_cache("threads", sizeof(Thread), 64, NULL,
NULL, NULL);
- // Note: The x86 port requires 16 byte alignment of thread
structures.
+ // Note: The x86 port requires 64 byte alignment of thread
structures.
if (sThreadCache == NULL)
panic("thread_init(): failed to allocate thread object cache!");