1
0
Fork 0
mirror of https://github.com/beefytech/Beef.git synced 2025-07-08 09:16:00 +02:00

Merge pull request #1824 from MineGame159/simd_improvements

Simd improvements
This commit is contained in:
Brian Fiete 2023-04-17 11:47:11 -07:00 committed by GitHub
commit a1dbea2574
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 374 additions and 25 deletions

View file

@ -27,6 +27,12 @@ namespace System.Numerics
public extern float4 wzyx { [Intrinsic("shuffle3210")] get; [Intrinsic("shuffle3210")] set; }
[Intrinsic("min")]
public static extern float4 min(float4 lhs, float4 rhs);
[Intrinsic("max")]
public static extern float4 max(float4 lhs, float4 rhs);
[Intrinsic("add")]
public static extern float4 operator+(float4 lhs, float4 rhs);
[Intrinsic("add"), Commutable]

View file

@ -2,8 +2,21 @@ namespace System.Numerics.X86
{
static class SSE
{
[Intrinsic(":add_ps")]
public static extern v128 add_ps(v128 a, v128 b);
public static bool IsSupported => Runtime.Features.SSE;
[Inline]
public static v128 add_ps(v128 a, v128 b) => (.) ((float4) a + (float4) b);
[Inline]
public static v128 sub_ps(v128 a, v128 b) => (.) ((float4) a - (float4) b);
[Inline]
public static v128 mul_ps(v128 a, v128 b) => (.) ((float4) a * (float4) b);
[Inline]
public static v128 div_ps(v128 a, v128 b) => (.) ((float4) a / (float4) b);
[Inline]
public static v128 min_ps(v128 a, v128 b) => (.) float4.min((.) a, (.) b);
[Inline]
public static v128 max_ps(v128 a, v128 b) => (.) float4.max((.) a, (.) b);
[Inline]
public static v128 add_ss(v128 a, v128 b)
@ -99,8 +112,6 @@ namespace System.Numerics.X86
public static extern int32 cvt_ss2si(v128 a);
public static extern v128 div_ps(v128 a, v128 b);
public static extern v128 div_ss(v128 a, v128 b);
public static extern v128 loadu_ps(void* ptr);
@ -111,12 +122,8 @@ namespace System.Numerics.X86
public static extern v128 load_ps(void* ptr);
public static extern v128 max_ps(v128 a, v128 b);
public static extern v128 max_ss(v128 a, v128 b);
public static extern v128 min_ps(v128 a, v128 b);
public static extern v128 min_ss(v128 a, v128 b);
public static extern v128 movehl_ps(v128 a, v128 b);
@ -127,8 +134,6 @@ namespace System.Numerics.X86
public static extern v128 move_ss(v128 a, v128 b);
public static extern v128 mul_ps(v128 a, v128 b);
public static extern v128 mul_ss(v128 a, v128 b);
public static extern v128 or_ps(v128 a, v128 b);
@ -169,8 +174,6 @@ namespace System.Numerics.X86
public static extern void stream_ps(void* mem_addr, v128 a);
public static extern v128 sub_ps(v128 a, v128 b);
public static extern v128 sub_ss(v128 a, v128 b);
public static extern void TRANSPOSE4_PS(ref v128 row0, ref v128 row1, ref v128 row2, ref v128 row3);

View file

@ -2,5 +2,6 @@ namespace System.Numerics.X86
{
static class SSE2
{
public static bool IsSupported => Runtime.Features.SSE2;
}
}

View file

@ -6,6 +6,12 @@ using System.Collections;
namespace System
{
struct RuntimeFeatures
{
public bool SSE, SSE2;
public bool AVX, AVX2, AVX512;
}
[StaticInitPriority(101)]
static class Runtime
{
@ -359,6 +365,9 @@ namespace System
static List<ErrorHandler> sErrorHandlers ~ DeleteContainerAndItems!(_);
static bool sInsideErrorHandler;
static bool sQueriedFeatures = false;
static RuntimeFeatures sFeatures;
public static this()
{
BfRtCallbacks.sCallbacks.Init();
@ -466,5 +475,91 @@ namespace System
}
return .ContinueFailure;
}
public static RuntimeFeatures Features
{
get
{
if (!sQueriedFeatures)
{
#if BF_MACHINE_X86 || BF_MACHINE_X64
QueryFeaturesX86();
#else
sFeatures = .();
sQueriedFeatures = true;
#endif
}
return sFeatures;
}
}
#if BF_MACHINE_X86 || BF_MACHINE_X64
private static void QueryFeaturesX86()
{
sFeatures = .();
sQueriedFeatures = true;
uint32 _ = 0;
// 0: Basic information
uint32 maxBasicLeaf = 0;
cpuid(0, 0, &maxBasicLeaf, &_, &_, &_);
if (maxBasicLeaf < 1)
{
// Earlier Intel 486, CPUID not implemented
return;
}
// 1: Processor Info and Feature Bits
uint32 procInfoEcx = 0;
uint32 procInfoEdx = 0;
cpuid(1, 0, &_, &_, &procInfoEcx, &procInfoEdx);
sFeatures.SSE = (procInfoEdx & (1 << 25)) != 0;
sFeatures.SSE2 = (procInfoEdx & (1 << 26)) != 0;
// 7: Extended Features
uint32 extendedFeaturesEbx = 0;
cpuid(7, 0, &_, &extendedFeaturesEbx, &_, &_);
// `XSAVE` and `AVX` support:
if ((procInfoEcx & (1 << 26)) != 0)
{
// Here the CPU supports `XSAVE`
// Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
// supports saving the state of the AVX/AVX2 vector registers on
// context-switches
if ((procInfoEcx & (1 << 27)) != 0)
{
// The OS must have signaled the CPU that it supports saving and restoring the
uint64 xcr0 = xgetbv(0);
bool avxSupport = (xcr0 & 6) == 6;
bool avx512Support = (xcr0 & 224) == 224;
// Only if the OS and the CPU support saving/restoring the AVX registers we enable `xsave` support
if (avxSupport)
{
sFeatures.AVX = (procInfoEcx & (1 << 28)) != 0;
sFeatures.AVX2 = (extendedFeaturesEbx & (1 << 5)) != 0;
// For AVX-512 the OS also needs to support saving/restoring
// the extended state, only then we enable AVX-512 support:
if (avx512Support)
sFeatures.AVX512 = (extendedFeaturesEbx & (1 << 16)) != 0;
}
}
}
}
[Intrinsic("cpuid")]
private static extern void cpuid(uint32 leaf, uint32 subleaf, uint32* eax, uint32* ebx, uint32* ecx, uint32* edx);
[Intrinsic("xgetbv")]
private static extern uint64 xgetbv(uint32 xcr);
#endif
}
}