17 union alignas(16)
Simd<float, 4> {
25 res.reg = _mm_mul_ps(lhs.reg, rhs.reg);
31 res.reg = _mm_div_ps(lhs.reg, rhs.reg);
37 res.reg = _mm_add_ps(lhs.reg, rhs.reg);
43 res.reg = _mm_sub_ps(lhs.reg, rhs.reg);
49 __m128 tmp = _mm_set1_ps(rhs);
50 res.reg = _mm_mul_ps(lhs.reg, tmp);
56 __m128 tmp = _mm_set1_ps(rhs);
57 res.reg = _mm_div_ps(lhs.reg, tmp);
63 __m128 tmp = _mm_set1_ps(rhs);
64 res.reg = _mm_add_ps(lhs.reg, tmp);
70 __m128 tmp = _mm_set1_ps(rhs);
71 res.reg = _mm_sub_ps(lhs.reg, tmp);
81 res.reg = _mm_set1_ps(value);
85 static inline Simd set(
float x,
float y,
float z,
float w) {
87 res.reg = _mm_setr_ps(x, y, z, w);
93 static_assert(Count <= 4,
"Number of elements to dot must be smaller or equal to dimension.");
94 static_assert(0 < Count,
"Count must not be zero.");
98 for (
int i = 1; i < Count; ++i) {
104 template<
int i0,
int i1,
int i2,
int i3>
107 ret.regi = _mm_shuffle_epi32(arg.regi, _MM_SHUFFLE(i0, i1, i2, i3));
115 union alignas(16)
Simd<float, 8> {
122 res.reg[0] = _mm_mul_ps(lhs.reg[0], rhs.reg[0]);
123 res.reg[1] = _mm_mul_ps(lhs.reg[1], rhs.reg[1]);
129 res.reg[0] = _mm_div_ps(lhs.reg[0], rhs.reg[0]);
130 res.reg[1] = _mm_div_ps(lhs.reg[1], rhs.reg[1]);
136 res.reg[0] = _mm_add_ps(lhs.reg[0], rhs.reg[0]);
137 res.reg[1] = _mm_add_ps(lhs.reg[1], rhs.reg[1]);
143 res.reg[0] = _mm_sub_ps(lhs.reg[0], rhs.reg[0]);
144 res.reg[1] = _mm_sub_ps(lhs.reg[1], rhs.reg[1]);
150 __m128 tmp = _mm_set1_ps(rhs);
151 res.reg[0] = _mm_mul_ps(lhs.reg[0], tmp);
152 res.reg[1] = _mm_mul_ps(lhs.reg[1], tmp);
158 __m128 tmp = _mm_set1_ps(rhs);
159 res.reg[0] = _mm_div_ps(lhs.reg[0], tmp);
160 res.reg[1] = _mm_div_ps(lhs.reg[1], tmp);
166 __m128 tmp = _mm_set1_ps(rhs);
167 res.reg[0] = _mm_add_ps(lhs.reg[0], tmp);
168 res.reg[1] = _mm_add_ps(lhs.reg[1], tmp);
174 __m128 tmp = _mm_set1_ps(rhs);
175 res.reg[0] = _mm_sub_ps(lhs.reg[0], tmp);
176 res.reg[1] = _mm_sub_ps(lhs.reg[1], tmp);
186 res.reg[0] = _mm_set1_ps(value);
187 res.reg[1] = _mm_set1_ps(value);
191 static inline Simd set(
float a,
float b,
float c,
float d,
float e,
float f,
float g,
float h) {
193 res.reg[0] = _mm_setr_ps(a, b, c, d);
194 res.reg[1] = _mm_setr_ps(e, f, g, h);
201 static_assert(Count <= 8,
"Number of elements to dot must be smaller or equal to dimension.");
202 static_assert(0 < Count,
"Count must not be zero.");
204 reg1 = _mm_mul_ps(lhs.reg[0], rhs.reg[0]);
205 reg2 = _mm_mul_ps(lhs.reg[1], rhs.reg[1]);
207 for (
int i = 7; i >= Count && i >= 4; --i) {
208 reinterpret_cast<float *
>(®2)[i] = 0.0f;
210 for (
int i = 3; i >= Count && i >= 0; --i) {
211 reinterpret_cast<float *
>(®1)[i] = 0.0f;
215 reg1 = _mm_add_ps(reg1, reg2);
216 sum =
reinterpret_cast<float *
>(®1)[0]
217 + reinterpret_cast<float *>(®1)[1]
218 +
reinterpret_cast<float *
>(®1)[2]
219 + reinterpret_cast<float *>(®1)[3];
225 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
228 ret.
v[7] = arg.
v[i0];
229 ret.
v[6] = arg.
v[i1];
230 ret.
v[5] = arg.
v[i2];
231 ret.
v[4] = arg.
v[i3];
232 ret.
v[3] = arg.
v[i4];
233 ret.
v[2] = arg.
v[i5];
234 ret.
v[1] = arg.
v[i6];
235 ret.
v[0] = arg.
v[i7];
249 union alignas(16)
Simd<double, 2> {
256 res.reg = _mm_mul_pd(lhs.reg, rhs.reg);
262 res.reg = _mm_div_pd(lhs.reg, rhs.reg);
268 res.reg = _mm_add_pd(lhs.reg, rhs.reg);
274 res.reg = _mm_sub_pd(lhs.reg, rhs.reg);
280 __m128d tmp = _mm_set1_pd(rhs);
281 res.reg = _mm_mul_pd(lhs.reg, tmp);
287 __m128d tmp = _mm_set1_pd(rhs);
288 res.reg = _mm_div_pd(lhs.reg, tmp);
294 __m128d tmp = _mm_set1_pd(rhs);
295 res.reg = _mm_add_pd(lhs.reg, tmp);
301 __m128d tmp = _mm_set1_pd(rhs);
302 res.reg = _mm_sub_pd(lhs.reg, tmp);
312 res.reg = _mm_set1_pd(value);
316 static inline Simd set(
double x,
double y) {
318 res.reg = _mm_setr_pd(x, y);
324 static_assert(Count <= 2,
"Number of elements to dot must be smaller or equal to dimension.");
325 static_assert(0 < Count,
"Count must not be zero.");
329 for (
int i = 1; i < Count; ++i) {
335 template<
int i0,
int i1>
338 ret.reg = _mm_shuffle_pd(arg.reg, arg.reg, _MM_SHUFFLE2(i0, i1));
347 union alignas(16)
Simd<double, 4> {
354 res.reg[0] = _mm_mul_pd(lhs.reg[0], rhs.reg[0]);
355 res.reg[1] = _mm_mul_pd(lhs.reg[1], rhs.reg[1]);
361 res.reg[0] = _mm_div_pd(lhs.reg[0], rhs.reg[0]);
362 res.reg[1] = _mm_div_pd(lhs.reg[1], rhs.reg[1]);
368 res.reg[0] = _mm_add_pd(lhs.reg[0], rhs.reg[0]);
369 res.reg[1] = _mm_add_pd(lhs.reg[1], rhs.reg[1]);
375 res.reg[0] = _mm_sub_pd(lhs.reg[0], rhs.reg[0]);
376 res.reg[1] = _mm_sub_pd(lhs.reg[1], rhs.reg[1]);
382 __m128d tmp = _mm_set1_pd(rhs);
383 res.reg[0] = _mm_mul_pd(lhs.reg[0], tmp);
384 res.reg[1] = _mm_mul_pd(lhs.reg[1], tmp);
390 __m128d tmp = _mm_set1_pd(rhs);
391 res.reg[0] = _mm_div_pd(lhs.reg[0], tmp);
392 res.reg[1] = _mm_div_pd(lhs.reg[1], tmp);
398 __m128d tmp = _mm_set1_pd(rhs);
399 res.reg[0] = _mm_add_pd(lhs.reg[0], tmp);
400 res.reg[1] = _mm_add_pd(lhs.reg[1], tmp);
406 __m128d tmp = _mm_set1_pd(rhs);
407 res.reg[0] = _mm_sub_pd(lhs.reg[0], tmp);
408 res.reg[1] = _mm_sub_pd(lhs.reg[1], tmp);
418 res.reg[0] = _mm_set1_pd(value);
419 res.reg[1] = _mm_set1_pd(value);
423 static inline Simd set(
double x,
double y,
double z,
double w) {
425 res.reg[0] = _mm_setr_pd(x, y);
426 res.reg[1] = _mm_setr_pd(z, w);
433 static_assert(Count <= 4,
"Number of elements to dot must be smaller or equal to dimension.");
434 static_assert(0 < Count,
"Count must not be zero.");
436 regs[0] = _mm_mul_pd(lhs.reg[0], rhs.reg[0]);
437 regs[1] = _mm_mul_pd(lhs.reg[1], rhs.reg[1]);
439 for (
int i = 3; i >= Count; --i) {
440 reinterpret_cast<double *
>(®s)[i] = 0.0;
444 regs[0] = _mm_add_pd(regs[0], regs[1]);
445 sum =
reinterpret_cast<double *
>(®s[0])[0] + reinterpret_cast<double *>(®s[0])[1];
451 template<
int i0,
int i1,
int i2,
int i3>
454 ret.
v[3] = arg.
v[i0];
455 ret.
v[2] = arg.
v[i1];
456 ret.
v[1] = arg.
v[i2];
457 ret.
v[0] = arg.
v[i3];
static Simd spread(float value)
Definition: Simd_SSE2.hpp:79
static Simd div(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:54
static Simd shuffle(const Simd &arg)
Definition: Simd_SSE2.hpp:226
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:352
static Simd sub(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:172
static Simd sub(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:141
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:35
static Simd sub(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:41
static Simd mad(const Simd &a, const Simd &b, const Simd &c)
Definition: Simd_SSE2.hpp:75
__m128 reg
Definition: Simd_SSE2.hpp:18
T v[Dim]
Definition: Simd.hpp:23
static Simd spread(float value)
Definition: Simd_SSE2.hpp:184
static double dot(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:323
static Simd sub(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:68
static float dot(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:92
static Simd div(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:260
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:266
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:254
static Simd add(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:396
static Simd div(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:127
static Simd spread(double value)
Definition: Simd_SSE2.hpp:310
static Simd spread(double value)
Definition: Simd_SSE2.hpp:416
static double dot(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:432
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:120
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:366
static float dot(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:200
static Simd shuffle(const Simd &arg)
Definition: Simd_SSE2.hpp:336
Definition: Approx.hpp:11
static Simd mul(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:47
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd.hpp:32
static Simd add(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:164
static Simd mul(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:380
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd.hpp:46
static Simd sub(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:272
static Simd div(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:388
static Simd div(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:156
static Simd mad(const Simd &a, const Simd &b, const Simd &c)
Definition: Simd_SSE2.hpp:306
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:23
static Simd sub(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:373
__m128d reg
Definition: Simd_SSE2.hpp:250
static Simd div(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:285
static Simd shuffle(const Simd &arg)
Definition: Simd_SSE2.hpp:452
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:134
2,4 or 8 dimension float or double parameters accepted. Uses SSE2 or AVX acceleration if enabled in t...
Definition: Simd.hpp:22
__m128i regi
Definition: Simd_SSE2.hpp:19
static Simd add(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:61
static Simd mad(const Simd &a, const Simd &b, const Simd &c)
Definition: Simd_SSE2.hpp:180
static Simd div(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:359
static Simd shuffle(const Simd &arg)
Definition: Simd_SSE2.hpp:105
static Simd mad(const Simd &a, const Simd &b, const Simd &c)
Definition: Simd_SSE2.hpp:412
static Simd sub(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:299
static Simd sub(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:404
static Simd div(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:29
static Simd add(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:292
static Simd mul(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:148
static Simd mul(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:278