Mathter
A configurable 3D math library for game developers.
Simd_SSE2.hpp
Go to the documentation of this file.
1 //==============================================================================
2 // This software is distributed under The Unlicense.
3 // For more information, please refer to <http://unlicense.org/>
4 //==============================================================================
5 
6 #pragma once
7 
8 #include <emmintrin.h>
9 
10 namespace mathter {
11 //------------------------------------------------------------------------------
12 // FLOAT
13 //------------------------------------------------------------------------------
14 
15 // Specialization for float4, using SSE
16 template<>
17 union alignas(16) Simd<float, 4> {
18  __m128 reg;
19  __m128i regi;
20  float v[4];
21 
22 
23  static inline Simd mul(const Simd &lhs, const Simd &rhs) {
24  Simd res;
25  res.reg = _mm_mul_ps(lhs.reg, rhs.reg);
26  return res;
27  }
28 
29  static inline Simd div(const Simd &lhs, const Simd &rhs) {
30  Simd res;
31  res.reg = _mm_div_ps(lhs.reg, rhs.reg);
32  return res;
33  }
34 
35  static inline Simd add(const Simd &lhs, const Simd &rhs) {
36  Simd res;
37  res.reg = _mm_add_ps(lhs.reg, rhs.reg);
38  return res;
39  }
40 
41  static inline Simd sub(const Simd &lhs, const Simd &rhs) {
42  Simd res;
43  res.reg = _mm_sub_ps(lhs.reg, rhs.reg);
44  return res;
45  }
46 
47  static inline Simd mul(const Simd &lhs, float rhs) {
48  Simd res;
49  __m128 tmp = _mm_set1_ps(rhs);
50  res.reg = _mm_mul_ps(lhs.reg, tmp);
51  return res;
52  }
53 
54  static inline Simd div(const Simd &lhs, float rhs) {
55  Simd res;
56  __m128 tmp = _mm_set1_ps(rhs);
57  res.reg = _mm_div_ps(lhs.reg, tmp);
58  return res;
59  }
60 
61  static inline Simd add(const Simd &lhs, float rhs) {
62  Simd res;
63  __m128 tmp = _mm_set1_ps(rhs);
64  res.reg = _mm_add_ps(lhs.reg, tmp);
65  return res;
66  }
67 
68  static inline Simd sub(const Simd &lhs, float rhs) {
69  Simd res;
70  __m128 tmp = _mm_set1_ps(rhs);
71  res.reg = _mm_sub_ps(lhs.reg, tmp);
72  return res;
73  }
74 
75  static inline Simd mad(const Simd &a, const Simd &b, const Simd &c) {
76  return add(mul(a, b), c);
77  }
78 
79  static inline Simd spread(float value) {
80  Simd res;
81  res.reg = _mm_set1_ps(value);
82  return res;
83  }
84 
85  static inline Simd set(float x, float y, float z, float w) {
86  Simd res;
87  res.reg = _mm_setr_ps(x, y, z, w);
88  return res;
89  }
90 
91  template<int Count>
92  static inline float dot(const Simd &lhs, const Simd &rhs) {
93  static_assert(Count <= 4, "Number of elements to dot must be smaller or equal to dimension.");
94  static_assert(0 < Count, "Count must not be zero.");
95  float sum;
96  Simd m = mul(lhs, rhs);
97  sum = m.v[0];
98  for (int i = 1; i < Count; ++i) {
99  sum += m.v[i];
100  }
101  return sum;
102  }
103 
104  template<int i0, int i1, int i2, int i3>
105  static inline Simd shuffle(const Simd &arg) {
106  Simd ret;
107  ret.regi = _mm_shuffle_epi32(arg.regi, _MM_SHUFFLE(i0, i1, i2, i3));
108  return ret;
109  }
110 };
111 
112 
113 // Specialization for float8, using SSE
114 template<>
115 union alignas(16) Simd<float, 8> {
116  __m128 reg[2];
117  float v[8];
118 
119 
120  static inline Simd mul(const Simd &lhs, const Simd &rhs) {
121  Simd res;
122  res.reg[0] = _mm_mul_ps(lhs.reg[0], rhs.reg[0]);
123  res.reg[1] = _mm_mul_ps(lhs.reg[1], rhs.reg[1]);
124  return res;
125  }
126 
127  static inline Simd div(const Simd &lhs, const Simd &rhs) {
128  Simd res;
129  res.reg[0] = _mm_div_ps(lhs.reg[0], rhs.reg[0]);
130  res.reg[1] = _mm_div_ps(lhs.reg[1], rhs.reg[1]);
131  return res;
132  }
133 
134  static inline Simd add(const Simd &lhs, const Simd &rhs) {
135  Simd res;
136  res.reg[0] = _mm_add_ps(lhs.reg[0], rhs.reg[0]);
137  res.reg[1] = _mm_add_ps(lhs.reg[1], rhs.reg[1]);
138  return res;
139  }
140 
141  static inline Simd sub(const Simd &lhs, const Simd &rhs) {
142  Simd res;
143  res.reg[0] = _mm_sub_ps(lhs.reg[0], rhs.reg[0]);
144  res.reg[1] = _mm_sub_ps(lhs.reg[1], rhs.reg[1]);
145  return res;
146  }
147 
148  static inline Simd mul(const Simd &lhs, float rhs) {
149  Simd res;
150  __m128 tmp = _mm_set1_ps(rhs);
151  res.reg[0] = _mm_mul_ps(lhs.reg[0], tmp);
152  res.reg[1] = _mm_mul_ps(lhs.reg[1], tmp);
153  return res;
154  }
155 
156  static inline Simd div(const Simd &lhs, float rhs) {
157  Simd res;
158  __m128 tmp = _mm_set1_ps(rhs);
159  res.reg[0] = _mm_div_ps(lhs.reg[0], tmp);
160  res.reg[1] = _mm_div_ps(lhs.reg[1], tmp);
161  return res;
162  }
163 
164  static inline Simd add(const Simd &lhs, float rhs) {
165  Simd res;
166  __m128 tmp = _mm_set1_ps(rhs);
167  res.reg[0] = _mm_add_ps(lhs.reg[0], tmp);
168  res.reg[1] = _mm_add_ps(lhs.reg[1], tmp);
169  return res;
170  }
171 
172  static inline Simd sub(const Simd &lhs, float rhs) {
173  Simd res;
174  __m128 tmp = _mm_set1_ps(rhs);
175  res.reg[0] = _mm_sub_ps(lhs.reg[0], tmp);
176  res.reg[1] = _mm_sub_ps(lhs.reg[1], tmp);
177  return res;
178  }
179 
180  static inline Simd mad(const Simd &a, const Simd &b, const Simd &c) {
181  return add(mul(a, b), c);
182  }
183 
184  static inline Simd spread(float value) {
185  Simd res;
186  res.reg[0] = _mm_set1_ps(value);
187  res.reg[1] = _mm_set1_ps(value);
188  return res;
189  }
190 
191  static inline Simd set(float a, float b, float c, float d, float e, float f, float g, float h) {
192  Simd res;
193  res.reg[0] = _mm_setr_ps(a, b, c, d);
194  res.reg[1] = _mm_setr_ps(e, f, g, h);
195  return res;
196  }
197 
198 
199  template<int Count>
200  static inline float dot(const Simd &lhs, const Simd &rhs) {
201  static_assert(Count <= 8, "Number of elements to dot must be smaller or equal to dimension.");
202  static_assert(0 < Count, "Count must not be zero.");
203  __m128 reg1, reg2;
204  reg1 = _mm_mul_ps(lhs.reg[0], rhs.reg[0]);
205  reg2 = _mm_mul_ps(lhs.reg[1], rhs.reg[1]);
206 
207  for (int i = 7; i >= Count && i >= 4; --i) {
208  reinterpret_cast<float *>(&reg2)[i] = 0.0f;
209  }
210  for (int i = 3; i >= Count && i >= 0; --i) {
211  reinterpret_cast<float *>(&reg1)[i] = 0.0f;
212  }
213 
214  float sum;
215  reg1 = _mm_add_ps(reg1, reg2);
216  sum = reinterpret_cast<float *>(&reg1)[0]
217  + reinterpret_cast<float *>(&reg1)[1]
218  + reinterpret_cast<float *>(&reg1)[2]
219  + reinterpret_cast<float *>(&reg1)[3];
220 
221  return sum;
222  }
223 
224 
225  template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
226  static inline Simd shuffle(const Simd &arg) {
227  Simd ret;
228  ret.v[7] = arg.v[i0];
229  ret.v[6] = arg.v[i1];
230  ret.v[5] = arg.v[i2];
231  ret.v[4] = arg.v[i3];
232  ret.v[3] = arg.v[i4];
233  ret.v[2] = arg.v[i5];
234  ret.v[1] = arg.v[i6];
235  ret.v[0] = arg.v[i7];
236  return ret;
237  }
238 };
239 
240 
241 //------------------------------------------------------------------------------
242 // DOUBLE
243 //------------------------------------------------------------------------------
244 
245 
246 
247 // Specialization for double2, using SSE
248 template<>
249 union alignas(16) Simd<double, 2> {
250  __m128d reg;
251  double v[4];
252 
253 
254  static inline Simd mul(const Simd &lhs, const Simd &rhs) {
255  Simd res;
256  res.reg = _mm_mul_pd(lhs.reg, rhs.reg);
257  return res;
258  }
259 
260  static inline Simd div(const Simd &lhs, const Simd &rhs) {
261  Simd res;
262  res.reg = _mm_div_pd(lhs.reg, rhs.reg);
263  return res;
264  }
265 
266  static inline Simd add(const Simd &lhs, const Simd &rhs) {
267  Simd res;
268  res.reg = _mm_add_pd(lhs.reg, rhs.reg);
269  return res;
270  }
271 
272  static inline Simd sub(const Simd &lhs, const Simd &rhs) {
273  Simd res;
274  res.reg = _mm_sub_pd(lhs.reg, rhs.reg);
275  return res;
276  }
277 
278  static inline Simd mul(const Simd &lhs, double rhs) {
279  Simd res;
280  __m128d tmp = _mm_set1_pd(rhs);
281  res.reg = _mm_mul_pd(lhs.reg, tmp);
282  return res;
283  }
284 
285  static inline Simd div(const Simd &lhs, double rhs) {
286  Simd res;
287  __m128d tmp = _mm_set1_pd(rhs);
288  res.reg = _mm_div_pd(lhs.reg, tmp);
289  return res;
290  }
291 
292  static inline Simd add(const Simd &lhs, double rhs) {
293  Simd res;
294  __m128d tmp = _mm_set1_pd(rhs);
295  res.reg = _mm_add_pd(lhs.reg, tmp);
296  return res;
297  }
298 
299  static inline Simd sub(const Simd &lhs, double rhs) {
300  Simd res;
301  __m128d tmp = _mm_set1_pd(rhs);
302  res.reg = _mm_sub_pd(lhs.reg, tmp);
303  return res;
304  }
305 
306  static inline Simd mad(const Simd &a, const Simd &b, const Simd &c) {
307  return add(mul(a, b), c);
308  }
309 
310  static inline Simd spread(double value) {
311  Simd res;
312  res.reg = _mm_set1_pd(value);
313  return res;
314  }
315 
316  static inline Simd set(double x, double y) {
317  Simd res;
318  res.reg = _mm_setr_pd(x, y);
319  return res;
320  }
321 
322  template<int Count>
323  static inline double dot(const Simd &lhs, const Simd &rhs) {
324  static_assert(Count <= 2, "Number of elements to dot must be smaller or equal to dimension.");
325  static_assert(0 < Count, "Count must not be zero.");
326  double sum;
327  Simd m = mul(lhs, rhs);
328  sum = m.v[0];
329  for (int i = 1; i < Count; ++i) {
330  sum += m.v[i];
331  }
332  return sum;
333  }
334 
335  template<int i0, int i1>
336  static inline Simd shuffle(const Simd &arg) {
337  Simd ret;
338  ret.reg = _mm_shuffle_pd(arg.reg, arg.reg, _MM_SHUFFLE2(i0, i1));
339  return ret;
340  }
341 };
342 
343 
344 // Specialization for double4, using SSE
345 //*
346 template<>
347 union alignas(16) Simd<double, 4> {
348  __m128d reg[2];
349  double v[4];
350 
351 
352  static inline Simd mul(const Simd &lhs, const Simd &rhs) {
353  Simd res;
354  res.reg[0] = _mm_mul_pd(lhs.reg[0], rhs.reg[0]);
355  res.reg[1] = _mm_mul_pd(lhs.reg[1], rhs.reg[1]);
356  return res;
357  }
358 
359  static inline Simd div(const Simd &lhs, const Simd &rhs) {
360  Simd res;
361  res.reg[0] = _mm_div_pd(lhs.reg[0], rhs.reg[0]);
362  res.reg[1] = _mm_div_pd(lhs.reg[1], rhs.reg[1]);
363  return res;
364  }
365 
366  static inline Simd add(const Simd &lhs, const Simd &rhs) {
367  Simd res;
368  res.reg[0] = _mm_add_pd(lhs.reg[0], rhs.reg[0]);
369  res.reg[1] = _mm_add_pd(lhs.reg[1], rhs.reg[1]);
370  return res;
371  }
372 
373  static inline Simd sub(const Simd &lhs, const Simd &rhs) {
374  Simd res;
375  res.reg[0] = _mm_sub_pd(lhs.reg[0], rhs.reg[0]);
376  res.reg[1] = _mm_sub_pd(lhs.reg[1], rhs.reg[1]);
377  return res;
378  }
379 
380  static inline Simd mul(const Simd &lhs, double rhs) {
381  Simd res;
382  __m128d tmp = _mm_set1_pd(rhs);
383  res.reg[0] = _mm_mul_pd(lhs.reg[0], tmp);
384  res.reg[1] = _mm_mul_pd(lhs.reg[1], tmp);
385  return res;
386  }
387 
388  static inline Simd div(const Simd &lhs, double rhs) {
389  Simd res;
390  __m128d tmp = _mm_set1_pd(rhs);
391  res.reg[0] = _mm_div_pd(lhs.reg[0], tmp);
392  res.reg[1] = _mm_div_pd(lhs.reg[1], tmp);
393  return res;
394  }
395 
396  static inline Simd add(const Simd &lhs, double rhs) {
397  Simd res;
398  __m128d tmp = _mm_set1_pd(rhs);
399  res.reg[0] = _mm_add_pd(lhs.reg[0], tmp);
400  res.reg[1] = _mm_add_pd(lhs.reg[1], tmp);
401  return res;
402  }
403 
404  static inline Simd sub(const Simd &lhs, double rhs) {
405  Simd res;
406  __m128d tmp = _mm_set1_pd(rhs);
407  res.reg[0] = _mm_sub_pd(lhs.reg[0], tmp);
408  res.reg[1] = _mm_sub_pd(lhs.reg[1], tmp);
409  return res;
410  }
411 
412  static inline Simd mad(const Simd &a, const Simd &b, const Simd &c) {
413  return add(mul(a, b), c);
414  }
415 
416  static inline Simd spread(double value) {
417  Simd res;
418  res.reg[0] = _mm_set1_pd(value);
419  res.reg[1] = _mm_set1_pd(value);
420  return res;
421  }
422 
423  static inline Simd set(double x, double y, double z, double w) {
424  Simd res;
425  res.reg[0] = _mm_setr_pd(x, y);
426  res.reg[1] = _mm_setr_pd(z, w);
427  return res;
428  }
429 
430 
431  template<int Count>
432  static inline double dot(const Simd &lhs, const Simd &rhs) {
433  static_assert(Count <= 4, "Number of elements to dot must be smaller or equal to dimension.");
434  static_assert(0 < Count, "Count must not be zero.");
435  __m128d regs[2];
436  regs[0] = _mm_mul_pd(lhs.reg[0], rhs.reg[0]);
437  regs[1] = _mm_mul_pd(lhs.reg[1], rhs.reg[1]);
438 
439  for (int i = 3; i >= Count; --i) {
440  reinterpret_cast<double *>(&regs)[i] = 0.0;
441  }
442 
443  double sum;
444  regs[0] = _mm_add_pd(regs[0], regs[1]);
445  sum = reinterpret_cast<double *>(&regs[0])[0] + reinterpret_cast<double *>(&regs[0])[1];
446 
447  return sum;
448  }
449 
450 
451  template<int i0, int i1, int i2, int i3>
452  static inline Simd shuffle(const Simd &arg) {
453  Simd ret;
454  ret.v[3] = arg.v[i0];
455  ret.v[2] = arg.v[i1];
456  ret.v[1] = arg.v[i2];
457  ret.v[0] = arg.v[i3];
458  return ret;
459  }
460 };
461 //*/
462 
463 } // namespace mathter
static Simd spread(float value)
Definition: Simd_SSE2.hpp:79
static Simd div(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:54
static Simd shuffle(const Simd &arg)
Definition: Simd_SSE2.hpp:226
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:352
static Simd sub(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:172
static Simd sub(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:141
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:35
static Simd sub(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:41
static Simd mad(const Simd &a, const Simd &b, const Simd &c)
Definition: Simd_SSE2.hpp:75
__m128 reg
Definition: Simd_SSE2.hpp:18
T v[Dim]
Definition: Simd.hpp:23
static Simd spread(float value)
Definition: Simd_SSE2.hpp:184
static double dot(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:323
static Simd sub(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:68
static float dot(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:92
static Simd div(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:260
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:266
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:254
static Simd add(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:396
static Simd div(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:127
static Simd spread(double value)
Definition: Simd_SSE2.hpp:310
static Simd spread(double value)
Definition: Simd_SSE2.hpp:416
static double dot(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:432
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:120
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:366
static float dot(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:200
static Simd shuffle(const Simd &arg)
Definition: Simd_SSE2.hpp:336
Definition: Approx.hpp:11
static Simd mul(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:47
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd.hpp:32
static Simd add(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:164
static Simd mul(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:380
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd.hpp:46
static Simd sub(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:272
static Simd div(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:388
static Simd div(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:156
static Simd mad(const Simd &a, const Simd &b, const Simd &c)
Definition: Simd_SSE2.hpp:306
static Simd mul(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:23
static Simd sub(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:373
__m128d reg
Definition: Simd_SSE2.hpp:250
static Simd div(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:285
static Simd shuffle(const Simd &arg)
Definition: Simd_SSE2.hpp:452
static Simd add(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:134
2,4 or 8 dimension float or double parameters accepted. Uses SSE2 or AVX acceleration if enabled in t...
Definition: Simd.hpp:22
__m128i regi
Definition: Simd_SSE2.hpp:19
static Simd add(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:61
static Simd mad(const Simd &a, const Simd &b, const Simd &c)
Definition: Simd_SSE2.hpp:180
static Simd div(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:359
static Simd shuffle(const Simd &arg)
Definition: Simd_SSE2.hpp:105
static Simd mad(const Simd &a, const Simd &b, const Simd &c)
Definition: Simd_SSE2.hpp:412
static Simd sub(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:299
static Simd sub(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:404
static Simd div(const Simd &lhs, const Simd &rhs)
Definition: Simd_SSE2.hpp:29
static Simd add(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:292
static Simd mul(const Simd &lhs, float rhs)
Definition: Simd_SSE2.hpp:148
static Simd mul(const Simd &lhs, double rhs)
Definition: Simd_SSE2.hpp:278