Skip to content

Commit c5a6207

Browse files
committed
use neon intrinsics to initialize and retrieve vector elements
1 parent ed04d12 commit c5a6207

1 file changed

Lines changed: 21 additions & 94 deletions

File tree

common/math/emath.h

Lines changed: 21 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -62,20 +62,11 @@ namespace embree
6262
{
6363
#if defined(__aarch64__) || defined(_M_ARM64)
6464
// Move scalar to vector register and do rcp.
65-
__m128 a;
66-
#if !defined(_M_ARM64)
67-
a[0] = x;
68-
#else
69-
a.n128_f32[0] = x;
70-
#endif
65+
__m128 a = vdupq_n_f32(x);
7166
float32x4_t reciprocal = vrecpeq_f32(a);
7267
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
7368
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
74-
#if !defined(_M_ARM64)
75-
return reciprocal[0];
76-
#else
77-
return reciprocal.n128_f32[0];
78-
#endif
69+
return vgetq_lane_f32(reciprocal, 0);
7970
#else
8071

8172
const __m128 a = _mm_set_ss(x);
@@ -98,65 +89,32 @@ namespace embree
9889
__forceinline float signmsk ( const float x ) {
9990
#if defined(__aarch64__) || defined(_M_ARM64)
10091
// FP and Neon shares same vector register in arm64
101-
__m128 a;
102-
__m128i b;
103-
#if !defined(_M_ARM64)
104-
a[0] = x;
105-
b[0] = 0x80000000;
106-
#else
107-
a.n128_f32[0] = x;
108-
b.n128_i32[0] = 0x80000000;
109-
#endif
92+
__m128 a = vdupq_n_f32(x);
93+
__m128i b = vdupq_n_s32(0x80000000);
11094
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
111-
#if !defined(_M_ARM64)
112-
return a[0];
113-
#else
114-
return a.n128_f32[0];
115-
#endif
95+
return vgetq_lane_f32(a, 0);
11696
#else
11797
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
11898
#endif
11999
}
120100
__forceinline float xorf( const float x, const float y ) {
121101
#if defined(__aarch64__) || defined(_M_ARM64)
122102
// FP and Neon shares same vector register in arm64
123-
__m128 a;
124-
__m128 b;
125-
#if !defined(_M_ARM64)
126-
a[0] = x;
127-
b[0] = y;
128-
#else
129-
a.n128_f32[0] = x;
130-
b.n128_f32[0] = y;
131-
#endif
103+
__m128 a = vdupq_n_f32(x);
104+
__m128 b = vdupq_n_f32(y);
132105
a = _mm_xor_ps(a, b);
133-
#if !defined(_M_ARM64)
134-
return a[0];
135-
#else
136-
return a.n128_f32[0];
137-
#endif
106+
return vgetq_lane_f32(a, 0);
138107
#else
139108
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
140109
#endif
141110
}
142111
__forceinline float andf( const float x, const unsigned y ) {
143112
#if defined(__aarch64__) || defined(_M_ARM64)
144113
// FP and Neon shares same vector register in arm64
145-
__m128 a;
146-
__m128i b;
147-
#if !defined(_M_ARM64)
148-
a[0] = x;
149-
b[0] = y;
150-
#else
151-
a.n128_f32[0] = x;
152-
b.n128_u32[0] = y;
153-
#endif
114+
__m128 a = vdupq_n_f32(x);
115+
__m128i b = vdupq_n_u32(y);
154116
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
155-
#if !defined(_M_ARM64)
156-
return a[0];
157-
#else
158-
return a.n128_f32[0];
159-
#endif
117+
return vgetq_lane_f32(a, 0);
160118
#else
161119
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
162120
#endif
@@ -165,20 +123,11 @@ namespace embree
165123
{
166124
#if defined(__aarch64__) || defined(_M_ARM64)
167125
// FP and Neon shares same vector register in arm64
168-
__m128 a;
169-
#if !defined(_M_ARM64)
170-
a[0] = x;
171-
#else
172-
a.n128_f32[0] = x;
173-
#endif
126+
__m128 a = vdupq_n_f32(x);
174127
__m128 value = _mm_rsqrt_ps(a);
175128
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
176129
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
177-
#if !defined(_M_ARM64)
178-
return value[0];
179-
#else
180-
return value.n128_f32[0];
181-
#endif
130+
return vgetq_lane_f32(value, 0);
182131
#else
183132

184133
const __m128 a = _mm_set_ss(x);
@@ -249,22 +198,11 @@ namespace embree
249198

250199
#if defined(__aarch64__) || defined(_M_ARM64)
251200
__forceinline float mini(float a, float b) {
252-
// FP and Neon shares same vector register in arm64
253-
__m128 x;
254-
__m128 y;
255-
#if !defined(_M_ARM64)
256-
x[0] = a;
257-
y[0] = b;
258-
#else
259-
x.n128_f32[0] = a;
260-
y.n128_f32[0] = b;
261-
#endif
262-
x = _mm_min_ps(x, y);
263-
#if !defined(_M_ARM64)
264-
return x[0];
265-
#else
266-
return x.n128_f32[0];
267-
#endif
201+
// FP and Neon shares same vector register in arm64
202+
__m128 x = vdupq_n_f32(a);
203+
__m128 y = vdupq_n_f32(b);
204+
x = _mm_min_ps(x, y);
205+
return vgetq_lane_f32(x, 0);
268206
}
269207
#elif defined(__SSE4_1__)
270208
__forceinline float mini(float a, float b) {
@@ -278,21 +216,10 @@ namespace embree
278216
#if defined(__aarch64__) || defined(_M_ARM64)
279217
__forceinline float maxi(float a, float b) {
280218
// FP and Neon shares same vector register in arm64
281-
__m128 x;
282-
__m128 y;
283-
#if !defined(_M_ARM64)
284-
x[0] = a;
285-
y[0] = b;
286-
#else
287-
x.n128_f32[0] = a;
288-
y.n128_f32[0] = b;
289-
#endif
219+
__m128 x = vdupq_n_f32(a);
220+
__m128 y = vdupq_n_f32(b);
290221
x = _mm_max_ps(x, y);
291-
#if !defined(_M_ARM64)
292-
return x[0];
293-
#else
294-
return x.n128_f32[0];
295-
#endif
222+
return vgetq_lane_f32(x, 0);
296223
}
297224
#elif defined(__SSE4_1__)
298225
__forceinline float maxi(float a, float b) {

0 commit comments

Comments
 (0)