@@ -62,20 +62,11 @@ namespace embree
6262 {
6363#if defined(__aarch64__) || defined(_M_ARM64)
6464 // Move scalar to vector register and do rcp.
65- __m128 a;
66- #if !defined(_M_ARM64)
67- a[0 ] = x;
68- #else
69- a.n128_f32 [0 ] = x;
70- #endif
65+ __m128 a = vdupq_n_f32 (x);
7166 float32x4_t reciprocal = vrecpeq_f32 (a);
7267 reciprocal = vmulq_f32 (vrecpsq_f32 (a, reciprocal), reciprocal);
7368 reciprocal = vmulq_f32 (vrecpsq_f32 (a, reciprocal), reciprocal);
74- #if !defined(_M_ARM64)
75- return reciprocal[0 ];
76- #else
77- return reciprocal.n128_f32 [0 ];
78- #endif
69+ return vgetq_lane_f32 (reciprocal, 0 );
7970#else
8071
8172 const __m128 a = _mm_set_ss (x);
@@ -98,65 +89,32 @@ namespace embree
9889 __forceinline float signmsk ( const float x ) {
9990#if defined(__aarch64__) || defined(_M_ARM64)
10091 // FP and Neon shares same vector register in arm64
101- __m128 a;
102- __m128i b;
103- #if !defined(_M_ARM64)
104- a[0 ] = x;
105- b[0 ] = 0x80000000 ;
106- #else
107- a.n128_f32 [0 ] = x;
108- b.n128_i32 [0 ] = 0x80000000 ;
109- #endif
92+ __m128 a = vdupq_n_f32 (x);
93+ __m128i b = vdupq_n_s32 (0x80000000 );
11094 a = _mm_and_ps (a, vreinterpretq_f32_s32 (b));
111- #if !defined(_M_ARM64)
112- return a[0 ];
113- #else
114- return a.n128_f32 [0 ];
115- #endif
95+ return vgetq_lane_f32 (a, 0 );
11696#else
11797 return _mm_cvtss_f32 (_mm_and_ps (_mm_set_ss (x),_mm_castsi128_ps (_mm_set1_epi32 (0x80000000 ))));
11898#endif
11999 }
120100 __forceinline float xorf ( const float x, const float y ) {
121101#if defined(__aarch64__) || defined(_M_ARM64)
122102 // FP and Neon shares same vector register in arm64
123- __m128 a;
124- __m128 b;
125- #if !defined(_M_ARM64)
126- a[0 ] = x;
127- b[0 ] = y;
128- #else
129- a.n128_f32 [0 ] = x;
130- b.n128_f32 [0 ] = y;
131- #endif
103+ __m128 a = vdupq_n_f32 (x);
104+ __m128 b = vdupq_n_f32 (y);
132105 a = _mm_xor_ps (a, b);
133- #if !defined(_M_ARM64)
134- return a[0 ];
135- #else
136- return a.n128_f32 [0 ];
137- #endif
106+ return vgetq_lane_f32 (a, 0 );
138107#else
139108 return _mm_cvtss_f32 (_mm_xor_ps (_mm_set_ss (x),_mm_set_ss (y)));
140109#endif
141110 }
142111 __forceinline float andf ( const float x, const unsigned y ) {
143112#if defined(__aarch64__) || defined(_M_ARM64)
144113 // FP and Neon shares same vector register in arm64
145- __m128 a;
146- __m128i b;
147- #if !defined(_M_ARM64)
148- a[0 ] = x;
149- b[0 ] = y;
150- #else
151- a.n128_f32 [0 ] = x;
152- b.n128_u32 [0 ] = y;
153- #endif
114+ __m128 a = vdupq_n_f32 (x);
115+ __m128i b = vdupq_n_u32 (y);
154116 a = _mm_and_ps (a, vreinterpretq_f32_s32 (b));
155- #if !defined(_M_ARM64)
156- return a[0 ];
157- #else
158- return a.n128_f32 [0 ];
159- #endif
117+ return vgetq_lane_f32 (a, 0 );
160118#else
161119 return _mm_cvtss_f32 (_mm_and_ps (_mm_set_ss (x),_mm_castsi128_ps (_mm_set1_epi32 (y))));
162120#endif
@@ -165,20 +123,11 @@ namespace embree
165123 {
166124#if defined(__aarch64__) || defined(_M_ARM64)
167125 // FP and Neon shares same vector register in arm64
168- __m128 a;
169- #if !defined(_M_ARM64)
170- a[0 ] = x;
171- #else
172- a.n128_f32 [0 ] = x;
173- #endif
126+ __m128 a = vdupq_n_f32 (x);
174127 __m128 value = _mm_rsqrt_ps (a);
175128 value = vmulq_f32 (value, vrsqrtsq_f32 (vmulq_f32 (a, value), value));
176129 value = vmulq_f32 (value, vrsqrtsq_f32 (vmulq_f32 (a, value), value));
177- #if !defined(_M_ARM64)
178- return value[0 ];
179- #else
180- return value.n128_f32 [0 ];
181- #endif
130+ return vgetq_lane_f32 (value, 0 );
182131#else
183132
184133 const __m128 a = _mm_set_ss (x);
@@ -249,22 +198,11 @@ namespace embree
249198
250199#if defined(__aarch64__) || defined(_M_ARM64)
251200 __forceinline float mini (float a, float b) {
252- // FP and Neon shares same vector register in arm64
253- __m128 x;
254- __m128 y;
255- #if !defined(_M_ARM64)
256- x[0 ] = a;
257- y[0 ] = b;
258- #else
259- x.n128_f32 [0 ] = a;
260- y.n128_f32 [0 ] = b;
261- #endif
262- x = _mm_min_ps (x, y);
263- #if !defined(_M_ARM64)
264- return x[0 ];
265- #else
266- return x.n128_f32 [0 ];
267- #endif
201+ // FP and Neon shares same vector register in arm64
202+ __m128 x = vdupq_n_f32 (a);
203+ __m128 y = vdupq_n_f32 (b);
204+ x = _mm_min_ps (x, y);
205+ return vgetq_lane_f32 (x, 0 );
268206 }
269207#elif defined(__SSE4_1__)
270208 __forceinline float mini (float a, float b) {
@@ -278,21 +216,10 @@ namespace embree
278216#if defined(__aarch64__) || defined(_M_ARM64)
279217 __forceinline float maxi (float a, float b) {
280218 // FP and Neon shares same vector register in arm64
281- __m128 x;
282- __m128 y;
283- #if !defined(_M_ARM64)
284- x[0 ] = a;
285- y[0 ] = b;
286- #else
287- x.n128_f32 [0 ] = a;
288- y.n128_f32 [0 ] = b;
289- #endif
219+ __m128 x = vdupq_n_f32 (a);
220+ __m128 y = vdupq_n_f32 (b);
290221 x = _mm_max_ps (x, y);
291- #if !defined(_M_ARM64)
292- return x[0 ];
293- #else
294- return x.n128_f32 [0 ];
295- #endif
222+ return vgetq_lane_f32 (x, 0 );
296223 }
297224#elif defined(__SSE4_1__)
298225 __forceinline float maxi (float a, float b) {
0 commit comments