Skip to content

Commit 76ab1d6

Browse files
authored
Merge branch 'master' into sw/avx10
2 parents e132ff4 + 7ba9359 commit 76ab1d6

13 files changed

Lines changed: 8330 additions & 2839 deletions

File tree

common/sys/vector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include "alloc.h"
77
#include <algorithm>
8+
#include <type_traits>
89

910
namespace embree
1011
{

common/tasking/taskschedulerinternal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "../sys/atomic.h"
1515
#include "../math/range.h"
1616

17+
#include <exception>
1718
#include <list>
1819

1920
namespace embree

kernels/common/device.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -668,31 +668,35 @@ namespace embree
668668
bool ze_rtas_builder = false;
669669
for (uint32_t i=0; i<extensions.size(); i++)
670670
{
671-
if (strncmp("ZE_experimental_rtas_builder",extensions[i].name,sizeof(extensions[i].name)) == 0)
671+
if (strncmp("ZE_extension_rtas",extensions[i].name,sizeof(extensions[i].name)) == 0) {
672672
ze_rtas_builder = true;
673+
}
674+
if (strncmp("ZE_experimental_relaxed_allocation_limits", extensions[i].name, sizeof(extensions[i].name)) == 0) {
675+
hasRelaxedAllocationLimits = true;
676+
}
673677
}
674678
if (!ze_rtas_builder)
675-
throw_RTCError(RTC_ERROR_LEVEL_ZERO_RAYTRACING_SUPPORT_MISSING, "ZE_experimental_rtas_builder extension not found. Please install a recent driver. On Linux, make sure that the package intel-level-zero-gpu-raytracing is installed");
679+
throw_RTCError(RTC_ERROR_LEVEL_ZERO_RAYTRACING_SUPPORT_MISSING, "ZE_extension_rtas extension not found. Please install a recent driver. On Linux, make sure that the package intel-level-zero-gpu-raytracing is installed");
676680

677681
result = ZeWrapper::initRTASBuilder(hDriver);
678682
if (result == ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE) {
679-
throw_RTCError(RTC_ERROR_LEVEL_ZERO_RAYTRACING_SUPPORT_MISSING, "cannot load ZE_experimental_rtas_builder extension. Please install a recent driver. On Linux, make sure that the package intel-level-zero-gpu-raytracing is installed");
683+
throw_RTCError(RTC_ERROR_LEVEL_ZERO_RAYTRACING_SUPPORT_MISSING, "cannot load ZE_extension_rtas extension. Please install a recent driver. On Linux, make sure that the package intel-level-zero-gpu-raytracing is installed");
680684
}
681685
if (result != ZE_RESULT_SUCCESS)
682-
throw_RTCError(RTC_ERROR_UNKNOWN, "cannot initialize ZE_experimental_rtas_builder extension");
686+
throw_RTCError(RTC_ERROR_UNKNOWN, "cannot initialize ZE_extension_rtas extension");
683687

684688
if (State::verbosity(1))
685689
{
686690
std::cout << " Level Zero RTAS Builder" << std::endl;
687691
}
688692

689693
/* check if extension library can get loaded */
690-
ze_rtas_parallel_operation_exp_handle_t hParallelOperation;
691-
result = ZeWrapper::zeRTASParallelOperationCreateExp(hDriver, &hParallelOperation);
694+
ze_rtas_parallel_operation_ext_handle_t hParallelOperation;
695+
result = ZeWrapper::zeRTASParallelOperationCreateExt(hDriver, &hParallelOperation);
692696
if (result == ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE)
693697
throw_RTCError(RTC_ERROR_UNKNOWN, "Level Zero RTAS Build Extension cannot get loaded");
694698
if (result == ZE_RESULT_SUCCESS)
695-
ZeWrapper::zeRTASParallelOperationDestroyExp(hParallelOperation);
699+
ZeWrapper::zeRTASParallelOperationDestroyExt(hParallelOperation);
696700

697701
gpu_maxWorkGroupSize = getGPUDevice().get_info<sycl::info::device::max_work_group_size>();
698702
gpu_maxComputeUnits = getGPUDevice().get_info<sycl::info::device::max_compute_units>();
@@ -707,7 +711,7 @@ namespace embree
707711
std::cout << std::endl;
708712
}
709713

710-
dispatchGlobalsPtr = zeRTASInitExp(gpu_device, gpu_context);
714+
dispatchGlobalsPtr = zeRTASInit(gpu_device, gpu_context);
711715
}
712716

713717
DeviceGPU::~DeviceGPU()

kernels/common/device.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,8 @@ namespace embree
209209

210210
unsigned int gpu_maxWorkGroupSize;
211211
unsigned int gpu_maxComputeUnits;
212-
212+
bool hasRelaxedAllocationLimits = false;
213+
213214
public:
214215
void* dispatchGlobalsPtr = nullptr;
215216

@@ -218,6 +219,8 @@ namespace embree
218219
inline sycl::context &getGPUContext() { return gpu_context; }
219220

220221
inline unsigned int getGPUMaxWorkGroupSize() { return gpu_maxWorkGroupSize; }
222+
inline bool relaxedAllocationLimitsSupported() const { return hasRelaxedAllocationLimits; }
223+
221224

222225
void init_rthw_level_zero();
223226
void init_rthw_opencl();

kernels/common/rtcore.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1405,7 +1405,7 @@ RTC_NAMESPACE_BEGIN;
14051405
RTC_CATCH_END2(geometry);
14061406
}
14071407

1408-
AffineSpace3fa loadTransform(RTCFormat format, const float* xfm)
1408+
static AffineSpace3fa loadTransform(RTCFormat format, const float* xfm)
14091409
{
14101410
AffineSpace3fa space = one;
14111411
switch (format)

kernels/geometry/curve_intersector_distance.h

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -103,38 +103,49 @@ namespace embree
103103
const NativeCurve3fa curve3Di(v0,v1,v2,v3);
104104
const NativeCurve3fa curve3D = enlargeRadiusToMinWidth(context,geom,ray.org,curve3Di);
105105
const NativeCurve3fa curve2D = curve3D.xfm_pr(pre.ray_space,ray.org);
106-
107-
/* evaluate the bezier curve */
108-
vboolx valid = vfloatx(step) < vfloatx(float(N));
109-
const Vec4vfx p0 = curve2D.template eval0<W>(0,N);
110-
const Vec4vfx p1 = curve2D.template eval1<W>(0,N);
111-
112-
/* approximative intersection with cone */
113-
const Vec4vfx v = p1-p0;
114-
const Vec4vfx w = -p0;
115-
const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
116-
const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
117-
const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
118-
const Vec4vfx p = madd(u,v,p0);
119-
const vfloatx t = p.z*pre.depth_scale;
120-
const vfloatx d2 = madd(p.x,p.x,p.y*p.y);
121-
const vfloatx r = p.w;
122-
const vfloatx r2 = r*r;
123-
valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
124-
if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
125-
valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
126106

127-
/* update hit information */
128107
bool ishit = false;
129-
if (unlikely(any(valid))) {
130-
DistanceCurveHit<NativeCurve3fa,W> hit(valid,u,0.0f,t,0,N,curve3D);
131-
ishit = ishit | epilog(valid,hit);
108+
int i = 0;
109+
110+
#if !defined(__SYCL_DEVICE_ONLY__)
111+
/* unrolled first iteration: on CPU SIMD this lets the compiler keep
112+
the first chunk of segments in registers and skip the loop overhead.
113+
On SYCL (W=1) it duplicates code without benefit and just bloats the
114+
kernel, so the SYCL path falls straight into the unified loop. */
115+
{
116+
/* evaluate the bezier curve */
117+
vboolx valid = vfloatx(step) < vfloatx(float(N));
118+
const Vec4vfx p0 = curve2D.template eval0<W>(0,N);
119+
const Vec4vfx p1 = curve2D.template eval1<W>(0,N);
120+
121+
/* approximative intersection with cone */
122+
const Vec4vfx v = p1-p0;
123+
const Vec4vfx w = -p0;
124+
const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
125+
const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
126+
const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
127+
const Vec4vfx p = madd(u,v,p0);
128+
const vfloatx t = p.z*pre.depth_scale;
129+
const vfloatx d2 = madd(p.x,p.x,p.y*p.y);
130+
const vfloatx r = p.w;
131+
const vfloatx r2 = r*r;
132+
valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
133+
if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
134+
valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
135+
136+
/* update hit information */
137+
if (unlikely(any(valid))) {
138+
DistanceCurveHit<NativeCurve3fa,W> hit(valid,u,0.0f,t,0,N,curve3D);
139+
ishit = ishit | epilog(valid,hit);
140+
}
141+
i = W;
132142
}
143+
#endif
133144

134-
if (unlikely(W < N))
145+
if (unlikely(i < N))
135146
{
136147
/* process SIMD-size many segments per iteration */
137-
for (int i=W; i<N; i+=W)
148+
for (; i<N; i+=W)
138149
{
139150
/* evaluate the bezier curve */
140151
vboolx valid = vintx(i)+vintx(step) < vintx(N);

kernels/geometry/curve_intersector_sweep.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,16 @@ namespace embree
8282
const Vec3fa dRdu = /*dQdu*/-dPdu;
8383
const Vec3fa dRdt = dQdt;//-dPdt;
8484

85-
const Vec3fa T = normalize(dPdu);
85+
/* share a single rsqrt(dot(dPdu,dPdu)) between the normalize() and the
86+
1/length(dPdu) needed for cos_err. Saves one transcendental per
87+
Jacobian iteration (important on Xe iGPUs where sqrt/rsqrt are
88+
~14-20 cycle latency and there is no ILP to hide them). */
89+
const float dPdu2 = dot(dPdu,dPdu);
90+
const float rcp_len_dPdu = rsqrt(dPdu2);
91+
const Vec3fa T = dPdu * rcp_len_dPdu;
8692
const Vec3fa dTdu = dnormalize(dPdu,ddPdu);
8793
//const Vec3fa dTdt = zero;
88-
const float cos_err = P_err/length(dPdu);
94+
const float cos_err = P_err * rcp_len_dPdu;
8995

9096
/* Error estimate for dot(R,T):
9197
@@ -348,8 +354,10 @@ namespace embree
348354
const Vec3ff dQ2 = abs(3.0f*(P3-P2) - W);
349355
const Vec3ff max_dQ = max(dQ0,dQ1,dQ2);
350356
const float m = max(max_dQ.x,max_dQ.y,max_dQ.z); //,max_dQ.w);
351-
const float l = length(Vec3f(W));
352-
const bool well_behaved = m < 0.2f*l;
357+
/* compare squared values to avoid the sqrt in length(W); m is
358+
non-negative since computed from abs() above, so this is exact. */
359+
const float l2 = dot(Vec3f(W),Vec3f(W));
360+
const bool well_behaved = m*m < (0.2f*0.2f)*l2;
353361

354362
if (!well_behaved && stack.depth < max_depth) {
355363
stack.push();

0 commit comments

Comments
 (0)