Screen Space Acceleration Structure

Screen Space Acceleration Structure

The paper “An Adaptive Acceleration Structure for Screen-space Ray Tracing” recently caught our attention. It promises up to 80% upspeed compared to a classic DDA single layer raymarching.

paper

The raymarching is seen as an improvement to a classic DDA line rasterizer and uses a quadtree as an acceleration structure. The quadtree data is maintained in the MIP-Levels of an RG16F texture, making it really suitable for a common GPU implementation. The tracing itself is decoupled from the quadtree building step and traverses the octree from root to leafs. If necessary, the traversal steps upwards on higher quadtree levels to check for empty space skipping, which is done in a stackless and register-saving manner.

All in all, very elegant. Since it still was a bit of work to go from the GLSL pseudocode from their supplemental material to a working shader, I’d like to share it with you. All intellectual credits go to the team around D. Luebke at NVIDIA.

Our first quadtree level is constructed in the following fragment program. Note, that the depth is not linear and ranges from zero to one, which enables to save a few FPU instructions for converting them. Theoretically, the non-linearity should cause a problem at the plane compression. Practically, it doesn’t. The depth space bending is small enough to be catched by the subsequent merging thresholds.

void main( void )
{
	// read 3x3 depth neighborhood
	float D11 = getDepth(ivec2(0,0));
	float D21 = getDepth(ivec2(1,0));
	float D01 = getDepth(ivec2(-1,0));
	float D12 = getDepth(ivec2(0,1));
	float D10 = getDepth(ivec2(0,-1));
	vec2 pSize = 1.0 / vec2(textureSize(sceneDepth, 0));
	// Discontinuity hint computed via Laplacian Thresholding
	const float lamdaD = 0.005;
	float avgD = (D11 + D21 + D01 + D12 + D10) * 0.2;
	// compute forward and backward differentials
	vec2 df = vec2(D21-D11, D12-D11);
	vec2 db = vec2(D11-D01, D11-D10);
	// enforce smoothness by picking the smallest derivative
	vec2 d = mix(df.xy, db.xy, lessThan(abs(df.xy), abs(db.xy)));
	// zero large derivatives that connect different surfaces: See Trees for example
	d.xy = step(length(d.xy), 0.001)*d.xy;
	// compute normal. - because the desired coordinate system is left handed-> Z goes into the screen
	vec3 N = normalize(cross(vec3(pSize.x, 0, d.x), vec3(0, pSize.y, d.y)));
	//N = cameraRot*GetNormal(texture(sceneNormals, TexCoord));
	// compute plane's top-left corner z-coordinate
	float P = D11 - dot(N.xy/N.z, -0.5*pSize);
	// Output a plane node
	FragColor = outputPlane(Plane(N, P));
}

We then build the quadtree bottom up:

#define normalThresh 0.0003 // Big enough to hide normal generation artifacts on big planes
#define depthTresh 0.0005 // Same calibration routine than the above
void main( void )
{
	vec2 Q[4];
	Q[0]=texelFetch(quadTree, 2*ivec2(gl_FragCoord)+ivec2(0, 0), 0).xy;
	Q[1]=texelFetch(quadTree, 2*ivec2(gl_FragCoord)+ivec2(1, 0), 0).xy;
	Q[2]=texelFetch(quadTree, 2*ivec2(gl_FragCoord)+ivec2(0, 1), 0).xy;
	Q[3]=texelFetch(quadTree, 2*ivec2(gl_FragCoord)+ivec2(1, 1), 0).xy;
	vec2 PixelSize = 1.0 / vec2(textureSize(quadTree, 0).xy);
	if(isPlane(Q[0]) && isPlane(Q[1]) && isPlane(Q[2]) && isPlane(Q[3])) // Contains only planes
	{
		Plane planes[4];
		// get plane normal vector, origin and discontinuity flag
		planes[0] = getPlane(Q[0]);
		planes[1] = getPlane(Q[1]);
		planes[2] = getPlane(Q[2]);
		planes[3] = getPlane(Q[3]);

		// Adjust the position Z to fit in plane[0]'s origin.
		planes[1].depth -= (planes[1].normal.x/planes[1].normal.z) * -PixelSize.x;
		planes[2].depth -= (planes[2].normal.y/planes[2].normal.z) * -PixelSize.y;
		planes[3].depth -= dot(planes[3].normal.xy/(planes[3].normal.z), -PixelSize);

		/* set proxy plane normal to mean of children normals */
		vec3 Nproxy = normalize(planes[0].normal+planes[1].normal+planes[2].normal+planes[3].normal);

		/* compute angle differences via dot-product */
		float d[4];
		d[0] = 1.0 - dot(Nproxy, planes[0].normal);
		d[1] = 1.0 - dot(Nproxy, planes[1].normal);
		d[2] = 1.0 - dot(Nproxy, planes[2].normal);
		d[3] = 1.0 - dot(Nproxy, planes[3].normal);

		/* proxy plane origin Pproxy is least-square fit to child planes with fit errors stored in p0...3 */
		float Pproxy = 0.25*(planes[0].depth+planes[1].depth+planes[2].depth+planes[3].depth);

		float p[4];
		p[0] = abs(Pproxy-planes[0].depth);
		p[1] = abs(Pproxy-planes[1].depth);
		p[2] = abs(Pproxy-planes[2].depth);
		p[3] = abs(Pproxy-planes[3].depth);

		float maxd = max(max(d[0], d[1]), max(d[2],d[3]));
		float maxp = max(max(p[0], p[1]), max(p[2],p[3]));

		/* output a plane node if the proxy is close enough to children in terms of orientation and position */
		if(maxd < normalThresh && maxp < depthTresh)
		{
			FragColor = outputPlane(Plane(Nproxy, Pproxy));
			// For O term: any(equal(O, vec4(1.0))) ? 1.0 : 0.0
			return;
		}
	}
	/* output AABB node that encompasses all children */
	float minZ = min(min(min(getMin(Q[0], PixelSize), getMin(Q[1], PixelSize)), getMin(Q[2], PixelSize)), getMin(Q[3], PixelSize));
	float maxZ = max(max(max(getMax(Q[0], PixelSize), getMax(Q[1], PixelSize)), getMax(Q[2], PixelSize)), getMax(Q[3], PixelSize));
	FragColor = outputAabb(minZ, maxZ);
}

Now, the tracing itself:

const int numLevels = 7;
struct Plane
{
	vec3 normal;
	float depth;
};
struct AABBresult
{
	float near; // Line length to entry point
	float far; // Line length to exit point
	bool hit; 
};
struct Ray
{
	vec3 origin;
	vec3 direction;
	vec3 inv_direction;
	int sign[3];
};
void initRay(inout Ray ray)
{
	ray.inv_direction = 1.0 / ray.direction;
	ray.sign[0] = int(step(0.0, -ray.inv_direction.x));
	ray.sign[1] = int(step(0.0, -ray.inv_direction.y));
	ray.sign[2] = int(step(0.0, -ray.inv_direction.z));
}
vec2 outputAabb(float min, float max)
{
	return vec2(-abs(min), abs(max));
}
vec3 decompressNormal(vec2 input)
{
	return vec3(input.xy, sqrt(1.0-input.x*input.x-input.y*input.y));
}
vec2 outputPlane(Plane plane)
{
	return vec2(abs(plane.depth), uintBitsToFloat(packHalf2x16(plane.normal.xy)));
}
Plane getPlane(vec2 input)
{
	vec2 compressedNormal = unpackHalf2x16(floatBitsToUint(input.y));
	return Plane(decompressNormal(compressedNormal), input.x);
}
bool isPlane(vec2 data)
{
	return data.x > 0;
}
float getMin(vec2 data, vec2 pixelSize)
{
	if(isPlane(data))
	{
		Plane plane = getPlane(data);
		return plane.depth + min(0.0, -pixelSize.x * plane.normal.x/(plane.normal.z)) + min(0.0, -pixelSize.y * plane.normal.y/(plane.normal.z));
	}
	return -data.x;
}
float getMax(vec2 data, vec2 pixelSize)
{
	if(isPlane(data))
	{
		Plane plane = getPlane(data);
		return plane.depth + max(0.0, -pixelSize.x * plane.normal.x/(plane.normal.z)) + max(0.0, -pixelSize.y * plane.normal.y/(plane.normal.z));
	}
	return data.y;
}
// http://github.com/hpicgs/cgsee/wiki/Ray-Box-Intersection-on-the-GPU
void rayAabbIntersection(in Ray ray, in vec3 aabb[2],out float tmin, out float tmax)
{
	float tymin, tymax, tzmin, tzmax;
	tmin = (aabb[ray.sign[0]].x - ray.origin.x) * ray.inv_direction.x;
	tmax = (aabb[1-ray.sign[0]].x - ray.origin.x) * ray.inv_direction.x;
	tymin = (aabb[ray.sign[1]].y - ray.origin.y) * ray.inv_direction.y;
	tymax = (aabb[1-ray.sign[1]].y - ray.origin.y) * ray.inv_direction.y;
	tzmin = (aabb[ray.sign[2]].z - ray.origin.z) * ray.inv_direction.z;
	tzmax = (aabb[1-ray.sign[2]].z - ray.origin.z) * ray.inv_direction.z;
	tmin = max(max(tmin, tymin), tzmin);
	tmax = min(min(tmax, tymax), tzmax);
}
// Only X & Y AABB check
void getFarNearOfNode(vec3 rayPos, vec3 rayDir, ivec2 Q, ivec2 resolution, out vec2 nearFar)
{
	vec2 pixelDelta = 1.0 / vec2(resolution);
	vec2 Aabb[2];
	Aabb[0] = vec2(Q) * pixelDelta;
	Aabb[1] = Aabb[0] + pixelDelta;
	Ray ray;
	ray.origin = rayPos;
	ray.direction = rayDir;
	initRay(ray);
	rayAabbIntersectionNoZ(ray, Aabb, nearFar.x, nearFar.y);
}
void rayIntersectAABB(vec3 rayPos, // TexCoords, DepthBuffer 0..1
vec3 rayDir, // VS normals
vec2 AABBdata,
out AABBresult hitAABB,
ivec2 Q,
ivec2 resolution)
{
	vec2 pixelDelta = 1.0 / vec2(resolution);
	vec3 Aabb[2];
	Aabb[0] = vec3(vec2(Q) * pixelDelta, abs(AABBdata.x));
	Aabb[1] = vec3(Aabb[0].xy + pixelDelta, abs(AABBdata.y));
	Ray ray;
	ray.origin = rayPos;
	ray.direction = rayDir;
	initRay(ray);
	rayAabbIntersection(ray, Aabb, hitAABB.near, hitAABB.far);
	hitAABB.hit = (hitAABB.near < hitAABB.far); } ivec2 getNextNode(vec3 pos, vec3 dir, ivec2 Q, vec2 currResolution)
	{ /* get node exit corner coordinate */
	vec2 B = vec2(Q + step(vec2(0.0), dir.xy)) / currResolution;
	/* get distances to node edges that intersect at Bxy */
	vec2 D = (B - pos.xy) / dir.xy; /* compute position shift */
	ivec2 S = ivec2(sign(dir.xy) * step(D.xy, D.yx));
	if(S == ivec2(0))
	{
		S = ivec2(sign(dir.xy));
	}
	return Q + S; /* return new position */
}
bool rayTraceSS(sampler2D DepthQuadtree, vec3 pos, vec3 dir, vec2 DepthToPos, out float d, out Plane plane)
{
	// current quad-tree level, start at the root
	int Qlevel = numLevels - 1; /* current node position in the quad-tree */
	ivec2 Q = ivec2( floor(pos.xy * textureSize(DepthQuadtree, Qlevel)) );
	while(clamp(Q, ivec2(0), textureSize(DepthQuadtree, Qlevel)-ivec2(1)) == Q)
	{
		vec2 Qdata = texelFetch(DepthQuadtree, Q, Qlevel).xy; // read the node data
		if(isPlane(Qdata))
		{
			plane = getPlane(Qdata);
#if 0 // Modified version from paper
			// Get far and near distances from ray to node AABB
			vec2 FandN;
			getFarNearOfNode(pos, dir, Q, textureSize(DepthQuadtree, Qlevel), FandN);
			vec3 N = plane.normal; // plane normal 
			if(dot(N, dir) > 0.0)
			{
				vec3 P0 = vec3(vec2(Q)/vec2(textureSize(DepthQuadtree, Qlevel)), plane.depth); // and origin
				// compute ray-plane intersection
				d = dot(P0-pos, N)/(dot(dir, N));
				float depthTresh = 1.5/(vec2(textureSize(DepthQuadtree, Qlevel)).x*dot(dir, N));
				if(d > 0 && d >= FandN.x - max(0.0, -depthTresh) && d < FandN.y + max(0.0, depthTresh))
				{
					return true;
				}
			}
#else // Optimized version for one layer
			vec3 N = plane.normal; // plane normal
			vec3 P0 = vec3(vec2(Q)/vec2(textureSize(DepthQuadtree, Qlevel)), plane.depth); // and origin
			// compute ray-plane intersection
			d = dot(P0-pos, N)/dot(dir, N);
			vec2 bounds[2];
			vec2 pixelDelta = 1.0 / vec2(textureSize(DepthQuadtree, Qlevel));
			bounds[0] = vec2(Q) * pixelDelta;
			bounds[1] = bounds[0] + pixelDelta;
			vec3 hitPosition = pos + d*dir;
			if(d > 0 && clamp(hitPosition.xy, bounds[0], bounds[1]) == hitPosition.xy)
				return true;
#endif
		}
		else // compute intersection with both node bounding box and occlusion volume
		{
			AABBresult hitAABB;
			rayIntersectAABB(pos, dir, Qdata, hitAABB, Q, textureSize(DepthQuadtree, Qlevel));
			if(hitAABB.hit)
			{
				// progress down to the next child
				vec2 ip = pos.xy + dir.xy*hitAABB.near;
				vec2 Q2Level = vec2(textureSize(DepthQuadtree, Qlevel));
				Q = Q * 2 + ivec2(step(vec2(0.0), ip - vec2((vec2(Q) + vec2(0.5))/Q2Level)));
				Qlevel--;
				continue;
			}	
		}
		/* plane or AABB miss, progress to the next node */
		/* current node successor position at Qlevel */
		vec2 divisor = vec2(textureSize(DepthQuadtree, Qlevel));
		ivec2 Qstar = getNextNode(pos, dir, Q, divisor);
		/* compute how many levels up we need to go */
		int levelShift = findMSB((Q.x ^ Qstar.x) | (Q.y ^ Qstar.y));
		/* prevent the traversal from going above the quad-tree root */
		int QstarLevel = min(Qlevel + levelShift, numLevels - 1);
		/* update the node location and level to new values */
		Q = ivec2(floor(vec2(Qstar)*pow(2.0, float(Qlevel-QstarLevel))));
		Qlevel = QstarLevel;
	}
	return false;
}

Please note that this code isn't heavily optimized, there is still a lot of potential, but it outlines the general algorithm.

We came to the conclusion, just like Frostbite in their recent presentation about stochastic screen space reflections, that it's a good idea to combine both stride-based raymarching and geometric exact tracing with an acceleration structure. For most cases however, we use the stride based because it scales a lot better to spontaneous needs thanks to the variation of step size and stride size.

Leave a Reply

Your email address will not be published. Required fields are marked *