CUDA | Deferred | Forward |
---|---|---|

17.80 | 57.09 | 9.82 |

Which is pretty discouraging. I still need to implement post-processing effects, so I'm going to hold off on trying to tune the kernel for now. The code is posted below, if anyone has any suggestions though.

__global__ void cudaCalculation41(unsigned int numPointLights, float *plFloats, unsigned int *srBounds, float *invPersp, float *view, int width, int height, uchar4 *outData) { __shared__ float iP[16]; __shared__ float v[16]; __shared__ bool tileInRect[MAX_BLOCK_SIZE]; __shared__ float pLightInfo[8]; int tdx = threadIdx.x + blockDim.x*threadIdx.y; int blockS = blockDim.x * blockDim.y; // Load the inverse perspective and view matrices if(blockS < 32) { if(tdx < 16) { iP[tdx] = invPersp[tdx]; v[tdx] = view[tdx]; } } else { if(tdx < 16) { iP[tdx] = invPersp[tdx]; } else if(tdx < 32) { v[tdx - 16] = view[tdx - 16]; } } __syncthreads(); // Calculate texture read coordinates ushort2 coords; coords.x = blockDim.x * blockIdx.x + threadIdx.x; coords.y = blockDim.y * blockIdx.y + threadIdx.y; // Get the depth and VS position uchar4 txRead = tex2D(depthTexRef, coords.x, coords.y); unsigned int depthUInt = txRead.x + txRead.y*256 + txRead.z*256*256 + txRead.w*256*256*256; float depth = __int_as_float(depthUInt); float3 vpos = vsPosition(depth, (float)coords.x/width, (float)coords.y/height, iP); // Get the normal txRead = tex2D(normalTexRef, coords.x, coords.y); float n1 = txRead.x/(256.0f*255.0f) + txRead.y/255.0f; float n2 = txRead.z/(256.0f*255.0f) + txRead.w/255.0f; n1 = n1*4.0f - 2.0f; n2 = n2*4.0f - 2.0f; float f = n1*n1 + n2*n2; float g = sqrt(1.0f - .25f*f); float3 normal = make_float3(n1*g, n2*g, 1.0f - .5f*f); // Get the tile rect-bounds ushort4 pBounds; pBounds.x = blockIdx.x * blockDim.x; pBounds.y = blockIdx.y * blockDim.y; pBounds.z = pBounds.x + blockDim.x - 1; pBounds.w = pBounds.y + blockDim.y - 1; // Initialize color accumulation buffer float3 color_accum = make_float3(0,0,0); bool onScreen = (coords.x < width) && (coords.y < height); uchar4 diffuse; if(onScreen) { diffuse = tex2D(diffuseTexRef, coords.x, coords.y); } unsigned int numLoops = numPointLights / blockS + (numPointLights % blockS == 0 ? 0 : 1); // Loop through for all of the lights for(unsigned int i = 0; i < numLoops; ++i) { int index = i*blockS + tdx; // Check to see if a light is in bounds if(index < numPointLights) { tileInRect[tdx] = !((srBounds[index] > pBounds.z) || (srBounds[index + numPointLights] > pBounds.w) || (srBounds[index + 2*numPointLights] < pBounds.x) || (srBounds[index + 3*numPointLights] < pBounds.y)); } else tileInRect[tdx] = false; __syncthreads(); // For each light that was in bounds, calculate the light contribution for(unsigned int j = 0; j < blockS; ++j) { if(tileInRect[j]) { // Load the info for a single light into shared memory if(tdx < 8) { pLightInfo[tdx] = plFloats[(i*blockS+j)*8 + tdx]; } __syncthreads(); // Only perform light calculation if the thread corresponds to an on-screen coordinate if(onScreen) { float3 lpos = mMult3(v, make_float4(pLightInfo[0], pLightInfo[1], pLightInfo[2], 1.0f)); float d = dist(lpos, vpos); float attenuate = 1.0f; if(d > pLightInfo[6]) attenuate = .0f; else if(d > pLightInfo[7]) attenuate -= (d - pLightInfo[7])/(pLightInfo[6] - pLightInfo[7]); // N*L*attenuation float3 lDir = normalize(make_float3(lpos.x - vpos.x, lpos.y - vpos.y, lpos.z - vpos.z)); float factor = dMax(dot(lDir,normal), 0.0f)*attenuate; // Accumulate the color contribution from the current light color_accum.x += factor*pLightInfo[3]; color_accum.y += factor*pLightInfo[4]; color_accum.z += factor*pLightInfo[5]; } __syncthreads(); } } } // Multiply light accumulation by the texture color if(onScreen) outData[coords.x + width*coords.y] = make_uchar4(pointFactor*color_accum.x*diffuse.x, pointFactor*color_accum.y*diffuse.y, pointFactor*color_accum.z*diffuse.z, 255); }

Sean, these results are not using the scissor test or screen-space classification, right? Even if you can't improve the kernel's performance, adding screen-space classification with CUDA may still improve the performance compared to using the scissor test with GLSL.

ReplyDeleteIf you want help optimizing your CUDA kernel, let's walk through it together after class on Monday.