Hi, I'm a n00bie in assembler coding and I was thinking that implementing my Catmull-Rom splines using SSE instructions would increase the performance over the c implementation replacing this function with:
inline void
evaluateCubicSpline( float cv0[], float cv1[], float cv2[], float cv3[], const float & b0, const float & b1, const float & b2, const float & b3, float vector[], int vecsize )
{
while( vecsize-- )
{
*vector++ = *cv0++ * b0 + *cv1++ * b1 + *cv2++ * b2 + *cv3++ * b3;
}
}
with this inline assembly:
inline void
evaluateCubicSplineAsm( float cv0[], float cv1[], float cv2[], float cv3[], const float & b0, const float & b1, const float & b2, const float & b3, float vector[], int vecsize )
{
float *vecend = vector + vecsize;
__declspec(align(16)) float B[4];
__asm
{
mov ecx, b0
mov eax, [ecx]
mov [B + 0 ], eax
mov [B + 4 ], eax
mov [B + 8 ], eax
mov [B + 12], eax
movaps xmm1,
mov ecx, b1
mov eax, [ecx]
mov [B + 0 ], eax
mov [B + 4 ], eax
mov [B + 8 ], eax
mov [B + 12], eax
movaps xmm3,
mov ecx, b2
mov eax, [ecx]
mov [B + 0 ], eax
mov [B + 4 ], eax
mov [B + 8 ], eax
mov [B + 12], eax
movaps xmm5,
mov ecx, b3
mov eax, [ecx]
mov [B + 0 ], eax
mov [B + 4 ], eax
mov [B + 8 ], eax
mov [B + 12], eax
movaps xmm7,
mov ecx, vector
mov edi, cv0
mov esi, cv1
mov eax, cv2
mov edx, cv3
_loop:
movaps xmm0, [edi]
mulps xmm0, xmm1
prefetcht0 [edi + 2048];
lea edi, [edi + 16]
movaps xmm2, [esi]
mulps xmm2, xmm3
prefetcht0 [esi + 2048];
lea esi, [esi + 16]
addps xmm0, xmm2
movaps xmm4, [eax]
mulps xmm4, xmm5
prefetcht0 [eax + 2048];
lea eax, [eax + 16]
movaps xmm6, [edx]
mulps xmm6, xmm7
prefetcht0 [edx + 2048];
lea edx, [edx + 16]
addps xmm4, xmm6
addps xmm0, xmm4
movntps [ecx], xmm0
lea ecx, [ecx + 16]
cmp vecend, ecx
jne _loop
}
}
I'm not sure of the prefetcht0 instruction placement or offset, but by trial and error, the current code location made an improvement.
The vector size I'm testing with is 60000 floats and the control point vectors are 16-byte aligned and contiguous in memory (cv0,cv1,cv2,cv3).
I would have expected a larger gain in performance since I was using 4-wide float registers, but I'm getting only 20% improvement.
Is there anything in the loop portion of the code that's really not efficient or just plain dumb?
Thanks in advance,
Chris
inline void
evaluateCubicSpline( float cv0[], float cv1[], float cv2[], float cv3[], const float & b0, const float & b1, const float & b2, const float & b3, float vector[], int vecsize )
{
while( vecsize-- )
{
*vector++ = *cv0++ * b0 + *cv1++ * b1 + *cv2++ * b2 + *cv3++ * b3;
}
}
with this inline assembly:
inline void
evaluateCubicSplineAsm( float cv0[], float cv1[], float cv2[], float cv3[], const float & b0, const float & b1, const float & b2, const float & b3, float vector[], int vecsize )
{
float *vecend = vector + vecsize;
__declspec(align(16)) float B[4];
__asm
{
mov ecx, b0
mov eax, [ecx]
mov [B + 0 ], eax
mov [B + 4 ], eax
mov [B + 8 ], eax
mov [B + 12], eax
movaps xmm1,
mov ecx, b1
mov eax, [ecx]
mov [B + 0 ], eax
mov [B + 4 ], eax
mov [B + 8 ], eax
mov [B + 12], eax
movaps xmm3,
mov ecx, b2
mov eax, [ecx]
mov [B + 0 ], eax
mov [B + 4 ], eax
mov [B + 8 ], eax
mov [B + 12], eax
movaps xmm5,
mov ecx, b3
mov eax, [ecx]
mov [B + 0 ], eax
mov [B + 4 ], eax
mov [B + 8 ], eax
mov [B + 12], eax
movaps xmm7,
mov ecx, vector
mov edi, cv0
mov esi, cv1
mov eax, cv2
mov edx, cv3
_loop:
movaps xmm0, [edi]
mulps xmm0, xmm1
prefetcht0 [edi + 2048];
lea edi, [edi + 16]
movaps xmm2, [esi]
mulps xmm2, xmm3
prefetcht0 [esi + 2048];
lea esi, [esi + 16]
addps xmm0, xmm2
movaps xmm4, [eax]
mulps xmm4, xmm5
prefetcht0 [eax + 2048];
lea eax, [eax + 16]
movaps xmm6, [edx]
mulps xmm6, xmm7
prefetcht0 [edx + 2048];
lea edx, [edx + 16]
addps xmm4, xmm6
addps xmm0, xmm4
movntps [ecx], xmm0
lea ecx, [ecx + 16]
cmp vecend, ecx
jne _loop
}
}
I'm not sure of the prefetcht0 instruction placement or offset, but by trial and error, the current code location made an improvement.
The vector size I'm testing with is 60000 floats and the control point vectors are 16-byte aligned and contiguous in memory (cv0,cv1,cv2,cv3).
I would have expected a larger gain in performance since I was using 4-wide float registers, but I'm getting only 20% improvement.
Is there anything in the loop portion of the code that's really not efficient or just plain dumb?
Thanks in advance,
Chris