用SSE加速CPU蒙皮计算
发表于2018-03-13
我们知道现在绝大多数情况下角色动画的蒙皮计算是放在GPU中计算的,但是仍然有一些特殊的场合我们需要在CPU端使用蒙皮计算的结果,比如涉及到布料的物理模拟的时候。这时我们需要在CPU端计算蒙皮。为了节约CPU计算的时间,需要用SSE对CPU计算蒙皮进行加速。
顶点结构如下:
struct Vertex { float3 Pos; float3 Normal; int n; //该顶点蒙到了几个骨骼上 int BoneId[4]; float Weight[3]; .... //切线等等 };
对于每个顶点的计算过程是先由BoneId和Weight计算出变换矩阵,然后用矩阵变换顶点的位置,法线等。其中求变换矩阵是对多个矩阵加权求和的过程,这一步很适合使用SSE加速。
每个顶点可能蒙到1-4个骨骼上,也就是每个顶点可能需要对1-4个矩阵加权求和。1个骨骼的时候不用计算。2-4个骨骼的时候,我们分别写3个函数对应这3种情况。
__forceinline void LoadFourFloats(float* a0, __m128& res) { res = _mm_load_ps(a0); } __forceinline void StoreFourFloats(float* a0, const __m128& src) { _mm_store_ps(a0, src); }
__forceinline void MulMatrixFloat(__m128& mo0, __m128& mo1, __m128& mo2, const __m128& mi0, const __m128& mi1, const __m128& mi2, float w) { __m128 xmm; xmm = _mm_load_ss(&w); xmm = _mm_shuffle_ps(xmm,xmm,0); // Multiply matrix 1 by weight 1. mo0 = _mm_mul_ps(xmm, mi0); mo1 = _mm_mul_ps(xmm, mi1); mo2 = _mm_mul_ps(xmm, mi2); }
__forceinline void Collapse2MatSSE(float* pM1, float* pM2, float W1, float W2, float* pR) { __m128 xmm1, xmm2, xmm3; __m128 xmm4, xmm5, xmm6; // Load matrix 1. LoadFourFloats(pM1 + 0, xmm1); LoadFourFloats(pM1 + 4, xmm2); LoadFourFloats(pM1 + 8, xmm3); MulMatrixFloat(xmm1, xmm2, xmm3, xmm1, xmm2, xmm3, W1); // Load matrix 2. LoadFourFloats(pM2 + 0, xmm4); LoadFourFloats(pM2 + 4, xmm5); LoadFourFloats(pM2 + 8, xmm6); MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W2); // Add matrix 1 to matrix 2. xmm1 = _mm_add_ps(xmm1, xmm4); xmm2 = _mm_add_ps(xmm2, xmm5); xmm3 = _mm_add_ps(xmm3, xmm6); StoreFourFloats(pR + 0, xmm1); StoreFourFloats(pR + 4, xmm2); StoreFourFloats(pR + 8, xmm3); }
__forceinline void Collapse3MatSSE(float* pM1, float* pM2, float* pM3, float W1, float W2, float W3, float* pR) { __m128 xmm1, xmm2, xmm3; __m128 xmm4, xmm5, xmm6; // Load matrix 1. LoadFourFloats(pM1 + 0, xmm1); LoadFourFloats(pM1 + 4, xmm2); LoadFourFloats(pM1 + 8, xmm3); MulMatrixFloat(xmm1, xmm2, xmm3, xmm1, xmm2, xmm3, W1); // Load matrix 2. LoadFourFloats(pM2 + 0, xmm4); LoadFourFloats(pM2 + 4, xmm5); LoadFourFloats(pM2 + 8, xmm6); MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W2); // Add matrix 1 to matrix 2. xmm1 = _mm_add_ps(xmm1, xmm4); xmm2 = _mm_add_ps(xmm2, xmm5); xmm3 = _mm_add_ps(xmm3, xmm6); // Load matrix 2. LoadFourFloats(pM3 + 0, xmm4); LoadFourFloats(pM3 + 4, xmm5); LoadFourFloats(pM3 + 8, xmm6); MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W3); // Add matrix 1 to matrix 2. xmm1 = _mm_add_ps(xmm1, xmm4); xmm2 = _mm_add_ps(xmm2, xmm5); xmm3 = _mm_add_ps(xmm3, xmm6); StoreFourFloats(pR + 0, xmm1); StoreFourFloats(pR + 4, xmm2); StoreFourFloats(pR + 8, xmm3); }
__forceinline void Collapse4MatSSE(float* pM1, float* pM2, float* pM3, float* pM4, float W1, float W2, float W3, float W4, float* pR) { __m128 xmm1, xmm2, xmm3; __m128 xmm4, xmm5, xmm6; // Load matrix 1. LoadFourFloats(pM1 + 0, xmm1); LoadFourFloats(pM1 + 4, xmm2); LoadFourFloats(pM1 + 8, xmm3); MulMatrixFloat(xmm1, xmm2, xmm3, xmm1, xmm2, xmm3, W1); // Load matrix 2. LoadFourFloats(pM2 + 0, xmm4); LoadFourFloats(pM2 + 4, xmm5); LoadFourFloats(pM2 + 8, xmm6); MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W2); // Add matrix 1 to matrix 2. xmm1 = _mm_add_ps(xmm1, xmm4); xmm2 = _mm_add_ps(xmm2, xmm5); xmm3 = _mm_add_ps(xmm3, xmm6); LoadFourFloats(pM3 + 0, xmm4); LoadFourFloats(pM3 + 4, xmm5); LoadFourFloats(pM3 + 8, xmm6); MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W3); xmm1 = _mm_add_ps(xmm1, xmm4); xmm2 = _mm_add_ps(xmm2, xmm5); xmm3 = _mm_add_ps(xmm3, xmm6); LoadFourFloats(pM4 + 0, xmm4); LoadFourFloats(pM4 + 4, xmm5); LoadFourFloats(pM4 + 8, xmm6); MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W4); xmm1 = _mm_add_ps(xmm1, xmm4); xmm2 = _mm_add_ps(xmm2, xmm5); xmm3 = _mm_add_ps(xmm3, xmm6); StoreFourFloats(pR + 0, xmm1); StoreFourFloats(pR + 4, xmm2); StoreFourFloats(pR + 8, xmm3); }
计算顶点蒙皮矩阵的过程如下:
// 为了使用sse优化,这个矩阵必须是16字节对齐的。 // release 编译时编译器会自动保证这一点,但 // 要debug正确运行必须加上内存对齐的声明. __declspec(align(16)) Matrix4 matObj; for (int i = 0; i < vertCount; i++, pVertex++) { if (pVertex->n == 1) { matObj = *BoneMatrixPalette[pWeight->nBones[0]]; } else if (pVertex->n == 2) { float* pMat0 = BoneMatrixPalette[pVertex->nBones[0]]->ToFloatPtr(); float* pMat1 = BoneMatrixPalette[pVertex->nBones[1]]->ToFloatPtr(); Collapse2MatSSE(pMat0, pMat1,pVertex->Weight[0], pVertex->Weight[1], matObj.ToFloatPtr()); } else if (pVertex->n == 3) { float* pMat0 = BoneMatrixPalette[pWeight->nBones[0]]->ToFloatPtr(); float* pMat1 = BoneMatrixPalette[pWeight->nBones[1]]->ToFloatPtr(); float* pMat2 = BoneMatrixPalette[pWeight->nBones[2]]->ToFloatPtr(); Collapse3MatSSE(pMat0, pMat1, pMat2,pVertex->Weight[0], pVertex->Weight[1], pVertex->Weight[2], matObj.ToFloatPtr()); } else if (pVertex->n == 4) { float* pMat0 = BoneMatrixPalette[pVertex->nBones[0]]->ToFloatPtr(); float* pMat1 = BoneMatrixPalette[pVertex->nBones[1]]->ToFloatPtr(); float* pMat2 = BoneMatrixPalette[pVertex->nBones[2]]->ToFloatPtr(); float* pMat3 = BoneMatrixPalette[pVertex->nBones[3]]->ToFloatPtr(); Collapse4MatSSE(pMat0, pMat1, pMat2, pMat3, pVertex->Weight[0], pVertex->Weight[1], pVertex->Weight[2], pVertex->Weight[3], matObj.ToFloatPtr()); } else { assert(0); } }
得到matObj后分别对等点位置、法线等做变换就可以了。这一过程仍然可以用SSE加速,但这需要顶点的位置、法线等均是16字节对齐的。这需要较大改动,因此我们没有做。经profile,加速后CPU蒙皮的速度提升了1倍。
主要参考
下面两篇来自id,更加变态的优化
Fast Skinning
The Skeleton Assembly Line
来自:http://blog.csdn.net/garuda/article/details/6539271