如何写一个软渲染(5)-SIMD

原标题：如何写一个软渲染(5)-SIMD

这一篇算是高级主题了，大家一起来学习下。

banner与主题无关。

关于SIMD和SISD：Single Instruction Multiple Data，单指令多数据流。反之SISD是单指令单数据。以加法指令为例，单指令单数据（SISD）的CPU对加法指令译码后，执行部件先访问内存，取得第一个操作数；之后再一次访问内存，取得第二个操作数；随后才能进行求和运算。而在SIMD型的CPU中，指令译码后几个执行部件同时访问内存，一次性获得所有操作数进行运算。这个特点使SIMD特别适合于多媒体应用等数据密集型运算。如下图所示：

你可能听过mmx，sse，neon等各种指令集，其实是不同处理器不同平台对SIMD的不同的实现。

可以用

命令查看当前处理器的相关信息

SSE 的浮点数运算指令，大致上可以分成两种：packed 和 scalar。Packed 指令是一次对 XMM 暂存器中的四个浮点数（即 DATA0 ~ DATA3）均进行计算，而 scalar 则只对 XMM 暂存器中的 DATA0 进行计算。如下图所示：

SSE 指令和一般的 x86 指令很类似，基本上包括两种定址方式：reg-reg 和 reg-mem。下面是两个例子：

指令的运算结果会覆盖到第一个参数中。例如，以上面的第一个例子来说，xmm0 暂存器会存放最后计算的结果。

另外，绝大部份需要存取记忆体的 SSE 指令，都要求位址是 16 的倍数（也就是对齐在 16 bytes 的边上）。如果不是的话，就会导致 exception。这是非常重要的。因为，一般的 32 位元浮点数只会对齐在 4 bytes 或 8 bytes 的边上（根据 compiler 的设定而不同）。另外，若是处理阵列中的数字，也需要特别注意这个问题。

使用sse指令有两种方式，一种是直接用汇编，另一用是用intrinsics，使用 intrinsics 可以增加程式的可读性，也比较容易使用。不过，在某些情形下，compiler 可能没办法产生最好的程式码，而且，其产生的程式码的效率，也会随着 compiler 的不同而有改变。但是，对于大部份的应用来说，使用 intrinsics 的好处通常是很明显的。因此，我们就只用 intrinsics先。

一个简单的例子，给数组中所有的数加10

，gcc编译命令要加 -mmmx.

Profiler 在源码库里面，可以自行查看。

两种运算的性能差别

性能差别在5倍左右。

在软渲染中加入SIMD优化

还是以LifeOfTriangle为例，优化之前先Profile，Rasterizor里面打一些桩

void Rasterizer::RasterizeTriangleLarabee3(VSOutput *pVSOutput0, VSOutput *pVSOutput1, VSOutput *pVSOutput2 , IShader* shader){ PROFILE_BEGIN(RasterizeTriangle_BoundingBox); Vector2 bboxmin( Mathf::Infinity, Mathf::Infinity); Vector2 bboxmax(Mathf::NegativeInfinity, Mathf::NegativeInfinity); Vector2 clamp(mRenderContext->width-1, mRenderContext->height-1); //Screen space clip by bounding box bboxmin.x = Mathf::Max(0.f, Mathf::Min(bboxmin.x , pVSOutput0->position.x)); bboxmin.y = Mathf::Max(0.f, Mathf::Min(bboxmin.y , pVSOutput0->position.y)); bboxmax.x = Mathf::Min(clamp.x, Mathf::Max(bboxmax.x, pVSOutput0->position.x)); bboxmax.y = Mathf::Min(clamp.y, Mathf::Max(bboxmax.y, pVSOutput0->position.y)); bboxmin.x = Mathf::Max(0.f, Mathf::Min(bboxmin.x , pVSOutput1->position.x)); bboxmin.y = Mathf::Max(0.f, Mathf::Min(bboxmin.y , pVSOutput1->position.y)); bboxmax.x = Mathf::Min(clamp.x, Mathf::Max(bboxmax.x, pVSOutput1->position.x)); bboxmax.y = Mathf::Min(clamp.y, Mathf::Max(bboxmax.y, pVSOutput1->position.y)); bboxmin.x = Mathf::Max(0.f, Mathf::Min(bboxmin.x , pVSOutput2->position.x)); bboxmin.y = Mathf::Max(0.f, Mathf::Min(bboxmin.y , pVSOutput2->position.y)); bboxmax.x = Mathf::Min(clamp.x, Mathf::Max(bboxmax.x, pVSOutput2->position.x)); bboxmax.y = Mathf::Min(clamp.y, Mathf::Max(bboxmax.y, pVSOutput2->position.y)); Vector2i P; Color col; PROFILE_END(RasterizeTriangle_BoundingBox); Vector2 bMinusa = Vector2(pVSOutput1->position.x - pVSOutput0->position.x, pVSOutput1->position.y - pVSOutput0->position.y); Vector2 cMinusa = Vector2(pVSOutput2->position.x - pVSOutput0->position.x, pVSOutput2->position.y - pVSOutput0->position.y); bool isInline = false; for (P.x=bboxmin.x; P.xposition.x, P.y - pVSOutput0->position.y); Vector3 barycentricCoord = BarycentricFast3(bMinusa,cMinusa, pMinusa, isInline); float fInvW =1.0f / (barycentricCoord.x * pVSOutput0->position.w+ barycentricCoord.y*pVSOutput1->position.w +barycentricCoord.z * pVSOutput2->position.w); float depth = barycentricCoord.x * pVSOutput0->position.z + barycentricCoord.y*pVSOutput1->position.z + barycentricCoord.z * pVSOutput2->position.z; if(isInline) continue; float threshold = -0.000001; if (barycentricCoord.x

depthBuffer[P.x + P.y * mRenderContext->width] uv + barycentricCoord.y * pVSOutput1->uv + barycentricCoord.z * pVSOutput2->uv); Color interpCol = fInvW * (barycentricCoord.x *pVSOutput0->color + barycentricCoord.y * pVSOutput1->color+ barycentricCoord.z * pVSOutput2->color); shader->FragmentInColor = &interpCol; shader->FragmentInUV = &interpUV; if(shader->VaryingsCountBitMask & FirstBitMask) { shader->FragmentVaringOuts[0] = barycentricCoord.x * pVSOutput0->varying[0] + barycentricCoord.y * pVSOutput1->varying[0] + barycentricCoord.z * pVSOutput2->varying[0]; shader->FragmentVaringOuts[0] *= fInvW; } if(shader->VaryingsCountBitMask & SecondBitMask) { shader->FragmentVaringOuts[1] = barycentricCoord.x * pVSOutput0->varying[1] + barycentricCoord.y * pVSOutput1->varying[1] + barycentricCoord.z * pVSOutput2->varying[1]; shader->FragmentVaringOuts[1] *= fInvW; } PROFILE_END(RasterizeTriangle_vertexattrs); PROFILE_BEGIN(RasterizeTriangle_fragment); bool discard = shader->fragment(barycentricCoord, col); PROFILE_END(RasterizeTriangle_fragment); PROFILE_BEGIN(RasterizeTriangle_draw); if (!discard) { mRenderContext->depthBuffer[P.x + P.y * mRenderContext->width] = depth; DrawPixel(P.x, mRenderContext->height - P.y - 1, col); } PROFILE_END(RasterizeTriangle_draw); } }} 跑100帧，数据如下 fps只有20，消耗主要集中在重心坐标计算，顶点属性插值，fragment shader等。直接上用sse处理的代码 void Rasterizer::RasterizeTriangleLarabeeSSE(VSOutput *pVSOutput0, VSOutput *pVSOutput1, VSOutput *pVSOutput2 , IShader* shader){ PROFILE_BEGIN(RasterizeTriangle_BoundingBox); Vector2 bboxmin( Mathf::Infinity, Mathf::Infinity); Vector2 bboxmax(Mathf::NegativeInfinity, Mathf::NegativeInfinity); Vector2 clamp(mRenderContext->width-1, mRenderContext->height-1); //Screen space clip by bounding box bboxmin.x = Mathf::Max(0.f, Mathf::Min(bboxmin.x , pVSOutput0->position.x)); bboxmin.y = Mathf::Max(0.f, Mathf::Min(bboxmin.y , pVSOutput0->position.y)); bboxmax.x = Mathf::Min(clamp.x, Mathf::Max(bboxmax.x, pVSOutput0->position.x)); bboxmax.y = Mathf::Min(clamp.y, Mathf::Max(bboxmax.y, pVSOutput0->position.y)); bboxmin.x = Mathf::Max(0.f, Mathf::Min(bboxmin.x , pVSOutput1->position.x)); bboxmin.y = Mathf::Max(0.f, Mathf::Min(bboxmin.y , pVSOutput1->position.y)); bboxmax.x = Mathf::Min(clamp.x, Mathf::Max(bboxmax.x, pVSOutput1->position.x)); bboxmax.y = Mathf::Min(clamp.y, Mathf::Max(bboxmax.y, pVSOutput1->position.y)); bboxmin.x = Mathf::Max(0.f, Mathf::Min(bboxmin.x , pVSOutput2->position.x)); bboxmin.y = Mathf::Max(0.f, Mathf::Min(bboxmin.y , pVSOutput2->position.y)); bboxmax.x = Mathf::Min(clamp.x, Mathf::Max(bboxmax.x, pVSOutput2->position.x)); bboxmax.y = Mathf::Min(clamp.y, Mathf::Max(bboxmax.y, pVSOutput2->position.y)); Vector2i P; Color col; PROFILE_END(RasterizeTriangle_BoundingBox); Vector2 bMinusa = Vector2(pVSOutput1->position.x - pVSOutput0->position.x, pVSOutput1->position.y - pVSOutput0->position.y); Vector2 cMinusa = Vector2(pVSOutput2->position.x - pVSOutput0->position.x, pVSOutput2->position.y - pVSOutput0->position.y); __m128 wPack = _mm_set_ps(pVSOutput0->position.w,pVSOutput1->position.w,pVSOutput2->position.w,0); __m128 zPack = _mm_set_ps(pVSOutput0->position.z,pVSOutput1->position.z,pVSOutput2->position.z,0); __m128 uPack = _mm_set_ps(pVSOutput0->uv.x,pVSOutput1->uv.x,pVSOutput2->uv.x,0); __m128 vPack = _mm_set_ps(pVSOutput0->uv.y,pVSOutput1->uv.y,pVSOutput2->uv.y,0); __m128 colorRPack = _mm_set_ps(pVSOutput0->color.r,pVSOutput1->color.r,pVSOutput2->color.r/*GetUintR()*/,0); __m128 colorGPack = _mm_set_ps(pVSOutput0->color.g,pVSOutput1->color.g,pVSOutput2->color.g/*GetUintG()*/,0); __m128 colorBPack = _mm_set_ps(pVSOutput0->color.b,pVSOutput1->color.b,pVSOutput2->color.b/*GetUintB()*/,0); __m128 colorAPack = _mm_set_ps(pVSOutput0->color.a,pVSOutput1->color.a,pVSOutput2->color.a/*GetUintA()*/,0); float interpWArray[4] = {0}; float interpZArray[4] = {0}; float barycentricArray[4] = {0}; float tmpArrayForSEE[4] = {0}; bool isInline = false; Vector2 interpUV; Color interpCol; float d00 = bMinusa.x * bMinusa.x + bMinusa.y * bMinusa.y; float d01 = bMinusa.x * cMinusa.x + bMinusa.y * cMinusa.y; float d11 = cMinusa.x * cMinusa.x + cMinusa.y * cMinusa.y; for (P.x=bboxmin.x; P.xposition.x, P.y - pVSOutput0->position.y); Vector3 barycentricCoord = BarycentricFastSSE(d00, d01, d11, bMinusa, cMinusa, pMinusa, isInline); //Vector3 barycentricCoord = BarycentricFast3(bMinusa,cMinusa, pMinusa, isInline); mBarycentricCoord = _mm_set_ps(barycentricCoord.x,barycentricCoord.y,barycentricCoord.z,0); __m128 InterpW = _mm_mul_ps(mBarycentricCoord, wPack); __m128 InterpZ = _mm_mul_ps(mBarycentricCoord, zPack); _mm_storer_ps(interpWArray, InterpW); _mm_storer_ps(interpZArray, InterpZ); //_mm_store_ps(barycentricArray, mBarycentricCoord); //float fInvW =1.0f / (barycentricCoord.x * pVSOutput0->position.w+ barycentricCoord.y*pVSOutput1->position.w +barycentricCoord.z * pVSOutput2->position.w); //float depth = barycentricCoord.x * pVSOutput0->position.z + barycentricCoord.y*pVSOutput1->position.z + barycentricCoord.z * pVSOutput2->position.z; float fInvW =1.0f /(interpWArray[0] + interpWArray[1] + interpWArray[2]); float depth = interpZArray[0] + interpZArray[1] + interpZArray[2]; if(isInline) continue; float threshold = -0.000001; if (barycentricCoord.xdepthBuffer[P.x + P.y * mRenderContext->width] uv + barycentricCoord.y * pVSOutput1->uv + barycentricCoord.z * pVSOutput2->uv); //Color interpCol = fInvW * (barycentricCoord.x *pVSOutput0->color + barycentricCoord.y * pVSOutput1->color+ barycentricCoord.z * pVSOutput2->color); __m128 interpUMM = _mm_mul_ps(mBarycentricCoord, uPack); __m128 interpVMM = _mm_mul_ps(mBarycentricCoord, vPack); __m128 interpRMM = _mm_mul_ps(mBarycentricCoord, colorRPack); __m128 interpGMM = _mm_mul_ps(mBarycentricCoord, colorGPack); __m128 interpBMM = _mm_mul_ps(mBarycentricCoord, colorBPack); __m128 interpAMM = _mm_mul_ps(mBarycentricCoord, colorAPack); _mm_storer_ps(tmpArrayForSEE, interpUMM); interpUV.x = fInvW * (tmpArrayForSEE[0] + tmpArrayForSEE[1] + tmpArrayForSEE[2]); _mm_storer_ps(tmpArrayForSEE, interpVMM); interpUV.y = fInvW * (tmpArrayForSEE[0] + tmpArrayForSEE[1] + tmpArrayForSEE[2]); _mm_storer_ps(tmpArrayForSEE, interpRMM); interpCol.r = fInvW * (tmpArrayForSEE[0] + tmpArrayForSEE[1] + tmpArrayForSEE[2]+ tmpArrayForSEE[3]); _mm_storer_ps(tmpArrayForSEE, interpGMM); interpCol.g = fInvW * (tmpArrayForSEE[0] + tmpArrayForSEE[1] + tmpArrayForSEE[2]+ tmpArrayForSEE[3]); _mm_storer_ps(tmpArrayForSEE, interpBMM); interpCol.b = fInvW * (tmpArrayForSEE[0] + tmpArrayForSEE[1] + tmpArrayForSEE[2]+ tmpArrayForSEE[3]); _mm_storer_ps(tmpArrayForSEE, interpAMM); interpCol.a = fInvW * (tmpArrayForSEE[0] + tmpArrayForSEE[1] + tmpArrayForSEE[2]+ tmpArrayForSEE[3]); shader->FragmentInColor = &interpCol; shader->FragmentInUV = &interpUV; if(shader->VaryingsCountBitMask & FirstBitMask) { shader->FragmentVaringOuts[0] = barycentricCoord.x * pVSOutput0->varying[0] + barycentricCoord.y * pVSOutput1->varying[0] + barycentricCoord.z * pVSOutput2->varying[0]; shader->FragmentVaringOuts[0] *= fInvW; } if(shader->VaryingsCountBitMask & SecondBitMask) { shader->FragmentVaringOuts[1] = barycentricCoord.x * pVSOutput0->varying[1] + barycentricCoord.y * pVSOutput1->varying[1] + barycentricCoord.z * pVSOutput2->varying[1]; shader->FragmentVaringOuts[1] *= fInvW; } PROFILE_END(RasterizeTriangle_vertexattrs); PROFILE_BEGIN(RasterizeTriangle_fragment); bool discard = shader->fragment(barycentricCoord, col); PROFILE_END(RasterizeTriangle_fragment); PROFILE_BEGIN(RasterizeTriangle_draw); if (!discard) { mRenderContext->depthBuffer[P.x + P.y * mRenderContext->width] = depth; DrawPixel(P.x, mRenderContext->height - P.y - 1, col); } PROFILE_END(RasterizeTriangle_draw); } }} 虽然没有预想中的那么好，但性能上还是有20%左右的提升 1.由于对数据的内存分布要求很高，函数中存在多次的set和store，这个的调用应该也比较消耗； 2.fragment shader没有优化。 3.simd更加适用的应该是向量的normalise，叉乘等？ 4.最理想的状态应该是一次处理四个像素？有大拿对这方面比较了解的可以一起讨论讨论。参考 Getting started with SSE programming supercomputingblog.com https://blog.molecular-matters.com/2011/10/18/simdifying-multi-platform-math/ blog.molecular-matters.com SIMD’ifying multi-platform math blog.molecular-matters.com SIMD’ifying multi-platform math blog.molecular-matters.com Hotball 的小屋 www.csie.ntu.edu.tw ARM和NEON指令 - CSDN博客 blog.csdn.net A quick guide to SSE/SIMD www.cs.virginia.edu SIMD Code Generation in Data-Parallel Programming