Unity Shader:优化GPU代码--用step()代替if else等条件语句
发表于2018-10-25
普通的卡通着色Shader:
先看一个Shader,卡通着色。由于卡通着色需要对不同渲染区域进行判定,比较适合做案例来给大家介绍是怎么用step()代替if else等条件语句。
Shader "Unlit/NewToonShading" { Properties { _Shininess("Shininess",float)=1 _Edge("Edge Scale",range(0,1))=0.2 _FinalColor("Final Color",Color)=(0.5,0.5,0.5,1) _EdgeColor("Edge Color",Color)=(0,0,0,1) } SubShader { Tags { "RenderType"="Opaque"} LOD 100 Pass { Tags {"LightMode"="Vertex" } CGPROGRAM #pragma vertex vert #pragma fragment frag #include "UnityCG.cginc" struct appdata { float4 vertex : POSITION; float3 normal:NORMAL; }; struct v2f { float4 vertex : SV_POSITION; float3 N:TEXCOORD0; float3 L:TEXCOORD1; float3 H:TEXCOORD2; float3 V:TEXCOORD3; }; float _Shininess; float _Edge; float4 _FinalColor; float4 _EdgeColor; float4 _LightPosition_World; v2f vert (appdata v) { v2f o=(v2f)0; float4 worldPos=mul(unity_ObjectToWorld,v.vertex); float4 lightPos_World=mul(UNITY_MATRIX_I_V,unity_LightPosition[1]); o.N=normalize(mul(unity_ObjectToWorld,v.normal)); o.L=normalize(lightPos_World-worldPos.xyz); o.V=normalize(_WorldSpaceCameraPos-worldPos.xyz); o.H=normalize(o.L+o.V); o.vertex = UnityObjectToClipPos(v.vertex); return o; } fixed4 frag (v2f i) : SV_Target { i.N=normalize(i.N); i.L=normalize(i.L); i.H=normalize(i.H); i.V=normalize(i.V); float4 Kd=_FinalColor; float4 Ks=0; fixed4 col; //边缘判定 float edge=max(dot(i.N,i.V),0); if(edge<_Edge){ return _EdgeColor; } //暗光判定 float diffuseLight=max(dot(i.N,i.L),0); if(diffuseLight<=0.1f){ //暗光区域 Kd*=0.5f; //亮光区域亮度减半 Ks=0; //无高光 //如果diffuseLight<=0,说明N,H夹角大于了90',眼睛或光源在材质表面后方 col=Kd+Ks; return col; } //高光判定 float specularLight=pow(max(dot(i.N,i.H),0),_Shininess); if(specularLight>=0.95f){ Ks=float4(1.0f,1.0f,1.0f,0.0f); //高光 } col=Kd+Ks; return col; } ENDCG } } }
(上图:渲染结果)
优化的原理:
在片段着色器中,我以正常cpu编程的逻辑进行了优化,例如,if(edge<_Edge){return _EdgeColor;},如果此像素被判定为边缘,则直接返回边缘颜色,那么则不用再进行之后的运算了。以此类推后面又用if else 分别进行了高光,亮光,暗光区的判断。但是这种优化对于gpu编程来讲是无效的。在片段着色器中,每个片段处理器每条指令操作上百个像素,如果有些片段(像素)采取一个分支而有些片段不采用另一个分支,则所有片段都会执行两个分支,但只在每个片段应该采取的分支上写入寄存器。另外,if/endif等流程控制操作有较高的开销(4个时钟周期,Geforce6)修改1修改1。因此在GPU编程中,if else ,switch case等条件语句和太复杂的逻辑是不推荐的。相应的,可以用step()等函数进行替换,用阶梯函数的思维来构建条件语句。这样,所有的线程都执行完全一样的代码,在很多方面对GPU都是有益的。
优化后的Shader:
上面Shader的Step()函数版本:
Shader "Unlit/NewToonShading_StepVersion" { Properties { _Shininess("Shininess",float)=1 _Edge("Edge Scale",range(0,1))=0.2 _FinalColor("Final Color",Color)=(0.5,0.5,0.5,1) _EdgeColor("Edge Color",Color)=(0,0,0,1) } SubShader { Tags { "RenderType"="Opaque"} LOD 100 Pass { Tags {"LightMode"="Vertex" } CGPROGRAM #pragma vertex vert #pragma fragment frag #include "UnityCG.cginc" struct appdata { float4 vertex : POSITION; float3 normal:NORMAL; }; struct v2f { float4 vertex : SV_POSITION; float3 N:TEXCOORD0; float3 L:TEXCOORD1; float3 H:TEXCOORD2; float3 V:TEXCOORD3; }; float _Shininess; float _Edge; float4 _FinalColor; float4 _EdgeColor; float4 _LightPosition_World; v2f vert (appdata v) { v2f o=(v2f)0; float4 worldPos=mul(unity_ObjectToWorld,v.vertex); float4 lightPos_World=mul(UNITY_MATRIX_I_V,unity_LightPosition[1]); o.N=normalize(mul(unity_ObjectToWorld,v.normal)); o.L=normalize(lightPos_World-worldPos.xyz); o.V=normalize(_WorldSpaceCameraPos-worldPos.xyz); o.H=normalize(o.L+o.V); o.vertex = UnityObjectToClipPos(v.vertex); return o; } fixed4 frag (v2f i) : SV_Target { i.N=normalize(i.N); i.L=normalize(i.L); i.H=normalize(i.H); i.V=normalize(i.V); float4 Kd=_FinalColor; float4 Ks=0; fixed4 col; //边缘判定 float edge=max(dot(i.N,i.V),0); edge=step(edge,_Edge); //if(edge<=_Edge) edge=1 , else edge=0 _EdgeColor*=edge; //高光判定 float specularLight=pow(max(dot(i.N,i.H),0),_Shininess); specularLight=step(0.95f,specularLight); //if specularLight>=0.95f specularLight=1 else =0 //暗光判定 float diffuseLight=max(dot(i.N,i.L),0); diffuseLight=step(0.1f,diffuseLight); //if(diffuseLight>=0.1f) diffuseLight=1 else diffuseLight=0 Ks=specularLight*diffuseLight; //if diffuseLight=0, Ks=0; else Ks=specularLight(1 or 0) diffuseLight=diffuseLight*0.5f+0.5f; //change 1 or 0 to 1 or 0.5 //0.5Kd or Kd 1or0 1or0 0or1 0orEdgeColor col=(Kd*diffuseLight+Ks)*(1.0f-edge)+_EdgeColor; return col; } ENDCG } } }
举例解释:
在HLSL中, step(a,b)既是当b>=a时返回1,否则返回0,换句话说既是当a<=b时返回1,否则返回0。因此可以把被比较数灵活的插入a或b的位置,完成小于或大于的比较。由于返回值是0或1,它无法直接替代if else逻辑判断,但是可以通过改造算法完成,例如:
//边缘判定 floatedge=max(dot(i.N,i.V),0); if(edge<_Edge){ return_EdgeColor; }
上文中,直接返回的_EdgeColor,将在下文中变为一个000或保持自身值的rgb变量,edge会变为0或1,并在最后的计算步骤中参与最终颜色的计算:
//边缘判定 floatedge=max(dot(i.N,i.V),0); edge=step(edge,_Edge);//if(edge<=_Edge)edge=1,elseedge=0 _EdgeColor*=edge; //...中间过程略... //0.5KdorKd1or01or00or10orEdgeColor col=(Kd*diffuseLight+Ks)*(1.0f-edge)+_EdgeColor;
如果此像素为边缘,edge为1,那么在最终颜色计算中,不论其他变量如何,它都会变为一个0+_EdgeColor的值,既是边缘颜色。如果此像素为非边缘地带,edge为0,_EdgeColor为0,那么最终颜色为 “其他颜色”*1+0,边缘颜色被剔除。
以此类推,原版中高光,亮光与暗光区域判断的返回值也都变成了变量放入最终颜色计算中。具体推理分析请借助step()版本各行后面注释。
测试
两个版本的FPS小幅波动基本相同, 第一可能是计算量太小,未对性能造成瓶颈。第二可能是step版本虽省去了3个if判断,但是增加了3个step函数以及几次计算,抵消后优化效果过于微弱。
汇编版本:
汇编后的片段着色器代码(部分截取):
if else版本:
0:dp3r0.x,v1.xyzx,v1.xyzx 1:rsqr0.x,r0.x 2:mulr0.xyz,r0.xxxx,v1.xyzx 3:dp3r0.w,v4.xyzx,v4.xyzx 4:rsqr0.w,r0.w 5:mulr1.xyz,r0.wwww,v4.xyzx 6:dp3r0.w,r0.xyzx,r1.xyzx 7:maxr0.w,r0.w,l(0.000000) 8:ltr0.w,r0.w,cb0[2].y 9:if_nzr0.w 10:movo0.xyzw,cb0[4].xyzw 11:ret 12:endif 13:dp3r0.w,v2.xyzx,v2.xyzx 14:rsqr0.w,r0.w 15:mulr1.xyz,r0.wwww,v2.xyzx 16:dp3r0.w,r0.xyzx,r1.xyzx 17:maxr0.w,r0.w,l(0.000000) 18:ger0.w,l(0.100000),r0.w 19:if_nzr0.w 20:mulo0.xyzw,cb0[3].xyzw,l(0.500000,0.500000,0.500000,0.500000) 21:ret 22:endif 23:dp3r0.w,v3.xyzx,v3.xyzx 24:rsqr0.w,r0.w 25:mulr1.xyz,r0.wwww,v3.xyzx 26:dp3r0.x,r0.xyzx,r1.xyzx 27:maxr0.x,r0.x,l(0.000000) 28:logr0.x,r0.x 29:mulr0.x,r0.x,cb0[2].x 30:expr0.x,r0.x 31:ger0.x,r0.x,l(0.950000) 32:andr0.xyzw,r0.xxxx,l(0x3f800000,0x3f800000,0x3f800000,0) 33:addo0.xyzw,r0.xyzw,cb0[3].xyzw 34:ret
step()版本:
0:dp3r0.x,v3.xyzx,v3.xyzx 1:rsqr0.x,r0.x 2:mulr0.xyz,r0.xxxx,v3.xyzx 3:dp3r0.w,v1.xyzx,v1.xyzx 4:rsqr0.w,r0.w 5:mulr1.xyz,r0.wwww,v1.xyzx 6:dp3r0.x,r1.xyzx,r0.xyzx 7:maxr0.x,r0.x,l(0.000000) 8:logr0.x,r0.x 9:mulr0.x,r0.x,cb0[2].x 10:expr0.x,r0.x 11:ger0.x,r0.x,l(0.950000) 12:dp3r0.y,v2.xyzx,v2.xyzx 13:rsqr0.y,r0.y 14:mulr0.yzw,r0.yyyy,v2.xxyz 15:dp3r0.y,r1.xyzx,r0.yzwy 16:maxr0.y,r0.y,l(0.000000) 17:ger0.y,r0.y,l(0.100000) 18:andr0.xz,r0.xxyx,l(0x3f800000,0,0x3f800000,0) 19:movcr0.y,r0.y,l(1.000000),l(0.500000) 20:mulr0.x,r0.z,r0.x 21:madr0.xyzw,cb0[3].xyzw,r0.yyyy,r0.xxxx 22:dp3r1.w,v4.xyzx,v4.xyzx 23:rsqr1.w,r1.w 24:mulr2.xyz,r1.wwww,v4.xyzx 25:dp3r1.x,r1.xyzx,r2.xyzx 26:maxr1.x,r1.x,l(0.000000) 27:ger1.x,cb0[2].y,r1.x 28:movcr0.xyzw,r1.xxxx,l(0,0,0,0),r0.xyzw 29:andr1.x,r1.x,l(0x3f800000) 30:mado0.xyzw,cb0[4].xyzw,r1.xxxx,r0.xyzw 31:ret
来自:https://blog.csdn.net/liu_if_else/article/details/77455639