Unity Shader:优化GPU代码--用step()代替if else等条件语句

发表于2018-10-25
评论0 8.1k浏览
普通的卡通着色Shader:

先看一个Shader,卡通着色。由于卡通着色需要对不同渲染区域进行判定,比较适合做案例来给大家介绍是怎么用step()代替if else等条件语句。
Shader "Unlit/NewToonShading"
{
    Properties
    {
        _Shininess("Shininess",float)=1
        _Edge("Edge Scale",range(0,1))=0.2
        _FinalColor("Final Color",Color)=(0.5,0.5,0.5,1)
        _EdgeColor("Edge Color",Color)=(0,0,0,1)
    }
    SubShader
    {
        Tags { "RenderType"="Opaque"}
        LOD 100
        Pass
        {
            Tags {"LightMode"="Vertex" }
            CGPROGRAM
            #pragma vertex vert
            #pragma fragment frag
            #include "UnityCG.cginc"
            struct appdata
            {
                float4 vertex : POSITION;
                float3 normal:NORMAL;
            };
            struct v2f
            {
                float4 vertex : SV_POSITION;
                float3 N:TEXCOORD0;
                float3 L:TEXCOORD1;
                float3 H:TEXCOORD2;
                float3 V:TEXCOORD3;
            };
            float _Shininess;
            float _Edge;
            float4 _FinalColor;
            float4 _EdgeColor;
            float4 _LightPosition_World;
            v2f vert (appdata v)
            {
                v2f o=(v2f)0;
                float4 worldPos=mul(unity_ObjectToWorld,v.vertex);
                float4 lightPos_World=mul(UNITY_MATRIX_I_V,unity_LightPosition[1]);
                o.N=normalize(mul(unity_ObjectToWorld,v.normal));
                o.L=normalize(lightPos_World-worldPos.xyz);
                o.V=normalize(_WorldSpaceCameraPos-worldPos.xyz);
                o.H=normalize(o.L+o.V);
                o.vertex = UnityObjectToClipPos(v.vertex);
                return o;
            }
            fixed4 frag (v2f i) : SV_Target
            {
                i.N=normalize(i.N);
                i.L=normalize(i.L);
                i.H=normalize(i.H);
                i.V=normalize(i.V);
                float4 Kd=_FinalColor;
                float4 Ks=0;
                fixed4 col;
                //边缘判定
                float edge=max(dot(i.N,i.V),0);
                if(edge<_Edge){
                    return _EdgeColor;
                }
                //暗光判定
                float diffuseLight=max(dot(i.N,i.L),0);
                if(diffuseLight<=0.1f){     //暗光区域
                    Kd*=0.5f;               //亮光区域亮度减半
                    Ks=0;                   //无高光  //如果diffuseLight<=0,说明N,H夹角大于了90',眼睛或光源在材质表面后方
                    col=Kd+Ks;
                    return col;
                }
                //高光判定
                float specularLight=pow(max(dot(i.N,i.H),0),_Shininess);
                if(specularLight>=0.95f){
                    Ks=float4(1.0f,1.0f,1.0f,0.0f);     //高光
                }
                col=Kd+Ks;
                return col;
            }
            ENDCG
        }
    }
}
(上图:渲染结果)

优化的原理:

在片段着色器中,我以正常cpu编程的逻辑进行了优化,例如,if(edge<_Edge){return _EdgeColor;},如果此像素被判定为边缘,则直接返回边缘颜色,那么则不用再进行之后的运算了。以此类推后面又用if else 分别进行了高光,亮光,暗光区的判断。但是这种优化对于gpu编程来讲是无效的。在片段着色器中,每个片段处理器每条指令操作上百个像素,如果有些片段(像素)采取一个分支而有些片段不采用另一个分支,则所有片段都会执行两个分支,但只在每个片段应该采取的分支上写入寄存器。另外,if/endif等流程控制操作有较高的开销(4个时钟周期,Geforce6)修改1修改1。因此在GPU编程中,if else ,switch case等条件语句和太复杂的逻辑是不推荐的。相应的,可以用step()等函数进行替换,用阶梯函数的思维来构建条件语句。这样,所有的线程都执行完全一样的代码,在很多方面对GPU都是有益的。

优化后的Shader:

上面Shader的Step()函数版本:
Shader "Unlit/NewToonShading_StepVersion"
{
    Properties
    {
        _Shininess("Shininess",float)=1
        _Edge("Edge Scale",range(0,1))=0.2
        _FinalColor("Final Color",Color)=(0.5,0.5,0.5,1)
        _EdgeColor("Edge Color",Color)=(0,0,0,1)
    }
    SubShader
    {
        Tags { "RenderType"="Opaque"}
        LOD 100
        Pass
        {
            Tags {"LightMode"="Vertex" }
            CGPROGRAM
            #pragma vertex vert
            #pragma fragment frag
            #include "UnityCG.cginc"
            struct appdata
            {
                float4 vertex : POSITION;
                float3 normal:NORMAL;
            };
            struct v2f
            {
                float4 vertex : SV_POSITION;
                float3 N:TEXCOORD0;
                float3 L:TEXCOORD1;
                float3 H:TEXCOORD2;
                float3 V:TEXCOORD3;
            };
            float _Shininess;
            float _Edge;
            float4 _FinalColor;
            float4 _EdgeColor;
            float4 _LightPosition_World;
            v2f vert (appdata v)
            {
                v2f o=(v2f)0;
                float4 worldPos=mul(unity_ObjectToWorld,v.vertex);
                float4 lightPos_World=mul(UNITY_MATRIX_I_V,unity_LightPosition[1]);
                o.N=normalize(mul(unity_ObjectToWorld,v.normal));
                o.L=normalize(lightPos_World-worldPos.xyz);
                o.V=normalize(_WorldSpaceCameraPos-worldPos.xyz);
                o.H=normalize(o.L+o.V);
                o.vertex = UnityObjectToClipPos(v.vertex);
                return o;
            }
            fixed4 frag (v2f i) : SV_Target
            {
                i.N=normalize(i.N);
                i.L=normalize(i.L);
                i.H=normalize(i.H);
                i.V=normalize(i.V);
                float4 Kd=_FinalColor;
                float4 Ks=0;
                fixed4 col;
                //边缘判定
                float edge=max(dot(i.N,i.V),0);
                edge=step(edge,_Edge); //if(edge<=_Edge) edge=1 , else edge=0
                _EdgeColor*=edge;
                //高光判定
                float specularLight=pow(max(dot(i.N,i.H),0),_Shininess);
                specularLight=step(0.95f,specularLight);        //if specularLight>=0.95f specularLight=1 else =0
                //暗光判定
                float diffuseLight=max(dot(i.N,i.L),0);
                diffuseLight=step(0.1f,diffuseLight); //if(diffuseLight>=0.1f) diffuseLight=1   else diffuseLight=0
                Ks=specularLight*diffuseLight;      //if diffuseLight=0, Ks=0; else Ks=specularLight(1 or 0)
                diffuseLight=diffuseLight*0.5f+0.5f;      //change 1 or 0 to 1 or 0.5
            //0.5Kd or Kd  1or0     1or0    0or1    0orEdgeColor    
                col=(Kd*diffuseLight+Ks)*(1.0f-edge)+_EdgeColor;        
                return col;
            }
            ENDCG
        }
    }
}

举例解释:

在HLSL中, step(a,b)既是当b>=a时返回1,否则返回0,换句话说既是当a<=b时返回1,否则返回0。因此可以把被比较数灵活的插入a或b的位置,完成小于或大于的比较。由于返回值是0或1,它无法直接替代if else逻辑判断,但是可以通过改造算法完成,例如:
//边缘判定  
floatedge=max(dot(i.N,i.V),0);  
if(edge<_Edge){  
return_EdgeColor;  
}  

上文中,直接返回的_EdgeColor,将在下文中变为一个000或保持自身值的rgb变量,edge会变为0或1,并在最后的计算步骤中参与最终颜色的计算:
//边缘判定  
floatedge=max(dot(i.N,i.V),0);  
edge=step(edge,_Edge);//if(edge<=_Edge)edge=1,elseedge=0  
_EdgeColor*=edge;  
//...中间过程略...  
//0.5KdorKd1or01or00or10orEdgeColor  
col=(Kd*diffuseLight+Ks)*(1.0f-edge)+_EdgeColor;  
如果此像素为边缘,edge为1,那么在最终颜色计算中,不论其他变量如何,它都会变为一个0+_EdgeColor的值,既是边缘颜色。如果此像素为非边缘地带,edge为0,_EdgeColor为0,那么最终颜色为 “其他颜色”*1+0,边缘颜色被剔除。

以此类推,原版中高光,亮光与暗光区域判断的返回值也都变成了变量放入最终颜色计算中。具体推理分析请借助step()版本各行后面注释。

测试


两个版本的FPS小幅波动基本相同, 第一可能是计算量太小,未对性能造成瓶颈。第二可能是step版本虽省去了3个if判断,但是增加了3个step函数以及几次计算,抵消后优化效果过于微弱。

汇编版本:

汇编后的片段着色器代码(部分截取):

if else版本:
0:dp3r0.x,v1.xyzx,v1.xyzx  
1:rsqr0.x,r0.x  
2:mulr0.xyz,r0.xxxx,v1.xyzx  
3:dp3r0.w,v4.xyzx,v4.xyzx  
4:rsqr0.w,r0.w  
5:mulr1.xyz,r0.wwww,v4.xyzx  
6:dp3r0.w,r0.xyzx,r1.xyzx  
7:maxr0.w,r0.w,l(0.000000)  
8:ltr0.w,r0.w,cb0[2].y  
9:if_nzr0.w  
10:movo0.xyzw,cb0[4].xyzw  
11:ret  
12:endif  
13:dp3r0.w,v2.xyzx,v2.xyzx  
14:rsqr0.w,r0.w  
15:mulr1.xyz,r0.wwww,v2.xyzx  
16:dp3r0.w,r0.xyzx,r1.xyzx  
17:maxr0.w,r0.w,l(0.000000)  
18:ger0.w,l(0.100000),r0.w  
19:if_nzr0.w  
20:mulo0.xyzw,cb0[3].xyzw,l(0.500000,0.500000,0.500000,0.500000)  
21:ret  
22:endif  
23:dp3r0.w,v3.xyzx,v3.xyzx  
24:rsqr0.w,r0.w  
25:mulr1.xyz,r0.wwww,v3.xyzx  
26:dp3r0.x,r0.xyzx,r1.xyzx  
27:maxr0.x,r0.x,l(0.000000)  
28:logr0.x,r0.x  
29:mulr0.x,r0.x,cb0[2].x  
30:expr0.x,r0.x  
31:ger0.x,r0.x,l(0.950000)  
32:andr0.xyzw,r0.xxxx,l(0x3f800000,0x3f800000,0x3f800000,0)  
33:addo0.xyzw,r0.xyzw,cb0[3].xyzw  
34:ret  

step()版本:
0:dp3r0.x,v3.xyzx,v3.xyzx  
1:rsqr0.x,r0.x  
2:mulr0.xyz,r0.xxxx,v3.xyzx  
3:dp3r0.w,v1.xyzx,v1.xyzx  
4:rsqr0.w,r0.w  
5:mulr1.xyz,r0.wwww,v1.xyzx  
6:dp3r0.x,r1.xyzx,r0.xyzx  
7:maxr0.x,r0.x,l(0.000000)  
8:logr0.x,r0.x  
9:mulr0.x,r0.x,cb0[2].x  
10:expr0.x,r0.x  
11:ger0.x,r0.x,l(0.950000)  
12:dp3r0.y,v2.xyzx,v2.xyzx  
13:rsqr0.y,r0.y  
14:mulr0.yzw,r0.yyyy,v2.xxyz  
15:dp3r0.y,r1.xyzx,r0.yzwy  
16:maxr0.y,r0.y,l(0.000000)  
17:ger0.y,r0.y,l(0.100000)  
18:andr0.xz,r0.xxyx,l(0x3f800000,0,0x3f800000,0)  
19:movcr0.y,r0.y,l(1.000000),l(0.500000)  
20:mulr0.x,r0.z,r0.x  
21:madr0.xyzw,cb0[3].xyzw,r0.yyyy,r0.xxxx  
22:dp3r1.w,v4.xyzx,v4.xyzx  
23:rsqr1.w,r1.w  
24:mulr2.xyz,r1.wwww,v4.xyzx  
25:dp3r1.x,r1.xyzx,r2.xyzx  
26:maxr1.x,r1.x,l(0.000000)  
27:ger1.x,cb0[2].y,r1.x  
28:movcr0.xyzw,r1.xxxx,l(0,0,0,0),r0.xyzw  
29:andr1.x,r1.x,l(0x3f800000)  
30:mado0.xyzw,cb0[4].xyzw,r1.xxxx,r0.xyzw  
31:ret  
来自:https://blog.csdn.net/liu_if_else/article/details/77455639

如社区发表内容存在侵权行为,您可以点击这里查看侵权投诉指引