快速FastPosChar算法比目前算法快500%以上具有实用价值!解决方案

快速FastPosChar算法比目前算法快500%以上具有实用价值!
应群友cjc要求改写了一个poschar的快速算法.
实测速度CGPosChar比系统自带pos要快40%-60%以上,
而CGPosCharSSE则比系统自带pos要快500%以上,这些都是归功于sse指令集发挥的作用。
SSE版本的需要CPU支持SSE2,附带一个测试CPU是否支持SSE的函数。
Delphi(Pascal) code


SSE优化算法:
function CGPosCharSSE(SubChar: Char ; SrcString: PChar; Len: Integer; Order:Boolean=True): Integer;
                // SubChar -> AL; SrcString -> EDX; Len -> ECX  Order -> [ebp+8]
asm
      push   esi
      push   ebx
      test   ecx, ecx
      jz     @NotFound
      test   edx, edx
      jz     @NotFound
      xor    esi,esi
      mov    ah,al
      movd     xmm1, eax
      pshuflw  xmm1, xmm1, 0
      pshufd   xmm1, xmm1, 0
      mov eax, [ebp+8]
      test  eax,eax   //为0则表示Order =false
      je    @Reverse   //为0倒序查找
      
    {---------------顺序查找------------------}
    @OrderCmp:
      movups   xmm0,[edx+esi]
      pcmpeqb  xmm0, xmm1
      pmovmskb eax, xmm0
      test  eax, eax
      jnz   @OrderFound
      add   esi,$10
      cmp   esi,ecx
      jl    @OrderCmp
      jmp   @Notfound
      
    {---------------倒序查找------------------}
    @Reverse:
      mov   esi,ecx
      sub   esi,$10
    @ReverseCmp:
      movups   xmm0,[edx+esi]
      pcmpeqb  xmm0, xmm1
      pmovmskb eax, xmm0
      test  eax, eax
      jnz   @ReverseFound
      sub   esi,$10
      cmp   esi,-$10
      jl    @ReverseCmp
    @NotFound:
      xor   eax, eax
      jmp   @Exit
    @OrderFound:
      bsf   eax, eax
      jmp   @SetRet
    @ReverseFound:
      bsr   eax, eax
    @SetRet:
      add   eax,esi
      add   eax,1
      cmp   eax,ecx
      jg    @Notfound //越界大于长度
      cmp   eax,1
      jl    @Notfound //越界小于1
    @Exit:
      pop   ebx
      pop   esi
end;

常规优化算法:

function CGPosChar(SubChar: Char ; SrcString: PChar; Len: Integer; Order:Boolean=True): Integer;
                //  SubChar -> AL; SrcString -> EDX; Len -> ECX  Order -> [ebp+8]
asm
      push esi
      push ebx
      push edx
      push edi
      test ecx, ecx
      jz   @Notfound
      test edx, edx
      jz   @Notfound
      xor  ebx,ebx
      mov  ah, al
      mov  bx, ax
      shl  eax, $10
      or   ebx, eax
      xor  esi,esi
      mov eax, [ebp+8]
      test eax,eax   //为0则表示Order =false
      je  @Reverse   //为0倒序查找
      
    {---------------顺序查找------------------}
    @OrderCmp:
      mov  eax,[edx+esi]
      xor  eax,ebx
      lea  edi, [eax-$01010101]
      not  eax
      and  eax, edi
      and  eax, $80808080
      jnz   @OrderFound
      add  esi,4
      cmp  esi,ecx
      jl   @OrderCmp
      jmp  @Notfound

    {---------------倒序查找------------------}
    @Reverse:
      mov   esi,ecx
      sub   esi,4
    @ReverseCmp:
      mov   eax,[edx+esi]
      xor   eax,ebx
      lea   edi, [eax-$01010101]
      not   eax
      and   eax, edi
      and   eax, $80808080
      jnz   @ReverseFound
      sub   esi,4
      cmp   esi,-4
      jg    @ReverseCmp
    @Notfound:
      xor   eax, eax
      jmp   @Exit
    @OrderFound:
      bsf   eax, eax
      jmp   @SetRet
    @ReverseFound:
      bsr   eax, eax
    @SetRet:
      shr   eax, 3
      add   eax,esi
      add   eax,1
      cmp   eax,ecx
      jg    @Notfound //越界大于长度
      cmp   eax,1
      jl    @Notfound //越界小于1
    @Exit:
      pop   edi
      pop   edx
      pop   ebx
      pop   esi
end;

CPU SSE支持检测函数:

function CheckSupportSSE(SupportFlag: Byte): Boolean;
// SupportFlag in:[1-6] is check SSE1,SSE2,SSE3,SSSE3,SSE41,SSE42.
  function GetCpuId: DWORD;
  asm
    push ecx
    push edx
    mov eax,1
    cpuid
    mov eax,edx //RetValue
    pop edx
    pop ecx
  end;
const
  _SSE_FLAG: array[0..5] of DWORD = ($2000000, $4000000, 1, $200, $80000, $100000);
var
  _Flag: DWORD;
begin
  Result := False;
  _Flag := GetCpuId;
  Result:= (_SSE_FLAG[SupportFlag] and _Flag) <> 0;
end;




------解决方案--------------------
膜拜一下汇编牛人
------解决方案--------------------