求好手参照C过程用SSE写个内联汇编函数
C代码如下:
DWORD * pSrc;
int nSize;
for(int i = 0; i < nSize; i ++)
{
if(*pSrc && *pSrc < 0x01000000)
*pSrc |= 0xff000000;
pSrc++;
}
请汇编高手用SSE2写一个上述过程的内联汇编代码
------解决思路----------------------
__declspec(naked)
void __cdecl fun(void* src, int nsize)
{
STATIC __declspec(align(16)) __int64 m_BSL[2]={0xff000000ff000000, 0xff000000ff000000};
STATIC __declspec(align(16)) __int64 m_CMP[2]={0x0100000001000000, 0x0100000001000000};
__asm {
push esi
push edi
push ebp
push ebx
mov esi, [esp+4+16]
mov ecx, [esp+8+16]
mov eax, ecx
mov ebx, 16
and ecx, 3
shr eax, 2
je __doremain16
movdqa xmm7, [m_BSL] // xmm7 <- 0xff000000, 0xff000000, 0xff000000, 0xff000000
movdqa xmm6, [m_CMP] // xmm6 <- 0x01000000, 0x01000000, 0x01000000, 0x01000000
pxor xmm5, xmm5 // ZERO
align 16
sse_loop:
movdqu xmm4, [esi] // xmm4 <- source
movdqa xmm3, xmm4 // xmm3 <- source
movdqa xmm2, xmm4 // xmm2 <- source
pcmpeqd xmm4, xmm5 // if DWORD = 0 set FFFFFFFF else 00000000[do]
pand xmm3, xmm7 // xmm3 AND 0xFF000000-0xFF000000-0xFF000000-0xFF000000
pcmpeqd xmm3, xmm5 // if xmm3 < 0x01000000 xmm3<-0[do] else xmm3<-0xFFFFFFFF
// ZZ
por xmm3, xmm4 // if do hit 2 times ZZ is 00
pxor xmm3, xmm7 // FF->00
------解决思路----------------------
00->FF
pand xmm3, xmm7 // maybe xmm3 <- 0xFF000000 (or 0xFF000000)
// maybe xmm3 <- 0x00000000 (or 0x00000000 load's mem no effect)
por xmm2, xmm3
movdqu [esi], xmm2 // write back mem
add esi, ebx
dec eax
jne sse_loop
// quit main loop deal remain bytes
__doremain16:
test ecx, ecx
mov ebx, 0FF000000h
mov edi, 001000000h
jne __lowxmm // JCC do remain byte deal
// else immed quit
pop ebx
pop ebp
pop edi
pop esi
ret // is's time to ret ....
align 16
__lowxmm:
mov eax, [esi]
or eax, ebx
test eax, eax
je ___next
cmp eax, edi
ja ___next
mov [esi], eax
___next:
add esi, 4
dec ecx
jne __lowxmm
pop ebx
pop ebp
pop edi
pop esi
ret // ret ....
}
}
------解决思路----------------------
我估计这代码要比C代码慢。
------解决思路----------------------
可以考虑使用intrinsics实现,这样不用考虑32位->64位的移植问题,另外算法还有优化的余地。