Click to See Complete Forum and Search --> : I don't see what I am doing wrong...


JamesSchumacher
May 2nd, 2008, 05:42 PM
__declspec(naked) void __stdcall MemCopy(void * lpDest,const void * lpSource,unsigned long dwSize)
{
#define _lpDest 8
#define _lpSource 12
#define _dwSize 16

_asm
{
push esi
push edi

mov esi,DWORD PTR _lpSource[ebp]
mov edi,DWORD PTR _lpDest[ebp]
mov eax,DWORD PTR _dwSize[ebp]
mov ecx,esi
add ecx,eax

CopyLoop:
mov al,BYTE PTR[esi]
mov BYTE PTR[edi],al
inc esi
inc edi
cmp esi,ecx
jne CopyLoop

pop edi
pop esi
xor eax,eax
ret 12
}

#undef _lpDest
#undef _lpSource
#undef _dwSize
}


What is wrong with that code?

S_M_A
May 2nd, 2008, 05:50 PM
What are we looking for, i.e what are you trying to acheive and what's going wrong?

JamesSchumacher
May 2nd, 2008, 06:17 PM
This one I solved myself. For some reason, this code works and the other does not.


__declspec(naked) void __stdcall MemCopy(void * lpDest,const void * lpSource,unsigned long dwSize)
{
#define _lpDest 8
#define _lpSource 12
#define _dwSize 16

_asm
{
push ebp
mov ebp,esp

push ebx
mov ecx,DWORD PTR _lpDest[ebp]
mov edx,DWORD PTR _lpSource[ebp]
mov ebx,ecx
add ebx,DWORD PTR _dwSize[ebp]
CopyLoop:
mov al,BYTE PTR[edx]
mov BYTE PTR[ecx],al
inc edx
inc ecx
cmp ecx,ebx
jne CopyLoop

pop ebx
pop ebp
xor eax,eax
ret 12
}

#undef _lpDest
#undef _lpSource
#undef _dwSize
}


Although, unless I add some push/pops for EBX in the loop, longer operations cause a crash.


__declspec(naked) void __stdcall MemCopy(void * lpDest,const void * lpSource,unsigned long dwSize)
{
#define _lpDest 8
#define _lpSource 12
#define _dwSize 16
#define _dwEnd 4

_asm
{
push ebp
mov ebp,esp

mov esi,DWORD PTR _lpSource[ebp]
mov edi,DWORD PTR _lpDest[ebp]
mov ecx,DWORD PTR _dwSize[ebp]
rep movsb

pop ebp
ret 12
}

#undef _lpDest
#undef _lpSource
#undef _dwSize
}


I know about that. I was just trying to get better speed. (Some cases, it was running much faster in the prior code)

TheCPUWizard
May 2nd, 2008, 06:26 PM
Two quick (hopefully helpful) observations...

1) Are you sure that NO context switches are occuring while the loop is running? IIRC, there are cases where the ESI and/or EDI registers may be impacted... This could account fo rthe difference

2) Any reason why you are bot using the build in REP... instructions? In many cases these micro instructions can provide significantly higher performance...

3) [hey I have always had trouble counting]... I am sure you ae aware (but other reader might not be) that this construct has issues if the source and destination overlap......

JamesSchumacher
May 2nd, 2008, 06:37 PM
See my edit of my previous post.

S_M_A
May 2nd, 2008, 08:32 PM
Please do not alter any code in your existing posts. Altering posts make thread comments invalid and also make the tread unuseful to new readers.