Click to See Complete Forum and Search --> : What's wrong with my code conversion of SSEMemcpy Intel Version to GCC Inline version


whocaresit
March 9th, 2009, 06:04 PM
I just learned Assembly by yesterday midnight and managed to come up with this code:

The following is giving me SEG FAULT.


void memcpy2(void* dest, const void* src, const unsigned long size_t)
{
asm("mov %0, %%esi\n\t"
: :"r"(src)); //src pointer


asm("mov %0, %%edi\n\t"
:
: "r"(dest)); //dest pointer

asm("mov %0, %%ebx\n\t" //ebx is our counter
:
: "r"(size_t));

asm("shr $7, %ebx\n\t" //divide by 128 (8 * 128bit registers)

"loop_copy:\n\t"
"prefetchnta 128(%esi)\n\t" //SSE2 prefetch
"prefetchnta 160(%esi)\n\t"
"prefetchnta 192(%esi)\n\t"
"prefetchnta 224(%esi)\n\t"

"movdqa %xmm0, 0(%esi)\n\t" //move data from src to registers
"movdqa %xmm1, 16(%esi)\n\t"
"movdqa %xmm2, 32(%esi)\n\t"
"movdqa %xmm3, 48(%esi)\n\t"
"movdqa %xmm4, 64(%esi)\n\t"
"movdqa %xmm5, 80(%esi)\n\t"
"movdqa %xmm6, 96(%esi)\n\t"
"movdqa %xmm7, 112(%esi)\n\t"

"movntdq %xmm0, 0(%edi)\n\t" //move data from registers to dest
"movntdq %xmm1, 16(%edi)\n\t"
"movntdq %xmm2, 32(%edi)\n\t"
"movntdq %xmm3, 48(%edi)\n\t"
"movntdq %xmm4, 64(%edi)\n\t"
"movntdq %xmm5, 80(%edi)\n\t"
"movntdq %xmm6, 96(%edi)\n\t"
"movntdq %xmm7, 112(%edi)\n\t"

"add %esi, 128\n"
"add %edi, 128\n"
"dec %ebx\n"

"jnz loop_copy\n" //loop please
"loop_copy_end:\n"
);
}


Please help!

rxbagain
March 10th, 2009, 03:09 AM
GCC inline assembly takes the source and destination operands in reversed positions. In normal way (intel), the destination comes first before the source but for GCC the destination is placed as the last operand. For example if we do something like "EAX = EBX", in intel way, it would be "mov eax, ebx" but for gcc inline, it becomes "mov %ebx, %eax".

Here's the modification I made. Note also that immediate values should be preceded by "$" (e.g. "add $128, %esi" in the code).
void memcpy2(void* dest, const void* src, const unsigned long size_t)
{
asm("mov %0, %%esi\n\t"
: :"r"(src)); //src pointer


asm("mov %0, %%edi\n\t"
:
: "r"(dest)); //dest pointer

asm("mov %0, %%ebx\n\t" //ebx is our counter
:
: "r"(size_t));

asm("shr $7, %ebx\n\t" //divide by 128 (8 * 128bit registers)

"loop_copy:\n\t"
"prefetchnta 128(%esi)\n\t" //SSE2 prefetch
"prefetchnta 160(%esi)\n\t"
"prefetchnta 192(%esi)\n\t"
"prefetchnta 224(%esi)\n\t"

"movdqa 0(%esi), %xmm0\n\t" //move data from src to registers
"movdqa 16(%esi), %xmm1\n\t"
"movdqa 32(%esi), %xmm2\n\t"
"movdqa 48(%esi), %xmm3\n\t"
"movdqa 64(%esi), %xmm4\n\t"
"movdqa 80(%esi), %xmm5\n\t"
"movdqa 96(%esi), %xmm6\n\t"
"movdqa 112(%esi), %xmm7\n\t"

"movntdq %xmm0, 0(%edi)\n\t" //move data from registers to dest
"movntdq %xmm1, 16(%edi)\n\t"
"movntdq %xmm2, 32(%edi)\n\t"
"movntdq %xmm3, 48(%edi)\n\t"
"movntdq %xmm4, 64(%edi)\n\t"
"movntdq %xmm5, 80(%edi)\n\t"
"movntdq %xmm6, 96(%edi)\n\t"
"movntdq %xmm7, 112(%edi)\n\t"

"add $128, %esi\n"
"add $128, %edi\n"
"dec %ebx\n"

"jnz loop_copy\n" //loop please
"loop_copy_end:\n"
);
}
Hope it will help you :)

whocaresit
March 10th, 2009, 06:10 PM
I am trying to use it as a memcpy replacement; but still, I am getting STATUS access Violation error / sigfault.


void memcpy2(void* dest, const void* src, const unsigned long size_t)

if (size_t >= 128)
{
asm("mov %0, %%esi\n\t"
: :"r"(src)); //src pointer


asm("mov %0, %%edi\n\t"
:
: "r"(dest)); //dest pointer

asm("mov %0, %%ebx\n\t" //ebx is our counter
:
: "r"(size_t));

asm("shr $7, %ebx\n\t" //divide by 128 (8 * 128bit registers)

"loop_copy:\n\t"
"prefetchnta 128(%esi)\n\t" //SSE2 prefetch
"prefetchnta 160(%esi)\n\t"
"prefetchnta 192(%esi)\n\t"
"prefetchnta 224(%esi)\n\t"

"movdqa 0(%esi), %xmm0\n\t" //move data from src to registers
"movdqa 16(%esi), %xmm1\n\t"
"movdqa 32(%esi), %xmm2\n\t"
"movdqa 48(%esi), %xmm3\n\t"
"movdqa 64(%esi), %xmm4\n\t"
"movdqa 80(%esi), %xmm5\n\t"
"movdqa 96(%esi), %xmm6\n\t"
"movdqa 112(%esi), %xmm7\n\t"

"movntdq %xmm0, 0(%edi)\n\t" //move data from registers to dest
"movntdq %xmm1, 16(%edi)\n\t"
"movntdq %xmm2, 32(%edi)\n\t"
"movntdq %xmm3, 48(%edi)\n\t"
"movntdq %xmm4, 64(%edi)\n\t"
"movntdq %xmm5, 80(%edi)\n\t"
"movntdq %xmm6, 96(%edi)\n\t"
"movntdq %xmm7, 112(%edi)\n\t"

"add $128, %esi\n"
"add $128, %edi\n"
"dec %ebx\n"

"jnz loop_copy\n" //loop please
"loop_copy_end:\n"
);
}
else
{
memcpy(dest,src,size_t);
}
}



Any ideas on fix?

rxbagain
March 10th, 2009, 06:17 PM
How are you calling the function? As you can see in the code, it copies by 128-byte blocks. You might be passing a pointer that can result to overflow or maybe the pointers are not aligned.

whocaresit
March 10th, 2009, 06:30 PM
How are you calling the function? As you can see in the code, it copies by 128-byte blocks. You might be passing a pointer that can result to overflow or maybe the pointers are not aligned.

Is it possible to force alignment upon data, incase it not already aligned?
If so how?

rxbagain
March 10th, 2009, 06:43 PM
There can be 2 cases when your pointers are not aligned. The first one are those that can be aligned and the other one are those that cannot.

With the first one, what you can do is to check the pointer first if you can align them. For example, if source address is 0x00400001 and destination address is 0x00600001, you can align them by processing the first 7 bytes with memcpy (fixing their addresses to 0x00400008 and 0x00600008) and then use your memcpy2 for the succeeding data. Make sure that the last part should be taken care also by making sure it does not overflow. It would be like a 3 steps process

1. if not aligned, copy the first several bytes using memcpy
2. copy using memcpy2 (but make sure it will not overflow)
3. copy the last bytes (if there is any) using memcpy

An example when it cannot be aligned is when the source and destination cannot be synchronized. For example source is 0x00400001 and destination is 0x00400002. We cannot make them align at the same time since making the other aligned will misalign the other. In this case we have no option than simply to use memcpy.

If you can't solve it, I'll send you some code when I get home :)

Edit: I have to correct myself. I was wrong when I gave the implication on my pointer sample that the alignment is 8 bytes. It should be 16 bytes. Sorry guys :)

rxbagain
March 10th, 2009, 11:25 PM
Hi whocaresit,

Here's the code changes I made.


void memcpy2(void* dest, const void* src, const unsigned long size_t)
{
unsigned long size1, size2 = 0, size3;
// can we synchronize src and dst pointers?
if (((reinterpret_cast<unsigned long>(dest) ^ reinterpret_cast<unsigned long>(src)) & 0xf) == 0) {
// if yes, compute the part1 and part 2 - part 2 is the count of 128 byte block
size1 = (0x10 - (reinterpret_cast<long>(dest) & 0xf)) & 0xf;
size2 = (size_t - size1) & ~0x7F;
}

// if the size that can be copied by 128 blocks (size2) is > 0 then use fast copy
if (size2)
{
if (size1) memcpy(dest, src, size1);

asm ("push %esi\n\t"); // preserve ESI to the stack
asm ("push %edi\n\t"); // preserve EDI to the stack

asm("mov %0, %%esi\n\t"
: :"r"(src)); //src pointer

asm("mov %0, %%edi\n\t"
:
: "r"(dest)); //dest pointer

asm("mov %0, %%ebx\n\t" //ebx is our counter
:
: "r"(size2));

asm("add %0, %%esi\n\t" :: "r"(size1)); // add the copied bytes count to src pointer
asm("add %0, %%edi\n\t" :: "r"(size1)); // add the copied bytes count to dest pointer

asm("shr $7, %ebx\n\t" //divide by 128 (8 * 128bit registers)

"loop_copy:\n\t"
"prefetchnta 128(%esi)\n\t" //SSE2 prefetch
"prefetchnta 160(%esi)\n\t"
"prefetchnta 192(%esi)\n\t"
"prefetchnta 224(%esi)\n\t"

"movdqa 0(%esi), %xmm0\n\t" //move data from src to registers
"movdqa 16(%esi), %xmm1\n\t"
"movdqa 32(%esi), %xmm2\n\t"
"movdqa 48(%esi), %xmm3\n\t"
"movdqa 64(%esi), %xmm4\n\t"
"movdqa 80(%esi), %xmm5\n\t"
"movdqa 96(%esi), %xmm6\n\t"
"movdqa 112(%esi), %xmm7\n\t"

"movntdq %xmm0, 0(%edi)\n\t" //move data from registers to dest
"movntdq %xmm1, 16(%edi)\n\t"
"movntdq %xmm2, 32(%edi)\n\t"
"movntdq %xmm3, 48(%edi)\n\t"
"movntdq %xmm4, 64(%edi)\n\t"
"movntdq %xmm5, 80(%edi)\n\t"
"movntdq %xmm6, 96(%edi)\n\t"
"movntdq %xmm7, 112(%edi)\n\t"

"add $128, %esi\n"
"add $128, %edi\n"
"dec %ebx\n"

"jnz loop_copy\n" //loop please
"loop_copy_end:\n"
);

asm ("pop %edi\n\t"); // restore EDI
asm ("pop %esi\n\t"); // restore ESI

size3 = size_t - size2 - size1;
if (size3)
memcpy(&reinterpret_cast<char*>(dest)[size1 + size2],
&reinterpret_cast<const char*>(src)[size1 + size2], size3);
}
else
{
printf("memcpy used\n");
memcpy(dest,src,size_t);
}
}

whocaresit
March 11th, 2009, 09:14 AM
Hi whocaresit,

Here's the code changes I made.


void memcpy2(void* dest, const void* src, const unsigned long size_t)
{
unsigned long size1, size2 = 0, size3;
// can we synchronize src and dst pointers?
if (((reinterpret_cast<unsigned long>(dest) ^ reinterpret_cast<unsigned long>(src)) & 0xf) == 0) {
// if yes, compute the part1 and part 2 - part 2 is the count of 128 byte block
size1 = (0x10 - (reinterpret_cast<long>(dest) & 0xf)) & 0xf;
size2 = (size_t - size1) & ~0x7F;
}

// if the size that can be copied by 128 blocks (size2) is > 0 then use fast copy
if (size2)
{
if (size1) memcpy(dest, src, size1);

asm ("push %esi\n\t"); // preserve ESI to the stack
asm ("push %edi\n\t"); // preserve EDI to the stack

asm("mov %0, %%esi\n\t"
: :"r"(src)); //src pointer

asm("mov %0, %%edi\n\t"
:
: "r"(dest)); //dest pointer

asm("mov %0, %%ebx\n\t" //ebx is our counter
:
: "r"(size2));

asm("add %0, %%esi\n\t" :: "r"(size1)); // add the copied bytes count to src pointer
asm("add %0, %%edi\n\t" :: "r"(size1)); // add the copied bytes count to dest pointer

asm("shr $7, %ebx\n\t" //divide by 128 (8 * 128bit registers)

"loop_copy:\n\t"
"prefetchnta 128(%esi)\n\t" //SSE2 prefetch
"prefetchnta 160(%esi)\n\t"
"prefetchnta 192(%esi)\n\t"
"prefetchnta 224(%esi)\n\t"

"movdqa 0(%esi), %xmm0\n\t" //move data from src to registers
"movdqa 16(%esi), %xmm1\n\t"
"movdqa 32(%esi), %xmm2\n\t"
"movdqa 48(%esi), %xmm3\n\t"
"movdqa 64(%esi), %xmm4\n\t"
"movdqa 80(%esi), %xmm5\n\t"
"movdqa 96(%esi), %xmm6\n\t"
"movdqa 112(%esi), %xmm7\n\t"

"movntdq %xmm0, 0(%edi)\n\t" //move data from registers to dest
"movntdq %xmm1, 16(%edi)\n\t"
"movntdq %xmm2, 32(%edi)\n\t"
"movntdq %xmm3, 48(%edi)\n\t"
"movntdq %xmm4, 64(%edi)\n\t"
"movntdq %xmm5, 80(%edi)\n\t"
"movntdq %xmm6, 96(%edi)\n\t"
"movntdq %xmm7, 112(%edi)\n\t"

"add $128, %esi\n"
"add $128, %edi\n"
"dec %ebx\n"

"jnz loop_copy\n" //loop please
"loop_copy_end:\n"
);

asm ("pop %edi\n\t"); // restore EDI
asm ("pop %esi\n\t"); // restore ESI

size3 = size_t - size2 - size1;
if (size3)
memcpy(&reinterpret_cast<char*>(dest)[size1 + size2],
&reinterpret_cast<const char*>(src)[size1 + size2], size3);
}
else
{
printf("memcpy used\n");
memcpy(dest,src,size_t);
}
}


Seg fault... :\

---

Perhaps we are forgetting something? :confused:

THIS (http://www.koders.com/c/fidDED82105F7530C7CC14FEF782C5A791EE2134B24.aspx?s=cdef%3Aparser
) one works but the performance is worser than standard memcpy:

(SSE_Memcpy) function. I had to make some changes like

from = (const unsigned char *&)from + 64;
to = (unsigned char *&)to + 64; for it to be compatible with ISO C++.

rxbagain
March 11th, 2009, 09:46 AM
Maybe there are some other problems on the pointer you are passing to the function. The code is working fine in my testing.

How are you allocating the source and destination and how are you passing them?

Edit: For my testing I am using this code.
int main(int argc, char *argv[])
{
#define CPSIZE (1024 * 1024 * 256)

// pv1 and pv2 holds the real pointers. we need some extra memory in case
// the pointer are aligned in 8 byte boundary instead of 16
void *pv1 = malloc(CPSIZE + 8);
void *pv2 = malloc(CPSIZE + 8);
if (pv1 && pv2)
{
// use p1 and p2 to work with the pointers. we have to do some adjustments
// to align it in 16 byte boundary.
char* p1 = reinterpret_cast<char*>(pv1);
char* p2 = reinterpret_cast<char*>(pv2);
if (reinterpret_cast<int>(p1) & 0xf) p1 += 8;
if (reinterpret_cast<int>(p2) & 0xf) p2 += 8;
// at this point, the pointers p1 and p2 are aligned and we can now use memcpy2
// p1 and p2 holds the pointer we are working with
// pv1 and pv2 still holds the real allocated pointers (we use them in free())

printf("real = (0x%.8x, 0x%.8x) aligned = (0x%.8x, 0x%.8x)\n", pv1, pv2, p1, p2);

printf("initializing some data on p1... ");
for (int i = 0; i < (CPSIZE + 32); i++) p1[i] = (i % 10);
printf("done\n");

printf("copying p1 contents to p2 ...");
memcpy2(p2, p1, CPSIZE);
printf("done\n");

printf("copying count - 100 from p1 to p2 ...");
memcpy2(p2, p1, CPSIZE - 100);
printf("done\n");

printf("copying from &p1[101] to &p2[101] ...");
memcpy2(&p2[101], &p1[101], CPSIZE - 101);
printf("done\n");

printf("copying from &p1[17] to &p2[1] ...");
memcpy2(&p2[1], &p1[17], CPSIZE - 17);
printf("done\n");

printf("copying from &p1[101] to p2 ...");
memcpy2(p2, &p1[101], CPSIZE - 101);
printf("done\n");

printf("copying 128 bytes ...");
memcpy2(p2, p1, 128);
printf("done\n");

printf("copying 128 bytes (misaligned)...");
memcpy2(&p2[1], &p1[1], 128);
printf("done\n");

printf("copying 150 bytes ...");
memcpy2(p2, p1, 150);
printf("done\n");

printf("copying 150 bytes (misaligned)...");
memcpy2(&p2[1], &p1[1], 150);
printf("done\n");

printf("copying 150 bytes (misaligned and unsync)...");
memcpy2(&p2[1], &p1[2], 150);
printf("done\n");

printf("copying 30 bytes ...");
memcpy2(p2, p1, 30);
printf("done\n");

}
else fprintf(stderr, "out of memory\n");
if (pv1) free(pv1);
if (pv2) free(pv2);

return EXIT_SUCCESS;
}