C ++类Tom
如何在下面的tom类中获取正确的B2 [4] [4]?
代码可以很好地编译,但是输出错误.预期的输出是B1 [4] [4]的转置.
How do i obtain the correct B2[4][4], in the tom class below?
The codes compiles well but the output is wrong.Expected output is transpose of B1[4][4].
//tom.h
#ifndef _TOM_H
#define _TOM_H
class tom
{
public:
tom();
void transpose1(short B1[4][4],short B2[4][4]);
};
#endif
//tom.cpp
#include "stdafx.h"
#include "tom.h"
#include "emmintrin.h"
#include <iostream>
#include <iomanip>
using namespace std;
tom::tom()
{
}
void tom::transpose1(short B1[4][4],short B2[4][4])
{
__asm
{
movq mm1, B1
movq mm2, B1+8
movq mm3, B1+16
movq mm4, B1+24
// Step one
punpcklwd mm1, mm2
punpcklwd mm3, mm4
movq mm5, mm1
punpckldq mm1, mm3
punpckhdq mm5, mm3
movq B2, mm1
movq B2+8, mm5
movq mm1, B1
movq mm2, B1+8
movq mm3, B1+16
movq mm4, B1+24
// Step two
punpckhwd mm1, mm2
punpckhwd mm3, mm4
movq mm5, mm1
punpckldq mm1, mm3
punpckhdq mm5, mm3
movq B2+16, mm1
movq B2+24, mm5
emms
}
}
//tommain.cpp
#include "stdafx.h"
#include "tom.h"
#include "emmintrin.h"
#include <iostream>
#include <iomanip>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
short B1[4][4]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
short B2[4][4];
tom X;
X.transpose1(B1,B2);
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
cout << B2[i][j] << " ";
cout << endl;
}
return 0;
}
以这种方式更改tom::transpose1
:
Change yourtom::transpose1
this way:
void tom::transpose1(short B1[4][4], short B2[4][4])
{
__asm
{
mov eax, B1
mov ecx, B2
movq mm1, [eax]
movq mm2, [eax+8]
movq mm3, [eax+16]
movq mm4, [eax+24]
// Step one
punpcklwd mm1, mm2
punpcklwd mm3, mm4
movq mm5, mm1
punpckldq mm1, mm3
punpckhdq mm5, mm3
movq [ecx], mm1
movq [ecx+8], mm5
movq mm1, [eax]
movq mm2, [eax+8]
movq mm3, [eax+16]
movq mm4, [eax+24]
// Step two
punpckhwd mm1, mm2
punpckhwd mm3, mm4
movq mm5, mm1
punpckldq mm1, mm3
punpckhdq mm5, mm3
movq [ecx+16], mm1
movq [ecx+24], mm5
emms
}
}
内联增强应该做的很小心:它有很多陷阱!
您复制并粘贴上一篇文章中的代码( http://www.codeproject.com/Questions/92909 /Matrix-transpose.aspx [^ ]),其中B1
和B2
数组以及 asm代码都位于_tmain
内部.
有理由认为您所做的事情应该起作用,但是在幕后还存在一个间接的附加级别,因为现在您要将数组作为参数传递给函数.然后,您应该从堆栈中获取它们的地址并取消引用.
You should take care in what you do with inline assempbly: it has a lot of pitfalls!
You copy and paste the code from a previous post (http://www.codeproject.com/Questions/92909/Matrix-transpose.aspx[^]) where both the B1
and B2
arrays and the asm code were inside _tmain
.
It is reasonable to think that what you have done should work, but behind the scene there is an additional level of indirection, because now you are passing the arrays as parameters to a function. Then you should get their addresses from the stack and dereference them.