我正在尝试将上交所float4添加与标准float4添加进行比较。作为演示,我在使用和不使用SSE的情况下计算相加分量的总和:
#include <iostream>
#include <vector>
struct Point4
{
Point4()
{
data[0] = 0;
data[1] = 0;
data[2] = 0;
data[3] = 0;
}
float data[4];
};
void Standard()
{
Point4 a;
a.data[0] = 1.0f;
a.data[1] = 2.0f;
a.data[2] = 3.0f;
a.data[3] = 4.0f;
Point4 b;
b.data[0] = 1.0f;
b.data[1] = 6.0f;
b.data[2] = 3.0f;
b.data[3] = 5.0f;
float total = 0.0f;
for(unsigned int i = 0; i < 1e9; ++i)
{
for(unsigned int component = 0; component < 4; ++component)
{
total += a.data[component] + b.data[component];
}
}
std::cout << "total: " << total << std::endl;
}
void Vectorized()
{
typedef float v4sf __attribute__ (( vector_size(4*sizeof(float)) ));
v4sf a;
float* aPointer = (float*)&a;
aPointer[0] = 1.0f; aPointer[1] = 2.0f; aPointer[2] = 3.0f; aPointer[3] = 4.0f;
v4sf b;
float* bPointer = (float*)&b;
bPointer[0] = 1.0f; bPointer[1] = 6.0f; bPointer[2] = 3.0f; bPointer[3] = 5.0f;
v4sf result;
float* resultPointer = (float*)&result;
resultPointer[0] = 0.0f;
resultPointer[1] = 0.0f;
resultPointer[2] = 0.0f;
resultPointer[3] = 0.0f;
for(unsigned int i = 0; i < 1e9; ++i)
{
result += a + b; // Vectorized operation
}
// Sum the components of the result (this is done with the "total += " in the Standard() loop
float total = 0.0f;
for(unsigned int component = 0; component < 4; ++component)
{
total += resultPointer[component];
}
std::cout << "total: " << total << std::endl;
}
int main()
{
// Standard();
Vectorized();
return 0;
}然而,使用标准方法的代码似乎比矢量化方法快(~.2秒) (~.4秒)。是因为for循环对v4sf值求和吗?有没有更好的操作,我可以用来计算这两种技术之间的差异,并仍然比较输出,以确保这两种技术之间没有差异?
发布于 2012-08-30 05:07:30
那么你的版本比SSE慢的原因是你必须在每次迭代中从SSE寄存器解包到标量寄存器4次,这比你从矢量化加法中获得的开销更大。看一下反汇编,你应该会得到一个更清晰的图像。
我认为你想做的事情如下(使用SSE会更快):
for(unsigned int i = 0; i < 1e6; ++i)
{
result += a + b; // Vectorized operation
}
// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
{
total += resultPointer[component];
}另外,下面的代码可能会更快:
for(unsigned int i = 0; i < 1e6/4; ++i)
{
result0 += a + b; // Vectorized operation
result1 += a + b; // Vectorized operation
result2 += a + b; // Vectorized operation
result3 += a + b; // Vectorized operation
}
// Sum the components of the result (this is done with the "total += " in the Standard() loop
for(unsigned int component = 0; component < 4; ++component)
{
total += resultPointer0[component];
total += resultPointer1[component];
total += resultPointer2[component];
total += resultPointer3[component];
}https://stackoverflow.com/questions/12186193
复制相似问题