上次比较队列性能,SGI-STL比我的高了一倍还多(是我的2.3倍,还是夹杂在一堆别的代码中的,估计实际上3倍都挡不住),于是下决心改善代码质量。但这需要不断的积累,现在只能一点点的来——哎,什么时候才能看见大师的背影呢。就从短循环开始吧,这个对CPU的开销是非常大的,而程序中出现的几率比较大,因此,对其的优化,能带来效率的大幅提升。为了说明问题,做了如下的测试:(我把我写的Timer.h拷到Inlcude目录里了)测试环境:C500、192RAM、Win2000Sp3、关掉其他前台程序
#include <iostream.h>
#include <Timer.h>
void sum1()
{
int j = 0;
for (unsigned i = 1; i < 630001; i++) j += i;
}
void sum2()
{
int j = 0;
for (unsigned i = 1; i < 630001;)
{
j += i++;
j += i++;
}
}
void sum3()
{
int j = 0;
for (unsigned i = 1; i < 630001;)
{
j += i++;
j += i++;
j += i++;
}
}
void sum4()
{
int j = 0;
for (unsigned i = 1; i < 630001;)
{
j += i++;
j += i++;
j += i++;
j += i++;
}
}
void sum5()
{
int j = 0;
for (unsigned i = 1; i < 630001;)
{
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
}
}
void sum6()
{
int j = 0;
for (unsigned i = 1; i < 630001;)
{
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
}
}
void sum7()
{
int j = 0;
for (unsigned i = 1; i < 630001;)
{
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
}
}
void sum8()
{
int j = 0;
for (unsigned i = 1; i < 630001;)
{
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
}
}
void sum9()
{
int j = 0;
for (unsigned i = 1; i < 630001;)
{
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
}
}
void sum10()
{
int j = 0;
for (unsigned i = 1; i < 630001;)
{
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
j += i++;
}
}
int main()
{
Timer timer; double t;
for (unsigned k = 0; k < 3; k++)
{
timer.Start();
sum1();
t = timer.GetTime();
cout <<"sum1: " << t << endl;
timer.Start();
sum2();
t = timer.GetTime();
cout <<"sum2: " << t << endl;
timer.Start();
sum3();
t = timer.GetTime();
cout <<"sum3: " << t << endl;
timer.Start();
sum4();
t = timer.GetTime();
cout <<"sum4: " << t << endl;
timer.Start();
sum5();
t = timer.GetTime();
cout <<"sum5: " << t << endl;
timer.Start();
sum6();
t = timer.GetTime();
cout <<"sum6: " << t << endl;
timer.Start();
sum7();
t = timer.GetTime();
cout <<"sum7: " << t << endl;
timer.Start();
sum8();
t = timer.GetTime();
cout <<"sum8: " << t << endl;
timer.Start();
sum9();
t = timer.GetTime();
cout <<"sum9: " << t << endl;
timer.Start();
sum10();
t = timer.GetTime();
cout <<"sum10: " << t << endl;
}
return 0;
}
Sum1 | Sum2 | Sum3 | Sum4 | Sum5 | Sum6 | Sum7 | Sum8 | Sum9 | Sum10 |
VC6 Release版本,生成文件大小57,344B,以下时间单位ms | |||||||||
3.83457 | 2.59251 | 1.936 | 1.68038 | 1.51304 | 1.81448 | 1.71642 | 1.71418 | 1.44795 | 1.49488 |
3.82311 | 2.51848 | 1.93628 | 1.71474 | 1.51276 | 1.75022 | 1.7167 | 1.77564 | 1.44711 | 1.42728 |
3.82479 | 2.51848 | 1.93544 | 1.7315 | 1.51304 | 1.74994 | 1.7167 | 1.81196 | 1.44739 | 1.49488 |
BCC32,生成文件大小141,312B,以下时间单位ms | |||||||||
2.57491 | 1.9645 | 1.936 | 1.75218 | 1.51276 | 1.75078 | 1.74827 | 1.41862 | 1.44851 | 1.43035 |
2.51987 | 1.92119 | 1.93656 | 1.68178 | 1.5136 | 1.75022 | 1.75022 | 1.41862 | 1.44795 | 1.42951 |
2.57491 | 1.89074 | 2.03154 | 1.6815 | 1.5122 | 1.75078 | 1.74715 | 1.90639 | 1.44795 | 1.4974 |
可以看出,对于连加短循环,展开确实能带来性能上的提升,在VC6中展开5项时的速度是不展开时的2.5倍;由于BCC32对短循环做了优化,提高不是很明显,只有1.6倍——BCB的拥护者乐了,在这方面他们有福了,不有意优化的情况下,BCC32编译的代码的速度是VC6的1.5倍;如果大家都写优化代码,速度持平。
很奇怪最小代价展开的峰值是5项,而不是预计的4项,可能是第一次加法的耗时过多,这个谁能解释一下。另外,预计浮点运算的展开应该得到更大的性能提升,实际情况并不是这样(连1倍都没有提高),可能是我代码写得不好,谁有例子提供一下。
选用VC6的人,很多是因为它的编译器编译的C++代码是Win32下效率最高的——有些人只是用它的IDE环境,根本不用MFC。我不知道有没有对应的优化器,不然每次自己展开太麻烦了。