Optlab
实验目标
优化程序运行效率
大体分为:
- 有条件:1个整数加法,2个浮点数乘法、加法
- 无条件:1个整数加法,2个浮点数乘法、加法
设计思路
一、结合书上内容的尝试
代码移动
将循环中多次使用的数据用变量保存,尽可能使其存储在寄存器中
int rows = lineorder_table_info.rows;
减少过程调用
将一些常用的指针,在循环外赋值给临时变量,方便从寄存器中使用
int * LO_QUANTITY = lineorder_table_info.table->lo_quantity;
int * LO_EXTENDEDPRICE = lineorder_table_info.table -> lo_extendedprice;
int * ORDER_DATE = lineorder_table_info.table -> lo_orderdate;
double * LO_DISCOUNT = lineorder_table_info.table -> lo_discount;
double * LO_TAX = lineorder_table_info.table -> lo_tax;
消除内存引用
原代码本身已经采用该策略,将求和结果保存在变量中而非采用指针不断引用内存
unsigned int quantity_sum = 0;
double discount_total_price = 0;
double tax_discount_total_price = 0;
unsigned int quantity_sum_with_condition = 0;
double discount_total_price_with_condition = 0;
double tax_discount_total_price_with_condition = 0;
以下为只使用了上述3种优化方法的代码
int rows = lineorder_table_info.rows;
int * LO_QUANTITY = lineorder_table_info.table->lo_quantity;
int * LO_EXTENDEDPRICE = lineorder_table_info.table -> lo_extendedprice;
int * ORDER_DATE = lineorder_table_info.table -> lo_orderdate;
double * LO_DISCOUNT = lineorder_table_info.table -> lo_discount;
double * LO_TAX = lineorder_table_info.table -> lo_tax;
for (int i = 0; i < rows; ++i) {
quantity_sum = quantity_sum + LO_QUANTITY[i];
discount_total_price = discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]);
tax_discount_total_price = tax_discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) *
(1 + LO_TAX[i] );
if (ORDER_DATE[i] <= limit_orderdate) {
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i];
}
if (ORDER_DATE[i] <= limit_orderdate) {
discount_total_price_with_condition = discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]);
}
if (ORDER_DATE[i] <= limit_orderdate) {
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) * (1 + LO_TAX[i] );
}
}
下图为该程序运行的结果
下图为原始query.cpp在服务器上的运行结果
在学校的服务器上运行的结果,使用的测试数据是自己生成后上传到服务器的
循环展开
由于不同程度的循环展开,并不是展开次数越多效率越好,我尝试了2✖️1,3✖️1,4✖️1的循环展开
从最终结果来看,3✖️1的循环展开效果最好
- 2✖️1
int rows = lineorder_table_info.rows;
int limit = rows - 2;
int * LO_QUANTITY = lineorder_table_info.table->lo_quantity;
int * LO_EXTENDEDPRICE = lineorder_table_info.table -> lo_extendedprice;
int * ORDER_DATE = lineorder_table_info.table -> lo_orderdate;
double * LO_DISCOUNT = lineorder_table_info.table -> lo_discount;
double * LO_TAX = lineorder_table_info.table -> lo_tax;
for (int i = 0; i < limit; i = i + 2) {
quantity_sum = quantity_sum + LO_QUANTITY[i] + LO_QUANTITY[i+1] ;//modified
discount_total_price = discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) +
LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]) ; // modified
tax_discount_total_price = tax_discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) *
(1 + LO_TAX[i] ) + LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]) * (1 + LO_TAX[i+1]);//modified
if (ORDER_DATE[i] <= limit_orderdate) {
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i];
discount_total_price_with_condition = discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]);
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) * (1 + LO_TAX[i] );
}
if (ORDER_DATE[i+1] <= limit_orderdate) {
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i+1];
discount_total_price_with_condition = discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]);
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]) * (1 + LO_TAX[i+1] );
}
}
for(int i = limit;i < rows;i++)
{
quantity_sum = quantity_sum + LO_QUANTITY[i];
discount_total_price = discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]);
tax_discount_total_price = tax_discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) *
(1 + LO_TAX[i] );
if (ORDER_DATE[i] <= limit_orderdate) {
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i];
discount_total_price_with_condition = discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]);
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) * (1 + LO_TAX[i] );
}
}
2✖️1的运行结果
可以看出比之前原代码和不使用循环展开的优化的程序效率更高
- 3✖️1
int rows = lineorder_table_info.rows;
int limit = rows - 2;
int * LO_QUANTITY = lineorder_table_info.table->lo_quantity;
int * LO_EXTENDEDPRICE = lineorder_table_info.table -> lo_extendedprice;
int * ORDER_DATE = lineorder_table_info.table -> lo_orderdate;
double * LO_DISCOUNT = lineorder_table_info.table -> lo_discount;
double * LO_TAX = lineorder_table_info.table -> lo_tax;
for (int i = 0; i < limit; i = i + 3) {
quantity_sum = quantity_sum + LO_QUANTITY[i] + ( LO_QUANTITY[i+1] + LO_QUANTITY[i+2] );//modified
discount_total_price = discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) +
LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]) + LO_EXTENDEDPRICE[i+2] * (1 - LO_DISCOUNT[i+2]) ; // modified
tax_discount_total_price = tax_discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) *
(1 + LO_TAX[i] ) + LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]) * (1 + LO_TAX[i+1])
+ LO_EXTENDEDPRICE[i+2] * (1 - LO_DISCOUNT[i+2]) * (1 + LO_TAX[i+2]) ;//modified
if (ORDER_DATE[i] <= limit_orderdate) {
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i];
discount_total_price_with_condition = discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]);
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) * (1 + LO_TAX[i] );
}
if (ORDER_DATE[i+1] <= limit_orderdate) {
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i+1];
discount_total_price_with_condition = discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]);
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]) * (1 + LO_TAX[i+1] );
}
if (ORDER_DATE[i+2] <= limit_orderdate) {
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i+2];
discount_total_price_with_condition = discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i+2] * (1 - LO_DISCOUNT[i+2]);
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i+2] * (1 - LO_DISCOUNT[i+2]) * (1 + LO_TAX[i+2] );
}
}
for(int i = limit;i < rows;i++)
{
quantity_sum = quantity_sum + LO_QUANTITY[i];
discount_total_price = discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]);
tax_discount_total_price = tax_discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) *
(1 + LO_TAX[i] );
if (ORDER_DATE[i] <= limit_orderdate) {
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i];
discount_total_price_with_condition = discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]);
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition
+ LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) * (1 + LO_TAX[i] );
}
}
下图为3✖️1循环展开的结果
可以看出3✖️1的循环展开是目前效果最好的优化
-
4✖️1
int rows = lineorder_table_info.rows; int limit = rows - 3; int * LO_QUANTITY = lineorder_table_info.table->lo_quantity; int * LO_EXTENDEDPRICE = lineorder_table_info.table -> lo_extendedprice; int * ORDER_DATE = lineorder_table_info.table -> lo_orderdate; double * LO_DISCOUNT = lineorder_table_info.table -> lo_discount; double * LO_TAX = lineorder_table_info.table -> lo_tax; for (int i = 0; i < limit; i = i + 4) { quantity_sum = quantity_sum + LO_QUANTITY[i] ; t1 += LO_QUANTITY[i+1]; t2 += LO_QUANTITY[i+2] ; t3 += LO_QUANTITY[i+3];//modified discount_total_price = discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]); te1 += LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]); te2 += LO_EXTENDEDPRICE[i+2] * (1 - LO_DISCOUNT[i+2]); te3 += LO_EXTENDEDPRICE[i+3] * (1 - LO_DISCOUNT[i+3]); // modified tax_discount_total_price = tax_discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) * (1 + LO_TAX[i] ) + LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]) * (1 + LO_TAX[i+1]) + LO_EXTENDEDPRICE[i+2] * (1 - LO_DISCOUNT[i+2]) * (1 + LO_TAX[i+2]) + LO_EXTENDEDPRICE[i+3] * (1 - LO_DISCOUNT[i+3]) * (1 + LO_TAX[i+3]);//modified if (ORDER_DATE[i] <= limit_orderdate) { quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i]; discount_total_price_with_condition = discount_total_price_with_condition + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]); tax_discount_total_price_with_condition = tax_discount_total_price_with_condition + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) * (1 + LO_TAX[i] ); } if (ORDER_DATE[i+1] <= limit_orderdate) { quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i+1]; discount_total_price_with_condition = discount_total_price_with_condition + LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]); tax_discount_total_price_with_condition = tax_discount_total_price_with_condition + LO_EXTENDEDPRICE[i+1] * (1 - LO_DISCOUNT[i+1]) * (1 + LO_TAX[i+1] ); } if (ORDER_DATE[i+2] <= limit_orderdate) { quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i+2]; discount_total_price_with_condition = discount_total_price_with_condition + LO_EXTENDEDPRICE[i+2] * (1 - LO_DISCOUNT[i+2]); tax_discount_total_price_with_condition = tax_discount_total_price_with_condition + LO_EXTENDEDPRICE[i+2] * (1 - LO_DISCOUNT[i+2]) * (1 + LO_TAX[i+2] ); } if (ORDER_DATE[i+3] <= limit_orderdate) { quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i+3]; discount_total_price_with_condition = discount_total_price_with_condition + LO_EXTENDEDPRICE[i+3] * (1 - LO_DISCOUNT[i+3]); tax_discount_total_price_with_condition = tax_discount_total_price_with_condition + LO_EXTENDEDPRICE[i+3] * (1 - LO_DISCOUNT[i+3]) * (1 + LO_TAX[i+3] ); } } for(int i = limit;i < rows;i++) { quantity_sum = quantity_sum + LO_QUANTITY[i]; discount_total_price = discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]); tax_discount_total_price = tax_discount_total_price + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) * (1 + LO_TAX[i] ); if (ORDER_DATE[i] <= limit_orderdate) { quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i]; discount_total_price_with_condition = discount_total_price_with_condition + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]); tax_discount_total_price_with_condition = tax_discount_total_price_with_condition + LO_EXTENDEDPRICE[i] * (1 - LO_DISCOUNT[i]) * (1 + LO_TAX[i] ); } } quantity_sum = quantity_sum + t1 + t2 + t3; discount_total_price = discount_total_price+ te1 + te2 + te3;
可以看出4✖️1的优化效果不如3✖️1
多个累积变量
-
对于double型数据来说,采用多个累积变量会导致数据在舍入以后,导致结果和顺序计算不同,产生了会扣16分的严重后果!所以不能采用多个累积变量的方法
-
在实际实验以后,多个累积变量也没有对运行时间有明显帮助
这是将4✖️1变成4✖️4以后的结果
重新结合变换
- 这个方法对于double类型数据来说也是一样,重新结合变换会导致导致答案错误
- 应用到int类型的数据上,并没有对于结果有什么明显区别
总结
综上所述,在书上提到的优化方法综合以后,使用代码移动、减少过程调用、消除内存引用以后,再进行3✖️1的循环展开后,得到的结果是最优结果
二、结合SIMD的尝试
#include <cstdio>
#include "dataload.h"
#include <immintrin.h>
#include <emmintrin.h>
#define limit_orderdate 19950630
#define block 12
const char lineorder_name[] = "lineorder.tbl";
static __inline__ uint64_t curtick() {
uint64_t tick;
unsigned long lo,hi;
__asm__ __volatile__ ("rdtsc":"=a"(lo),"=d"(hi));
tick = (uint64_t) hi << 32 | lo;
return tick;
}
static __inline__ void startTimer(uint64_t *t) {
(*t) = curtick();
}
static __inline__ void stopTimer(uint64_t *t) {
(*t) = curtick() - *t;
}
int main() {
table_info lineorder_table_info;
FILE * lineorder_file;
//load lineorder table from file
lineorder_file = fopen(lineorder_name,"r");
loadTable(lineorder_file, &lineorder_table_info);
unsigned int quantity_sum = 0;
double discount_total_price = 0;
double tax_discount_total_price = 0;
unsigned int quantity_sum_with_condition = 0;
double discount_total_price_with_condition = 0;
double tax_discount_total_price_with_condition = 0;
uint64_t beg;
startTimer(&beg);
//you should editor the following the part to accelerate the calculation
/*--------------------------------*/
int rows = lineorder_table_info.rows;
int * LO_QUANTITY = lineorder_table_info.table->lo_quantity;
int * LO_EXTENDEDPRICE = lineorder_table_info.table -> lo_extendedprice;
int * ORDER_DATE = lineorder_table_info.table -> lo_orderdate;
double * LO_DISCOUNT = lineorder_table_info.table -> lo_discount;
double * LO_TAX = lineorder_table_info.table -> lo_tax;
int cycle = rows;
int residule = cycle % block;
cycle = cycle - residule;
__m128i quantity_sum_0={0,0},quantity_sum_1={0,0},quantity_sum_2={0,0};
__m128i int_load_0,int_load_1,int_load_2;
__m256d discount_sum_0=_mm256_setzero_pd(),discount_sum_1=_mm256_setzero_pd(),discount_sum_2=_mm256_setzero_pd();
__m256d tax_sum_0=_mm256_setzero_pd(),tax_sum_1=_mm256_setzero_pd(),tax_sum_2=_mm256_setzero_pd();
__m256d double_load_0,double_load_1,double_load_2;
__m256d double_load_3,double_load_4,double_load_5;
for(int i = 0;i < cycle; i+=block)
{
for(int j = i; j < i+block;j++)
{
if(ORDER_DATE[j] <= limit_orderdate)
{
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[j];
discount_total_price_with_condition = discount_total_price_with_condition + LO_EXTENDEDPRICE[j]*(1-LO_DISCOUNT[j]);
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition + LO_EXTENDEDPRICE[j]*(1 - LO_DISCOUNT[j])*(1 + LO_TAX[j]);
}
LO_DISCOUNT[j] = LO_EXTENDEDPRICE[j] * (1 - LO_DISCOUNT[j]);
LO_TAX[j] = 1 + LO_TAX[j];
}
int_load_0=_mm_load_si128((__m128i*)(LO_QUANTITY+i));
int_load_1=_mm_load_si128((__m128i*)(LO_QUANTITY+i+4));
int_load_2=_mm_load_si128((__m128i*)(LO_QUANTITY+i+8));
quantity_sum_0=_mm_add_epi32(int_load_0,quantity_sum_0);
quantity_sum_1=_mm_add_epi32(int_load_1,quantity_sum_1);
quantity_sum_2=_mm_add_epi32(int_load_2,quantity_sum_2);
double_load_0=_mm256_loadu_pd(LO_DISCOUNT+i);
double_load_1=_mm256_loadu_pd(LO_DISCOUNT+i+4);
double_load_2=_mm256_loadu_pd(LO_DISCOUNT+i+8);
discount_sum_0=_mm256_add_pd(discount_sum_0,double_load_0);
discount_sum_1=_mm256_add_pd(discount_sum_1,double_load_1);
discount_sum_2=_mm256_add_pd(discount_sum_2,double_load_2);
double_load_3=_mm256_loadu_pd(LO_TAX+i);
double_load_4=_mm256_loadu_pd(LO_TAX+i+4);
double_load_5=_mm256_loadu_pd(LO_TAX+i+8);
double_load_3=_mm256_mul_pd(double_load_0,double_load_3);
double_load_4=_mm256_mul_pd(double_load_1,double_load_4);
double_load_5=_mm256_mul_pd(double_load_2,double_load_5);
tax_sum_0=_mm256_add_pd(tax_sum_0,double_load_3);
tax_sum_1=_mm256_add_pd(tax_sum_1,double_load_4);
tax_sum_2=_mm256_add_pd(tax_sum_2,double_load_5);
}
int* quantity_sum_a=(int*)&quantity_sum_0;
int* quantity_sum_b=(int*)&quantity_sum_1;
int* quantity_sum_c=(int*)&quantity_sum_2;
quantity_sum=quantity_sum+(quantity_sum_a[0]+quantity_sum_a[1]+quantity_sum_a[2]+quantity_sum_a[3]);
quantity_sum=quantity_sum+(quantity_sum_b[0]+quantity_sum_b[1]+quantity_sum_b[2]+quantity_sum_b[3]);
quantity_sum=quantity_sum+(quantity_sum_c[0]+quantity_sum_c[1]+quantity_sum_c[2]+quantity_sum_c[3]);
double* discount_sum_a=(double*)&discount_sum_0;
double* discount_sum_b=(double*)&discount_sum_1;
double* discount_sum_c=(double*)&discount_sum_2;
discount_total_price=discount_total_price+(discount_sum_a[0]+discount_sum_a[1]+discount_sum_a[2]+discount_sum_a[3]);
discount_total_price=discount_total_price+(discount_sum_b[0]+discount_sum_b[1]+discount_sum_b[2]+discount_sum_b[3]);
discount_total_price=discount_total_price+(discount_sum_c[0]+discount_sum_c[1]+discount_sum_c[2]+discount_sum_c[3]);
double* tax_sum_a=(double*)&tax_sum_0;
double* tax_sum_b=(double*)&tax_sum_1;
double* tax_sum_c=(double*)&tax_sum_2;
tax_discount_total_price=tax_discount_total_price+(tax_sum_a[0]+tax_sum_a[1]+tax_sum_a[2]+tax_sum_a[3]);
tax_discount_total_price=tax_discount_total_price+(tax_sum_b[0]+tax_sum_b[1]+tax_sum_b[2]+tax_sum_b[3]);
tax_discount_total_price=tax_discount_total_price+(tax_sum_c[0]+tax_sum_c[1]+tax_sum_c[2]+tax_sum_c[3]);
for(int i=cycle;i<cycle+residule;i++)
{
double temp=LO_EXTENDEDPRICE[i]*(1-LO_DISCOUNT[i]);
quantity_sum = quantity_sum + LO_QUANTITY[i];
discount_total_price = discount_total_price + temp;
tax_discount_total_price=tax_discount_total_price+temp*(1+LO_TAX[i]);
if (ORDER_DATE[i]<=limit_orderdate)
{
quantity_sum_with_condition = quantity_sum_with_condition + LO_QUANTITY[i];
discount_total_price_with_condition = discount_total_price_with_condition + temp;
tax_discount_total_price_with_condition = tax_discount_total_price_with_condition + temp*(1+LO_TAX[i]);
}
}
/*--------------------------------*/
stopTimer(&beg);
//output
printf("%d\n",quantity_sum);
printf("%0.6lf\n",discount_total_price);
printf("%0.6lf\n",tax_discount_total_price);
printf("%d\n",quantity_sum_with_condition);
printf("%0.6lf\n",discount_total_price_with_condition);
printf("%0.6lf\n",tax_discount_total_price_with_condition);
printf("running time is %ld\n", (long)(beg));
}
用SIMD技术优化以后跑得更慢了…