(CSAPP第三版系列)导航篇传送门
5.14编写5.13的6*1循环展开版本
代码如下:
/* Inner product. Accumulate in temporary */ void inner4(vec_ptr u,vec_ptr v,data_t *dest) { long i; long length = vec_length(u); long limit = length - 5; data_t *udata = get_vec_start(u); data_t *vdata = get_vec_start(v); data_t sum = (data_t)0; for(i = 0;i < limit;i = i + 6) { sum = sum + udata[i] * vdata[i]; sum = sum + udata[i+1] * vdata[i+1]; sum = sum + udata[i+2] * vdata[i+2]; sum = sum + udata[i+3] * vdata[i+3]; sum = sum + udata[i+4] * vdata[i+4]; sum = sum + udata[i+5] * vdata[i+5]; } for(;i < length;i++) { sum = sum + udata[i] * vdata[i]; } *dest = sum; }
A. 原因:6*1循环展开标量版本的内积过程的关键路径仍然是n个加法操作,而在Haswell架构上整数加法的延迟为1个周期,所以这种方式编写程序的CPE不可能小于1.00。
B. 原因:在Hawell架构上浮点数加法操作的延迟为3个周期,而在使用6*1循环展开之前程序的CPE已经达到了3.01,所以即使使用6*1循环展开也是不可能突破这个下界的。5.16 编写5.13的6*1a循环展开版本
代码如下:
/* Inner product. Accumulate in temporary */ void inner4(vec_ptr u,vec_ptr v,data_t *dest) { long i; long length = vec_length(u); long limit = length - 5; data_t *udata = get_vec_start(u); data_t *vdata = get_vec_start(v); data_t sum = (data_t)0; for(i = 0;i < limit;i = i + 6) { sum = sum + (udata[i] * vdata[i] + udata[i+1] * vdata[i+1] + udata[i+2] * vdata[i+2] + udata[i+3] * vdata[i+3] + udata[i+4] * vdata[i+4] + udata[i+5] * vdata[i+5]);; } for(;i < length;i++) { sum = sum + udata[i] * vdata[i]; } *dest = sum; }
若要使CPE接近于机器的吞吐量界限,可以使用10*10(浮点乘法延迟为5,容量为2)的并行累计方式编写程序。
代码如下:
/*使用直接求值方法*/ double poly(double a[],double x,long degree) { long i; long limit = degree - 9; double result = a[0]; double result_1 = 0; double result_2 = 0; double result_3 = 0; double result_4 = 0; double result_5 = 0; double result_6 = 0; double result_7 = 0; double result_8 = 0; double result_9 = 0; double xpwr = x; double xpwr_1 = x*x; double xpwr_2 = x*x*x; double xpwr_3 = x*x*x*x; double xpwr_4 = x*x*x*x*x; double xpwr_5 = x*x*x*x*x*x; double xpwr_6 = x*x*x*x*x*x*x; double xpwr_7 = x*x*x*x*x*x*x*x; double xpwr_8 = x*x*x*x*x*x*x*x*x; double xpwr_9 = x*x*x*x*x*x*x*x*x*x; for(i = 1;i <= limit;i++) { result += a[i] * xpwr; result_1 += a[i+1] * xpwr_1; result_2 += a[i+2] * xpwr_2; result_3 += a[i+3] * xpwr_3; result_4 += a[i+4] * xpwr_4; result_5 += a[i+5] * xpwr_5; result_6 += a[i+6] * xpwr_6; result_7 += a[i+7] * xpwr_7; result_8 += a[i+8] * xpwr_8; result_9 += a[i+9] * xpwr_9; xpwr = xpwr * xpwr_9; xpwr_1 = xpwr_1 * xpwr_9; xpwr_2 = xpwr_2 * xpwr_9; xpwr_3 = xpwr_3 * xpwr_9; xpwr_4 = xpwr_4 * xpwr_9; xpwr_5 = xpwr_5 * xpwr_9; xpwr_6 = xpwr_6 * xpwr_9; xpwr_7 = xpwr_7 * xpwr_9; xpwr_8 = xpwr_8 * xpwr_9; xpwr_9 = xpwr_9 * xpwr_9; } for(;i <= degree;i++) { result += a[i] * xpwr; xpwr = x * xpwr; } return (result + result_1 + result_2 + result_3 + result_4 + result_5 + result_6 + result_7 + result_8 + result_9); }