CSAPP第五章家庭作业参考答案

(CSAPP第三版系列)导航篇传送门


5.14编写5.13的6*1循环展开版本

代码如下:

/* Inner product. Accumulate in temporary */
void inner4(vec_ptr u,vec_ptr v,data_t *dest)
{
    long i;
    long length = vec_length(u);
    long limit = length - 5;
    data_t *udata = get_vec_start(u);
    data_t *vdata = get_vec_start(v);
    data_t sum = (data_t)0;

    for(i = 0;i < limit;i = i + 6)
    {
        sum = sum + udata[i] * vdata[i];
        sum = sum + udata[i+1] * vdata[i+1];
        sum = sum + udata[i+2] * vdata[i+2];
        sum = sum + udata[i+3] * vdata[i+3];
        sum = sum + udata[i+4] * vdata[i+4];
        sum = sum + udata[i+5] * vdata[i+5];
    }
    for(;i < length;i++)
    {
        sum = sum + udata[i] * vdata[i];
    }

    *dest = sum;
}

A.  原因:6*1循环展开标量版本的内积过程的关键路径仍然是n个加法操作,而在Haswell架构上整数加法的延迟为1个周期,所以这种方式编写程序的CPE不可能小于1.00。

B.  原因:在Hawell架构上浮点数加法操作的延迟为3个周期,而在使用6*1循环展开之前程序的CPE已经达到了3.01,所以即使使用6*1循环展开也是不可能突破这个下界的。


5.16 编写5.13的6*1a循环展开版本

代码如下:

/* Inner product. Accumulate in temporary */
void inner4(vec_ptr u,vec_ptr v,data_t *dest)
{
    long i;
    long length = vec_length(u);
    long limit = length - 5;
    data_t *udata = get_vec_start(u);
    data_t *vdata = get_vec_start(v);
    data_t sum = (data_t)0;

    for(i = 0;i < limit;i = i + 6)
    {
        sum = sum + (udata[i] * vdata[i] + udata[i+1] * vdata[i+1] + udata[i+2] * vdata[i+2] + udata[i+3] * vdata[i+3] + udata[i+4] * vdata[i+4] + udata[i+5] * vdata[i+5]);;
    }
    for(;i < length;i++)
    {
        sum = sum + udata[i] * vdata[i];
    }

    *dest = sum;
}


5.18优化多项式求值任务

若要使CPE接近于机器的吞吐量界限,可以使用10*10(浮点乘法延迟为5,容量为2)的并行累计方式编写程序。

代码如下:

/*使用直接求值方法*/
double poly(double a[],double x,long degree)
{
    long i;
    long limit = degree - 9;
    double result = a[0];
    double result_1 = 0;
    double result_2 = 0;
    double result_3 = 0;
    double result_4 = 0;
    double result_5 = 0;
    double result_6 = 0;
    double result_7 = 0;
    double result_8 = 0;
    double result_9 = 0;

    double xpwr = x;
    double xpwr_1 = x*x;
    double xpwr_2 = x*x*x;
    double xpwr_3 = x*x*x*x;
    double xpwr_4 = x*x*x*x*x;
    double xpwr_5 = x*x*x*x*x*x;
    double xpwr_6 = x*x*x*x*x*x*x;
    double xpwr_7 = x*x*x*x*x*x*x*x;
    double xpwr_8 = x*x*x*x*x*x*x*x*x;
    double xpwr_9 = x*x*x*x*x*x*x*x*x*x;

    for(i = 1;i <= limit;i++)
    {
        result += a[i] * xpwr;
        result_1 += a[i+1] * xpwr_1;
        result_2 += a[i+2] * xpwr_2;
        result_3 += a[i+3] * xpwr_3;
        result_4 += a[i+4] * xpwr_4;
        result_5 += a[i+5] * xpwr_5;
        result_6 += a[i+6] * xpwr_6;
        result_7 += a[i+7] * xpwr_7;
        result_8 += a[i+8] * xpwr_8;
        result_9 += a[i+9] * xpwr_9;
        xpwr = xpwr * xpwr_9;
        xpwr_1 = xpwr_1 * xpwr_9;
        xpwr_2 = xpwr_2 * xpwr_9;
        xpwr_3 = xpwr_3 * xpwr_9;
        xpwr_4 = xpwr_4 * xpwr_9;
        xpwr_5 = xpwr_5 * xpwr_9;
        xpwr_6 = xpwr_6 * xpwr_9;
        xpwr_7 = xpwr_7 * xpwr_9;
        xpwr_8 = xpwr_8 * xpwr_9;
        xpwr_9 = xpwr_9 * xpwr_9;
    }
    for(;i <= degree;i++)
    {
        result += a[i] * xpwr;
        xpwr = x * xpwr;
    }
    return (result + result_1 + result_2 + result_3 + result_4 + result_5 + result_6 + result_7 + result_8 + result_9);
}



猜你喜欢

转载自blog.csdn.net/qq512028505/article/details/79129033
今日推荐