NVDLA内核态驱动代码整理八


前言

本系列内容力求将nvdla的内核态驱动整理清楚,如果有分析不对的请指出。
前面已经分析了一大块代码了,链接分别如下:
系列文章1:NVDLA内核态驱动代码整理一
系列文章2:NVDLA内核态驱动代码整理二
系列文章3:NVDLA内核态驱动代码整理三
系列文章4:NVDLA内核态驱动代码整理四
系列文章5:NVDLA内核态驱动代码整理五
系列文章6:NVDLA内核态驱动代码整理六
系列文章7:NVDLA内核态驱动代码整理七

欢迎阅读硬件信号和架构分析系列文章1:
架构开篇介绍文章:NVDLA内核态驱动代码整理三
系列文章1:NVDLA硬件信号和架构设计整理一
系列文章2:NVDLA硬件信号和架构设计整理二
系列文章3:NVDLA硬件信号和架构设计整理三

本章分析conv.c代码以及相关的寄存器,因为已经有了NVDLA硬件信号和架构设计整理二对架构中卷积实现的细节做了介绍。提示:建议先阅读系列文章1:NVDLA硬件信号和架构设计整理一中的寄存器后再阅读本篇。

先把前面整理好的函数搬过来!

函数 功能
dla_conv_stat_data函数 dla_conv_stat_data函数和打印相关
get_in_format函数 get_in_format函数是为了获取输入格式是Feature还是Pixel
dla_conv_set_producer函数 dla_conv_set_producer函数是为了根据选择好的乒乓寄存器组编号来配置Convolution Core的四个子模块cacccmaccsccaccS_POINTER寄存器,这里的S_POINTER是指向CSB Master和访问Groups的数据路径的指针Pointer
dla_conv_enable函数 dla_conv_enable函数是为了在确认CBUFConvolution Core的数据流动已经结束了以后,启动CDMA的性能计数器,然后使能所有子模块,使能的方式就是把CACC_D_OP_ENABLE_0_OP_EN_ENABLE这个宏常量交给cacccmaccsccdmaD_OP_ENABLE寄存器,然后就可以启动了。
dla_conv_rdma_check函数 dla_conv_rdma_check函数用于是否启动remote DMA

本文接着整理大篇幅的processor_conv_program函数。


一. conv.c函数解读二

1.1 初始化

继续读代码processor_conv_program函数是conv.c最核心的一块,为了搞明白Convolution Core到底是怎么运行的,还得一个接着一个捋清楚寄存器是怎么被读写的!还是一块接着一块啃
首先是

static int32_t
processor_conv_program(struct dla_processor_group *group)
{
    
    
	int32_t ret = 0;
	uint32_t reg, high, low, shift, mask;
	uint32_t stride_x, stride_y, pad_x, pad_y;
	uint64_t weight_address = 0;
	uint64_t wmb_address = 0;
	uint64_t wgs_address = 0;
	uint64_t input_address = 0;
	uint64_t output_address = 0;
	uint32_t atom_size = 0;
	bool weight_compress_support = false;
	struct dla_engine *engine = dla_get_engine();
	struct dla_conv_op_desc *conv_op;
	struct dla_conv_surface_desc *conv_surface;

	dla_trace("Enter: %s", __func__);

	weight_compress_support = engine->config_data->weight_compress_support;
	atom_size = engine->config_data->atom_size;
	conv_op = &group->operation_desc->conv_op;
	conv_surface = &group->surface_desc->conv_surface;

还是搬出几个提到的结构体:

# dla_engine结构体定义如下:
struct dla_engine {
    
    
	struct dla_task *task;
	struct dla_config *config_data;
	struct dla_network_desc *network;
	struct dla_processor processors[DLA_OP_NUM];

	uint16_t num_proc_hwl;
	int32_t status;
	uint32_t stat_enable;

	void *driver_context;
};

dla_engine结构体包含了dla_taskdla_configdla_processor等重要的结构体。

# dla_config结构体定义如下:
/**
 * @brief			Configuration parameters supported by the engine
 *
 * atom_size			Memory smallest access size
 * bdma_enable			Defines whether bdma is supported
 * rubik_enable			Defines whether rubik is supported
 * weight_compress_support	Defines whether weight data compression is supported
 */
struct dla_config {
    
    
	uint32_t atom_size;
	bool bdma_enable;
	bool rubik_enable;
	bool weight_compress_support;
};

dla_config结构体包含了是否使能bdmarubik,是否支持权重压缩和atom size

# dla_processor_group结构体定义如下:
struct dla_processor_group {
    
    
	uint8_t id;
	uint8_t rdma_id;
	uint8_t active;
	uint8_t events;
	uint8_t roi_index;
	uint8_t is_rdma_needed;
	uint8_t pending;
	int32_t lut_index;
	uint8_t programming;
	uint64_t start_time;

	struct dla_common_op_desc *op_desc;
	struct dla_common_op_desc *consumers[DLA_OP_NUM];
	struct dla_common_op_desc *fused_parent;
	union dla_operation_container *operation_desc;
	union dla_surface_container *surface_desc;
};

dla_processor_group结构体包含了重要的dla_operation_containerdla_surface_container联合体,这两个联合体与6个子模块的操作与输入相关。

#dla_operation_container union定义如下:
union dla_operation_container {
    
    
	struct dla_bdma_op_desc bdma_op;
	struct dla_conv_op_desc conv_op;
	struct dla_sdp_op_desc sdp_op;
	struct dla_pdp_op_desc pdp_op;
	struct dla_cdp_op_desc cdp_op;
	struct dla_rubik_op_desc rubik_op;
};

dla_operation_container联合体包含了bdmaconvsdppdpcdprubik等子模块的操作。

#dla_surface_container union定义如下:
union dla_surface_container {
    
    
	struct dla_bdma_surface_desc bdma_surface;
	struct dla_conv_surface_desc conv_surface;
	struct dla_sdp_surface_desc sdp_surface;
	struct dla_pdp_surface_desc pdp_surface;
	struct dla_cdp_surface_desc cdp_surface;
	struct dla_rubik_surface_desc rubik_surface;
};

dla_surface_container联合体包含了bdmaconvsdppdpcdprubik等子模块的surface,每个surface下主要是该阶段的输入和输出数据,比如conv下的权重WMBWGS输入数据输出数据

#dla_conv_surface_desc结构体定义如下:
struct dla_conv_surface_desc {
    
    
	/* Data cube */
	struct dla_data_cube weight_data;
	struct dla_data_cube wmb_data;
	struct dla_data_cube wgs_data;
	struct dla_data_cube src_data;
	struct dla_data_cube dst_data;

	/**
	 * u_addr = input_data.source_addr + offset_u
	 * this field should be set when YUV is not interleave format
	 *
	 */
	int64_t offset_u;

	/* line stride for 2nd plane, must be 32bytes aligned */
	uint32_t in_line_uv_stride;
} __packed __aligned(4);

dla_conv_surface_desc结构体包含权重WMBWGS输入数据输出数据类型。

#dla_conv_op_desc结构体定义如下:
struct dla_conv_op_desc {
    
    
	/* Performance parameters */

	/* dla_conv_mode */
	uint8_t conv_mode;
	uint8_t data_reuse;
	uint8_t weight_reuse;
	uint8_t skip_data_rls;

	uint8_t skip_weight_rls;
	uint8_t reserved0;
	uint16_t entry_per_slice;

	/* dla_data_format */
	uint8_t data_format;
	/* dla_pixel_mapping */
	uint8_t pixel_mapping;
	/* number of free slices before fetch */
	uint16_t fetch_grain;

	uint8_t reserved_b[8];

	/* batch_num */
	uint8_t batch;
	/* dla_weight_format */
	uint8_t weight_format;
	uint8_t data_bank;
	uint8_t weight_bank;

	/* the offset in bytes of each data cube in a batch */
	uint32_t batch_stride;

	uint8_t post_extension;
	uint8_t pixel_override;
	/* number of slices need to be released */
	uint16_t release;

	 /* The input cube dimension for CSC */
	uint16_t input_width_csc;
	uint16_t input_height_csc;

	uint16_t input_channel_csc;
	uint16_t kernel_width_csc;

	uint16_t kernel_height_csc;
	uint16_t kernel_channel_csc;

	/* The input cube dimension for CMAC */
	uint16_t input_width_cmac;
	uint16_t input_height_cmac;

	/* actual size in bytes */
	uint32_t bytes_per_kernel;

	/* Algorithm parameters */

	int16_t mean_ry; /* mean value for red in RGB or Y in YUV */
	int16_t mean_gu; /* mean value for green in RGB or U in YUV */

	int16_t mean_bv; /* mean value for blue in RGB or V in YUV */
	int16_t mean_ax;

	uint8_t mean_format; /* dla_mean_format */
	uint8_t conv_stride_x;
	uint8_t conv_stride_y;
	uint8_t pad_x_left;

	uint8_t pad_x_right;
	uint8_t pad_y_top;
	uint8_t pad_y_bottom;
	uint8_t dilation_x;

	uint8_t dilation_y;
	uint8_t reserved2[2];

	/* Precision parameters */
	uint8_t pra_truncate;

	uint8_t in_precision;
	/* The output precision from CONV, it's the MAC processing precison */
	uint8_t out_precision;
	int16_t pad_val;

	/* input converter parameters */
	struct dla_cvt_param in_cvt;
	/* output converter parameters, support truncate only */
	struct dla_cvt_param out_cvt;

} __packed __aligned(4);

dla_conv_op_desc结构体包含卷积操作的不同配置和拓扑参数等。
开头这段代码完成赋值操作,包括Convolution Core运行需要的权重、激活值等数据,以及相关的操作及其可配置选项。

1.2 WMB/WGS/Weight Data/Destination总线地址的获取

继续读代码,如下:

	if (conv_op->weight_format == WEIGHT_FORMAT_COMPRESSED) {
    
    
		ASSERT_GOTO((weight_compress_support), ret, ERR(INVALID_INPUT), exit);
		ASSERT_GOTO((conv_surface->wmb_data.address != -1),
			ret, ERR(INVALID_INPUT), exit);
		dla_get_dma_cube_address(engine->driver_context,
					engine->task->task_data,
					conv_surface->wmb_data.address,
					conv_surface->wmb_data.offset,
					(void *)&wmb_address,
					DESTINATION_DMA);
		CHECK_ALIGN(wmb_address, atom_size);
		CHECK_ALIGN(conv_surface->wmb_data.size, 128);

		ASSERT_GOTO((conv_surface->wgs_data.address != -1),
			ret, ERR(INVALID_INPUT), exit);
		dla_get_dma_cube_address(engine->driver_context,
					engine->task->task_data,
					conv_surface->wgs_data.address,
					conv_surface->wgs_data.offset,
					(void *)&wgs_address,
					DESTINATION_DMA);
		CHECK_ALIGN(wgs_address, atom_size);
		CHECK_ALIGN(conv_surface->wgs_data.size, 4);
	}

可以看下WEIGHT_FORMAT_COMPRESSED宏的定义:

/**
 * @ingroup Convolution
 * @name Weight formats
 * @brief Weight data formats supported in Convolution
 * @{
 */
#define WEIGHT_FORMAT_UNCOMPRESSED	0
#define WEIGHT_FORMAT_COMPRESSED	1

再来看看dla_get_dma_cube_address函数:

int32_t
dla_get_dma_cube_address(void *driver_context, void *task_data,
					int16_t index, uint32_t offset, void *dst_ptr,
					uint32_t destination)
{
    
    
	int32_t ret = 0;
	uint64_t *pdst = (uint64_t *)dst_ptr;
       ret = dla_get_dma_address(driver_context, task_data, index,
								dst_ptr, destination);
	if (ret)
		goto exit;

	pdst[0] += offset;

exit:
	return ret;
}
  ||
  ||
  \/
dla_get_dma_cube_address(engine->driver_context,
					engine->task->task_data,
					conv_surface->wgs_data.address,
					conv_surface->wgs_data.offset,
					(void *)&wgs_address,
					DESTINATION_DMA);
dla_get_dma_cube_address(engine->driver_context,
					engine->task->task_data,
					conv_surface->wmb_data.address,
					conv_surface->wmb_data.offset,
					(void *)&wmb_address,
					DESTINATION_DMA);

这里提到一个很熟悉的函数dla_get_dma_addressdla_get_dma_address函数将dla_read_dma_addressdla_read_cpu_address合并,便于使用统一的destination变量来获取地址,前者dla_read_dma_address函数的核心在于利用nvdla_gem_dma_addr函数来获取dma_addr,注意这个地址是总线地址,也就是从设备角度看到的地址;后者dla_read_cpu_address用于读取的地址是CPU视角的地址。在这里由于传入的参数是DESTINATION_DMA,因此调用dla_read_dma_address函数来获取DMA地址,并将从设备视角看到的地址存储在wgs_addresswmb_address。这里的wgswmb,就是这里提到的一段话:如果用了权重的稀疏性,那么稀疏算法会使用1bit的标记来指示权重元素是否为零。那么很显然要存储两份东西,第一份是所有的标记,也就是tag,存储tag专用存储被称为WMB;第二份是具体的数值,这个交给了第三方存储,这个存储被称为WGS

关于CHECK_ALIGN函数,定义如下:

#ifdef DEBUG
#define CHECK_ALIGN(val, align)		 assert((val&(align-1)) == 0)
#else
#define CHECK_ALIGN(val, align)
#endif /* DEBUG */

上面两段函数分别比较了WGSWMB的地址位宽和数据位宽:

		CHECK_ALIGN(wmb_address, atom_size);
		CHECK_ALIGN(conv_surface->wmb_data.size, 128);
		CHECK_ALIGN(wgs_address, atom_size);
		CHECK_ALIGN(conv_surface->wgs_data.size, 4);

所以这个if完成的工作是如果权重采用了压缩模式,那么获取和稀疏化权重息息相关的数据标记WMB和数据WGS,将外部设备视角的总线地址分别交给wmb_addresswgs_address两个指针进行存储。

接着看下一个if

	if (conv_surface->weight_data.address != -1) {
    
    
		dla_get_dma_cube_address(engine->driver_context,
					engine->task->task_data,
					conv_surface->weight_data.address,
					conv_surface->weight_data.offset,
					(void *)&weight_address,
					DESTINATION_DMA);
		CHECK_ALIGN(weight_address, atom_size);
		CHECK_ALIGN(conv_surface->weight_data.size, 128);
	}

如果存在权重数据,那么用一样的方式去获取总线视角的权重数据地址,并将其存储到weight_address指针中。

接着看下一个if

	if (conv_surface->dst_data.address != -1) {
    
    
		dla_get_dma_cube_address(engine->driver_context,
					engine->task->task_data,
					conv_surface->dst_data.address,
					conv_surface->dst_data.offset,
					(void *)&output_address,
					DESTINATION_DMA);
		CHECK_ALIGN(output_address, atom_size);
		CHECK_ALIGN(conv_surface->dst_data.size, atom_size);
		CHECK_ALIGN(conv_surface->dst_data.line_stride, atom_size);
		CHECK_ALIGN(conv_surface->dst_data.surf_stride, atom_size);
	}

可能会有点纳闷,因为conv_surface结构体下有如下和data cube相关的结构体,这里的if用到最后也只是把目标数据的总线地址赋值给output_address指针,那么激活值或者说输入数据呢?

	/* Data cube */
	struct dla_data_cube weight_data;
	struct dla_data_cube wmb_data;
	struct dla_data_cube wgs_data;
	struct dla_data_cube src_data;
	struct dla_data_cube dst_data;

1.3 Input Data与众不同的地址读取方式

我们继续往下看代码,这个if结束后有dla_read_input_address输入数据的地址和前面那四位有点不太一样。

	ret = dla_read_input_address(&conv_surface->src_data, &input_address,
					group->op_desc->index,
					group->roi_index,
					map_img_fmt[conv_op->data_format][1]);
	if (ret)
		goto exit;

这是个重要的函数,原型如下:

int dla_read_input_address(struct dla_data_cube *data,
		       uint64_t *address,
		       int16_t op_index,
		       uint8_t roi_index,
		       uint8_t bpp)
{
    
    
	uint64_t roi_desc_addr;
	int32_t ret = ERR(INVALID_INPUT);
	struct dla_engine *en = dla_get_engine();

	/**
	 * If memory type is HW then no address required
	 */
	if (data->type == DLA_MEM_HW) {
    
    
		ret = 0;
		goto exit;
	}

	/**
	 * If address list index is not -1 means this address has to
	 * be read from address list
	 */
	if (data->address != -1) {
    
    

		/**
		 * But if other parameters indicate that this is input layer
		 * for dynamic ROI then it is an error
		 */
		if (en->network->dynamic_roi &&
			en->network->input_layer == op_index)
			goto exit;
		ret = dla_get_dma_cube_address(en->driver_context,
						en->task->task_data,
						data->address,
						data->offset,
						(void *)address,
						DESTINATION_DMA);
		goto exit;
	}

	/**
	 * Check if it is dynamic ROI and this is input layer
	 */
	if (en->network->dynamic_roi && en->network->input_layer == op_index) {
    
    
		if (!en->task->surface_addr)
			goto exit;

		/* Calculate address of ROI descriptor in array */
		roi_desc_addr = en->task->roi_array_addr;

		/* Read ROI descriptor */
		ret = dla_data_read(en->driver_context,
				en->task->task_data,
				roi_desc_addr,
				(void *)&roi_desc,
				sizeof(roi_desc),
				sizeof(struct dla_roi_array_desc) +
				roi_index * sizeof(struct dla_roi_desc));
		if (ret)
			goto exit;

		/* Calculate ROI address */
		*address = en->task->surface_addr;
		*address += (roi_desc.top * data->line_stride) +
						(bpp * roi_desc.left);
	}

exit:
	RETURN(ret);
}

dla_read_input_address函数内出现了俩函数:dla_get_dma_cube_address函数和dla_data_read函数。dla_get_dma_cube_address函数前面已经提到过,而后面的dla_data_read函数此前提到过,和dla_data_write函数很相似。dla_data_write函数的功能就是CPU访问dma_buf(其中的访问流程DMA一致性dma_buf_begin_cpu_accessdma_buf_end_cpu_access来完成和保证),并希望按照给定的数据源地址src数据长度size写入由CPU申请好的dma_buf映射到内核态地址空间内的一段空间(这个功能由dma_buf_vmapdma_buf_vunmap来完成),注意还得按照dla_data_write给定的内核态地址空间偏移量来写入。注意获取dma_buf是由文件描述符fd来完成的(dma_buf_get函数),而dla_data_read就是把写变成读。

解释完2个函数的作用以后,我们看一下整体。对于输入层,如果是静态ROI,则从地址列表中读取该地址,索引在data cube中指定,也就是常规的dla_get_dma_cube_address函数:

	/**
	 * If address list index is not -1 means this address has to
	 * be read from address list
	 */
	if (data->address != -1) {
    
    

		/**
		 * But if other parameters indicate that this is input layer
		 * for dynamic ROI then it is an error
		 */
		if (en->network->dynamic_roi &&
			en->network->input_layer == op_index)
			goto exit;
		ret = dla_get_dma_cube_address(en->driver_context,
						en->task->task_data,
						data->address,
						data->offset,
						(void *)address,
						DESTINATION_DMA);
		goto exit;
	}

对于动态ROI,根据ROI信息和使用的Surface Address读取。

	/**
	 * Check if it is dynamic ROI and this is input layer
	 */
	if (en->network->dynamic_roi && en->network->input_layer == op_index) {
    
    
		if (!en->task->surface_addr)
			goto exit;

		/* Calculate address of ROI descriptor in array */
		roi_desc_addr = en->task->roi_array_addr;

		/* Read ROI descriptor */
		ret = dla_data_read(en->driver_context,
				en->task->task_data,
				roi_desc_addr,
				(void *)&roi_desc,
				sizeof(roi_desc),
				sizeof(struct dla_roi_array_desc) +
				roi_index * sizeof(struct dla_roi_desc));
		if (ret)
			goto exit;

		/* Calculate ROI address */
		*address = en->task->surface_addr;
		*address += (roi_desc.top * data->line_stride) +
						(bpp * roi_desc.left);
	}

动态ROI并非连续地址,而是采用感兴趣的地址。而dla_data_read函数可以实现给定的数据源地址src数据长度size读取由CPU申请好的dma_buf映射到内核态地址空间内的一段空间,在本例中数据使用roi_desc指针读取。

所以输入数据的读取会采用两种方式,一种是无视地址,直接采用dla_get_dma_cube_address读取;另一种和感兴趣地址相关,需要给定感兴趣地址列表,然后使用dla_data_read函数读取。

1.4 一个会引出下一篇博客的点

继续读代码,上面卷积需要的数据都齐活儿了,接下来按照道理是配置Convolution Core相关的寄存器了,但是,来了这么一段代码:

	ASSERT_GOTO((conv_op->out_cvt.scale  == 1),
		ret, ERR(INVALID_INPUT), exit);
	ASSERT_GOTO((conv_op->out_cvt.offset == 0),
		ret, ERR(INVALID_INPUT), exit);

很纳闷out_cvt究竟是什么?做了一点追溯:

struct dla_conv_op_desc {
    
    

       ......
       
	/* Precision parameters */
	uint8_t pra_truncate;

	uint8_t in_precision;
	/* The output precision from CONV, it's the MAC processing precison */
	uint8_t out_precision;
	int16_t pad_val;

	/* input converter parameters */
	struct dla_cvt_param in_cvt;
	/* output converter parameters, support truncate only */
	struct dla_cvt_param out_cvt;

} __packed __aligned(4);

dla_cvt_param结构体的定义如下:

struct dla_cvt_param {
    
    
	int16_t  scale;
	uint8_t  truncate;
	uint8_t  enable;

	int32_t  offset;
} __packed __aligned(4);

这一部分和数据的处理相关,官方给出的解释文档在此处——传送门。之后有空把这篇文档解释一下!

1.5 检查Register Group是否idle

代码如下:

	/* check if the register group is idle */
	reg = cacc_reg_read(S_STATUS);
	mask = group->id ? MASK(CACC_S_STATUS_0, STATUS_1) :
		MASK(CACC_S_STATUS_0, STATUS_0);
	shift = group->id ? SHIFT(CACC_S_STATUS_0, STATUS_1) :
		SHIFT(CACC_S_STATUS_0, STATUS_0);
	reg = (reg & mask) >> shift;
	ASSERT_GOTO((reg == FIELD_ENUM(CACC_S_STATUS_0, STATUS_0, IDLE)),
		ret, ERR(INVALID_INPUT), exit);

	reg = cmac_a_reg_read(S_STATUS);
	mask = group->id ? MASK(CMAC_A_S_STATUS_0, STATUS_1) :
        MASK(CMAC_A_S_STATUS_0, STATUS_0);
	shift = group->id ? SHIFT(CMAC_A_S_STATUS_0, STATUS_1) :
		SHIFT(CMAC_A_S_STATUS_0, STATUS_0);
	reg = (reg & mask) >> shift;
	ASSERT_GOTO((reg == FIELD_ENUM(CMAC_A_S_STATUS_0, STATUS_0, IDLE)),
		ret, ERR(INVALID_INPUT), exit);

	reg = cmac_b_reg_read(S_STATUS);
	mask = group->id ? MASK(CMAC_B_S_STATUS_0, STATUS_1) :
		MASK(CMAC_B_S_STATUS_0, STATUS_0);
	shift = group->id ? SHIFT(CMAC_B_S_STATUS_0, STATUS_1) :
		SHIFT(CMAC_B_S_STATUS_0, STATUS_0);
	reg = (reg & mask) >> shift;
	ASSERT_GOTO((reg == FIELD_ENUM(CMAC_B_S_STATUS_0, STATUS_0, IDLE)),
		ret, ERR(INVALID_INPUT), exit);

	reg = csc_reg_read(S_STATUS);
	mask = group->id ? MASK(CSC_S_STATUS_0, STATUS_1) :
		MASK(CSC_S_STATUS_0, STATUS_0);
	shift = group->id ? SHIFT(CSC_S_STATUS_0, STATUS_1) :
		SHIFT(CSC_S_STATUS_0, STATUS_0);
	reg = (reg & mask) >> shift;
	ASSERT_GOTO((reg == FIELD_ENUM(CSC_S_STATUS_0, STATUS_0, IDLE)),
		ret, ERR(INVALID_INPUT), exit);

	reg = cdma_reg_read(S_STATUS);
	mask = group->id ? MASK(CDMA_S_STATUS_0, STATUS_1) :
		MASK(CDMA_S_STATUS_0, STATUS_0);
	shift = group->id ? SHIFT(CDMA_S_STATUS_0, STATUS_1) :
		SHIFT(CDMA_S_STATUS_0, STATUS_0);
	reg = (reg & mask) >> shift;
	ASSERT_GOTO((reg == FIELD_ENUM(CDMA_S_STATUS_0, STATUS_0, IDLE)),
		ret, ERR(INVALID_INPUT), exit);

继续不厌其烦地对其中一块进行解释:

	reg = cacc_reg_read(S_STATUS);
	mask = group->id ? MASK(CACC_S_STATUS_0, STATUS_1) :
		MASK(CACC_S_STATUS_0, STATUS_0);
	shift = group->id ? SHIFT(CACC_S_STATUS_0, STATUS_1) :
		SHIFT(CACC_S_STATUS_0, STATUS_0);
	reg = (reg & mask) >> shift;
	ASSERT_GOTO((reg == FIELD_ENUM(CACC_S_STATUS_0, STATUS_0, IDLE)),
		ret, ERR(INVALID_INPUT), exit);

我们还是老规矩,一步一步追溯:

reg = cacc_reg_read(S_STATUS);
=> #define cacc_reg_read(reg)          reg_read(CACC_REG(reg))
=> #define CACC_REG(name)               CACC_##name##_0
=> 所以等价于 reg = reg_read(CACC_S_STATUS_0)
=> 顺带查一下该寄存器的地址 #define CACC_S_STATUS_0                                 (_MK_ADDR_CONST(0x9000))

这里的S_STATUS表示的含义是2个Register Groups的状态,请注意S_STATUS寄存器会出现在CDMACSCCMAC_ACMAC_BCACCSDP_RDMASDPPDP_RDMAPDPCDPRUBIK等。所以2个Register Groups内包含各个子模块的状态。
继续往下读:

mask = group->id ? MASK(CACC_S_STATUS_0, STATUS_1) :
		MASK(CACC_S_STATUS_0, STATUS_0);
=> #define MASK(reg, field)		(reg##_##field##_FIELD)
=> 所以等价于 mask = group->id ? CACC_S_STATUS_0_STATUS_1_FIELD :
		CACC_S_STATUS_0_STATUS_0_FIELD;
		
=> 其中CACC_S_STATUS_0_STATUS_1_FIELD为
#define CACC_S_STATUS_0_STATUS_1_SHIFT                     (_MK_SHIFT_CONST(16))
#define CACC_S_STATUS_0_STATUS_1_FIELD \
	(_MK_FIELD_CONST(0x3, CACC_S_STATUS_0_STATUS_1_SHIFT))
=> #define _MK_FIELD_CONST(_mask_, _shift_) \
	((_MK_MASK_CONST(_mask_) << _MK_SHIFT_CONST(_shift_)))
=> 所以CACC_S_STATUS_0_STATUS_1_FIELD为0x3 << 16

同理
=> 所以CACC_S_STATUS_0_STATUS_0_FIELD为0x3 << 0

按照初始设置的数值,group下的id只有0或者1,那么当id为1时,此时mask掩码取值为CACC_S_STATUS_0_STATUS_1_FIELD,如果当id为0时,此时掩码为CACC_S_STATUS_0_STATUS_0_FIELD

继续往下读:

shift = group->id ? SHIFT(CACC_S_STATUS_0, STATUS_1) :
		SHIFT(CACC_S_STATUS_0, STATUS_0);
=> #define SHIFT(reg, field)		(reg##_##field##_SHIFT)
=> 所以等价于 shift = group->id ? CACC_S_STATUS_0_STATUS_1_SHIFT : CACC_S_STATUS_0_STATUS_0_SHIFT
=> 
#define CACC_S_STATUS_0_STATUS_1_SHIFT                     (_MK_SHIFT_CONST(16))
#define CACC_S_STATUS_0_STATUS_0_SHIFT                      (_MK_SHIFT_CONST(0))

这无非就是上面的偏移数值。同样道理,当id为1时,此时shift取值为CACC_S_STATUS_0_STATUS_1_SHIFT,如果当id为0时,此时为CACC_S_STATUS_0_STATUS_0_SHIFT

继续往下读:

reg = (reg & mask) >> shift;
ASSERT_GOTO((reg == FIELD_ENUM(CACC_S_STATUS_0, STATUS_0, IDLE)),
		ret, ERR(INVALID_INPUT), exit);
=> #define FIELD_ENUM(r, f, e)		(r##_##f##_##e)
=> 等效为 reg == CACC_S_STATUS_0_STATUS_0_IDLE
=> #define CACC_S_STATUS_0_STATUS_0_IDLE                        (_MK_ENUM_CONST(0))

这里主要检查Group RegisterCACC是否处于IDLE状态。同理也检查了CMAC_ACMAC_BCSCCDMA子模块。

1.6 配置寄存器之CACC

代码如下:

	reg = (map_conv[conv_op->conv_mode]
		<< SHIFT(CACC_D_MISC_CFG_0, CONV_MODE)) |
		(map_precision[conv_op->out_precision]
		<< SHIFT(CACC_D_MISC_CFG_0, PROC_PRECISION));
	cacc_reg_write(D_MISC_CFG, reg);
	
# SHIFT(CACC_D_MISC_CFG_0, CONV_MODE) 等效 CACC_D_MISC_CFG_0_CONV_MODE_SHIFT
# SHIFT(CACC_D_MISC_CFG_0, PROC_PRECISION) 等效 CACC_D_MISC_CFG_0_PROC_PRECISION_SHIFT
# cacc_reg_write(D_MISC_CFG, reg) 等效 reg_write(CACC_D_MISC_CFG_0, reg)
解释:`CACC_D_MISC_CFG_0`是指Register Group 0内关于CACC的卷积mode、数据精度、权重是否复用、输入数据是否复用做了系一列的配置。


	reg = ((conv_surface->dst_data.width - 1)
		<< SHIFT(CACC_D_DATAOUT_SIZE_0_0, DATAOUT_WIDTH)) |
		((conv_surface->dst_data.height - 1)
		<< SHIFT(CACC_D_DATAOUT_SIZE_0_0, DATAOUT_HEIGHT));
	cacc_reg_write(D_DATAOUT_SIZE_0, reg);

# SHIFT(CACC_D_DATAOUT_SIZE_0_0, DATAOUT_WIDTH) 等效 CACC_D_DATAOUT_SIZE_0_0_DATATOUT_WIDTH_SHIFT
# cacc_reg_write(D_DATAOUT_SIZE_0, reg) 等效 reg_write(CACC_D_DATAOUT_SIZE_0, reg)
解释:`CACC_D_DATAOUT_SIZE_0`指的是输出cube的宽和高。

	reg = ((conv_surface->dst_data.channel - 1)
		<< SHIFT(CACC_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL));
	cacc_reg_write(D_DATAOUT_SIZE_1, reg);

# SHIFT(CACC_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL) 等效 CACC_D_DATAOUT_SIZE_1_0_DATAOUT_CHANNEL_SHIFT
# cacc_reg_write(D_DATAOUT_SIZE_1, reg) 等效 reg_write(CACC_D_DATAOUT_SIZE_1, reg)
解释:`CACC_D_DATAOUT_SIZE_1`指的是输出cube的通道数。

	low = LOW32BITS(output_address);
	cacc_reg_write(D_DATAOUT_ADDR, low);
	cacc_reg_write(D_BATCH_NUMBER, conv_op->batch - 1);
	cacc_reg_write(D_LINE_STRIDE, conv_surface->dst_data.line_stride);
	cacc_reg_write(D_SURF_STRIDE, conv_surface->dst_data.surf_stride);
# `CACC_D_DATAOUT_ADDR`指的是输出cube的地址。
# `CACC_D_BATCH_NUMBER`指的是batch数目。
# `CACC_D_LINE_STRIDE`指的是输出cube的line stride。
# `CACC_D_SURF_STRIDE`指的是surface cube的line stride。

	if (conv_surface->dst_data.width == 1 &&
				conv_surface->dst_data.height == 1) {
    
    
		ASSERT_GOTO((((uint32_t)conv_surface->dst_data.line_stride ==
			(uint32_t)(conv_surface->dst_data.width * atom_size))),
			ret, ERR(INVALID_INPUT), exit);
		reg = (CACC_D_DATAOUT_MAP_0_LINE_PACKED_TRUE <<
				SHIFT(CACC_D_DATAOUT_MAP_0, LINE_PACKED));
		reg |= (CACC_D_DATAOUT_MAP_0_SURF_PACKED_TRUE <<
				SHIFT(CACC_D_DATAOUT_MAP_0, SURF_PACKED));
	} else {
    
    
		reg = (FIELD_ENUM(CACC_D_DATAOUT_MAP_0, LINE_PACKED, FALSE) <<
				SHIFT(CACC_D_DATAOUT_MAP_0, LINE_PACKED));
		reg |= (FIELD_ENUM(CACC_D_DATAOUT_MAP_0, SURF_PACKED, FALSE) <<
				SHIFT(CACC_D_DATAOUT_MAP_0, SURF_PACKED));
	}
	cacc_reg_write(D_DATAOUT_MAP, reg);
	# cacc_reg_write(D_DATAOUT_MAP, reg) 等效 reg_write(CACC_D_DATAOUT_MAP, reg)
	# 解释:`CACC_D_DATAOUT_MAP`指的是output cube是line pakced还是surface packed。

	cacc_reg_write(D_CLIP_CFG, conv_op->out_cvt.truncate);
	# cacc_reg_write(D_CLIP_CFG, conv_op->out_cvt.truncate) 等效 reg_write(CACC_D_CLIP_CFG, xxx)
	# 解释:`CACC_D_CLIP_CFG`指的是在发送到SDP之前数据截断的bit数。

1.7 配置寄存器之CMAC

代码如下:

	reg = (map_conv[conv_op->conv_mode]
		<< SHIFT(CMAC_A_D_MISC_CFG_0, CONV_MODE)) |
		(map_precision[conv_op->out_precision]
		<< SHIFT(CMAC_A_D_MISC_CFG_0, PROC_PRECISION));
	cmac_a_reg_write(D_MISC_CFG, reg);
	cmac_b_reg_write(D_MISC_CFG, reg);
	
	# reg_write(CMAC_A_D_MISC_CFG, reg)
	#解释: `CMAC_A_D_MISC_CFG`指的是卷积模式、数据精度等的配置。

1.8 配置寄存器之CSC

代码如下:

	reg = (map_conv[conv_op->conv_mode]
		<< SHIFT(CSC_D_MISC_CFG_0, CONV_MODE)) |
		(map_precision[conv_op->out_precision]
		<< SHIFT(CSC_D_MISC_CFG_0, IN_PRECISION)) |
		(map_precision[conv_op->out_precision]
		<< SHIFT(CSC_D_MISC_CFG_0, PROC_PRECISION)) |
		(conv_op->data_reuse
		<< SHIFT(CSC_D_MISC_CFG_0, DATA_REUSE)) |
		(conv_op->weight_reuse
		<< SHIFT(CSC_D_MISC_CFG_0, WEIGHT_REUSE)) |
		(conv_op->skip_data_rls
		<< SHIFT(CSC_D_MISC_CFG_0, SKIP_DATA_RLS)) |
		(conv_op->skip_weight_rls
		<< SHIFT(CSC_D_MISC_CFG_0, SKIP_WEIGHT_RLS));
	csc_reg_write(D_MISC_CFG, reg);
	#解释: `CSC_D_MISC_CFG`与`IN_PRECISION`、`卷积mode`、`PROC_PRECISION`、`数据复用`、`权重复用`、`skip_data_rls`和`skip_weight_rls`相关。

	reg = (get_in_format(conv_op->data_format) <<
		SHIFT(CSC_D_DATAIN_FORMAT_0, DATAIN_FORMAT));
	csc_reg_write(D_DATAIN_FORMAT, reg);
	#解释:`CSC_D_DATAIN_FORMAT`指的是输入数据的format和pixel的format。

	reg = ((conv_op->input_width_csc - 1)
		<< SHIFT(CSC_D_DATAIN_SIZE_EXT_0_0, DATAIN_WIDTH_EXT)) |
		((conv_op->input_height_csc - 1)
		<< SHIFT(CSC_D_DATAIN_SIZE_EXT_0_0, DATAIN_HEIGHT_EXT));
	csc_reg_write(D_DATAIN_SIZE_EXT_0, reg);
	#解释:`CSC_D_DATAIN_SIZE_EXT_0`指的是在extension后的输入cube的宽和高。

	reg = ((conv_op->input_channel_csc - 1)
		<< SHIFT(CSC_D_DATAIN_SIZE_EXT_1_0, DATAIN_CHANNEL_EXT));
	csc_reg_write(D_DATAIN_SIZE_EXT_1, reg);
	#解释:`CSC_D_DATAIN_SIZE_EXT_1`指的是在extension后的输入cube的通道。

	reg = ((conv_op->batch - 1)
		<< SHIFT(CSC_D_BATCH_NUMBER_0, BATCHES));
	csc_reg_write(D_BATCH_NUMBER, reg);
	#解释:`CSC_D_BATCH_NUMBER`指的是batch数。
	
	reg = ((conv_op->post_extension)
		<< SHIFT(CSC_D_POST_Y_EXTENSION_0, Y_EXTENSION));
	csc_reg_write(D_POST_Y_EXTENSION, reg);
	#解释:`CSC_D_POST_Y_EXTENSION`指的是针对image-in的后extension系数。

	reg = ((conv_op->entry_per_slice - 1)
		<< SHIFT(CSC_D_ENTRY_PER_SLICE_0, ENTRIES));
	csc_reg_write(D_ENTRY_PER_SLICE, reg);
	#解释:`CSC_D_ENTRY_PER_SLICE`指的是用于一个输入slice的CBUF entry数目。

	reg = (map_weight_fmt[conv_op->weight_format]
		<< SHIFT(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT));
	csc_reg_write(D_WEIGHT_FORMAT, reg);
	#解释:`CSC_D_WEIGHT_FORMAT`指的是权重是否压缩。

	reg = ((conv_op->kernel_width_csc - 1)
		<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_0_0, WEIGHT_WIDTH_EXT)) |
		((conv_op->kernel_height_csc - 1)
		<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_0_0, WEIGHT_HEIGHT_EXT));
	csc_reg_write(D_WEIGHT_SIZE_EXT_0, reg);
	#解释:`CSC_D_WEIGHT_SIZE_EXT_0`指的是extension后的weight的宽和高。

	reg = ((conv_op->kernel_channel_csc - 1)
		<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_1_0, WEIGHT_CHANNEL_EXT)) |
		((conv_surface->dst_data.channel - 1)
		<< SHIFT(CSC_D_WEIGHT_SIZE_EXT_1_0, WEIGHT_KERNEL));
	csc_reg_write(D_WEIGHT_SIZE_EXT_1, reg);
	#解释:`CSC_D_WEIGHT_SIZE_EXT_1`指的是extension后的weight的通道。

	csc_reg_write(D_WEIGHT_BYTES, conv_surface->weight_data.size);
	csc_reg_write(D_WMB_BYTES, conv_surface->wmb_data.size);
	#解释:CSC_D_WEIGHT_BYTES和CSC_D_WMB_BYTES指的是权重和WMB的总bytes数。

	reg = ((conv_op->input_width_cmac - 1)
		<< SHIFT(CSC_D_DATAOUT_SIZE_0_0, DATAOUT_WIDTH)) |
		((conv_op->input_height_cmac - 1)
		<< SHIFT(CSC_D_DATAOUT_SIZE_0_0, DATAOUT_HEIGHT));
	csc_reg_write(D_DATAOUT_SIZE_0, reg);
###后续不再解读: 参考NVDLA硬件信号和架构设计整理一 2.3 表格

	reg = ((conv_surface->dst_data.channel - 1)
		<< SHIFT(CSC_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL));
	csc_reg_write(D_DATAOUT_SIZE_1, reg);

	reg = ((conv_surface->dst_data.width *
				conv_surface->dst_data.height - 1)
		<< SHIFT(CSC_D_ATOMICS_0, ATOMICS));
	csc_reg_write(D_ATOMICS, reg);
	reg = ((conv_op->release - 1)
		<< SHIFT(CSC_D_RELEASE_0, RLS_SLICES));
	csc_reg_write(D_RELEASE, reg);

	if (conv_op->conv_mode == CONV_MODE_DIRECT) {
    
    
		stride_x = conv_op->conv_stride_x - 1;
		stride_y = conv_op->conv_stride_y - 1;
		pad_x = conv_op->pad_x_left;
		pad_y = conv_op->pad_y_top;
	} else {
    
    
		stride_x = 0;
		stride_y = 0;
		pad_x = 0;
		pad_y = 0;
	}

	reg = (stride_x
		<< SHIFT(CSC_D_CONV_STRIDE_EXT_0, CONV_X_STRIDE_EXT)) |
		(stride_y
		<< SHIFT(CSC_D_CONV_STRIDE_EXT_0, CONV_Y_STRIDE_EXT));
	csc_reg_write(D_CONV_STRIDE_EXT, reg);
	# 解释:reg_write(CSC_D_CONV_STRIDE_EXT)

	reg = ((conv_op->dilation_x - 1)
		<< SHIFT(CSC_D_DILATION_EXT_0, X_DILATION_EXT)) |
		((conv_op->dilation_y - 1)
		<< SHIFT(CSC_D_DILATION_EXT_0, Y_DILATION_EXT));
	csc_reg_write(D_DILATION_EXT, reg);

	reg = (pad_x
		<< SHIFT(CSC_D_ZERO_PADDING_0, PAD_LEFT)) |
		(pad_y
		<< SHIFT(CSC_D_ZERO_PADDING_0, PAD_TOP));
	csc_reg_write(D_ZERO_PADDING, reg);

	reg = (conv_op->pad_val
		<< SHIFT(CSC_D_ZERO_PADDING_VALUE_0, PAD_VALUE)) &
		MASK(CSC_D_ZERO_PADDING_VALUE_0, PAD_VALUE);
	csc_reg_write(D_ZERO_PADDING_VALUE, reg);

	reg = ((conv_op->data_bank - 1)
		<< SHIFT(CSC_D_BANK_0, DATA_BANK)) |
		((conv_op->weight_bank - 1)
		<< SHIFT(CSC_D_BANK_0, WEIGHT_BANK));
	csc_reg_write(D_BANK, reg);
	csc_reg_write(D_PRA_CFG, conv_op->pra_truncate);

后续不再解读: 参考NVDLA硬件信号和架构设计整理一 2.3 表格

1.9 配置寄存器之CDMA

代码如下:

	reg = (map_conv[conv_op->conv_mode]
		<< SHIFT(CDMA_D_MISC_CFG_0, CONV_MODE)) |
		(map_precision[conv_op->in_precision]
		<< SHIFT(CDMA_D_MISC_CFG_0, IN_PRECISION)) |
		(map_precision[conv_op->out_precision]
		<< SHIFT(CDMA_D_MISC_CFG_0, PROC_PRECISION)) |
		(conv_op->data_reuse
		<< SHIFT(CDMA_D_MISC_CFG_0, DATA_REUSE)) |
		(conv_op->weight_reuse
		<< SHIFT(CDMA_D_MISC_CFG_0, WEIGHT_REUSE)) |
		(conv_op->skip_data_rls
		<< SHIFT(CDMA_D_MISC_CFG_0, SKIP_DATA_RLS)) |
		(conv_op->skip_weight_rls
		<< SHIFT(CDMA_D_MISC_CFG_0, SKIP_WEIGHT_RLS));
	cdma_reg_write(D_MISC_CFG, reg);

	reg = (get_in_format(conv_op->data_format) <<
		SHIFT(CDMA_D_DATAIN_FORMAT_0, DATAIN_FORMAT)) |
		(map_img_fmt[conv_op->data_format][0]
		<< SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_FORMAT)) |
		(map_pixel[conv_op->pixel_mapping]
		<< SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_MAPPING)) |
		(conv_op->pixel_override
		<< SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_SIGN_OVERRIDE));
	cdma_reg_write(D_DATAIN_FORMAT, reg);

	reg = ((conv_surface->src_data.width - 1)
		<< SHIFT(CDMA_D_DATAIN_SIZE_0_0, DATAIN_WIDTH)) |
		((conv_surface->src_data.height - 1)
		<< SHIFT(CDMA_D_DATAIN_SIZE_0_0, DATAIN_HEIGHT));
	cdma_reg_write(D_DATAIN_SIZE_0, reg);

	reg = ((conv_surface->src_data.channel - 1)
		<< SHIFT(CDMA_D_DATAIN_SIZE_1_0, DATAIN_CHANNEL));
	cdma_reg_write(D_DATAIN_SIZE_1, reg);

	reg = ((conv_op->input_width_csc - 1)
		<< SHIFT(CDMA_D_DATAIN_SIZE_EXT_0_0, DATAIN_WIDTH_EXT)) |
		((conv_op->input_height_csc - 1)
		<< SHIFT(CDMA_D_DATAIN_SIZE_EXT_0_0, DATAIN_HEIGHT_EXT));
	cdma_reg_write(D_DATAIN_SIZE_EXT_0, reg);

	reg = (map_ram[conv_surface->src_data.type]
		<< SHIFT(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE));
	cdma_reg_write(D_DAIN_RAM_TYPE, reg);

	high = HIGH32BITS(input_address);
	low = LOW32BITS(input_address);
	cdma_reg_write(D_DAIN_ADDR_HIGH_0, high);
	cdma_reg_write(D_DAIN_ADDR_LOW_0, low);

	high = HIGH32BITS((input_address + conv_surface->offset_u));
	low = LOW32BITS(input_address + conv_surface->offset_u);
	cdma_reg_write(D_DAIN_ADDR_HIGH_1, high);
	cdma_reg_write(D_DAIN_ADDR_LOW_1, low);

	cdma_reg_write(D_LINE_STRIDE, conv_surface->src_data.line_stride);
	cdma_reg_write(D_SURF_STRIDE, conv_surface->src_data.surf_stride);
	cdma_reg_write(D_LINE_UV_STRIDE, conv_surface->in_line_uv_stride);

	reg = ((conv_surface->src_data.line_stride ==
			((uint32_t)conv_surface->src_data.width * atom_size))
		<< SHIFT(CDMA_D_DAIN_MAP_0, LINE_PACKED));
	reg |= ((conv_surface->src_data.surf_stride ==
			((uint32_t)(conv_surface->src_data.width *
			conv_surface->src_data.height) * atom_size))
		<< SHIFT(CDMA_D_DAIN_MAP_0, SURF_PACKED));
	cdma_reg_write(D_DAIN_MAP, reg);

	reg = ((conv_op->batch - 1)
		<< SHIFT(CDMA_D_BATCH_NUMBER_0, BATCHES));
	cdma_reg_write(D_BATCH_NUMBER, reg);

	cdma_reg_write(D_BATCH_STRIDE, conv_op->batch_stride);

	reg = ((conv_op->entry_per_slice - 1)
		<< SHIFT(CDMA_D_ENTRY_PER_SLICE_0, ENTRIES));
	cdma_reg_write(D_ENTRY_PER_SLICE, reg);

	reg = ((conv_op->fetch_grain - 1)
		<< SHIFT(CDMA_D_FETCH_GRAIN_0, GRAINS));
	cdma_reg_write(D_FETCH_GRAIN, reg);

	reg = (map_weight_fmt[conv_op->weight_format]
		<< SHIFT(CDMA_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT));
	cdma_reg_write(D_WEIGHT_FORMAT, reg);

	reg = ((conv_op->bytes_per_kernel - 1)
		<< SHIFT(CDMA_D_WEIGHT_SIZE_0_0, BYTE_PER_KERNEL));
	cdma_reg_write(D_WEIGHT_SIZE_0, reg);

	reg = ((conv_surface->dst_data.channel - 1)
		<< SHIFT(CDMA_D_WEIGHT_SIZE_1_0, WEIGHT_KERNEL));
	cdma_reg_write(D_WEIGHT_SIZE_1, reg);

	reg = (map_ram[conv_surface->weight_data.type]
		<< SHIFT(CDMA_D_WEIGHT_RAM_TYPE_0, WEIGHT_RAM_TYPE));
	cdma_reg_write(D_WEIGHT_RAM_TYPE, reg);

	high = HIGH32BITS(weight_address);
	low = LOW32BITS(weight_address);
	cdma_reg_write(D_WEIGHT_ADDR_HIGH, high);
	cdma_reg_write(D_WEIGHT_ADDR_LOW, low);
	cdma_reg_write(D_WEIGHT_BYTES, conv_surface->weight_data.size);

	if (conv_op->weight_format == WEIGHT_FORMAT_COMPRESSED) {
    
    
		high = HIGH32BITS(wgs_address);
		low = LOW32BITS(wgs_address);
		cdma_reg_write(D_WGS_ADDR_HIGH, high);
		cdma_reg_write(D_WGS_ADDR_LOW, low);

		high = HIGH32BITS(wmb_address);
		low = LOW32BITS(wmb_address);
		cdma_reg_write(D_WMB_ADDR_HIGH, high);
		cdma_reg_write(D_WMB_ADDR_LOW, low);
		cdma_reg_write(D_WMB_BYTES, conv_surface->wmb_data.size);
	}

	reg = (map_mean[conv_op->mean_format]
		<< SHIFT(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT));
	cdma_reg_write(D_MEAN_FORMAT, reg);

	if (conv_op->mean_format == MEAN_FORMAT_ENABLE) {
    
    
		reg = ((conv_op->mean_ry
			<< SHIFT(CDMA_D_MEAN_GLOBAL_0_0, MEAN_RY)) &
			MASK(CDMA_D_MEAN_GLOBAL_0_0, MEAN_RY)) |
			((conv_op->mean_gu
			<< SHIFT(CDMA_D_MEAN_GLOBAL_0_0, MEAN_GU)) &
			MASK(CDMA_D_MEAN_GLOBAL_0_0, MEAN_GU));
		cdma_reg_write(D_MEAN_GLOBAL_0, reg);

		reg = ((conv_op->mean_bv
			<< SHIFT(CDMA_D_MEAN_GLOBAL_1_0, MEAN_BV))&
			MASK(CDMA_D_MEAN_GLOBAL_1_0, MEAN_BV)) |
			((conv_op->mean_ax
			<< SHIFT(CDMA_D_MEAN_GLOBAL_1_0, MEAN_AX))&
			MASK(CDMA_D_MEAN_GLOBAL_1_0, MEAN_AX));
		cdma_reg_write(D_MEAN_GLOBAL_1, reg);
	}

	if (conv_op->in_cvt.enable) {
    
    
		reg = ((FIELD_ENUM(CDMA_D_CVT_CFG_0, CVT_EN, ENABLE))
			<< SHIFT(CDMA_D_CVT_CFG_0, CVT_EN)) |
			(conv_op->in_cvt.truncate
			<< SHIFT(CDMA_D_CVT_CFG_0, CVT_TRUNCATE));
		cdma_reg_write(D_CVT_CFG, reg);
		cdma_reg_write(D_CVT_OFFSET, conv_op->in_cvt.offset);
		cdma_reg_write(D_CVT_SCALE, conv_op->in_cvt.scale);
	} else {
    
    
		reg = ((FIELD_ENUM(CDMA_D_CVT_CFG_0, CVT_EN, DISABLE))
			<< SHIFT(CDMA_D_CVT_CFG_0, CVT_EN));
		cdma_reg_write(D_CVT_CFG, reg);
	}

	reg = ((conv_op->conv_stride_x - 1)
		<< SHIFT(CDMA_D_CONV_STRIDE_0, CONV_X_STRIDE)) |
		((conv_op->conv_stride_y - 1)
		<< SHIFT(CDMA_D_CONV_STRIDE_0, CONV_Y_STRIDE));
	cdma_reg_write(D_CONV_STRIDE, reg);

	reg = (conv_op->pad_x_left <<
		SHIFT(CDMA_D_ZERO_PADDING_0, PAD_LEFT)) |
		(conv_op->pad_x_right
		<< SHIFT(CDMA_D_ZERO_PADDING_0, PAD_RIGHT)) |
		(conv_op->pad_y_top
		<< SHIFT(CDMA_D_ZERO_PADDING_0, PAD_TOP)) |
		(conv_op->pad_y_bottom
		<< SHIFT(CDMA_D_ZERO_PADDING_0, PAD_BOTTOM));
	cdma_reg_write(D_ZERO_PADDING,   reg);

	reg = conv_op->pad_val <<
		SHIFT(CDMA_D_ZERO_PADDING_VALUE_0, PAD_VALUE) &
		MASK(CDMA_D_ZERO_PADDING_VALUE_0, PAD_VALUE);
	cdma_reg_write(D_ZERO_PADDING_VALUE, reg);
	reg = ((conv_op->weight_bank - 1)
		<< SHIFT(CDMA_D_BANK_0, WEIGHT_BANK)) |
		((conv_op->data_bank - 1)
		<< SHIFT(CDMA_D_BANK_0, DATA_BANK));
	cdma_reg_write(D_BANK, reg);

后续不再解读: 参考NVDLA硬件信号和架构设计整理一 2.3 表格

二. conv.c的函数整理二

函数 功能
dla_read_input_address函数 dla_read_input_address函数内出现了俩函数:dla_get_dma_cube_address函数和dla_data_read函数。dla_get_dma_cube_address函数前面已经提到过,而后面的dla_data_read函数此前提到过,和dla_data_write函数很相似。dla_data_write函数的功能就是CPU访问dma_buf(其中的访问流程DMA一致性dma_buf_begin_cpu_accessdma_buf_end_cpu_access来完成和保证),并希望按照给定的数据源地址src数据长度size写入由CPU申请好的dma_buf映射到内核态地址空间内的一段空间(这个功能由dma_buf_vmapdma_buf_vunmap来完成),注意还得按照dla_data_write给定的内核态地址空间偏移量来写入。注意获取dma_buf是由文件描述符fd来完成的(dma_buf_get函数),而dla_data_read就是把写变成读。解释完2个函数的作用以后,我们看一下整体。对于输入层,如果是静态ROI,则从地址列表中读取该地址,索引在data cube中指定,也就是常规的dla_get_dma_cube_address函数。对于动态ROI,根据ROI信息和使用的Surface Address读取。所以输入数据的读取会采用两种方式,一种是无视地址,直接采用dla_get_dma_cube_address读取;另一种和感兴趣地址相关,需要给定感兴趣地址列表,然后使用dla_data_read函数读取。
processor_conv_program函数 主要完成convolution core运行所需要的各种配置,这些配置均是通过寄存器来完成,和硬件设计息息相关。

三. conv.c的结构体和联合体整理二

结构体或者联合体 功能
dla_engine结构体 dla_engine结构体包含了dla_taskdla_configdla_processor等重要的结构体。
dla_config结构体 dla_config结构体包含了是否使能bdmarubik,是否支持权重压缩和atom size
dla_processor_group结构体 dla_processor_group结构体包含了重要的dla_operation_containerdla_surface_container联合体,这两个联合体与6个子模块的操作与输入相关。
dla_operation_container联合体 dla_operation_container联合体包含了bdmaconvsdppdpcdprubik等子模块的操作。
dla_surface_container联合体 dla_surface_container联合体包含了bdmaconvsdppdpcdprubik等子模块的surface,每个surface下主要是该阶段的输入和输出数据,比如conv下的权重WMBWGS输入数据输出数据
dla_conv_surface_desc结构体 dla_conv_surface_desc结构体包含权重WMBWGS输入数据输出数据类型。
dla_conv_op_desc结构体 dla_conv_op_desc结构体包含卷积操作的不同配置和拓扑参数等。

四、conv.c的寄存器整理二

寄存器 功能
S_STATUS S_STATUS表示的含义是2个Register Groups的状态,请注意S_STATUS寄存器会出现在CDMACSCCMAC_ACMAC_BCACCSDP_RDMASDPPDP_RDMAPDPCDPRUBIK等。所以2个Register Groups内包含各个子模块的状态。
与之相关的寄存器实例1 group下的id只有0或者1,那么当id为1时,此时mask掩码取值为CACC_S_STATUS_0_STATUS_1_FIELD,如果当id为0时,此时掩码为CACC_S_STATUS_0_STATUS_0_FIELD
与之相关的寄存器实例2 同样道理,当id为1时,此时shift取值为CACC_S_STATUS_0_STATUS_1_SHIFT,如果当id为0时,此时为CACC_S_STATUS_0_STATUS_0_SHIFT
D_MISC_CFG 实例:CACC_D_MISC_CFG_0是指Register Group 0内关于CACC的卷积mode、数据精度、权重是否复用、输入数据是否复用做了一系列的配置。
D_DATAOUT_SIZE_0 实例:CACC_D_DATAOUT_SIZE_0指的是CACC子模块输出cube的宽和高
D_DATAOUT_SIZE_1 实例:CACC_D_DATAOUT_SIZE_1指的是CACC子模块输出cube的通道数。
D_DATAOUT_ADDR 实例:CACC_D_DATAOUT_ADDR指的是CACC子模块输出cube的地址。
D_BATCH_NUMBER 实例:CACC_D_BATCH_NUMBER指的是CACC子模块batch数目。
D_LINE_STRIDE 实例:CACC_D_LINE_STRIDE指的是CACC子模块输出cube的line stride。
D_SURF_STRIDE 实例:CACC_D_SURF_STRIDE指的是CACC子模块surface cube的line stride。
D_DATAOUT_MAP 实例:CACC_D_DATAOUT_MAP指的是output cube是line pakced还是surface packed。
D_CLIP_CFG 实例:CACC_D_CLIP_CFG指的是在发送到SDP之前数据截断的bit数。
D_MISC_CFG 实例1:CMAC_A_D_MISC_CFG指的是卷积模式、数据精度等的配置。实例2:CMAC_B_D_MISC_CFG指的是卷积模式、数据精度等的配置。
D_MISC_CFG 实例:CSC_D_MISC_CFGIN_PRECISION卷积modePROC_PRECISION数据复用权重复用skip_data_rlsskip_weight_rls相关。
D_DATAIN_FORMAT 实例:CSC_D_DATAIN_FORMAT指的是输入数据的format和pixel的format。
D_DATAIN_SIZE_EXT_0 实例:CSC_D_DATAIN_SIZE_EXT_0指的是在extension后的输入cube的宽和高。
D_DATAIN_SIZE_EXT_1 实例:CSC_D_DATAIN_SIZE_EXT_1指的是在extension后的输入cube的通道。
D_BATCH_NUMBER 实例:CSC_D_BATCH_NUMBER指的是batch数。
D_POST_Y_EXTENSION 实例:CSC_D_POST_Y_EXTENSION指的是针对image-in的后extension系数。
D_ENTRY_PER_SLICE 实例:CSC_D_ENTRY_PER_SLICE指的是用于一个输入slice的CBUF entry数目。
D_WEIGHT_FORMAT 实例:CSC_D_WEIGHT_FORMAT指的是权重是否压缩。
D_WEIGHT_SIZE_EXT_0 实例:CSC_D_WEIGHT_SIZE_EXT_0指的是extension后的weight的宽和高。
D_WEIGHT_SIZE_EXT_1 实例:CSC_D_WEIGHT_SIZE_EXT_1指的是extension后的weight的通道。
D_WEIGHT_BYTESCSC_D_WMB_BYTES 实例:CSC_D_WEIGHT_BYTESCSC_D_WMB_BYTES指的是权重和WMB的总bytes数。

总结

本文主要针对conv.cprocessor_conv_program函数进行解释,不再针对后续的子模块进行分析,因为分析方式类似。

猜你喜欢

转载自blog.csdn.net/weixin_41029027/article/details/134470211