Three basic data quality evaluation methods, and Python implementation

Basic data quality assessment method

1. Analysis of the saturation (empty value rate) of the statistical table data
2. Analysis of the correctness rate of the threshold value through the comparison of the code table (the relevance of the real-time table and the dimension table)
3. Time saturation (data continuity) analysis
4. The main body table Relevance to the behavior table (no examples, designed according to specific scenarios)

Null value rate of statistical table data

# 简介:生成一个表的字段饱和度(空值率) SQL 脚本,可以选择手动跑或自动跑
# 作者:王振东
# 日期:2021-02-07
from odps import ODPS

o = ODPS('ak', 'sk', 'project_namme', endpoint='http://xxxxx/api')

def check_data_by_execute_sql(table_name, partition):
    ta = o.get_table(table_name)
    sql_str = 'select \n'
    for col in ta.schema.columns:
        col_name = col.name
        col_comm = col.comment
        if col_comm == 'null' or col_comm is None or col_comm == '':
            continue
        sql_str += "sum(case when (%s is null) or (%s in ('', 'null', 'NULL', '-')) or " \
                   "(trim(%s) = '') then 1 else 0 end)/count(1) as `%s`,\n" % \
                   (col_name, col_name, col_name, col_comm)
    sql_str += "count(1) as total_cnt \nfrom %s where %s" %(table_name, partition)
    # print(sql_str)
    print('|字段名|空值率|\n|----|----|\n')
    statistic = {'0.0': 0, '0.1': 0, '0.2': 0, '0.3': 0, '0.4': 0,
                 '0.5': 0, '0.6': 0, '0.7': 0, '0.8': 0, '0.9': 0, '1.0': 0}
    with o.execute_sql(sql_str).open_reader() as rec:
        rf = rec.to_result_frame()
        n = rf.names
        v = rf.values[0]
        for i in range(len(n) - 1):
            print("|%s|%.2f|" % (n[i], v[i]))
            statistic['%.1f' % v[i]] = statistic['%.1f' % v[i]] + 1
        print('数据总量', v[-1])
    for i in statistic.keys():
        print(i, statistic[i])

def main():
    check_data_by_execute_sql('table_name', "partition_name")
    check_data_by_execute_sql('table_name', "partition_name")
    check_data_by_execute_sql('table_name', "partition_name")

if __name__ == '__main__':
    main()

Analyze the correct rate of the threshold through the code table comparison

# 简介:拼写 SQL 筛查码表对照情况
# 作者:王振东
# 日期:2021-01-07
"""
把某个字段的字典导入标准字典表模版,字典表结构:
    create table dim_gjlcb2d0_zdbm_all (
        code string,
        name string)
    comment '诊断代码表';
    create table dim_gjlcb2d0_ssbm_all (
        code string,
        name string)
    comment '手术代码表';
统计内容:码表对照对照失败的 name/code 对,以及对应的数据条数
"""

"""
var:
    code_field 代码字段
    name_field 名称字段
    code_ch 代码字段注释
    name_ch 名称字段注释
    table_name 表名称
    dim_table 字典表名称
    partition 指定数据分区
"""
def fill_sql(var):
    code_field, name_field, code_ch, name_ch, table_name, dim_table, partition = var
    sql_model = """
-- 异常统计
select '%s_%s',count(1) 无法对照的name_code对数,sum(cnt) 涉及数据条数
from (select code, name, cnt 
      from (select %s code, %s name,count(1) cnt from %s 
            where %s and %s is not null and %s is not null and %s !='-' and %s != '-' group by %s,%s) t_1  -- 过滤掉空值
      left anti join %s t_2
      on t_1.code = t_2.code and t_1.name = t_2.name) t_3;
"""
    detail_sql_model = """
-- 异常明细
select code %s, name %s, cnt 数据量
from (select %s code, %s name,count(1) cnt from %s 
      where %s and %s is not null and %s is not null and %s !='-' and %s != '-' group by %s,%s) t_1  -- 过滤掉空值
left anti join %s t_2
on t_1.code = t_2.code and t_1.name = t_2.name;
"""
    # code_ch, name_ch
    return sql_model % (code_ch, name_ch, code_field, name_field, table_name, partition, code_field, name_field,
                               code_field, name_field, code_field, name_field, dim_table)

def main():
    var_list = [('c', 'n', '诊断编码', '诊断名称', 'xxxxxxxxx',
                 'dim_gjlcb2d0_zdbm_all', "partition'")]
    for v in var_list:
        print(fill_sql(v))

if __name__ == '__main__':
    main()

Time saturation

-- 注意:如果表内有脏数据(例如日期为 9999-01-01 00:00:00 的时间,需要增加 where 条件过滤掉)
-- 查看数据的时间区间
select 
	max(time_field),min(time_field) 
from table_name
where pt = 'partition_name'; 
-- 统计数据持续的时间(天级别)
select 
	datediff(max(time_field),min(time_field),'dd') 数据持续总天数
from table_name
where pt = 'partition_name'
  and time_field between  '约束条件' and '约束条件'; 
-- 组织维度的天级别时间饱和度: 组织,有数据的天数,时间饱和度,总数据量
select org_code 组织, count(1) 有数据的天数, count(1)/数据持续总天数 时间饱和度, sum(cnt) 总数据量
from (
	select 
		org_code, to_char(time_field, 'yyyy-mm-dd') dd, count(1) cnt
	from table_name
	where pt = 'partition_name'
	  and time_field between  '约束条件' and '约束条件'
	group by org_code, to_char(time_field, 'yyyy-mm'))
where cnt > 0
group by org_code;

Guess you like

Origin blog.csdn.net/ManWZD/article/details/112425577