Python Excel 处理脚本

0、

1、编程环境

python3

  • 编辑器:PyCharm
  • 环境:anaconda3

使用的工具包

#用于调用excel文件的
import openpyxl
#用于正则比较的包
import re

2、主函数

if \__name__ == "\__main__":

# targetSheet为需要输出统计结果的工作表,dataSheet为数据所在的工作表
# 提取excel文件
wb_targetSheet = openpyxl.load_workbook('D:/target.xlsx')
wb_dataSheet = openpyxl.load_workbook('D:/data.xlsx')
# 提取工作表
ws_targetSheet = getWorksheets(wb_targetSheet, "Sheetname")
ws_dataSheet = getWorksheets(wb_dataSheet, "Sheetname")

# column_origin_targetSheet 统计表 原始结果所在列
# column_output_targetSheet 统计表 误识别统计结果输出列
target_column = targetColumn(n, n)#column_origin_targetSheet,column_output_targetSheet

# column_origin_dataSheet 数据表 原始结果所在列
# column_tag_dataSheet 数据表 标注结果所在列
data_column = dataColumn(n, n)    # column_origin_dataSheet,column_tag_dataSheet

# 统计表的各类数字统计所在列
# all = column_number[0] 统计表 提取总次数
# true = column_number[1] 统计表 识别正确数
# true_per = column_number[2] 统计表 识别正确占比
# false = column_number[3] 统计表 识别错误数
# false_per = column_number[4] 统计表 识别错误占比
# none = column_number[5] 统计表 无效次数
# none_per = column_number[6] 统计表 无效数据占比
column_number = param_number_return(n, n, n, n, n, n, n)

# 统计识别结果的主函数,并返回错误的数组
# targetSheet_row 统计表 原始识别结果总行数
# dataSheet_row 数据表 标注结果总行数
# 本函数导入参数分别为:ws_targetSheet, ws_dataSheet, targetSheet_row, dataSheet_row, target_column, data_column
err_str = Statistics(ws_targetSheet, ws_dataSheet, n, n, target_column, data_column, column_number)

# 在统计表第err_statistics行,输出 统计表原始结果无法识别的行
err_statistics = n
ws_targetSheet.cell(err_statistics, column=1).value = "原始结果无法识别行:"
ws_targetSheet.cell(err_statistics, column=2).value = err_str

# 保存统计工作表,在此命令执行之前生成和修改的数据不会保存
wb_targetSheet.save('D:/target.xlsx')

3、整表处理函数

#输出全部误识别结果统计到指定xlsx的指定Sheet
#targetSheet_row:输出表的行数
#dataSheet_row:数据表的行数
def Statistics(ws_targetSheet, ws_dataSheet, targetSheet_row, dataSheet_row, target_column, data_column, column_number):

column_origin_targetSheet = target_column[0]
column_output_targetSheet = target_column[1]
column_origin_dataSheet = data_column[0]
column_tag_dataSheet = data_column[1]

all = column_number[0]
true = column_number[1]
true_per = column_number[2]
false = column_number[3]
false_per = column_number[4]
none = column_number[5]
none_per = column_number[6]

# 用于存放无法识别的原始结果
err = {}
err_index = 0

for i in range(2, targetSheet_row+1):
    err,err_index = getIdentificationNumber(ws_targetSheet, ws_dataSheet, i, dataSheet_row, column_origin_targetSheet, column_output_targetSheet, column_origin_dataSheet, column_tag_dataSheet, all, true, true_per, false, false_per, none, none_per, err, err_index)

err_index -= 1
err_str = ''
for k in range(0, err_index + 1):
    err_str = err_str + str(err[k])
    if k != err_index:
        err_str = err_str + '、'

return err_str

4、统计表单条数据分析

#单条原始识别结果的误识别分析
def getIdentificationNumber(ws_targetSheet, ws_dataSheet, targetSheet_row, dataSheet_row, column_origin_targetSheet, column_output_targetSheet, column_origin_dataSheet, column_tag_dataSheet, all, true, true_per, false, false_per, none, none_per, err, err_index):

# 正则判断是否为时间格式文本,若为则需要处理
m = re.search(r'(\w)(\w)(:)(\w+)(:)(\w+)',str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value))
if m != None and m.group(3) == ':' and m.group(5) == ':':
    ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value = str(m.group(2)) + str(m.group(3)) + str(m.group(4))

print('【' + str(targetSheet_row) + '】' + str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value))

# 音频误识别结果统计,结果放入a数组
# a[2 * a_index]:记录误识别结果字符串 str类型
# a[2 * a_index + 1]:记录每种误识别结果字符串数量 int类型
a = {}
a_index = 0

# 统计正确、错误、无效数目
number_true = 0
number_false = 0
number_none = 0

# 针对每一条原始识别结果,搜索所有相关标注结果
for i in range(2, dataSheet_row+1):

    if str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) == str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value) \
    and str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) != str(ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value) \
    and str(ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value) != "/":
        number_false += 1
        Misidentification_repeat = 0
        for j in range(0, a_index):
            if ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value == a[2 * j]:
                Misidentification_repeat += 1
                a[2 * j + 1] += 1
            else:
                Misidentification_repeat = Misidentification_repeat
        if Misidentification_repeat == 0:
            a[2 * a_index] = ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value
            a[2 * a_index + 1] = 1
            a_index += 1

    # 正确识别结果统计
    elif str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) == str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value) \
    and str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) == str(ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value):
        number_true += 1

    # 无效识别结果统计
    elif str(ws_dataSheet.cell(row=i, column=column_origin_dataSheet).value) == str(ws_targetSheet.cell(row=targetSheet_row, column=column_origin_targetSheet).value) \
    and str(ws_dataSheet.cell(row=i, column=column_tag_dataSheet).value) == "/":
        number_none += 1

# 将a_index多加的一次减去,得到正确的a_index
a_index -= 1

number_all = number_true + number_none + number_false
print(number_all, number_true, number_false, number_none)
if number_all != 0:
    ws_targetSheet.cell(row=targetSheet_row, column=all).value = number_all

    ws_targetSheet.cell(row=targetSheet_row, column=true).value = number_true
    number_true_per = float(number_true) / float(number_all)
    ws_targetSheet.cell(row=targetSheet_row, column=true_per).value = number_true_per

    ws_targetSheet.cell(row=targetSheet_row, column=false).value = number_false
    number_false_per = float(number_false) / float(number_all)
    ws_targetSheet.cell(row=targetSheet_row, column=false_per).value = number_false_per

    ws_targetSheet.cell(row=targetSheet_row, column=none).value = number_none
    number_none_per = float(number_none) / float(number_all)
    ws_targetSheet.cell(row=targetSheet_row, column=none_per).value = number_none_per
else:
    err[err_index] = str(targetSheet_row)
    err_index += 1

    ws_targetSheet.cell(row=targetSheet_row, column=all).value = 0
    ws_targetSheet.cell(row=targetSheet_row, column=true).value = 0
    ws_targetSheet.cell(row=targetSheet_row, column=true_per).value = 0
    ws_targetSheet.cell(row=targetSheet_row, column=false).value = 0
    ws_targetSheet.cell(row=targetSheet_row, column=false_per).value = 0
    ws_targetSheet.cell(row=targetSheet_row, column=none).value = 0
    ws_targetSheet.cell(row=targetSheet_row, column=none_per).value = 0

# 针对误识别条数进行排序
a = quickSort(arr=a, left=0, right=a_index)
# 从小到大排序
a = reverseSort(arr=a, left=0, right=a_index)

# 输出误识别结果
a_str = ''
for k in range(0, a_index+1):
    a_str = a_str + str(a[2 * k]) + '(' + str(a[2 * k + 1]) + ')'
    if k != a_index:
        a_str = a_str + '、'
print(a_str + '\n')
ws_targetSheet.cell(row=targetSheet_row, column=column_output_targetSheet).value = a_str

return err,err_index

5、快速排序算法

#快速排序算法

#快速排序算法
def quickSort( arr, left, right):
    left = 0 if not isinstance(left,(int, float)) else left
    right = len(arr)-1 if not isinstance(right,(int, float)) else right
    if left < right:
        partitionIndex = partition(arr, left, right)
        quickSort(arr, left, partitionIndex-1)
        quickSort(arr, partitionIndex+1, right)
    return arr

#分冶

#分冶法(配合实现本程序中的快速排序)
def partition(arr, left, right):
    pivot = left
    index = pivot+1
    i = index
    while i <= right:
        if arr[2 * i + 1] < arr[2 * pivot + 1]:
            swap(arr, 2*i, 2*index)
            swap(arr, 2*i+1, 2*index+1)
            index += 1
        i += 1
    swap(arr, 2*pivot+1, 2*(index-1)+1)
    swap(arr, 2*pivot, 2*(index - 1))
    return index-1

#交换

#交换数组元素值(配合实现本程序中的快速排序)
def swap(arr, i, j):
    arr[i], arr[j] = arr[j], arr[i]

6、其他辅助函数

#倒序处理,从大到小排列
def reverseSort( arr, left, right):
    while left<right :
        temp = arr[2*left]
        arr[2 * left] = arr[2 * right]
        arr[2 * right] = temp

        temp = arr[2 * left + 1]
        arr[2 * left + 1] = arr[2 * right + 1]
        arr[2 * right + 1] = temp

        left += 1
        right -= 1
    return arr


#从xlsx获得单张Sheet
def getWorksheets(wb,sheetname):
    ws = wb[sheetname]
    return ws


#返回统计表的文本所处第几列
def targetColumn(column_origin_targetSheet, column_output_targetSheet):
    return column_origin_targetSheet, column_output_targetSheet


#返回数据表的文本所处第几列
def dataColumn(column_origin_dataSheet, column_tag_dataSheet):
    return column_origin_dataSheet, column_tag_dataSheet


#返回统计表的各类数字统计所在列
def param_number_return(all, true, true_per, false, false_per, none, none_per):
    return all, true, true_per, false, false_per, none, none_per

猜你喜欢

转载自blog.csdn.net/weixin_44576259/article/details/119894432