关于文件格式判断改进

发现目前文件格式判断都是通过split文件名拿到后缀去判断，但是最近遇到挺搞笑的事情，客户一个文件把后缀改名字导致我们错误的判断了格式，使得代码报错。

李逵还是李鬼，傻傻分不清。
在这里插入图片描述

后期打算在接口中加一个基于文件头部信息来判断文件实际格式给api服务使用。保证后面业务通过spit方法也能保证正常。

后续写了一个基于代码的判断：

import binascii
import os
from typing import List
 
WhiteFormatList = ['csv', 'txt']
FuzzyMatchType = [
    ['jpg', 'jpeg', 'png', 'tif', 'tiff', 'bmp', 'jfif'],
    ['doc', 'docx', 'wps', 'rtf', 'ppt', 'pptx', 'dps'],
    ['xls', 'xlsx', 'et'],
]
FormatType = {
    
    
    '255044462d312e': ['pdf'],
    'd0cf11e0a1b11ae1': ['wps', 'et', 'dps'],
    'd0cf11e0': ['doc', 'xls', 'ppt'],
    '504b0304140000000': ['ofd'],
    '504b0304': ['docx', 'xlsx', 'pptx'],
    '7b5c727466': ['rtf'],
 
    'ffd8ff': ['jpg', 'jpeg', 'jfif'],
    '89504e47': ['png'],
    '49492a00': ['tif', 'tiff'],
    '424d': ['bmp'],    # Windows Bitmap
 
    "68746d6c3e": ['html'],
    "3c3f786d6c": ['xml'],
}
 
 
# 获取文件实际的格式
def get_file_format(file_path: str) -> List[str]:
    assert os.path.exists(file_path), FileNotFoundError(f'{
      
      file_path} is not exist')
    real_format_candidate = []
    ori_file_format = os.path.splitext(file_path)[1][1:].lower()
    if ori_file_format in WhiteFormatList:
        real_format_candidate = [ori_file_format]
    else:
        with open(file_path, 'rb') as f:
            header_bytes = f.read(28)
        hex_header = str(binascii.b2a_hex(header_bytes))
        for format_content, format_list in FormatType.items():
            if format_content.lower() in hex_header.lower():
                real_format_candidate = format_list
                break
    return real_format_candidate
 
 
 
def check_file_format(file_path: str, fuzzy_match: bool = False) -> (bool, List):
    """
    检查文件格式和后缀名是否一致
    :param file_path: 输入文件绝对路径
    :param fuzzy_match: 是否进行模糊匹配
    :return: 检测结果和预测的实际文件格式
    """
    is_right = False
    file_format = os.path.splitext(file_path)[1][1:].lower()
    format_candidate = get_file_format(file_path)
    if format_candidate:
        if file_format in format_candidate or _fuzzy_match(set(format_candidate + [file_format]), fuzzy_match):
            is_right = True
    return is_right, format_candidate or WhiteFormatList
 
 
def _fuzzy_match(check_types: set, fuzzy_match: bool):
    is_right = False
    if fuzzy_match:
        for type_list in FuzzyMatchType:
            if check_types.issubset(type_list):
                is_right = True
                break
    return is_right

关于文件格式判断改进

关于文件格式判断改进

猜你喜欢