Python3 batch convert file encoding
| Background: I am a novice programmer this one day suddenly find themselves in a very rookie of the project, all the files are encoded confusion. How to do this? Emergency, such as online.
Unfortunately, I did not wait for the big brother finally recommended to me what good servants software. So I think I can not be your own batch solve it.
Ready to work
- python3
- pip install chardet (detection-encoded)
Detect file encoding
"Forewarned is forearmed, without prejudging the waste" too much confusion encoded file, or a good plan follows: First, we detect what encoding status of each file before you can start correcting.
Detect file encoding, we can use the chardet
open source library usage is very simple, direct bytes
pass to:
import chardet
f_file = open(path, "rb")
content = f_file.read()
# 结果是一个字典,包含了猜测的编码与概率
guess_encode = chardet.detect(content)
Get all files to be encoded detection
"There Cunyan son, son and grandchildren, grandson and son, son and the Son, there are sub-grandson, children and grandchildren infinite Kui also" - For some folders, it really is very deep, they have a very deep directory structure.
Whether detection coding, file encoding, or amended, it should first of many files to find out. How do I find?
Generally, we think of recursion, but in fact for this case file, python's os module is ready, you can use os.walk:
import os
import re
# 深度递归遍历所有文件夹下的文件
def walk_files(path, regex=r"."):
if not os.path.isdir(path):
return [path]
file_list = []
for root, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(root, file)
if re.match(regex, file_path):
file_list.append(file_path)
return file_list
Using regular expressions (re module), for the convenience of the filter, it is always some files may need to detect or modified.
Since acquiring the file list, then read and traverse detection coding is not difficult, you only need to add a loop, the loop we guess record results in the coding, or print, or write to the temporary end the report file, not repeat them.
Modify the file encoding
python2 string designed to be relatively bad can be said, it can be considered a type of binary bytes string, led to a series of confusion.
python3 these improvements have been made, byte code conversion can only be as follows:
# byte解码为字符串
contentStr = content.decode(original)
# 转为目标编码bytes
targetBytes = bytes(contentStr, target)
Of course, remember to add try
decoding bytes need to be in the correct method, otherwise it will throw an exception, which is equivalent to a decryption process, using the wrong key would not open the door (for example, could have been utf-8
content encoded with the wrong gbk
decoder)
After obtaining the finished modifying the encoding of bytes, we also need to save the file:
f_file.seek(0)
f_file.truncate()
f_file.write(targetBytes)
First file pointer to the front, then use f_file.truncate()
all the contents empty pointer, the last written.
Final chapter (sample code and screenshots)
Most of them are described in the above ideas, code is not complete. However, the most important thing is - carried out before any batch operation, back up. But I did not realize, consider using a shutil.copytree(原文件夹,新文件夹)
backup.
As shown above, chardet
the speculation is not necessarily correct, we need to back up, need some fine-tuning for certain documents, know IDE can display or function properly.
Here is the complete test code:
# -*- coding: utf-8 -*-
# @Date:2020/1/12 19:04
# @Author: Lu
# @Description
import os
import copy
import re
import chardet
class FileUtil():
# 深度递归遍历所有文件夹下的文件
def walk_files(path, regex=None):
if not os.path.isdir(path):
return [path]
file_list = []
for root, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(root, file)
if re.match(regex, file_path):
file_list.append(file_path)
return file_list
class EncodeTask():
def __init__(self):
self.default_config = {
"workpaths": [u"./"],
"filefilter": r"."
}
self.config = copy.deepcopy(self.default_config)
self.work_files = []
self.workpaths = []
def update(self, config, fill_default_value=False):
cache = copy.deepcopy(config)
for k in self.default_config.keys():
if cache.get(k):
self.config[k] = cache[k]
elif fill_default_value:
self.config[k] = self.default_config[k]
self.__gen_files(self.config["workpaths"])
return self
def __gen_files(self, workpaths):
self.work_files.clear()
for workpath in workpaths:
self.work_files += FileUtil.walk_files(workpath, self.config["filefilter"])
def check_encoding(self):
encoding_report = {"stat": {}, "reports": []}
for path in self.work_files:
f_file = open(path, "rb")
content = f_file.read()
guess_encode = chardet.detect(content)
encoding = guess_encode.get("encoding")
encoding_report["reports"].append([path, guess_encode])
if not encoding_report["stat"].get(encoding):
encoding_report["stat"][encoding] = 1
else:
encoding_report["stat"][encoding] += 1
f_file.flush()
f_file.close()
reportfile = open(u"./encoding_report.txt", "w",encoding="utf-8")
reportContent = u"{}\n".format(encoding_report["stat"])
for item in encoding_report["reports"]:
reportContent += u"\n{} {}".format(item[0], item[1])
reportfile.write(reportContent)
reportfile.flush()
reportfile.close()
print(encoding_report)
def change_encoding(self, original, target):
for path in self.work_files:
print(u"\n{}\nchange {} to {}".format(path, original, target))
f_file = open(path, "rb+")
content = f_file.read()
try:
# byte解码为字符串
contentStr = content.decode(original)
# 字符串编码为uniccode str
# unicodeBytes = contentStr.encode("unicode_escape")
# 转为目标编码bytes
targetBytes = bytes(contentStr, target)
# print(targetBytes)
f_file.seek(0)
f_file.truncate()
f_file.write(targetBytes)
except Exception as e:
print(u"Error:可能编码有误\n{}".format(e))
finally:
f_file.flush()
f_file.close()
def task():
print("""You can use it like this code:
# -*- coding: utf-8 -*-
from conver_encode import EncodeTask
EncodeTask().update({
"workpaths": [u"./test"],
"filefilter": r".*\.(?:java)"
}).check_encoding()
EncodeTask().update({
"workpaths": [u"./test"],
"filefilter": r".*\.(?:java)"
}).change_encoding("gb18030", "utf-8")
# }).change_encoding("utf-8", "gb18030")
# }).change_encoding("Windows-1252", "utf-8")
""");
pass
if __name__ == '__main__':
task()