Coding BUG:
UnicodeEncodeError: 'gbk' codec can't encode character '\uff62' in position 34: illegal multibyte sequence :
solution
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #改变标准输出的默认编码
My computer can't read other computers
Cause Analysis:
cmd is not well compatible with utf8, but IDLE can, even running under IDLE, even "changing the default encoding of standard output" is not needed, because it defaults to utf8. If you must run it under cmd, then change the code, for example, if I change it to "gb18030", it will be displayed normally:
The print() function itself has limitations and cannot print all unicode characters completely
solution:
In fact, the limitation of the print() function is the limitation of the default encoding of Python, because the system is win7, and the default encoding of python is not 'utf-8', just change the default encoding of python to 'utf-8'.
import io
import sys
import urllib.request
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
res=urllib.request.urlopen('http://www.baidu.com')
htmlBytes=res.read()
print(htmlBytes.decode('utf-8'))
###
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2022/4/20 9:21
# @Author : wenjing
# @File : 商品_BIO——NER.py
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #改变标准输出的默认编码
import os
BASE_DIRL = os.path.dirname(__file__)
raw_data_path = path = os.path.join(BASE_DIRL, "ner_dataset.txt")
def text2idx(text, data):
start = text.find(data)
return start, len(data)
def label_storage(text, index_start, len_, y):
labels = len(text) * ['O']
labels[index_start:index_start + len_] = ['B_' + y] + ['I_' + y] * (len_ - 1)
return labels
def text_label():
dic = {
}
dic2 = {
}
with open(raw_data_path, 'r', encoding='UTF-8') as f:
for i in f:
text, y, node_text = i.replace('\n', "").split(" ")
index_start, len_ = text2idx(text, node_text) # 对应的位置
labels = label_storage(text, index_start, len_, y)
if dic.get(text):
dic[text].append(labels)
else:
dic[text] = [labels]
for k in dic:
dic2[k] = []
for labels in zip(*dic[k]):
lab = set(labels)
if len(lab) == 2:
lab.remove('O')
dic2[k].append(*lab)
return dic2
import json
def exls_label():
with open('./ner_dev.json','r',encoding='utf-8') as f:
print(f.read())
if __name__ == '__main__':
print('a' == '\u0061')
exls_label()
# exls_label()
# res = text_label()
# for k in res:
# print(k)
# print(res[k])