txt_files = []
init_dir = "../data"
for root, dirs, files in os.walk(init_dir):
for file in files:
if file.endswith(".txt"):
file_path = os.path.join(root, file)
txt_files.append(file_path)
print(txt_files)
import chardet
from tqdm import tqdm
fs = open('../summary.txt','w',encoding='utf8')
read_files = []
not_read_files = []
for txt_file in tqdm(txt_files):
with open(txt_file,'rb') as t:
t_d = t.read()
encode_name = chardet.detect(t_d)['encoding']
if encode_name=='GB2312':
encode_name='gbk'
try:
with open(txt_file,'r',encoding=encode_name) as ft:
temp_list = ft.readlines()
for line in temp_list:
if line:
fs.write(line)
fs.write('\n')
read_files.append(txt_file)
except Exception as e:
not_read_files.append(txt_file)
print(f"不能读取的文件:{no_read_files}")
fs.close()
no_read_files
是未能读取的文件列表。
在代码中将检测到GB2312
的编码类型转换为gbk
,因为gbk
是GB2312
的父级编码,使用gbk
可以尽量减少错误。
import os
import chardet
from tqdm import tqdm
txt_files = []
init_dir = "../data"
for root, dirs, files in os.walk(init_dir):
for file in files:
if file.endswith(".txt"):
file_path = os.path.join(root, file)
txt_files.append(file_path)
fs = open('../summary.txt','w',encoding='utf8')
read_files = []
not_read_files = []
for txt_file in tqdm(txt_files):
with open(txt_file,'rb') as t:
t_d = t.read()
encode_name = chardet.detect(t_d)['encoding']
if encode_name=='GB2312':
encode_name='gbk'
try:
with open(txt_file,'r',encoding=encode_name) as ft:
temp_list = ft.readlines()
for line in temp_list:
if line:
fs.write(line)
fs.write('\n')
read_files.append(txt_file)
except Exception as e:
not_read_files.append(txt_file)
print(f"不能读取的文件:{no_read_files}")
fs.close()
[1] https://zhuanlan.zhihu.com/p/643279312
[2] https://www.jb51.net/article/211203.htm
[3] https://blog.csdn.net/weixin_48030475/article/details/126311995?spm=1001.2014.3001.5502