验证中...
私信发送成功
denovel.py
原始数据 复制代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import chardet
import glob
import os
import re
import sys
import zhconv
# 2017-12-10 更新
#
# 使用生成器改写
def denoise(text):
'''
:text:
小说正文
'''
for line in text.splitlines():
line = line.strip()
line = line.translate(str.maketrans(''.join((chr(i) for i in range(65281, 65375))), ''.join((chr(i) for i in range(33, 127))))) # 全角半角替换
line = line.translate(str.maketrans('!"『』\'「」(),.:;<>?[]', '!“““‘‘‘(),。:;《》?【】')) # 中英文与简繁体标点符号替换
line = re.sub('[^=]?(https?:\/\/[^\u4e00-\u9fa5\s]+)', '', line) # 去除网址
line = re.sub('\s+', ' ', line) # 将连续的空白符替换为单空格
line = re.sub('。{2,}', '……', line) # 将连续的句号替换为省略号
line = re.sub('(“.*)“', '\\1”', line) # 双引号配对
line = re.sub('(‘.*)‘', '\\1’', line) # 单引号配对
if re.match('^(?:推荐序|译者序|作者的话|序[章篇幕]?|楔子|第[零一二三四五六七八九十百千\d]+[章节篇幕回卷部集]|最?终[章篇回卷]|大?结局|尾声|番外|后记|写在最后|Chapter\s*\d+).*$', line):
line = line.join('\n\n') # 标题前后各空一行
elif len(line) == 0 or re.match('^\W+$', line):
continue # 若此时行变为了空行或纯符号行则跳过
elif re.match('^(?(?:全文)?完)?$', line):
break # 遇到全文结束标记则终止
yield line
def denovel(filename):
'''
:filename:
要处理的文件的文件名
'''
with open(filename, 'rb') as source, open('_denovel'.join(os.path.splitext(filename)), 'w') as target:
data = source.read()
encoding = chardet.detect(data)['encoding']
if encoding == 'GB2312': encoding = 'gbk'
text = zhconv.convert(data.decode(encoding), 'zh-cn') # 简繁转换
target.write('\n'.join(denoise(text)).strip()) # 去除首尾的换行符
if __name__ == '__main__':
args = sys.argv[1:] or os.path.dirname(os.path.abspath(__file__)) # 没有指定参数则默认为程序所在目录
for arg in args:
# 判断参数是文件还是目录,如果是目录则搜索其中的txt文件,跳过非txt文件
if os.path.isdir(arg): arg = glob.glob('*.txt')
elif os.path.splitext(arg)[1].lower() == '.txt': arg = [arg]
else: continue
for filename in arg:
if '_denovel.txt' not in filename: # 跳过已处理的文件
try: denovel(filename)
except: pass

评论列表( 0 )

你可以在登录后,对此项目发表评论

4_float_left_people 4_float_left_close