1 Star 0 Fork 0

iyuedu / iyuedu_pypc

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
shuju.py 5.96 KB
一键复制 编辑 原始数据 按行查看 历史
zcc123 提交于 2023-11-28 16:22 . 更新
import os
import re
import time
import requests
from bs4 import BeautifulSoup
siteName = 'zongheng'
file_root = './app_site/%s'%siteName
image_root = './app_site/%s/image'%siteName
def getHtml(url):
# 获取网页数据
html = requests.get(url)
htmlCode = html.text
# 解析网页
soup = BeautifulSoup(htmlCode, 'html.parser')
# 返回解析后的页面内容
return soup
# 获取书籍
def getBook(url):
soup = getHtml(url)
# 查找所有章节的链接
listBox = soup.find_all('div', class_="bookbox")
# 新建列表用来储存list的url
bookLists = []
for item in listBox:
bookUrl = item.find('div', class_="bookimg").find('a')['href']
bookImg = item.find('div', class_="bookimg").find('img')['src']
bookName = item.find('div', class_="bookname").find('a').string
bookType = item.find('div', class_="bookilnk").find_all('a')[1].string
author = item.find('div', class_="bookilnk").find_all('a')[0].string
status = item.find('div', class_="bookilnk").find_all('span')[0].string
updateTime = item.find('div', class_="bookilnk").find_all('span')[1].string
bookIntro = item.find('div', class_="bookintro").string
# 提取bookId
pattern = '//book.zongheng.com/book/(.*?).html'
match = re.search(pattern, bookUrl)
bookId = match.group(1)
# 放进列表里
bookLists.append({
'bookId': bookId,
'bookName': bookName,
'bookUrl': bookUrl,
'bookImg': bookImg,
'bookIntro': bookIntro,
'bookType': bookType,
'author': author,
'status': status,
'updateTime': updateTime
})
return bookLists
# 获取书籍章节
def getBookCapture(book):
fileBookDir = '%s/books/'%file_root
# 添加目录
checkBookDir(fileBookDir)
rsp = requests.post('https://bookapi.zongheng.com/api/chapter/getChapterList', {
'bookId': book['bookId']
})
listBox = []
if rsp.status_code == 200:
res = rsp.json()
if res['code'] == 0:
listBox = res['result']['chapterList']
for item in listBox:
# titleId = item['tome']['tomeId']
titleName = item['tome']['tomeName']
# 提取章节内容
for index,v in enumerate(item['chapterViewList']):
# 判断当前章节是否已经写入
fileBookTextAllPath = '{}/books/{}_all.txt'.format(file_root, book['bookName'])
time.sleep(2)
# 读取章节内容
contentUrl = 'https://read.zongheng.com/chapter/{}/{}.html'.format(book['bookId'], v['chapterId'])
txt = getBookCaptureContent(contentUrl)
# 写入单章内容
boxTitle = '书名:{}\n作者:{}\n简介:\n{}\n\n'.format(book['bookName'], book['author'], book['bookIntro'])
txtTitle = '章节:{}\n本章字数:{}\n更新时间:{}\n\n'.format(txt['title'], txt['count'], txt['updateTime'])
if index == 0:
boxTitle = '第{}卷:{}\n{}'.format(index + 1, titleName, boxTitle)
#
if not os.path.isfile(fileBookTextAllPath):
allText = '{}{}{}'.format(boxTitle, txtTitle, txt['text'])
writeTxtContent(fileBookTextAllPath, allText, 'w')
# 读取_all.txt内容
txtReadAllContent = readTxtContent(fileBookTextAllPath)
if txtTitle not in txtReadAllContent:
txtContent = txtTitle + txt['text']
writeTxtContent(fileBookTextAllPath, txtContent)
# 获取章节内容
def getBookCaptureContent(url):
soup = getHtml(url)
title = soup.find('div', class_='title').find('div', class_='title_txtbox').string
author = soup.find('div', class_='bookinfo').find_all('span')[0].find('i').string
count = soup.find('div', class_='bookinfo').find_all('span')[1].find('i').string
updateTime = soup.find('div', class_='bookinfo').find_all('span')[2].find('i').string
pList = soup.find('div', class_='reader-box').find('div', class_='content').find_all('p')
pTxt = ''
for p in pList:
txt = ''
if p.string:
txt = '{}\n'.format(p.string)
pTxt += txt
return {
'title': title,
'author': author,
'count': count,
'updateTime': updateTime,
'text': pTxt
}
# 写入内容
def writeTxtContent(filePath, txt, mode='a'):
with open(filePath, mode, encoding='utf-8') as f:
f.write(txt)
f.close()
# 读取文件内容
def readTxtContent(filePath):
if not os.path.isfile(filePath):
return ''
else:
with open(filePath, 'r', encoding='utf-8') as f:
content = f.read()
return content
# 创建文件夹
def checkBookDir(dirPath):
# 检查文件夹是否存在,如果不存在,创建文件夹
if not os.path.exists(os.path.dirname(dirPath+'/')):
try:
os.makedirs(dirPath, exist_ok=True)
except OSError as error:
print(f"创建文件夹失败: {os.path.dirname(dirPath)}")
exit()
# 下载图片
def downloadImage(image_url, image_path):
response = requests.get(image_url, stream=True)
if response.status_code == 200:
# 检查文件夹是否存在,如果不存在,创建文件夹 # os.path.dirname(image_root)
checkBookDir(image_root)
# 下载图片
with open(image_root + image_path, 'wb') as f:
f.write(response.content)
f.close()
else:
print(f"Unable to download image. HTTP response code: {response.status_code}")
# 开启爬虫
def starter(url):
bookList = getBook(url)
print('数据抓取中...')
for book in bookList:
# downloadImage(book['bookImg'], '/%s.jpg'%(book['bookId']))
time.sleep(2)
getBookCapture(book)
print('数据加载完成!')
# 纵横中文网
starter('https://book.zongheng.com/store/c1/c1003/b0/u0/p0/v0/s1/t0/u0/i1/ALL.html')
1
https://gitee.com/iyuedu/iyuedu_pypc.git
git@gitee.com:iyuedu/iyuedu_pypc.git
iyuedu
iyuedu_pypc
iyuedu_pypc
master

搜索帮助