iyuedu_pypc
/
shuju.py

import os
import re
import time
import requests
from bs4 import BeautifulSoup

siteName = 'zongheng'
file_root = './app_site/%s'%siteName
image_root = './app_site/%s/image'%siteName

def getHtml(url):
    # 获取网页数据
    html = requests.get(url)
    htmlCode = html.text
    # 解析网页
    soup = BeautifulSoup(htmlCode, 'html.parser')
    # 返回解析后的页面内容
    return soup


# 获取书籍
def getBook(url):
    soup = getHtml(url)
    # 查找所有章节的链接
    listBox = soup.find_all('div', class_="bookbox")
    # 新建列表用来储存list的url
    bookLists = []
    for item in listBox:
        bookUrl = item.find('div', class_="bookimg").find('a')['href']
        bookImg = item.find('div', class_="bookimg").find('img')['src']
        bookName = item.find('div', class_="bookname").find('a').string
        bookType = item.find('div', class_="bookilnk").find_all('a')[1].string
        author = item.find('div', class_="bookilnk").find_all('a')[0].string
        status = item.find('div', class_="bookilnk").find_all('span')[0].string
        updateTime = item.find('div', class_="bookilnk").find_all('span')[1].string
        bookIntro = item.find('div', class_="bookintro").string
        # 提取bookId
        pattern = '//book.zongheng.com/book/(.*?).html'
        match = re.search(pattern, bookUrl)
        bookId = match.group(1)
        # 放进列表里
        bookLists.append({
            'bookId': bookId,
            'bookName': bookName,
            'bookUrl': bookUrl,
            'bookImg': bookImg,
            'bookIntro': bookIntro,
            'bookType': bookType,
            'author': author,
            'status': status,
            'updateTime': updateTime
        })
    return bookLists


# 获取书籍章节
def getBookCapture(book):
    fileBookDir = '%s/books/'%file_root
    # 添加目录
    checkBookDir(fileBookDir)
    rsp = requests.post('https://bookapi.zongheng.com/api/chapter/getChapterList', {
        'bookId': book['bookId']
    })
    listBox = []
    if rsp.status_code == 200:
        res = rsp.json()
        if res['code'] == 0:
            listBox = res['result']['chapterList']

    for item in listBox:
        # titleId = item['tome']['tomeId']
        titleName = item['tome']['tomeName']

        # 提取章节内容
        for index,v in enumerate(item['chapterViewList']):
            # 判断当前章节是否已经写入
            fileBookTextAllPath = '{}/books/{}_all.txt'.format(file_root, book['bookName'])

            time.sleep(2)

            # 读取章节内容
            contentUrl = 'https://read.zongheng.com/chapter/{}/{}.html'.format(book['bookId'], v['chapterId'])
            txt = getBookCaptureContent(contentUrl)
            # 写入单章内容
            boxTitle = '书名：{}\n作者：{}\n简介：\n{}\n\n'.format(book['bookName'], book['author'], book['bookIntro'])
            txtTitle = '章节：{}\n本章字数：{}\n更新时间：{}\n\n'.format(txt['title'], txt['count'], txt['updateTime'])

            if index == 0:
                boxTitle = '第{}卷：{}\n{}'.format(index + 1, titleName, boxTitle)

            #
            if not os.path.isfile(fileBookTextAllPath):
                allText = '{}{}{}'.format(boxTitle, txtTitle, txt['text'])
                writeTxtContent(fileBookTextAllPath, allText, 'w')

            # 读取_all.txt内容
            txtReadAllContent = readTxtContent(fileBookTextAllPath)
            if txtTitle not in txtReadAllContent:
                txtContent = txtTitle + txt['text']
                writeTxtContent(fileBookTextAllPath, txtContent)


# 获取章节内容
def getBookCaptureContent(url):
    soup = getHtml(url)
    title = soup.find('div', class_='title').find('div', class_='title_txtbox').string
    author = soup.find('div', class_='bookinfo').find_all('span')[0].find('i').string
    count = soup.find('div', class_='bookinfo').find_all('span')[1].find('i').string
    updateTime = soup.find('div', class_='bookinfo').find_all('span')[2].find('i').string
    pList = soup.find('div', class_='reader-box').find('div', class_='content').find_all('p')
    pTxt = ''
    for p in pList:
        txt = ''
        if p.string:
            txt = '{}\n'.format(p.string)
        pTxt += txt
    return {
        'title': title,
        'author': author,
        'count': count,
        'updateTime': updateTime,
        'text': pTxt
    }


# 写入内容
def writeTxtContent(filePath, txt, mode='a'):
    with open(filePath, mode, encoding='utf-8') as f:
        f.write(txt)
        f.close()


# 读取文件内容
def readTxtContent(filePath):
    if not os.path.isfile(filePath):
        return ''
    else:
        with open(filePath, 'r', encoding='utf-8') as f:
            content = f.read()
            return content


# 创建文件夹
def checkBookDir(dirPath):
    # 检查文件夹是否存在，如果不存在，创建文件夹
    if not os.path.exists(os.path.dirname(dirPath+'/')):
        try:
            os.makedirs(dirPath, exist_ok=True)
        except OSError as error:
            print(f"创建文件夹失败: {os.path.dirname(dirPath)}")
            exit()


# 下载图片
def downloadImage(image_url, image_path):
    response = requests.get(image_url, stream=True)
    if response.status_code == 200:
        # 检查文件夹是否存在，如果不存在，创建文件夹  # os.path.dirname(image_root)
        checkBookDir(image_root)
        # 下载图片
        with open(image_root + image_path, 'wb') as f:
            f.write(response.content)
            f.close()
    else:
        print(f"Unable to download image. HTTP response code: {response.status_code}")


# 开启爬虫
def starter(url):
    bookList = getBook(url)
    print('数据抓取中...')
    for book in bookList:
        # downloadImage(book['bookImg'], '/%s.jpg'%(book['bookId']))
        time.sleep(2)
        getBookCapture(book)
    print('数据加载完成！')


# 纵横中文网
starter('https://book.zongheng.com/store/c1/c1003/b0/u0/p0/v0/s1/t0/u0/i1/ALL.html')