python 으로 다운로드 소설 추출

1726 단어 파충
import os
import re
from lxml import etree
import requests
from fake_useragent import UserAgent

url = "http://www.17k.com"
headers = {"User-Agent": UserAgent().chrome}

def getHtml(url):
    try:
        response = requests.get(url, headers=headers)
        html = response.content.decode("utf-8")
    except:
        html = None
    return html
def getPUrl(html):
    list = []
    elements = etree.HTML(html)
    links = elements.xpath("//div[@class='Main List']//dd/a/@href")  # 章节链接
    name = elements.xpath("//div[@class='Main List']//h1/text()")[0]  # 小说标题
    for link in links:
        list.append(url + link)
    return list, name
def saveT(list, name):
    cd = "E:/" + name + "/"  # 将小说保存在此目录 E:/name/
    if os.path.exists(cd) == False:  # 如果E盘没有name目录就创建name文件夹
        os.mkdir(cd)
    for i in list:  # 遍历章节链接,获取小说内容
        response = requests.get(i, headers=headers)
        con = response.content.decode("utf-8")
        elem = etree.HTML(con)
        conText = elem.xpath("//div[@class='p']/text()")  # 小说内容
        ts = elem.xpath("//h1/text()")[0]  # 小说章节标题
        ts = re.sub(u"[^\u4e00-\u9fa5\u0030-\u0039]","",ts)  # 只保留标题中的中文
        for val in conText:  # 保存小说内容
            with open(cd + ts + ".txt", "a+") as f:
                f.write(val+"
") f.close() print("%s —— %s 下载完成:%s" % (name, ts, i)) # 输出保存信息,防止出错。 if __name__ == '__main__': html = getHtml("http://www.17k.com/list/2816298.html") links, name = getPUrl(html) saveT(links,name)

실행 캡 처
결과 캡 처
 

좋은 웹페이지 즐겨찾기