网页解析之xpath

一、简介

二、语法

三、xpath练习：

爬取全书网玄幻魔法分类中的完本小说

import requests
from lxml import etree
import re
import time
from threading import Thread
from multiprocessing.pool import ThreadPool


def my_session(url, headers=None):
    session = requests.session()
    html = session.get(url)
    html.encoding = "gbk"
    return html.text


def save_story(chapter_urls, titles, number):
    """
保存所有章节内容
    :param chapter_urls: 每个章节的url，a list
    :param titles: 每本书的标题，a list
    :param number:
    :return:
    """
    for url in chapter_urls:
        chapter_content = my_session(url)  # 对每个章节的url循环发起请求
        selector_chapter = etree.HTML(chapter_content)
        while selector_chapter is None:
            chapter_content = my_session(url)
            selector_chapter = etree.HTML(chapter_content)
        chapter_title = selector_chapter.xpath('//h1/strong/text()')  # 匹配该章的章节名
        chapter_content = selector_chapter.xpath('//div[@id="content"]/text()')  # 匹配该章的章节内容
        print(chapter_title)
        # print(chapter_content)
        chapter_content_msg = "".join(chapter_content)  # 将列表拼接成字符串
        # print(chapter_content_msg)
        chapter_content_res = re.sub(r"\.*?\xa0|\.*?d", "", chapter_content_msg)  # 利用正则替换特殊字符
        chapter_content_result = chapter_content_res.replace("\r\n", "\r")  # 利用字符串的replace方法将“回车换行”替换成“回车”
        with open(r"D:\Python学院学习环境\pachong\story\{}.txt".format(titles[number]), "a")as f:
            f.write("".join(chapter_title)+"\n")  # 写入章节名
            f.write(chapter_content_result+"\n")  # 写入章节内容
    # print(html)
    # print(read_url)


if __name__ == "__main__":
    pool = ThreadPool(5)  # 实例一个线程池，线程数为5
    start_time = time.time()
    index_url = "http://www.quanshuwang.com/all/allvisit_1_0_0_0_1_0_1.html"
    html = my_session(index_url)
    # print(html)
    selector = etree.HTML(html)
    title = selector.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/a/h2/text()')  # 获取每本书的标题
    author = selector.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/div[@class="author-container"]/dl/dd/p/text()')  # 获取每本书的作者
    story_url = selector.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/a/@href')  # 每本书的url
    print(title)
    t_list = []
    # 循环下载
    for i in range(5, 10):
        html = my_session(story_url[i])  # 对每本书的url发起请求
        selector_url = etree.HTML(html)  # 每发起一次请求，都要实例一个选择器对象
        synopsis = selector_url.xpath('//div[@id="waa"]/text()')  # 通过选择器对象匹配该书的简介
        read_url = selector_url.xpath('//div[@class="detail"]/a/@href')  # 通过选择器对象匹配该书的阅读url
        html_content = my_session(read_url[0])  # 对该书发起阅读请求
        selector_content_url = etree.HTML(html_content)
        chapter_url = selector_content_url.xpath('//div[@class="clearfix dirconone"]/li/a/@href')  # 获取该书每个章节的url
        # print(synopsis)
        # print(html_content)
        print(chapter_url, len(chapter_url))
        synopsis_str = "".join(synopsis)  # 将列表拼接成字符串
        print(synopsis_str)
        synopsis_res = synopsis_str.replace("\xa0\xa0\xa0\xa0", "    ")  # 替换特殊字符
        # 初始化文件
        with open(r"D:\Python学院学习环境\pachong\story\{}.txt".format(title[i]), "w")as file:  # 新建文本文件
            file.write(author[i]+"\n")
            file.write(synopsis_res)  # 写入小说简介
        # 不使用多线程
        # save_story(chapter_url)

    # 使用多线程
    #     th = Thread(target=save_story, args=(chapter_url, title, i))
    #     t_list.append(th)
    # for t in t_list:
    #     t.start()
    # for t in t_list:
    #     t.join()
        pool.apply_async(save_story, args=(chapter_url, title, i))
    pool.close()
    pool.join()
    print("耗时：{}".format(time.time()-start_time))


"不使用线程耗时：514.7513475418091"

"一个for循环启动线程（相当於单线程，多个线程顺序执行）耗时：526.5542323589325"

"双线程耗时：291.0101172924042"

效果图展示