Python抓取漫畫

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1295.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.5 WindowsWechat',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'h-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4',
    'Cookie': 'token=5300230093002300730053009300160057008600E6001600D600; sid=9572924; UM_distinctid=271545721543f4-06ddf5f30aa90b-8011274-1ea000-17154572159ce'
}


def findImg(url,tags):

    pageDoc = etree.HTML(requests.post(url, headers=headers).text)
    for img in pageDoc.xpath('//img[@class="lazy"]'):
        img_tag = '<img src="%s">' % str(img.xpath('string(./@data-original)'))
        print(img_tag)
        tags.append(img_tag)
    base_url = '/'.join(url.split('/')[:-1]) + '/'
    next = pageDoc.xpath('string(//td[@class="next"]/a/@href)')
    if next:
        findImg(base_url + next,tags)
    return tags


def find_books():
    url = 'http://c1190.w3592.s5672712.iscqg.cn/manhua/#gohome'
    pageDoc = etree.HTML(requests.get(url, headers=headers).text)
    for a in pageDoc.xpath('//div/ul/li/a'):
        href_ = 'http://c1190.w3592.s5672712.iscqg.cn/manhua/' + a.xpath('string(./@href)')
        yield href_


if __name__ == '__main__':
    for bookUrl in find_books():
        doc = etree.HTML(requests.get(bookUrl, headers=headers).text)
        first_chapter_url = base_url = '/'.join(bookUrl.split('/')[:-1]) + '/' + doc.xpath(
            'string(//div[@class="detailMenuList"]/div[@class="titleDiv"][1]/a/@href)')
        book_name = str(doc.xpath('string(//title/text())'))
        print(first_chapter_url, book_name)

        tags = findImg(first_chapter_url,[])

        lines = """<!DOCTYPE html>
                <html lang="en">
                <head>
                    <meta charset="UTF-8">
                    <title>{}</title>
                    <style>
                        img {{
                            width: 100%;
                            padding: 0px;
                            margin: 0px;
                            overflow: hidden;
                            display: block;
                            max-width: 100%;
                        }}
                    </style>
                </head>
                <body>
                    {}
                </body></html>""".format(book_name, "".join(tags))
        with open('./%s.html' % book_name, mode='w') as f:
            f.write(lines)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章