import requests
from bs4 import BeautifulSoup
from lxml import etree
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1295.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.5 WindowsWechat',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'h-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4',
'Cookie': 'token=5300230093002300730053009300160057008600E6001600D600; sid=9572924; UM_distinctid=271545721543f4-06ddf5f30aa90b-8011274-1ea000-17154572159ce'
}
def findImg(url,tags):
pageDoc = etree.HTML(requests.post(url, headers=headers).text)
for img in pageDoc.xpath('//img[@class="lazy"]'):
img_tag = '<img src="%s">' % str(img.xpath('string(./@data-original)'))
print(img_tag)
tags.append(img_tag)
base_url = '/'.join(url.split('/')[:-1]) + '/'
next = pageDoc.xpath('string(//td[@class="next"]/a/@href)')
if next:
findImg(base_url + next,tags)
return tags
def find_books():
url = 'http://c1190.w3592.s5672712.iscqg.cn/manhua/#gohome'
pageDoc = etree.HTML(requests.get(url, headers=headers).text)
for a in pageDoc.xpath('//div/ul/li/a'):
href_ = 'http://c1190.w3592.s5672712.iscqg.cn/manhua/' + a.xpath('string(./@href)')
yield href_
if __name__ == '__main__':
for bookUrl in find_books():
doc = etree.HTML(requests.get(bookUrl, headers=headers).text)
first_chapter_url = base_url = '/'.join(bookUrl.split('/')[:-1]) + '/' + doc.xpath(
'string(//div[@class="detailMenuList"]/div[@class="titleDiv"][1]/a/@href)')
book_name = str(doc.xpath('string(//title/text())'))
print(first_chapter_url, book_name)
tags = findImg(first_chapter_url,[])
lines = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{}</title>
<style>
img {{
width: 100%;
padding: 0px;
margin: 0px;
overflow: hidden;
display: block;
max-width: 100%;
}}
</style>
</head>
<body>
{}
</body></html>""".format(book_name, "".join(tags))
with open('./%s.html' % book_name, mode='w') as f:
f.write(lines)
Python抓取漫畫
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.