很多网页的源码是有问题的,这里有一个通杀方法,便于取到源码以后快速进行xpath解析
直接上源码:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File : 正确的取网页源码方式.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date : 2020/1/24
import requests
from bs4 import BeautifulSoup #pip install bs4/ pip install html5lib
def redecode(r):
r.encoding = r.apparent_encoding if r.apparent_encoding and not "gb" in r.apparent_encoding.lower() else "gb18030"
page_source = r.text
soup = BeautifulSoup(page_source.strip(), "html5lib")
page_source = soup.prettify()
return page_source
if __name__ == '__main__':
r = requests.get("https://www.baidu.com/")
page_source = redecode(r)
print(page_source)