很多網頁的源碼是有問題的,這裏有一個通殺方法,便於取到源碼以後快速進行xpath解析
直接上源碼:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File : 正確的取網頁源碼方式.py
# Author: DaShenHan&道長-----先苦後甜,任憑晚風拂柳顏------
# Date : 2020/1/24
import requests
from bs4 import BeautifulSoup #pip install bs4/ pip install html5lib
def redecode(r):
r.encoding = r.apparent_encoding if r.apparent_encoding and not "gb" in r.apparent_encoding.lower() else "gb18030"
page_source = r.text
soup = BeautifulSoup(page_source.strip(), "html5lib")
page_source = soup.prettify()
return page_source
if __name__ == '__main__':
r = requests.get("https://www.baidu.com/")
page_source = redecode(r)
print(page_source)