目錄:
文章目錄
Day03筆記
requests模塊
常用方法
- get() : 發起請求,獲取響應對象response
-
response對象
response.text
: 字符串
字符編碼 :ISO-8859-1
response.encoding = "utf-8"
2.response.content
: bytesresponse.status_code
: 返回響應碼
-
沒有查詢參數
res = requests.get(url,headers=headers) -
有查詢參數
params={“wd”:“美女”}
res = requests.get(url,params=params,headers=headers) -
for example:
import requests url = "http://www.baidu.com/s?" headers = {"User-Agent":"Mozilla5.0/"} s = input("請輸入要搜索的內容:") # get方法params參數必須要爲 字典 格式,自動編碼 wd = {"wd":s} res = requests.get(url,params=wd,headers=headers) res.encoding = "utf-8" print(res.text)
-
- post() : 參數名data
-
data={} #data參數爲字典,不用轉爲 bytes 數據類型
-
for example:
import requests import json # 處理表單數據 # Form表單的數據要放到字典中,然後再進行編碼轉換 word = input("請輸入要翻譯的內容:") data = {"i":word, "from":"AUTO", "to":"AUTO", "smartresult":"dict", "client":"fanyideskweb", "salt":"1536648321283", "sign":"1e7948e25551448dbfb7184f23dc126c", "doctype":"json", "version":"2.1", "keyfrom":"fanyi.web", "action":"FY_BY_REALTIME", "typoResult":"false" } url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" headers = {"User-Agent":"Mozilla5.0/"} res = requests.post(url,data=data,headers=headers) res.encoding = "utf-8" result = res.text # 把json格式的字符串轉換爲 Python字典 # json模塊中的loads方法 :json格式字符串 -> Python字典 result_dict = json.loads(result) r = result_dict["translateResult"][0][0]["tgt"] print(r)
-
代理 :proxies
- 爬蟲和反爬蟲鬥爭的第二步
獲取代理IP的網站 - 西刺代理
- 快代理
- 全網代理
- 普通代理 :proxies={“協議”:“IP地址:端口號”}
-
proxies={"HTTP":"222.221.11.119:3128"}
-
for example:
import requests url = "http://www.taobao.com/" proxies = {"HTTP":"222.221.11.119:3128"} headers = {"User-Agent":"Mozilla5.0/"} res = requests.get(url,proxies=proxies,headers=headers) res.encoding = "utf-8" print(res.text)
-
- 私密代理 :
-
proxies={"HTTP":"http://309435365:[email protected]:16819"}
-
for example :
import requests url = "http://www.taobao.com/" headers = {"User-Agent":"Mozilla5.0/"} proxies={"HTTP":"http://309435365:[email protected]:16819"} #114.67.228.126:16819 res = requests.get(url,proxies=proxies,headers=headers) res.encoding = "utf-8" print(res.status_code)
-
爬取鏈家MongoDb
import requests import re import pymongo class LianJiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.headers = {"User-Agent":"Mozilla5.0/"} self.proxies = {"HTTP":"http://309435365:[email protected]:16819"} self.page = 1 # 創建連接對象 self.conn = pymongo.MongoClient("localhost",27017) # 創建數據庫對象 self.db = self.conn.Lianjia#庫名 # 創建集合對象 self.myset = self.db.housePrice#表名 # 獲取頁面 def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 用正則解析頁面 def parsePage(self,html): p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S) r_list = p.findall(html) # [("首科花園","595"),(),()] self.writeToMogo(r_list) # 保存本地文件 def writeToMogo(self,r_list): for r_tuple in r_list: d = {"houseName":r_tuple[0].strip(), "housePrice":float(r_tuple[1].strip())*10000} self.myset.insert(d) print("存入mongodb數據庫成功") # mongo # show dbs; # use Lianjia; # show tables; # db.housePrice.find().pretty(); # 主函數 def workOn(self): while True: print("正在爬取%d頁" % self.page) # 拼接URL url = self.baseurl + str(self.page) + "/" self.getPage(url) print("第%d頁爬取成功" % self.page) c = input("是否繼續爬取(y/n):") if c.strip().lower() == "y": self.page += 1 else: print("爬取結束,謝謝使用!") break if __name__ == "__main__": spider = LianJiaSpider() spider.workOn()
-
爬取鏈家MySql
import requests import re import pymysql import warnings class LianJiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.headers = {"User-Agent":"Mozilla5.0/"} self.proxies = {"HTTP":"http://309435365:[email protected]:16819"} self.page = 1 # 創建數據庫連接對象 self.db = pymysql.connect("localhost","root", "123456",charset="utf8") # 創建遊標對象 self.cursor = self.db.cursor() # 獲取頁面 def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers) res.encoding = "utf-8" html = res.text print("頁面已獲取,正在解析頁面...") self.parsePage(html) # 用正則解析頁面 def parsePage(self,html): p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S) r_list = p.findall(html) # [("首科花園","595"),(),()] print("正在存入mysql數據庫...") self.writeToMysql(r_list) # 保存到MySQL數據庫 def writeToMysql(self,r_list): c_db = "create database if not exists spider;" u_db = "use spider;" c_tab = "create table if not exists lianjia(\ id int primary key auto_increment,\ name varchar(30),\ price decimal(20,2))charset=utf8;" # 過濾警告 warnings.filterwarnings("error") try: self.cursor.execute(c_db) except Warning: pass self.cursor.execute(u_db) try: self.cursor.execute(c_tab) except Warning: pass # r_list : [("首科花園","595"),(),()] for r_tuple in r_list: s_insert = "insert into lianjia(name,price) \ values('%s','%s');" % \ (r_tuple[0].strip(), float(r_tuple[1].strip())*10000) self.cursor.execute(s_insert) self.db.commit() print("第%d頁存入數據庫成功" % self.page) # 主函數 def workOn(self): while True: print("正在爬取%d頁" % self.page) # 拼接URL url = self.baseurl + str(self.page) + "/" self.getPage(url) print("第%d頁爬取成功" % self.page) c = input("是否繼續爬取(y/n):") if c.strip().lower() == "y": self.page += 1 else: print("爬取結束,謝謝使用!") break if __name__ == "__main__": spider = LianJiaSpider() spider.workOn()
-
鏈家信息保存到本地
import requests import re class LianJiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.headers = {"User-Agent":"Mozilla5.0/"} self.proxies = {"HTTP":"http://309435365:[email protected]:16819"} self.page = 1 # 獲取頁面 def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 用正則解析頁面 def parsePage(self,html): p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S) r_list = p.findall(html) # [("首科花園","595"),(),()] self.writePage(r_list) # 保存本地文件 def writePage(self,r_list): for r_tuple in r_list:# r_tuple ("首科花園","595") for r_str in r_tuple: with open("鏈家二手房.txt","a") as f: f.write(r_str.strip() + " ") with open("鏈家二手房.txt","a") as f: f.write("\n") # 主函數 def workOn(self): while True: print("正在爬取%d頁" % self.page) # 拼接URL url = self.baseurl + str(self.page) + "/" self.getPage(url) print("第%d頁爬取成功" % self.page) c = input("是否繼續爬取(y/n):") if c.strip().lower() == "y": self.page += 1 else: print("爬取結束,謝謝使用!") break if __name__ == "__main__": spider = LianJiaSpider() spider.workOn()
-
Web客戶端驗證 :auth
- auth = (“用戶名”,“密 碼”)
auth = ("lht4815","123456789lht")
requests.get(url,auth=auth,headers=headers)
SSL證書認證 : verify
-
verify=True : 默認,做SSL證書認證
-
verify=False : 忽略證書認證
-
for example:
import requests url = "https://www.12306.cn/mormhweb/" headers = {"User-Agent":"Mozilla5.0/"} res = requests.get(url,verify=False,headers=headers) res.encoding = "utf-8" print(res.text)
Handler處理器(urllib.request)
定義
自定義的urlopen()方法,urlopen方法是一個特殊的opener
常用方法
build_opener(Handler處理器對象)
opener.open(url)
使用流程
- 創建相關的Handler處理器對象
http_handler = urllib.request.HTTPHandler()
- 創建自定義opener對象
opener = urllib.request.build_opener(http_handler)
- 利用opener對象的open方法發請求
Handler處理器分類
- HTTPHandler()
import urllib.request url = "http://www.baidu.com/" # 1.創建HTTPHandler處理器對象 http_hander = urllib.request.HTTPHandler() # 2.創建自定義的opener對象 opener = urllib.request.build_opener(http_hander) # 3.利用opener對象的open方法發請求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8"))
- ProxyHandler(代理IP) : 普通代理
import urllib.request url = "http://www.baidu.com/" proxy = {"HTTP":"120.78.196.33:3128"} # 1.創建Handler proxy_handler = urllib.request.ProxyHandler(proxy) # 2.創建自定義opener opener = urllib.request.build_opener(proxy_handler) # 3.利用open方法發請求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8"))
- ProxyBasicAuthHandler(密碼管理器對象) : 私密代理