Python爬蟲 Day03

目錄:

Day03筆記

requests模塊

常用方法

  1. get() : 發起請求,獲取響應對象response
    1. response對象

      1. response.text: 字符串
        字符編碼 :ISO-8859-1
        response.encoding = "utf-8"
        2.response.content: bytes
      2. response.status_code : 返回響應碼
    2. 沒有查詢參數
      res = requests.get(url,headers=headers)

    3. 有查詢參數
      params={“wd”:“美女”}
      res = requests.get(url,params=params,headers=headers)

    4. for example:

      import requests
      
      url = "http://www.baidu.com/s?"
      headers = {"User-Agent":"Mozilla5.0/"}
      s = input("請輸入要搜索的內容:")
      # get方法params參數必須要爲 字典 格式,自動編碼
      wd = {"wd":s}
      res = requests.get(url,params=wd,headers=headers)
      res.encoding = "utf-8"
      print(res.text)
      
      
  2. post() : 參數名data
    1. data={} #data參數爲字典,不用轉爲 bytes 數據類型

    2. for example:

      import requests
      import json
      
      # 處理表單數據
      # Form表單的數據要放到字典中,然後再進行編碼轉換
      word = input("請輸入要翻譯的內容:")
      data = {"i":word,
              "from":"AUTO",
              "to":"AUTO",
              "smartresult":"dict",
              "client":"fanyideskweb",
              "salt":"1536648321283",
              "sign":"1e7948e25551448dbfb7184f23dc126c",
              "doctype":"json",
              "version":"2.1",
              "keyfrom":"fanyi.web",
              "action":"FY_BY_REALTIME",
              "typoResult":"false"
          }
      
      url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
      headers = {"User-Agent":"Mozilla5.0/"}
      res = requests.post(url,data=data,headers=headers)
      res.encoding = "utf-8"
      result = res.text
      
      # 把json格式的字符串轉換爲 Python字典
      # json模塊中的loads方法 :json格式字符串 -> Python字典
      result_dict = json.loads(result)
      r = result_dict["translateResult"][0][0]["tgt"]
      print(r)
      
      

代理 :proxies

  1. 爬蟲和反爬蟲鬥爭的第二步
    獲取代理IP的網站
  2. 西刺代理
  3. 快代理
  4. 全網代理
  5. 普通代理 :proxies={“協議”:“IP地址:端口號”}
    • proxies={"HTTP":"222.221.11.119:3128"}

    • for example:

      import requests
      
      url = "http://www.taobao.com/"
      proxies = {"HTTP":"222.221.11.119:3128"}
      headers = {"User-Agent":"Mozilla5.0/"}
      
      res = requests.get(url,proxies=proxies,headers=headers)
      res.encoding = "utf-8"
      print(res.text)
      
      
  6. 私密代理 :
    • proxies={"HTTP":"http://309435365:[email protected]:16819"}

    • for example :

      import requests
      
      url = "http://www.taobao.com/"
      headers = {"User-Agent":"Mozilla5.0/"}
      proxies={"HTTP":"http://309435365:[email protected]:16819"}
      
      #114.67.228.126:16819
      res = requests.get(url,proxies=proxies,headers=headers)
      res.encoding = "utf-8"
      print(res.status_code)
      
      
    • 爬取鏈家MongoDb

      import requests
      import re
      import pymongo
      
      class LianJiaSpider:
          def __init__(self):
              self.baseurl = "https://bj.lianjia.com/ershoufang/pg"
              self.headers = {"User-Agent":"Mozilla5.0/"}
              self.proxies = {"HTTP":"http://309435365:[email protected]:16819"}
              self.page = 1
              # 創建連接對象
              self.conn = pymongo.MongoClient("localhost",27017)
              # 創建數據庫對象
              self.db = self.conn.Lianjia#庫名
              # 創建集合對象
              self.myset = self.db.housePrice#表名
      
          # 獲取頁面
          def getPage(self,url):
              res = requests.get(url,proxies=self.proxies,headers=self.headers)
              res.encoding = "utf-8"
              html = res.text
              self.parsePage(html)
          
          # 用正則解析頁面
          def parsePage(self,html):
              p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S)
              r_list = p.findall(html)
              # [("首科花園","595"),(),()]
              self.writeToMogo(r_list)
              
          # 保存本地文件
          def writeToMogo(self,r_list):
              for r_tuple in r_list:
                  d = {"houseName":r_tuple[0].strip(),
                       "housePrice":float(r_tuple[1].strip())*10000}
                  self.myset.insert(d)
              print("存入mongodb數據庫成功")
              # mongo
              # show dbs;
              # use Lianjia;
              # show tables;
              # db.housePrice.find().pretty();
      
      
      
          # 主函數
          def workOn(self):
              while True:
                  print("正在爬取%d頁" % self.page)
                  # 拼接URL
                  url = self.baseurl + str(self.page) + "/"
                  self.getPage(url)
                  print("第%d頁爬取成功" % self.page)
                  
                  c = input("是否繼續爬取(y/n):")
                  if c.strip().lower() == "y":
                      self.page += 1
                  else:
                      print("爬取結束,謝謝使用!")
                      break
                  
      if __name__ == "__main__":
          spider = LianJiaSpider()
          spider.workOn()
      
      
    • 爬取鏈家MySql

      import requests
      import re
      import pymysql
      import warnings
      
      class LianJiaSpider:
          def __init__(self):
              self.baseurl = "https://bj.lianjia.com/ershoufang/pg"
              self.headers = {"User-Agent":"Mozilla5.0/"}
              self.proxies = {"HTTP":"http://309435365:[email protected]:16819"}
              self.page = 1
              # 創建數據庫連接對象
              self.db = pymysql.connect("localhost","root",
                                "123456",charset="utf8")
              # 創建遊標對象
              self.cursor = self.db.cursor()
          
          # 獲取頁面
          def getPage(self,url):
              res = requests.get(url,proxies=self.proxies,headers=self.headers)
              res.encoding = "utf-8"
              html = res.text
              print("頁面已獲取,正在解析頁面...")
              self.parsePage(html)
          
          # 用正則解析頁面
          def parsePage(self,html):
              p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S)
              r_list = p.findall(html)
              # [("首科花園","595"),(),()]
              print("正在存入mysql數據庫...")
              self.writeToMysql(r_list)
              
          # 保存到MySQL數據庫
          def writeToMysql(self,r_list):
              c_db = "create database if not exists spider;"
              u_db = "use spider;"
              c_tab = "create table if not exists lianjia(\
                       id int primary key auto_increment,\
                       name varchar(30),\
                       price decimal(20,2))charset=utf8;"
              # 過濾警告
              warnings.filterwarnings("error")
              try:
                  self.cursor.execute(c_db)
              except Warning:
                  pass
      
              self.cursor.execute(u_db)
      
              try:
                  self.cursor.execute(c_tab)
              except Warning:
                  pass
              
              # r_list : [("首科花園","595"),(),()]
              for r_tuple in r_list:
                  s_insert = "insert into lianjia(name,price) \
                              values('%s','%s');" % \
                              (r_tuple[0].strip(),
                               float(r_tuple[1].strip())*10000)
                  self.cursor.execute(s_insert)
                  self.db.commit()
              print("第%d頁存入數據庫成功" % self.page)
      
          # 主函數
          def workOn(self):
              while True:
                  print("正在爬取%d頁" % self.page)
                  # 拼接URL
                  url = self.baseurl + str(self.page) + "/"
                  self.getPage(url)
                  print("第%d頁爬取成功" % self.page)
                  
                  c = input("是否繼續爬取(y/n):")
                  if c.strip().lower() == "y":
                      self.page += 1
                  else:
                      print("爬取結束,謝謝使用!")
                      break
                  
      if __name__ == "__main__":
          spider = LianJiaSpider()
          spider.workOn()
      
      
    • 鏈家信息保存到本地

      import requests
      import re
      
      class LianJiaSpider:
          def __init__(self):
              self.baseurl = "https://bj.lianjia.com/ershoufang/pg"
              self.headers = {"User-Agent":"Mozilla5.0/"}
              self.proxies = {"HTTP":"http://309435365:[email protected]:16819"}
              self.page = 1
          
          # 獲取頁面
          def getPage(self,url):
              res = requests.get(url,proxies=self.proxies,headers=self.headers)
              res.encoding = "utf-8"
              html = res.text
              self.parsePage(html)
          
          # 用正則解析頁面
          def parsePage(self,html):
              p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S)
              r_list = p.findall(html)
              # [("首科花園","595"),(),()]
              self.writePage(r_list)
              
          # 保存本地文件
          def writePage(self,r_list):
              for r_tuple in r_list:# r_tuple ("首科花園","595")
                  for r_str in r_tuple:
                      with open("鏈家二手房.txt","a") as f:
                          f.write(r_str.strip() + "  ")
                  
                  with open("鏈家二手房.txt","a") as f:
                      f.write("\n")
          # 主函數
          def workOn(self):
              while True:
                  print("正在爬取%d頁" % self.page)
                  # 拼接URL
                  url = self.baseurl + str(self.page) + "/"
                  self.getPage(url)
                  print("第%d頁爬取成功" % self.page)
                  
                  c = input("是否繼續爬取(y/n):")
                  if c.strip().lower() == "y":
                      self.page += 1
                  else:
                      print("爬取結束,謝謝使用!")
                      break
                  
      if __name__ == "__main__":
          spider = LianJiaSpider()
          spider.workOn()
      

Web客戶端驗證 :auth

  1. auth = (“用戶名”,“密 碼”)
    auth = ("lht4815","123456789lht")
    requests.get(url,auth=auth,headers=headers)

SSL證書認證 : verify

  1. verify=True : 默認,做SSL證書認證

  2. verify=False : 忽略證書認證

  3. for example:

    import requests
    
    url = "https://www.12306.cn/mormhweb/"
    headers = {"User-Agent":"Mozilla5.0/"}
    
    res = requests.get(url,verify=False,headers=headers)
    res.encoding = "utf-8"
    print(res.text)
    

Handler處理器(urllib.request)

定義

自定義的urlopen()方法,urlopen方法是一個特殊的opener

常用方法

  1. build_opener(Handler處理器對象)
  2. opener.open(url)

使用流程

  1. 創建相關的Handler處理器對象
    http_handler = urllib.request.HTTPHandler()
  2. 創建自定義opener對象
    opener = urllib.request.build_opener(http_handler)
  3. 利用opener對象的open方法發請求

Handler處理器分類

  1. HTTPHandler()
    import urllib.request
    
    url = "http://www.baidu.com/"
    # 1.創建HTTPHandler處理器對象
    http_hander = urllib.request.HTTPHandler()
    # 2.創建自定義的opener對象
    opener = urllib.request.build_opener(http_hander)
    # 3.利用opener對象的open方法發請求
    req = urllib.request.Request(url)
    res = opener.open(req)
    print(res.read().decode("utf-8"))
    
  2. ProxyHandler(代理IP) : 普通代理
    import urllib.request
    
    url = "http://www.baidu.com/"
    proxy = {"HTTP":"120.78.196.33:3128"}
    # 1.創建Handler
    proxy_handler = urllib.request.ProxyHandler(proxy)
    # 2.創建自定義opener
    opener = urllib.request.build_opener(proxy_handler)
    # 3.利用open方法發請求
    req = urllib.request.Request(url)
    res = opener.open(req)
    print(res.read().decode("utf-8"))
    
  3. ProxyBasicAuthHandler(密碼管理器對象) : 私密代理
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章