爬蟲
程序
目標url
內容提取
表現形式
爬蟲 why
大數據
Google
垂直行業搜索
Qunar
房子
車子
比價網
爬蟲 how
爬蟲 lib
pip install beautifulsoup4 將html變成樹狀結構
pip install requests
pip install selenium
腳本
#!/usr/bin/env python # -*- coding:utf-8 -*- __author__ = 'teng' import urllib from bs4 import BeautifulSoup import re html = urllib.urlopen('http://baike.baidu.com/view/284853.htm') bs_obj = BeautifulSoup(html,"html.parser") #findAll(tag, attributes, recursive, text, limit, keywords) #find(tag, attributes, recursive, text, keywords) #recursive=False表示只搜索直接兒子,否則搜索整個子樹,默認爲True。 #findAll(“a”) #findAll(“a”, href=“”) #findAll(“div”, class=“”) #findAll(“button”, id=“”) #a_list = bs_obj.findAll("a") a_list = bs_obj.findAll("a",href=re.compile("tousu\.baidu\.com\w?")) for aa in a_list: if not aa.find("img"): if aa.attrs.get('href'): print aa.text, aa.attrs['href']
循環訪問url
# -*- coding: utf-8 -*- # CopyRight by heibanke import urllib from bs4 import BeautifulSoup import re url='http://www.heibanke.com/lesson/crawler_ex00/' number=[''] loops = 0 while True: content = urllib.urlopen(url+number[0]) bs_obj = BeautifulSoup(content,"html.parser") tag_number = bs_obj.find("h3") number= re.findall(r'\d+',tag_number.get_text()) if not number or loops>100: break else: print number[0] loops+=1 print bs_obj.text
post數據,表單提交
requests
支持各種request類型
HTTP request types: GET,POST,PUT,DELETE,HEAD and OPTIONS
支持各種POST,如上傳文件
支持自定義header
支持json數據解析
支持訪問Cookies
支持重定向地址
支持設置timeout
#!/usr/bin/env python # coding: utf-8 #copyRight by heibanke import requests url = "http://www.heibanke.com/lesson/crawler_ex01/" number = 0 while number<=30: params = {'username':'heibanke','password': str(number)} r = requests.post(url,data=params) if r.text.find(u"輸入的密碼錯誤")>0: print u"輸入的密碼",number,u"錯誤" number = number+1 else: print r.text break
Selenium 解決跨站僞造 js
模擬用戶瀏覽器操作,Selenium IDE可錄製測試動作
Function Test,自動測試
支持多種語言,python,java,ruby,C#,php
Webdriver支持多種瀏覽器,最方便是Firefox
pip install -U selenium 更新
圖片下載
urllib.urlretrieve
selenium的實例
#!/usr/bin/env python # -*- coding:utf-8 -*- __author__ = 'teng' from selenium import webdriver from bs4 import BeautifulSoup import time driver = webdriver.Firefox() driver.get("http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/") time.sleep(3) driver.find_element_by_id("id_username").send_keys("test") driver.find_element_by_id("id_password").send_keys("test123") driver.find_element_by_id("id_submit").click() time.sleep(1) number = 0 while number < 30: driver.get("http://www.heibanke.com/lesson/crawler_ex02/") time.sleep(2) driver.find_element_by_name("username").send_keys("flysmoke") driver.find_element_by_name("password").send_keys(str(number)) driver.find_element_by_id("id_submit").click() html = driver.page_source bs_obj = BeautifulSoup(html) if bs_obj.text.find(u"輸入的密碼錯誤")>0: print u"輸入的密碼", number, u"錯誤" number = number+1 else: print bs_obj.text break time.sleep(2) driver.close()
session.request的實例
#!/usr/bin/env python # -*- coding:utf-8 -*- __author__ = 'teng' import requests def post_data_django(s, url, data): s.get(url) params = {'csrfmiddlewaretoken': s.cookies.get('csrftoken')} params.update(data) r = s.post(url, data=params) return r, s url_login = "http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/" url_form = "http://www.heibanke.com/lesson/crawler_ex02/" s = requests.Session() r,s = post_data_django(s,url_login,{'username':'test','password':'test123'}) print 'login ',r.status_code # find the password for number in range(30): rr,s = post_data_django(s,url_form,{'username':'heibanke','password': str(number)}) if rr.text.find(u"輸入的密碼錯誤")>0: print number,"not correct" number = number+1 else: print rr.text break