反爬蟲操作1

例:抓取豆瓣電影評論

# 基礎代碼
from lxml import etree
import requests

# 抓取電影評論頁
r = requests.get("http://movie.douban.com/")
s = etree.HTML(r.text)

# 獲取每個評論節點
comments = s.xpath("//div[@class='comment']")
for comment in comments:
    # 獲取當前評論的用戶名稱
    username = comment.xpath("./h3/span[2]/a/text()")[0]
    # 獲取當前評論的內容
    content = comment.xpath("./p/text()")[0]
    # 獲取評分星級
    stars = comment.xpath("./h3/span[2]/span[2]/@class")[0]
    # 評論發表時間
    comment_time = comment.xpath("./h3/span[2]/span[3]/@title")
    comment_time = comment_time[0] if comment_time else ""
    print("%s %s %s: \n%s"%(username,stars,comment_time,content))
 反爬蟲操作:

 登陸豆瓣賬號,獲取網站cookie,複製瀏覽器header信息,將header信息複製到postman, 以便自動

生成可用代碼,複製生成後的代碼,修改文件如下:

from lxml import dtree
import requests

headers = {
     'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
     'Accept-Encoding': "gzip, deflate, br",
     'Accept-Language': "zh-CN,zh;q=0.8",
     'Connection': "keep-alive",
     'Cookie': "ll=\"108288\"; bid=0tRrpebS00A; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1517215204%2C%22https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D1%26tn%3Dbaidu%26wd%3Ddouban%26rsv_pq%3D820efaeb000212e5%26rsv_t%3D5e10T10IN7m%252FhExSLYKlG7FzVzlifyXlCVcMJ7JYdiV07Du67xniyhzxdA4%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D7%26rsv_sug1%3D6%26rsv_sug7%3D100%26rsv_sug2%3D0%26inputT%3D1385%26rsv_sug4%3D13002%22%5D; __yadk_uid=iDvXJyG4hcWMCIyBOFEFG3TcQ1M3Ltl4; _vwo_uuid_v2=278263CB26D65EE4B8F07AB811643B1F|a9d23a84e2d88b5e098ae55b9af343fe; __utmt_t1=1; dbcl2=\"173260058:bJn0N7OcoSQ\"; ck=y6hk; ps=y; _pk_id.100001.4cf6=6d939baa385028b2.1517215204.1.1517216374.1517215204.; _pk_ses.100001.4cf6=*; __utma=223695111.1564140964.1517215200.1517215200.1517215200.1; __utmb=223695111.0.10.1517215200; __utmc=223695111; __utmz=223695111.1517215200.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=douban; __utma=30149280.310849476.1517215200.1517215200.1517215201.2; __utmb=30149280.21.8.1517216374354; __utmc=30149280; __utmz=30149280.1517215201.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=douban; push_noty_num=0; push_doumail_num=0; RT=s=1517216403280&r=https%3A%2F%2Fmovie.douban.com%2Fsubject%2F26752852%2F%3Ftag%3D%25E7%2583%25AD%25E9%2597%25A8%26from%3Dgaia",
     'Host': "movie.douban.com",
     'Referer': "https://movie.douban.com/subject/26752852/?tag=%E7%83%AD%E9%97%A8&from=gaia",
     'Upgrade-Insecure-Requests': "1",
     'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36",
     'Cache-Control': "no-cache",
     # 'Postman-Token': "c6ef7ffc-72a6-fcce-08f0-a3e3caadf738"
     }

# 抓取電影評論頁
r = requests.get("https://movie.douban.com/",headers=headers)
s = etree.HTML(r.text)

# 獲取每個評論節點
comments = s.xpath("//div[@class='comment']")
for comment in comments:
    # 獲取當前評論的用戶名稱
    username = comment.xpath("./h3/span[2]/a/text()")[0]
    # 獲取當前評論的內容
    content = comment.xpath("./p/text()")[0]
    # 獲取評分星級
    stars = comment.xpath("./h3/span[2]/span[2]/@class")[0]
    # 評論發表時間
    comment_time = comment.xpath("./h3/span[2]/span[3]/@title")
    comment_time = comment_time[0] if comment_time else ""
    print("%s %s %s: \n%s"%(username,stars,comment_time,content))




發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章