網絡爬蟲之前奏總結

1.首先在電腦安裝requests庫:命令:pip install requests

2.爬取網頁內容:

一:爬取頁面內容(文字信息)

代碼:

添加headers是爲了對應網站的反爬技術

#爬取京東商品頁面信息
#導入requests庫
import requests
#商品地址
url="https://item.jd.com/100012014970.html#crumb-wrap"
#user-agent和cookie信息
kv:{
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
    'cookie':'__jdc=122270672; __jdu=1810527096; shshshfpa=5c31968b-41e5-cba8-dba2-b39569a132c9-1587205677; shshshfpb=um3kANbAFjMv5MDiflvoSBQ%3D%3D; 3AB9D23F7A4B3C9B=D35J7U2PGKXJ2GUPPEPRNBKPJWDZYS34NGT3TOIN5D7WXWYROAMHFI2GO6BGTEFJHQO6BPSSQX7BTCQ35PFLX64BUY; ipLoc-djd=6-379-388-0; areaId=6; unpl=V2_ZzNtbUZSSkAmCUcAKBAIVWJXQV8SUBdGcA1AVHpLDgFhBhJaclRCFnQUR1FnGVQUZwMZX0ZcRhxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zRAVHHCFbE1MvSwhVYgIbXhFSQhJwC0IAeB9dVmdQG1RyZ0AVRQhHZHsQXARjABRcSlBzJXI4dmRzHFsCbwIiXHJWc1chVENWeRtVDSoDG11DU0ATdABBZHopXw%3d%3d; CCC_SE=ADC_LUhr3Gpk6dUhMBXmlMT059uWbSTppMnzzoFgllKO8miSEYo7BL5I4hnaqbGjEwHhz%2fEo2lKBNZn%2bFp1uc7wVILPE3i0IBmKm7XzZu64l3UGtE9jpMDLztOl%2bym64STPHquyhmll8ZfPQYMF5PM4ph9GwtNqIJqatj%2buOwJ7mfkBmVJpEP3V%2flqMq6steWkziR37gkJtvctgWHjK0smHnG0VhGl9O2m2NEQYfuS4Z%2bw%2fpgL7cCn6y0RabFkAsHvfSQ1uMiv4xuOL4ckpSSSbRkXbMnXPbkr4T%2bztPJyOvGYK1x%2btteEww8i9sDBi8To39bdsFKYuW8d9Lz6OvRWFufLhgpOB%2fRIKsjAVoKtvUJmXJw%2bbK10tMyNqGrFMrmvdEnShschC55e9Cc41XCbPDXW99dppmSPUjBD%2fz5mfBozIL9g5fGoQpQKh1M65v2oPA0AEQyga8rOWk9RSmdg2wjKtr7Z9oIygTnNITE9wrxC08%2fmkEBr%2bNVnyzN05zbAifUbhhiyYIMLnw4dgIzeJFili56D0LmAL%2fXHUWFldGdRcEpxOl1kLestC8SpXbAqRE%2b5RCCr4E7%2fhthS4LuInQtA%3d%3d; __jda=122270672.1810527096.1583840132.1587207769.1587267350.3; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|7c58ebd6ecea4082b406425e270b1b88_0_548db00eb8ea4eb3a6eb44710cc57416|1587267349684; shshshfp=0ca00c61b8d65ee5c54c217cb3fe41ca; shshshsID=4c284260dcd8d8512daca7f2d018bbba_2_1587267356019; __jdb=122270672.2.1810527096|3.1587267350'
}
#利用try except 捕獲異常
try:
    #返回Response對象,get構造一個request對象
    r = requests.get(url,headers=kv)
    #檢查返回狀態碼是不是200,如果不是返回異常
    r.raise_for_status()
    #網頁內容 = 分析出來的編碼,解析不會亂碼
    r.encoding = r.apparent_encoding
    #輸出內容
    print(r.text)
except:
    print("爬取失敗!")

實驗運行結果:

需要材料:

二:網路爬蟲的限制:

1.來源審查:對user-agent域審查

2.發佈公告:發佈robots協議

三:if __name__ == "__main__"   的理解:

#const.py
PI = 3.14
def main(PI):
    print("PI:",PI)
main()
from const import PI
def area(r):
    return PI*r**2
def main():
    print("area:",area(2))
#main()    如果直接這樣結果會執行const的代碼
#這樣的話就會只引用const.py的PI,不運行const.py的代碼
#if __name__ == "__main__"的理解:如果模塊是被直接運行的,則代碼塊被運行,如果模塊是被導入的,則代碼塊不被運行。
if __name__ == "__main__":
    main()

四:百度搜索的實現:

import requests

keyword="python"
url="http://www.baidu.com/s"
kv={'wd':keyword}
hd={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',    
    'Cookie': 'BAIDUID=CDEA8223FE89F3DB111133A716519675:FG=1; BIDUPSID=CDEA8223FE89F3DB111133A716519675; PSTM=1569998960; BD_UPN=12314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=hqYjVyWnI3RGR0dUtLS3FVZW9PWTBrLTh5N09MYjV4WDZLaGZLaTBYfi1JYmxlRVFBQUFBJCQAAAAAAAAAAAEAAACsI7jL1tCxsdGns6Q2NgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP6UkV7-lJFeal; BDSFRCVID=oPusJeCCxG3HgeTu9rP95KxRKW2mEWAH1xJt3J; H_BDCLCKID_SF=tJkO_DKKJK-3fP36q4Qh5-4ObqbWetJya4o2WDvu-xJcOR5Jj65WWMIB5l5tqx0fMGTfbI5y5lvYEqrv3MA--t4fXPnT0hLtJ26iKtTcBP5msq0x0-nYe-bQypoaLUnyaDOMahvX5l7xO-5sQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3tjjTXjHuet5kfJJ3fL-08MJnEJ45v-4rHhnI_Mq7054CXKKOLVh3Ktp7keq8CDxbjjPuy0q_80nKqQ6vT_4nFWq5RHnc2y5jHhpkXQnni0TbHXa6t0RrzyJQpsIJMhPDWbT8ULf5q-MjzaKvia-THBMb1fqRDBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTD-Dhe6jWjG_8JT0Df5rBsDIyMR7hJbIk-PnVeUo-MPnZKxtqtJvtXtjtQPKBjRul3JJmjJkm5-RbLt6nWncKWhD-J4JH_Itxbf6T5Mv-hbQ405OTX5-O0KJcbRoJ8RvbhPJvynF8XnO7-x7lXbrtXp7_2J0WStbKy4oTjxL1Db3JKjvMtIFtVD85tCKMhKPr-PbjqJvHMx8X5-RLfKOTL4OF5lOTJh0Ry-Jq3bQ-KR08txnOLe3UoJ6NQhj1htn_55bke4tX-NFDJ5tHtU5; delPer=0; BD_CK_SAM=1; PSINO=1; COOKIE_SESSION=797_0_8_7_20_4_0_0_7_4_0_2_0_0_0_0_1587183915_0_1587215105%7C9%234367_100_1587091058%7C9; H_PS_PSSID=1466_31169_21114_31342_30903_30823_31085_31164; BD_HOME=1'
    }

try:
    r = requests.get(url,headers=hd,params=kv)
    r.encoding = r.apparent_encoding
    r.raise_for_status
    print(r.text)
except:
    print("爬取失敗")

五:圖片下載的實現:

import requests
import os
#爬取圖片的url
url = "https://images-cn.ssl-images-amazon.com/images/I/81M5fmmHKbL._AC_SL1500_.jpg"
#圖片存放的目錄
root="E://移動後的桌面//爬蟲//image//"
#圖片存放的目錄加網頁圖片的名字
path=root+url.split('/')[-1]
try:
    #判斷root目錄是不是存在,不存在就創建
    if not os.path.exists(root):
        os.mkdir(root)
    #如果圖片不存在,執行一段代碼
    if not os.path.exists(path):
        #請求
        r=requests.get(url)
        #(wb):以二進制的形式打開文件只用於寫入
        with open(path,'wb') as f:
            把圖片的二進制保存
            f.write(r.content)
            f.close()
            print("文件保存成功")
    else:
        print("文件已經存在")
except:
    print("爬取失敗")

六:IP地址的查詢:

方法一:

import requests
url="https://ip38.com/ip.php?ip="
try:
    r=requests.get(url+"202.204.80.112")
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    print(r.text)
except:
    print("爬取失敗")

方法二:

import requests
url="https://ip38.com/ip.php?"
kv={'ip':"202.204.80.112"}
try:
    r=requests.get(url,params=kv)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    print(r.text)
except:
    print("爬取失敗")

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章