网络爬虫之前奏总结

1.首先在电脑安装requests库:命令:pip install requests

2.爬取网页内容:

一:爬取页面内容(文字信息)

代码:

添加headers是为了对应网站的反爬技术

#爬取京东商品页面信息
#导入requests库
import requests
#商品地址
url="https://item.jd.com/100012014970.html#crumb-wrap"
#user-agent和cookie信息
kv:{
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
    'cookie':'__jdc=122270672; __jdu=1810527096; shshshfpa=5c31968b-41e5-cba8-dba2-b39569a132c9-1587205677; shshshfpb=um3kANbAFjMv5MDiflvoSBQ%3D%3D; 3AB9D23F7A4B3C9B=D35J7U2PGKXJ2GUPPEPRNBKPJWDZYS34NGT3TOIN5D7WXWYROAMHFI2GO6BGTEFJHQO6BPSSQX7BTCQ35PFLX64BUY; ipLoc-djd=6-379-388-0; areaId=6; unpl=V2_ZzNtbUZSSkAmCUcAKBAIVWJXQV8SUBdGcA1AVHpLDgFhBhJaclRCFnQUR1FnGVQUZwMZX0ZcRhxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zRAVHHCFbE1MvSwhVYgIbXhFSQhJwC0IAeB9dVmdQG1RyZ0AVRQhHZHsQXARjABRcSlBzJXI4dmRzHFsCbwIiXHJWc1chVENWeRtVDSoDG11DU0ATdABBZHopXw%3d%3d; CCC_SE=ADC_LUhr3Gpk6dUhMBXmlMT059uWbSTppMnzzoFgllKO8miSEYo7BL5I4hnaqbGjEwHhz%2fEo2lKBNZn%2bFp1uc7wVILPE3i0IBmKm7XzZu64l3UGtE9jpMDLztOl%2bym64STPHquyhmll8ZfPQYMF5PM4ph9GwtNqIJqatj%2buOwJ7mfkBmVJpEP3V%2flqMq6steWkziR37gkJtvctgWHjK0smHnG0VhGl9O2m2NEQYfuS4Z%2bw%2fpgL7cCn6y0RabFkAsHvfSQ1uMiv4xuOL4ckpSSSbRkXbMnXPbkr4T%2bztPJyOvGYK1x%2btteEww8i9sDBi8To39bdsFKYuW8d9Lz6OvRWFufLhgpOB%2fRIKsjAVoKtvUJmXJw%2bbK10tMyNqGrFMrmvdEnShschC55e9Cc41XCbPDXW99dppmSPUjBD%2fz5mfBozIL9g5fGoQpQKh1M65v2oPA0AEQyga8rOWk9RSmdg2wjKtr7Z9oIygTnNITE9wrxC08%2fmkEBr%2bNVnyzN05zbAifUbhhiyYIMLnw4dgIzeJFili56D0LmAL%2fXHUWFldGdRcEpxOl1kLestC8SpXbAqRE%2b5RCCr4E7%2fhthS4LuInQtA%3d%3d; __jda=122270672.1810527096.1583840132.1587207769.1587267350.3; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|7c58ebd6ecea4082b406425e270b1b88_0_548db00eb8ea4eb3a6eb44710cc57416|1587267349684; shshshfp=0ca00c61b8d65ee5c54c217cb3fe41ca; shshshsID=4c284260dcd8d8512daca7f2d018bbba_2_1587267356019; __jdb=122270672.2.1810527096|3.1587267350'
}
#利用try except 捕获异常
try:
    #返回Response对象,get构造一个request对象
    r = requests.get(url,headers=kv)
    #检查返回状态码是不是200,如果不是返回异常
    r.raise_for_status()
    #网页内容 = 分析出来的编码,解析不会乱码
    r.encoding = r.apparent_encoding
    #输出内容
    print(r.text)
except:
    print("爬取失败!")

实验运行结果:

需要材料:

二:网路爬虫的限制:

1.来源审查:对user-agent域审查

2.发布公告:发布robots协议

三:if __name__ == "__main__"   的理解:

#const.py
PI = 3.14
def main(PI):
    print("PI:",PI)
main()
from const import PI
def area(r):
    return PI*r**2
def main():
    print("area:",area(2))
#main()    如果直接这样结果会执行const的代码
#这样的话就会只引用const.py的PI,不运行const.py的代码
#if __name__ == "__main__"的理解:如果模块是被直接运行的,则代码块被运行,如果模块是被导入的,则代码块不被运行。
if __name__ == "__main__":
    main()

四:百度搜索的实现:

import requests

keyword="python"
url="http://www.baidu.com/s"
kv={'wd':keyword}
hd={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',    
    'Cookie': 'BAIDUID=CDEA8223FE89F3DB111133A716519675:FG=1; BIDUPSID=CDEA8223FE89F3DB111133A716519675; PSTM=1569998960; BD_UPN=12314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=hqYjVyWnI3RGR0dUtLS3FVZW9PWTBrLTh5N09MYjV4WDZLaGZLaTBYfi1JYmxlRVFBQUFBJCQAAAAAAAAAAAEAAACsI7jL1tCxsdGns6Q2NgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP6UkV7-lJFeal; BDSFRCVID=oPusJeCCxG3HgeTu9rP95KxRKW2mEWAH1xJt3J; H_BDCLCKID_SF=tJkO_DKKJK-3fP36q4Qh5-4ObqbWetJya4o2WDvu-xJcOR5Jj65WWMIB5l5tqx0fMGTfbI5y5lvYEqrv3MA--t4fXPnT0hLtJ26iKtTcBP5msq0x0-nYe-bQypoaLUnyaDOMahvX5l7xO-5sQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3tjjTXjHuet5kfJJ3fL-08MJnEJ45v-4rHhnI_Mq7054CXKKOLVh3Ktp7keq8CDxbjjPuy0q_80nKqQ6vT_4nFWq5RHnc2y5jHhpkXQnni0TbHXa6t0RrzyJQpsIJMhPDWbT8ULf5q-MjzaKvia-THBMb1fqRDBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTD-Dhe6jWjG_8JT0Df5rBsDIyMR7hJbIk-PnVeUo-MPnZKxtqtJvtXtjtQPKBjRul3JJmjJkm5-RbLt6nWncKWhD-J4JH_Itxbf6T5Mv-hbQ405OTX5-O0KJcbRoJ8RvbhPJvynF8XnO7-x7lXbrtXp7_2J0WStbKy4oTjxL1Db3JKjvMtIFtVD85tCKMhKPr-PbjqJvHMx8X5-RLfKOTL4OF5lOTJh0Ry-Jq3bQ-KR08txnOLe3UoJ6NQhj1htn_55bke4tX-NFDJ5tHtU5; delPer=0; BD_CK_SAM=1; PSINO=1; COOKIE_SESSION=797_0_8_7_20_4_0_0_7_4_0_2_0_0_0_0_1587183915_0_1587215105%7C9%234367_100_1587091058%7C9; H_PS_PSSID=1466_31169_21114_31342_30903_30823_31085_31164; BD_HOME=1'
    }

try:
    r = requests.get(url,headers=hd,params=kv)
    r.encoding = r.apparent_encoding
    r.raise_for_status
    print(r.text)
except:
    print("爬取失败")

五:图片下载的实现:

import requests
import os
#爬取图片的url
url = "https://images-cn.ssl-images-amazon.com/images/I/81M5fmmHKbL._AC_SL1500_.jpg"
#图片存放的目录
root="E://移动后的桌面//爬虫//image//"
#图片存放的目录加网页图片的名字
path=root+url.split('/')[-1]
try:
    #判断root目录是不是存在,不存在就创建
    if not os.path.exists(root):
        os.mkdir(root)
    #如果图片不存在,执行一段代码
    if not os.path.exists(path):
        #请求
        r=requests.get(url)
        #(wb):以二进制的形式打开文件只用于写入
        with open(path,'wb') as f:
            把图片的二进制保存
            f.write(r.content)
            f.close()
            print("文件保存成功")
    else:
        print("文件已经存在")
except:
    print("爬取失败")

六:IP地址的查询:

方法一:

import requests
url="https://ip38.com/ip.php?ip="
try:
    r=requests.get(url+"202.204.80.112")
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    print(r.text)
except:
    print("爬取失败")

方法二:

import requests
url="https://ip38.com/ip.php?"
kv={'ip':"202.204.80.112"}
try:
    r=requests.get(url,params=kv)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    print(r.text)
except:
    print("爬取失败")

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章