1.首先在电脑安装requests库:命令:pip install requests
2.爬取网页内容:
一:爬取页面内容(文字信息)
代码:
添加headers是为了对应网站的反爬技术
#爬取京东商品页面信息
#导入requests库
import requests
#商品地址
url="https://item.jd.com/100012014970.html#crumb-wrap"
#user-agent和cookie信息
kv:{
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
'cookie':'__jdc=122270672; __jdu=1810527096; shshshfpa=5c31968b-41e5-cba8-dba2-b39569a132c9-1587205677; shshshfpb=um3kANbAFjMv5MDiflvoSBQ%3D%3D; 3AB9D23F7A4B3C9B=D35J7U2PGKXJ2GUPPEPRNBKPJWDZYS34NGT3TOIN5D7WXWYROAMHFI2GO6BGTEFJHQO6BPSSQX7BTCQ35PFLX64BUY; ipLoc-djd=6-379-388-0; areaId=6; unpl=V2_ZzNtbUZSSkAmCUcAKBAIVWJXQV8SUBdGcA1AVHpLDgFhBhJaclRCFnQUR1FnGVQUZwMZX0ZcRhxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zRAVHHCFbE1MvSwhVYgIbXhFSQhJwC0IAeB9dVmdQG1RyZ0AVRQhHZHsQXARjABRcSlBzJXI4dmRzHFsCbwIiXHJWc1chVENWeRtVDSoDG11DU0ATdABBZHopXw%3d%3d; CCC_SE=ADC_LUhr3Gpk6dUhMBXmlMT059uWbSTppMnzzoFgllKO8miSEYo7BL5I4hnaqbGjEwHhz%2fEo2lKBNZn%2bFp1uc7wVILPE3i0IBmKm7XzZu64l3UGtE9jpMDLztOl%2bym64STPHquyhmll8ZfPQYMF5PM4ph9GwtNqIJqatj%2buOwJ7mfkBmVJpEP3V%2flqMq6steWkziR37gkJtvctgWHjK0smHnG0VhGl9O2m2NEQYfuS4Z%2bw%2fpgL7cCn6y0RabFkAsHvfSQ1uMiv4xuOL4ckpSSSbRkXbMnXPbkr4T%2bztPJyOvGYK1x%2btteEww8i9sDBi8To39bdsFKYuW8d9Lz6OvRWFufLhgpOB%2fRIKsjAVoKtvUJmXJw%2bbK10tMyNqGrFMrmvdEnShschC55e9Cc41XCbPDXW99dppmSPUjBD%2fz5mfBozIL9g5fGoQpQKh1M65v2oPA0AEQyga8rOWk9RSmdg2wjKtr7Z9oIygTnNITE9wrxC08%2fmkEBr%2bNVnyzN05zbAifUbhhiyYIMLnw4dgIzeJFili56D0LmAL%2fXHUWFldGdRcEpxOl1kLestC8SpXbAqRE%2b5RCCr4E7%2fhthS4LuInQtA%3d%3d; __jda=122270672.1810527096.1583840132.1587207769.1587267350.3; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|7c58ebd6ecea4082b406425e270b1b88_0_548db00eb8ea4eb3a6eb44710cc57416|1587267349684; shshshfp=0ca00c61b8d65ee5c54c217cb3fe41ca; shshshsID=4c284260dcd8d8512daca7f2d018bbba_2_1587267356019; __jdb=122270672.2.1810527096|3.1587267350'
}
#利用try except 捕获异常
try:
#返回Response对象,get构造一个request对象
r = requests.get(url,headers=kv)
#检查返回状态码是不是200,如果不是返回异常
r.raise_for_status()
#网页内容 = 分析出来的编码,解析不会乱码
r.encoding = r.apparent_encoding
#输出内容
print(r.text)
except:
print("爬取失败!")
实验运行结果:
需要材料:
二:网路爬虫的限制:
1.来源审查:对user-agent域审查
2.发布公告:发布robots协议
三:if __name__ == "__main__" 的理解:
#const.py
PI = 3.14
def main(PI):
print("PI:",PI)
main()
from const import PI
def area(r):
return PI*r**2
def main():
print("area:",area(2))
#main() 如果直接这样结果会执行const的代码
#这样的话就会只引用const.py的PI,不运行const.py的代码
#if __name__ == "__main__"的理解:如果模块是被直接运行的,则代码块被运行,如果模块是被导入的,则代码块不被运行。
if __name__ == "__main__":
main()
四:百度搜索的实现:
import requests
keyword="python"
url="http://www.baidu.com/s"
kv={'wd':keyword}
hd={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
'Cookie': 'BAIDUID=CDEA8223FE89F3DB111133A716519675:FG=1; BIDUPSID=CDEA8223FE89F3DB111133A716519675; PSTM=1569998960; BD_UPN=12314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDUSS=hqYjVyWnI3RGR0dUtLS3FVZW9PWTBrLTh5N09MYjV4WDZLaGZLaTBYfi1JYmxlRVFBQUFBJCQAAAAAAAAAAAEAAACsI7jL1tCxsdGns6Q2NgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP6UkV7-lJFeal; BDSFRCVID=oPusJeCCxG3HgeTu9rP95KxRKW2mEWAH1xJt3J; H_BDCLCKID_SF=tJkO_DKKJK-3fP36q4Qh5-4ObqbWetJya4o2WDvu-xJcOR5Jj65WWMIB5l5tqx0fMGTfbI5y5lvYEqrv3MA--t4fXPnT0hLtJ26iKtTcBP5msq0x0-nYe-bQypoaLUnyaDOMahvX5l7xO-5sQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3tjjTXjHuet5kfJJ3fL-08MJnEJ45v-4rHhnI_Mq7054CXKKOLVh3Ktp7keq8CDxbjjPuy0q_80nKqQ6vT_4nFWq5RHnc2y5jHhpkXQnni0TbHXa6t0RrzyJQpsIJMhPDWbT8ULf5q-MjzaKvia-THBMb1fqRDBT5h2M4qMxtOLR3pWDTm_q5TtUJMeCnTD-Dhe6jWjG_8JT0Df5rBsDIyMR7hJbIk-PnVeUo-MPnZKxtqtJvtXtjtQPKBjRul3JJmjJkm5-RbLt6nWncKWhD-J4JH_Itxbf6T5Mv-hbQ405OTX5-O0KJcbRoJ8RvbhPJvynF8XnO7-x7lXbrtXp7_2J0WStbKy4oTjxL1Db3JKjvMtIFtVD85tCKMhKPr-PbjqJvHMx8X5-RLfKOTL4OF5lOTJh0Ry-Jq3bQ-KR08txnOLe3UoJ6NQhj1htn_55bke4tX-NFDJ5tHtU5; delPer=0; BD_CK_SAM=1; PSINO=1; COOKIE_SESSION=797_0_8_7_20_4_0_0_7_4_0_2_0_0_0_0_1587183915_0_1587215105%7C9%234367_100_1587091058%7C9; H_PS_PSSID=1466_31169_21114_31342_30903_30823_31085_31164; BD_HOME=1'
}
try:
r = requests.get(url,headers=hd,params=kv)
r.encoding = r.apparent_encoding
r.raise_for_status
print(r.text)
except:
print("爬取失败")
五:图片下载的实现:
import requests
import os
#爬取图片的url
url = "https://images-cn.ssl-images-amazon.com/images/I/81M5fmmHKbL._AC_SL1500_.jpg"
#图片存放的目录
root="E://移动后的桌面//爬虫//image//"
#图片存放的目录加网页图片的名字
path=root+url.split('/')[-1]
try:
#判断root目录是不是存在,不存在就创建
if not os.path.exists(root):
os.mkdir(root)
#如果图片不存在,执行一段代码
if not os.path.exists(path):
#请求
r=requests.get(url)
#(wb):以二进制的形式打开文件只用于写入
with open(path,'wb') as f:
把图片的二进制保存
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已经存在")
except:
print("爬取失败")
六:IP地址的查询:
方法一:
import requests
url="https://ip38.com/ip.php?ip="
try:
r=requests.get(url+"202.204.80.112")
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text)
except:
print("爬取失败")
方法二:
import requests
url="https://ip38.com/ip.php?"
kv={'ip':"202.204.80.112"}
try:
r=requests.get(url,params=kv)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text)
except:
print("爬取失败")