Python爬蟲——bs4

from bs4 import BeautifulSoup
import  requests
url="https://www.douban.com/?p=1"
headers = {
            "User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"
            }
cookies = 'bid=CaE0F8bS1Fo; __gads=ID=909dc33bfcc2076a:T=1581476387:S=ALNI_MbZ4jrzEUZ-XOtc_K2keuVnphlCdA; push_noty_num=0; push_doumail_num=0; ll="108288"; _vwo_uuid_v2=DCA8B8479EC9962DA9583DB109414C6BC|2b6f125f8df85eaa8526a461fb2f8d70; __yadk_uid=5gtjCnoJMeEhSDeRjRByUx4Zz8ViPcCi; __utmv=30149280.21093; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1581867944%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPlE9kQOeLGCLm0IBa75DE_QSqt08PlSJ3AAFebIZSxUiescU8j0_1K20xwHD0q8f%26wd%3D%26eqid%3Db1c87716012299e2000000035e4963a3%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1767680939.1581476455.1581743959.1581867945.7; __utmc=30149280; __utmz=30149280.1581867945.7.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; dbcl2="210933638:HvbCs6Bvse4"; ck=oxtR; ap_v=0,6.0; _pk_id.100001.8cb4=cb3b8107db3f7bda.1581735902.5.1581868163.1581863253.; __utmb=30149280.5.10.1581867945'
cook_dict = {}
cookies_list = cookies.split("; ")
for cookie in cookies_list:
    cook_dict[cookie.split("=")[0]] = cookie.split("=")[1]

response = requests.get(url,headers=headers,cookies=cook_dict)
data = response.content.decode("utf-8")
soup =BeautifulSoup(data,"lxml")
#獲取Tag對象只能得到一個
result = soup.head
#獲取內容缺點只能得到一個
result = soup.a.string
#獲取註釋內容
result = soup.p.string
#獲取屬性只能得到一個
result =soup.a['href']
####################################################3
#通用的解析方法
#find 返回查詢條件的第一個標籤
result = soup.find(name="a")
result =soup.find(attrs={"class=global-nav"})
result = soup.find(
    name="div",
    attrs={"class=global-nav"}
)

#find_all返回的是標籤對象,和上面的區別是,這個是列表
result =soup.find_all('a')
result =soup.find_all('a',limit=1)


#select就是css選擇器(選擇兩個title)
result = soup.select_one(".global-nav")
result =soup.select("head title")
result = soup.select("title,.title")
#select_one就是css選擇器
result =soup.select_one(".global-nav")
#標籤包裹的內容
result = soup.select("b")[0].get_text()
#標籤的屬性
result = soup.select("#link1")[0].get("href")

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章