二十、python xpath介紹和新聞內容爬蟲
Xpath介紹
用xpath提取感興趣的內容
一個網頁文檔是一個半結構化的數據,其實html文檔就是一個樹形結構。根節點是html
用正則表達式也可以提取,但是不如xpath方便
1、路徑表示法
//定位根節點
/text(): 提取文本的內容
/@attr:提取屬性的內容
2、篩選條件
/div[@id]
/div[@id="content_id"]
/book[price>100] #按照節點的值
問題:先再windows中安裝lxml ???
前提是先裝好:pip
C:\Users\lyd>pip install lxml
Xpath使用
#獲取新聞列表
import requests
from lxml import etree
import datetime
#根據跟url獲取該文檔中的新聞列表的信息
def getNewsUrlList(baseUrl):
x = requests.get(baseUrl)
html = x.content.decode('gbk')
selector = etree.HTML(html)
contents = selector.xpath('//div[@id="content_right"]/div[@class="content_list"]/ul/li[div]')
for eachLink in contents:
url = eachLink.xpath('div/a/@href')[0]
title = eachLink.xpath('div/a/text()')[0]
ptime = eachLink.xpath('div[@class="dd_time"]/text()')[0]
yield(title, url, ptime)
#根據具體的新聞的url獲取該新聞的內容
def getNewsContent(newsUrlList):
for title, url, ptime in newUrlList:
x = requests.get(url)
html = x.content.decode('gbk') #整個頁面的編碼爲gbk2312
selector = etree.HTML(html)
constants = selector.xpath('//div[@class="left_zw"]/p/text()')
news = '\r\n'.join(constants) #在每一個p標籤都回車換行。 \r\n: linux中的換行,就相當於wind中的\n
yield title, url, ptime, news
# 獲取昨天的日期
def getYesterday(i):
today = datetime.date.today()
oneday = datetime.timedelta(days=i)
yesterday = today - oneday
return yesterday.strftime("%m%d")
if __name__ == "__main__":
urlTemplate = 'http://www.chinanews.com/scroll-news/mil/{0}/{1}{2}/news.shtml'
#http://www.chinanews.com/scroll-news/2017/0719/news.shtml
#http://www.chinanews.com/scroll-news/mil/2017/0717/news.shtml
testurl = urlTemplate.format('2017', '7', '20')
#print testurl
# newUrlList = getNewsUrlList(testurl)
# for title, url, ptime in newUrlList:
# print title, url, ptime
# newsConstant = getNewsContent(newUrlList)
# f = open('news.txt','w') #以寫的方式打開一個文件
# w = lambda x: f.write((x+u'\r\n').encode('utf-8'))
# for title, url, ptime, news in newsConstant:
# w(u'~'*100)
# w(title)
# w(url)
# w(news)
# f.close()
#########################如下是作業:####################################
#爬去今天以前n多天的新聞數據
for i in range(0, 10):
yesterday = getYesterday(i)
urls = '%s%s%s' % ("http://www.chinanews.com/scroll-news/mil/2017/", yesterday, '/news.shtml')
newUrlList = getNewsUrlList(urls)
for title, url, ptime in newUrlList:
print title, url, ptime
# newsConstant = getNewsContent(newUrlList)
# f = open('news.txt','w') #以寫的方式打開一個文件
# w = lambda x: f.write((x+u'\r\n').encode('utf-8'))
# for title, url, ptime, news in newsConstant:
# w(u'~'*100)
# w(title)
# w(url)
# w(news)
# f.close()
-----------------------------------------------------------------------------------------------------------------------