#encoding=utf8
import requests
from lxml import etree
class QiuShi(object):
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
}
url = 'http://www.qiushibaike.com/text/'
def __init__(self):
filed = ['作者','性別','年齡','段子內容','好笑','評論']
# self.write = CSV('qiushi.csv',filed)
print(filed)
# 總頁碼
def totalUrl(self):
urls = [self.url+'page/{}?s=4985075'.format(i) for i in range(1,36)]
for url in urls:
print(u'正在獲取:'+url.split('/')[-2]+u'頁')
self.getInfo(url)
# 抓取詳細信息
def getInfo(self,url):
item= {}
html = requests.get(url,headers = self.headers).text
data = etree.HTML(html)
infos = data.xpath('//*[@class="article block untagged mb15"]')
print(infos)
for info in infos:
try:
item[1] = info.xpath('div[1]/a[2]/h2/text()')[0]
try:
age = info.xpath('div[1]/div[@class="articleGender womenIcon"]/text()')[0]
item[2] = u'女'
item[3] = age
except:
age = info.xpath('div[1]/div[@class="articleGender manIcon"]/text()')[0]
item[2] = u'男'
item[3] = age
except:
item[1] = u'匿名用戶'
item[2] = u'不詳'
item[3] = u'不詳'
item[4] = info.xpath('a/div/span/text()')[0].strip()
item[5] = info.xpath('div[2]/span[1]/i/text()')[0]
item[6] = data.xpath('//*[@class="qiushi_comments"]/i/text()')[0]
row = [item[i] for i in range(1, 7)]
# self.write.writeRow(row)
print(row)
# with open('C:\\QiuShiBaiKe.cvs', 'w+') as f:
# # f.write('{},{},{},{},{}'.format(row, work_year, money, palace, '\n'))
# f.write(row+"")
if __name__ == '__main__':
qiushi = QiuShi()
qiushi.totalUrl()
python3 [入門基礎實戰] 爬蟲入門之爬取糗事百科
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.