#!/usr/bin/python
import requests
from bs4 import BeautifulSoup
import sys
import re
import threading
reload(sys)
sys.setdefaultencoding('utf-8')
def deal_i(tag):
title = tag.a.text.strip()
desc = tag.find_all(class_='article_description')[0].text.strip()
return (title, desc)
def deal_url(url):
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
}
r = requests.get(url, headers = headers)
s = BeautifulSoup(r.text,'html.parser')
return map(deal_i, s.find_all('div', attrs={'class':'list_item article_item'}))
class th_htl(threading.Thread):
def __init__(self, lst_url):
threading.Thread.__init__(self)
self.lst = []
self.lst_url = lst_url
def run(self):
self.lst = reduce(lambda x, y : x+y, map(deal_url, self.lst_url))
def get_lst(self):
return self.lst;
if __name__ == '__main__':
url = 'http://blog.csdn.net/lanphaday/article/list/PAGE'
# genarate url list
lst_url = []
for i in range(1,14):
lst_url.append(re.sub('PAGE', '%d' % i, url))
print lst_url
l = len(lst_url)
n = l
p = l / n
n_2 = n * p < l and n+1 or n
print n_2
lst_all = []
th = []
# genarate thread and start
for i in range(0,n_2):
th_ = th_htl(lst_url[i*p:(i+1)*p])
th_.start()
th.append(th_)
continue
lst = reduce(lambda x, y : x+y, map(deal_url, lst_url[i*p:(i+1)*p]))
lst_all += lst
# wait thread
for item in th:
item.join()
# get thread value
for item in th:
lst_all += item.get_lst()
for item in lst_all:
print 'TITLE:', item[0]
print 'DESCRIPTION:', item[1]## 標題 ##
PYTHON 獲取csdn的博客文章標題和描述
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.