PYTHON 獲取csdn的博客文章標題和描述

#!/usr/bin/python

import requests
from bs4 import BeautifulSoup
import sys
import re
import threading

reload(sys)
sys.setdefaultencoding('utf-8')

def deal_i(tag):
    title = tag.a.text.strip()
    desc  =  tag.find_all(class_='article_description')[0].text.strip()
    return (title, desc)

def deal_url(url):
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
    }
    r = requests.get(url, headers = headers)

    s = BeautifulSoup(r.text,'html.parser')

    return map(deal_i, s.find_all('div', attrs={'class':'list_item article_item'}))

class th_htl(threading.Thread):
    def __init__(self, lst_url):
        threading.Thread.__init__(self)
    self.lst = []
    self.lst_url = lst_url

    def run(self):
    self.lst = reduce(lambda x, y : x+y, map(deal_url, self.lst_url))

    def get_lst(self):
        return self.lst;

if __name__ == '__main__':
    url = 'http://blog.csdn.net/lanphaday/article/list/PAGE'
    # genarate url list
    lst_url = []
    for i in range(1,14):
        lst_url.append(re.sub('PAGE', '%d' % i, url))
    print lst_url

    l = len(lst_url)
    n = l 
    p = l / n
    n_2 = n * p < l and n+1 or n
    print n_2

    lst_all = []
    th = []
    # genarate thread and start
    for i in range(0,n_2):
        th_ = th_htl(lst_url[i*p:(i+1)*p])
    th_.start()
    th.append(th_)
    continue
    lst = reduce(lambda x, y : x+y, map(deal_url, lst_url[i*p:(i+1)*p]))
    lst_all += lst

    # wait thread
    for item in th:
        item.join() 

    # get thread value
    for item in th:
        lst_all += item.get_lst()

    for item in lst_all:
        print 'TITLE:', item[0]
        print 'DESCRIPTION:', item[1]## 標題 ##
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章