---------------------------------------------------------------------------------------------
[版權申明:本文系作者原創,轉載請註明出處]
文章出處:http://blog.csdn.net/sdksdk0/article/details/76208980
作者:朱培 ID:sdksdk0
--------------------------------------------------------------------------------------------
import bs4 # DOM解析模塊
import pymysql # 數據庫連接模塊
import urllib.request # 網絡訪問模塊
import time #時間模塊
# 配置參數
maxcount = 100 # 數據數量
home = 'https://www.tianfang1314.cn/index.html' # 起始位置
# 數據庫連接參數
db_config = {
'host': 'localhost',
'port': '3306',
'charset': 'utf8'
}
url_set = set() # url集合
url_old = set() # 過期url
# 獲取首頁鏈接
request = urllib.request.Request(home)
#爬取結果
response = urllib.request.urlopen(request)
html = response.read()
#設置解碼方式
html = html.decode('utf-8')
soup = bs4.BeautifulSoup(html, 'html.parser')
pattern = '/blog/articles/\w+/\w+.html'
links = soup.find_all('a', href=re.compile(pattern))
for link in links:
url_set.add(link['href'])
# 文章類定義
class Article(object):
def __init__(self):
self.url = None #地址
self.title = None #標題
self.author = None #作者
self.date = None #時間
self.content = None #文章內容
self.zq_date=None; #文章採集時間
# 連接數據庫
connect = pymysql.Connect(
host=db_config['host'],
port=int(db_config['port']),
user=db_config['username'],
passwd=db_config['password'],
db=db_config['database'],
charset=db_config['charset']
)
cursor = connect.cursor()
# 處理URL信息
count = 0
while len(url_set) != 0:
try:
# 獲取鏈接
url = url_set.pop()
url='https://www.tianfang1314.cn'+url
url_old.add(url)
# 獲取代碼
response = urllib.request.urlopen(request)
html = response.read()
# 設置解碼方式
html = html.decode('utf-8')
# DOM解析
soup = bs4.BeautifulSoup(html, 'html.parser')
pattern = 'https://www.tianfang1314.cn/blog/articles/\w+/\w+.html' # 鏈接匹配規則
links = soup.find_all('a', href=re.compile(pattern))
# 獲取URL
for link in links:
if link['href'] not in url_old:
url_set.add(link['href'])
# 數據防重
sql = "SELECT id FROM news WHERE url = '%s' "
data = (url,)
cursor.execute(sql % data)
if cursor.rowcount != 0:
raise Exception('重複數據: ' + url)
# 獲取詳情頁的鏈接
drequest = urllib.request.Request(url)
# 爬取結果
dresponse = urllib.request.urlopen(drequest)
dhtml = dresponse.read()
# 設置解碼方式
dhtml = dhtml.decode('utf-8')
dsoup = bs4.BeautifulSoup(dhtml, 'html.parser')
# 獲取信息
article = Article()
article.url = url # URL信息
page = dsoup.find('div', {'class': 'data_list'})
article.title=page.find('div', {'class': 'blog_title'}).get_text()
infoStr = page.find('div', {'class': 'blog_info'}).get_text() # 文章信息,例如 發佈時間:『 2016-12-14 11:26 』 用戶名:sdksdk0 閱讀(938) 評論(3)
infoStr=infoStr.rsplit('『', 1)
infoStr=infoStr[1].rsplit('』', 1)
article.date = infoStr[0] # 時間
article.author = infoStr[1].rsplit('\xa0\xa0', 1)[0].rsplit('用戶名:', 1)[1] #用戶名
article.content = page.find('div', {'class': 'blog_content'}).get_text() # 獲取文章
article.zq_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) #採集時間
# 存儲數據
sql = "INSERT INTO news( url, title, author, date, content,zq_date ) "
sql = sql + " VALUES ('%s', '%s', '%s', '%s', '%s','%s') "
data = (article.url, article.title, article.author, article.date, article.content,article.zq_date)
cursor.execute(sql % data)
connect.commit()
except Exception as e:
print(e)
continue
else:
print(article.title)
count += 1
finally:
# 判斷數據是否收集完成
if count == maxcount:
break
# 關閉數據庫連接
cursor.close()