本文主要總結了爬取時光網的方法,僅供學習爬蟲使用:
1.爬取首頁熱點資訊和新聞圖片:
url = "http://www.mtime.com/"
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
#獲取熱門資訊
news_list = soup.find_all('div', attrs={'class': 'newsitem'})
for news in news_list:
title=news.find('a')['title']
content=news.find('a')['href']
#將熱點資訊放在字典中,key值爲資訊標題,value爲資訊鏈接
news_dic[title]=content
#獲取熱點圖片
hotpicture_list = soup.find_all('div', attrs={'class': 'over-a'})
for hotpicture in hotpicture_list:
picture_url = re.findall("(?<=[(])[^()]+\.[^()]+(?=[)])", hotpicture['style'])[0].replace(' ', '')
urllib.urlretrieve(picture_url, picture_path)
2.爬取搜索到的電影資訊
#quote對字符串進行編碼
searchcontent=quote(searchcontent)
url_search='http://service.channel.mtime.com/Search.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Channel.Services&Ajax_CallBackMethod=GetSearchResult&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fsearch.mtime.com%2Fsearch%2F'+quote('/?q='+searchcontent+'&t=1&i=0&c=791')+'&Ajax_CallBackArgument0='+searchcontent+'&Ajax_CallBackArgument1=1&Ajax_CallBackArgument2=791&Ajax_CallBackArgument3=0&Ajax_CallBackArgument4=1'
moviesearch_result = requests.get(url_search).content
movie_list = re.findall(r'{"movieId":(.*?)}', moviesearch_result)
for movie in movie_list:
movie_title=re.findall(r'"movieTitle":"(.*?)",', movie)[0]
movie_url = re.findall(r'"movieUrl":"(.*?)",', movie)[0]
#將搜索內容放在字典中,key值爲電影名稱,value爲電影鏈接
movie_dic[movie_title] = movie_url
searchcontent = quote(searchcontent)
url_search='http://service.channel.mtime.com/Search.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Channel.Services&Ajax_CallBackMethod=GetSearchResult&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fsearch.mtime.com%2Fsearch%2F'+quote('?q='+searchcontent+'&t=3&i=0&c=791')+'&Ajax_CallBackArgument0='+searchcontent+'&Ajax_CallBackArgument1=3&Ajax_CallBackArgument2=791&Ajax_CallBackArgument3=0&Ajax_CallBackArgument4=1'
moviersearch_result = requests.get(url_search).content
movirer_list = re.findall(r'{"personId":(.*?)}]', moviersearch_result)[0]
#影人照片
movierpicture_url = re.findall(r'"cover":"(.*?)",', movirer_list)[0]
urllib.urlretrieve(movierpicture_url, movierpicture_path)
#影人姓名
moviertitle=re.findall(r'"personTitle":"(.*?)",', movirer_list)[0]
#影人職業
movierwork=re.findall(r'"personFilmography":"(.*?)",', movirer_list)[0]
#影人生日
movierbirth=re.findall(r'"birth":"(.*?)",', movirer_list)[0]
#影人喜愛度
movierlove=re.findall(r'"love":(.*?),', movirer_list)[0]
#影人代表作
moviermovielist = re.findall(r'"title":"(.*?)",', movirer_list)
movielist=""
for m in moviermovielist:
movielist=movielist+"《"+m+"》"
#更多信息(鏈接)
url_movier=re.findall(r'"personUrl":"(.*?)",', movirer_list)[0]
4.根據用戶ID爬取用戶所有影評
review_dict = dict()
point_dict = dict()
review_No = 1
point_No = 1
page_num = 1
url_user='http://sandbox.my.mtime.com/Service/callback.mc?Ajax_CallBack=true&Ajax_CallBackType=Mtime.MemberCenter.Pages.CallbackService&Ajax_CallBackMethod=RemoteLoad&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fmy.mtime.com%2F'+ str(userId) + '%2F&Ajax_CallBackArgument0=t&Ajax_CallBackArgument1=' + str(userId) + '%2F%3F%242'
while True:
cont = requests.get(url_user).content
cont_list = re.findall(r'content:"\s+(.+?)<!--content end -->', cont)[0]
txt = re.sub(r'\\', '', cont_list)
soup = BeautifulSoup(txt, 'html.parser')
# 用戶信息
if page_num == 1:
member_info_name = soup.find('div', attrs={'class': 't_memberinfo'}).find('a')['title']
member_info = member_info_name + u'說'
# 用戶評論和評分
Content_list = soup.find_all('div', attrs={'class': 't_module'})
for content in Content_list:
review = content.find('dt', attrs={'class': 'normal'})
point = content.find('strong', attrs={'class': 'c_green fl'})
if review:
## print 'This is a review'
try:
review_movie = content.find('dd', attrs={'class': 'clearfix mt9 tl_link'}).get_text()
except AttributeError:
pass
review_content = review.get_text()
review_content = re.sub(member_info, '', review_content)
review_time = content.find('span', attrs={'class': 'mt3 fl'}).find('a').get_text()
#將評論信息放置到字典中,字典key值爲序號,value爲list[評論電影,評論內容,評論時間]
review_detail = [review_movie, review_content, review_time]
review_dict[review_No] = review_detail
review_No = review_No + 1
elif point:
# print 'This is a point'
if content.find('div', attrs={'class': 'clearfix mt9 px14 tl_link lh16'}):
point_movie = content.find('div', attrs={'class': 'clearfix mt9 px14 tl_link lh16'}).get_text()
point_content = point.get_text()
point_time = content.find('span', attrs={'class': 'mt3 fl'}).find('a').get_text()
#將評分信息放置到字典中,字典key值爲序號,value爲list[評分電影,評分內容,評分時間]
point_detail = [point_movie, point_content, point_time]
point_dict[point_No] = point_detail
point_No = point_No + 1
page_list = soup.find('div', attrs={'class': 'my_page'})
#獲取下一頁鏈接
next_page = page_list.find('a', attrs={'class': 'ml10 next'})
if next_page:
if page_num == 1:
max_page = page_list.find_all('a', attrs={'class': 'num'})
page_num = page_num + 1
url_user = 'http://sandbox.my.mtime.com/Service/callback.mc?Ajax_CallBack=true&Ajax_CallBackType=Mtime.MemberCenter.Pages.CallbackService&Ajax_CallBackMethod=RemoteLoad&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fmy.mtime.com%2F' + str(userId) + '%2F%3Ffilter%3D0%26pageIndex%3D' + str(page_num) + '&Ajax_CallBackArgument0=t&Ajax_CallBackArgument1=' + str(userId) + '%2F%3Ffilter%3D0%26pageIndex%3D' + str(page_num)
else:
break
5.模擬登陸併發布影評
#構造post方法傳遞參數
login_postdata={
'loginEmailText': 'username',#這裏的用戶名不用加密,可以直接寫進來
'loginPasswordText': '',
'inputVcode': '',
'isvcode': 'true',
'isAutoSign': 'true'
}
#對密碼進行加密
login_postdata['loginPasswordText']=self.md5(password)
url_login='https://passport.mtime.com/member/signinLogin'
s = requests.Session()
login_result=json.loads(s.post(url_login, data=login_postdata).content)
if login_result['result']['code']==91880012:
self.label_inform.setText(u'用戶名無效')
elif login_result['result']['code']==91880013:
self.label_inform.setText(u'密碼錯誤')
elif login_result['result']['code']==0:
self.label_inform.setText(u'登錄成功!')
#登錄成功後,發佈微評,content爲發佈內容
url_send='http://service.mtime.com/Service/Twitter.msi?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Service.Pages.TwitterService&Ajax_CallBackMethod=PostTweetCrossDomainByFlash&Ajax_CrossDomain=1&Ajax_RequestUrl=http://my.mtime.com/&Ajax_CallBackArgument0='+content
s.get(url_send)
#密碼加密方法
def md5(self,str):
m = hashlib.md5()
m.update(str)
result=m.hexdigest()
return m.hexdigest()