一、分頁的兩種方法
(一)for循環
這種方法的使用限制是,需要知道最大頁碼。
for i in range(1,20):
response = requests.get(url %i)
(二)while True循環
使用這種方法,需要限定跳出循環的邊界。
i = 0
while True:
json_str = get_conent(url.format(type_,i) ,headers =headers )
print(json_str)
if json_str =='[]':
break
json_data = json.loads(json_str)
parse_json(json_data)
i+=20
二、項目
豆瓣電影項目
import requests
import json,re
from lxml import etree
from urllib import parse
def get_content(url,headers):
'''
:param url: 請求url
:param headers: 請求頭
:return: python list/dict
'''
response = requests.get(url,headers=headers)
return response.text
def parse_data(list_):
for one in list_:
item = {}
item['title'] = one['title']
item['url'] = one['url']
item['release_date'] = one['release_date']
item['vote_count'] = one['vote_count']
item['score'] = one['score']
print(item)
def parse_ajax(type_,type_name):
params = {
'type_name': type_name,
'type': type_,
'interval_id': '100:90',
'action': ''
}
referer = 'https://movie.douban.com/typerank?' + parse.urlencode(params)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Referer': referer
}
i = 0
while True:
url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start={}&limit=20'.format(type_,i)
json_str = get_content(url,headers=headers)
list_ = json.loads(json_str)
if not list_:
break
parse_data(list_)
i = (i+1)*20
def main():
base_url = 'https://movie.douban.com/chart'
# 請求首頁,獲取分類type值
headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
html_str = get_content(base_url,headers)
html = etree.HTML(html_str)
type_urls = html.xpath('//div[@class="types"]/span/a/@href')
type_names = html.xpath('//div[@class="types"]/span/a/text()')
for i,type_url in enumerate(type_urls):
# /typerank?type_name=劇情&type=11&interval_id=100:90&action=
type_pattern = re.compile('&type=(.*?)&')
type_ = type_pattern.search(type_url).group(1)
parse_ajax(type_,type_names[i])
if __name__ == '__main__':
main()
封裝,面向對象
import requests
import json,re
from lxml import etree
from urllib import parse
import time
class Douban_movie(object):
def __init__(self,url):
self.url = url
self.main()
def get_content(self,url, headers):
'''
:param url: 請求url
:param headers: 請求頭
:return: python list/dict
'''
response = requests.get(url, headers=headers)
return response.text
def parse_data(self,list_):
for one in list_:
item = {}
item['title'] = one['title']
item['url'] = one['url']
item['release_date'] = one['release_date']
item['vote_count'] = one['vote_count']
item['score'] = one['score']
print(item)
def parse_ajax(self,type_, type_name):
params = {
'type_name': type_name,
'type': type_,
'interval_id': '100:90',
'action': ''
}
referer = 'https://movie.douban.com/typerank?' + parse.urlencode(params)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Referer': referer
}
i = 0
while True:
url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start={}&limit=20'.format(
type_, i)
json_str = self.get_content(url, headers=headers)
list_ = json.loads(json_str)
if not list_:
break
self.parse_data(list_)
i = (i + 1) * 20
def main(self):
# 請求首頁,獲取分類type值
headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
html_str = self.get_content(self.url, headers)
html = etree.HTML(html_str)
type_urls = html.xpath('//div[@class="types"]/span/a/@href')
type_names = html.xpath('//div[@class="types"]/span/a/text()')
for i, type_url in enumerate(type_urls):
# /typerank?type_name=劇情&type=11&interval_id=100:90&action=
type_pattern = re.compile('&type=(.*?)&')
type_ = type_pattern.search(type_url).group(1)
self.parse_ajax(type_, type_names[i])
if __name__ == '__main__':
start = time.time()
base_url = 'https://movie.douban.com/chart'
Douban_movie(base_url)
print(time.time()-start) # 21.504230260849
多線程
import requests
import json,re
from lxml import etree
from urllib import parse
import threading
import time
from queue import Queue
class Douban_movie(threading.Thread):
def __init__(self,url=None,q=None):
super().__init__()
self.url = url
self.q = q
def run(self):
self.main()
def get_type(self):
'''
爲了將type放到消息隊列中作爲任務
:return: [(type_,type_name),(type_,type_name),...]
'''
type_list = []
# 請求首頁,獲取分類type值
headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
html_str = self.get_content(self.url, headers)
html = etree.HTML(html_str)
type_urls = html.xpath('//div[@class="types"]/span/a/@href')
type_names = html.xpath('//div[@class="types"]/span/a/text()')
for i, type_url in enumerate(type_urls):
# /typerank?type_name=劇情&type=11&interval_id=100:90&action=
type_pattern = re.compile('&type=(.*?)&')
type_ = type_pattern.search(type_url).group(1)
type_list.append((type_,type_names[i]))
return type_list
def get_content(self,url, headers):
'''
:param url: 請求url
:param headers: 請求頭
:return: python list/dict
'''
response = requests.get(url, headers=headers)
return response.text
def parse_data(self,list_):
for one in list_:
item = {}
item['title'] = one['title']
item['url'] = one['url']
item['release_date'] = one['release_date']
item['vote_count'] = one['vote_count']
item['score'] = one['score']
print(item)
def parse_ajax(self,type_, type_name):
params = {
'type_name': type_name,
'type': type_,
'interval_id': '100:90',
'action': ''
}
referer = 'https://movie.douban.com/typerank?' + parse.urlencode(params)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Referer': referer
}
i = 0
while True:
url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start={}&limit=20'.format(
type_, i)
json_str = self.get_content(url, headers=headers)
list_ = json.loads(json_str)
if not list_:
break
self.parse_data(list_)
i = (i + 1) * 20
def main(self):
while True:
if self.q.empty():
break
type_,type_name = self.q.get()
self.parse_ajax(type_, type_name)
if __name__ == '__main__':
start = time.time()
base_url = 'https://movie.douban.com/chart'
Douban_movie(base_url)
q = Queue()
type_list = Douban_movie(url=base_url).get_type()
for one in type_list:
q.put(one)
list_ = []
for i in range(4):
t = Douban_movie(q=q)
t.start()
list_.append(t)
for t in list_:
t.join()
print(time.time()-start) # 8.396480321884155
講項目修改爲多線程時,最重要的是要講什麼放入任務隊列中
在這個項目中,我們決定將電影的type放入隊列。那麼就要單獨提供一個方法,可以在主線程中獲取電影的type,放入任務隊列。也就是項目中的get_type()方法。
這個方法爲什麼要返回一個[(type,type_name),(type,type_name),…]結構?
因爲該項目中,我們需要type值和type_name,構造referer,封裝到請求頭中。
三、MongoDB
(一)安裝步驟
1.安裝
安裝時需要選擇自定義安裝,修改安裝路徑,推薦將MongoDB安裝到C盤頂層,注意路徑中不能有空格或中文。
2.環境變量的配置
3.驗證
mongod
4.新建一個存放數據庫的文件夾
新建data文件
再進入data新建一個db
5.啓動
mongod --dbpath C:\MongoDB\Server\3.4\data\db
(二)將MongoDB配置成一個服務
1.創建日誌目錄
在data目錄下創建一個logs目錄
2.以管理員身份進入cmd
3.進入MongoDB的bin目錄下
4.運行命令
mongod --bind_ip 0.0.0.0 --logpath C:\MongoDB\Server\3.4\data\logs\mongo.log --logappend --dbpath C:\MongoDB\Server\3.4\data\db --port 27017 --serviceName "MongoDB" --serviceDisplayName "MongoDB" --install
5.啓動服務
(三)客戶端和服務器
mongod–啓動MongoDB服務器。
mongo–啓動客戶端,客戶端是用來做曾刪改查這些操作的。
(四)mongo的文檔document
一個文檔就相當於一個字典
{name:'zhangsan',age:'6',grade:'5'}
四、MongoDB操作
新建的默認有兩個數據庫
admin–配置權限
local–固定的重要數據
(一)基本操作
查看數據庫
show dbs
創建數據庫(隱式創建)
沒有成功,需要兩步
還可以
查看集合
show tables
show collections
刪除庫和集合
db.collectionName.drop() // 刪除集合
db.dropDatabase() // 刪除數據庫
庫內沒有集合了,庫也會自動刪除