一、分页的两种方法
(一)for循环
这种方法的使用限制是,需要知道最大页码。
for i in range(1,20):
response = requests.get(url %i)
(二)while True循环
使用这种方法,需要限定跳出循环的边界。
i = 0
while True:
json_str = get_conent(url.format(type_,i) ,headers =headers )
print(json_str)
if json_str =='[]':
break
json_data = json.loads(json_str)
parse_json(json_data)
i+=20
二、项目
豆瓣电影项目
import requests
import json,re
from lxml import etree
from urllib import parse
def get_content(url,headers):
'''
:param url: 请求url
:param headers: 请求头
:return: python list/dict
'''
response = requests.get(url,headers=headers)
return response.text
def parse_data(list_):
for one in list_:
item = {}
item['title'] = one['title']
item['url'] = one['url']
item['release_date'] = one['release_date']
item['vote_count'] = one['vote_count']
item['score'] = one['score']
print(item)
def parse_ajax(type_,type_name):
params = {
'type_name': type_name,
'type': type_,
'interval_id': '100:90',
'action': ''
}
referer = 'https://movie.douban.com/typerank?' + parse.urlencode(params)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Referer': referer
}
i = 0
while True:
url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start={}&limit=20'.format(type_,i)
json_str = get_content(url,headers=headers)
list_ = json.loads(json_str)
if not list_:
break
parse_data(list_)
i = (i+1)*20
def main():
base_url = 'https://movie.douban.com/chart'
# 请求首页,获取分类type值
headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
html_str = get_content(base_url,headers)
html = etree.HTML(html_str)
type_urls = html.xpath('//div[@class="types"]/span/a/@href')
type_names = html.xpath('//div[@class="types"]/span/a/text()')
for i,type_url in enumerate(type_urls):
# /typerank?type_name=剧情&type=11&interval_id=100:90&action=
type_pattern = re.compile('&type=(.*?)&')
type_ = type_pattern.search(type_url).group(1)
parse_ajax(type_,type_names[i])
if __name__ == '__main__':
main()
封装,面向对象
import requests
import json,re
from lxml import etree
from urllib import parse
import time
class Douban_movie(object):
def __init__(self,url):
self.url = url
self.main()
def get_content(self,url, headers):
'''
:param url: 请求url
:param headers: 请求头
:return: python list/dict
'''
response = requests.get(url, headers=headers)
return response.text
def parse_data(self,list_):
for one in list_:
item = {}
item['title'] = one['title']
item['url'] = one['url']
item['release_date'] = one['release_date']
item['vote_count'] = one['vote_count']
item['score'] = one['score']
print(item)
def parse_ajax(self,type_, type_name):
params = {
'type_name': type_name,
'type': type_,
'interval_id': '100:90',
'action': ''
}
referer = 'https://movie.douban.com/typerank?' + parse.urlencode(params)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Referer': referer
}
i = 0
while True:
url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start={}&limit=20'.format(
type_, i)
json_str = self.get_content(url, headers=headers)
list_ = json.loads(json_str)
if not list_:
break
self.parse_data(list_)
i = (i + 1) * 20
def main(self):
# 请求首页,获取分类type值
headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
html_str = self.get_content(self.url, headers)
html = etree.HTML(html_str)
type_urls = html.xpath('//div[@class="types"]/span/a/@href')
type_names = html.xpath('//div[@class="types"]/span/a/text()')
for i, type_url in enumerate(type_urls):
# /typerank?type_name=剧情&type=11&interval_id=100:90&action=
type_pattern = re.compile('&type=(.*?)&')
type_ = type_pattern.search(type_url).group(1)
self.parse_ajax(type_, type_names[i])
if __name__ == '__main__':
start = time.time()
base_url = 'https://movie.douban.com/chart'
Douban_movie(base_url)
print(time.time()-start) # 21.504230260849
多线程
import requests
import json,re
from lxml import etree
from urllib import parse
import threading
import time
from queue import Queue
class Douban_movie(threading.Thread):
def __init__(self,url=None,q=None):
super().__init__()
self.url = url
self.q = q
def run(self):
self.main()
def get_type(self):
'''
为了将type放到消息队列中作为任务
:return: [(type_,type_name),(type_,type_name),...]
'''
type_list = []
# 请求首页,获取分类type值
headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
html_str = self.get_content(self.url, headers)
html = etree.HTML(html_str)
type_urls = html.xpath('//div[@class="types"]/span/a/@href')
type_names = html.xpath('//div[@class="types"]/span/a/text()')
for i, type_url in enumerate(type_urls):
# /typerank?type_name=剧情&type=11&interval_id=100:90&action=
type_pattern = re.compile('&type=(.*?)&')
type_ = type_pattern.search(type_url).group(1)
type_list.append((type_,type_names[i]))
return type_list
def get_content(self,url, headers):
'''
:param url: 请求url
:param headers: 请求头
:return: python list/dict
'''
response = requests.get(url, headers=headers)
return response.text
def parse_data(self,list_):
for one in list_:
item = {}
item['title'] = one['title']
item['url'] = one['url']
item['release_date'] = one['release_date']
item['vote_count'] = one['vote_count']
item['score'] = one['score']
print(item)
def parse_ajax(self,type_, type_name):
params = {
'type_name': type_name,
'type': type_,
'interval_id': '100:90',
'action': ''
}
referer = 'https://movie.douban.com/typerank?' + parse.urlencode(params)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Referer': referer
}
i = 0
while True:
url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start={}&limit=20'.format(
type_, i)
json_str = self.get_content(url, headers=headers)
list_ = json.loads(json_str)
if not list_:
break
self.parse_data(list_)
i = (i + 1) * 20
def main(self):
while True:
if self.q.empty():
break
type_,type_name = self.q.get()
self.parse_ajax(type_, type_name)
if __name__ == '__main__':
start = time.time()
base_url = 'https://movie.douban.com/chart'
Douban_movie(base_url)
q = Queue()
type_list = Douban_movie(url=base_url).get_type()
for one in type_list:
q.put(one)
list_ = []
for i in range(4):
t = Douban_movie(q=q)
t.start()
list_.append(t)
for t in list_:
t.join()
print(time.time()-start) # 8.396480321884155
讲项目修改为多线程时,最重要的是要讲什么放入任务队列中
在这个项目中,我们决定将电影的type放入队列。那么就要单独提供一个方法,可以在主线程中获取电影的type,放入任务队列。也就是项目中的get_type()方法。
这个方法为什么要返回一个[(type,type_name),(type,type_name),…]结构?
因为该项目中,我们需要type值和type_name,构造referer,封装到请求头中。
三、MongoDB
(一)安装步骤
1.安装
安装时需要选择自定义安装,修改安装路径,推荐将MongoDB安装到C盘顶层,注意路径中不能有空格或中文。
2.环境变量的配置
3.验证
mongod
4.新建一个存放数据库的文件夹
新建data文件
再进入data新建一个db
5.启动
mongod --dbpath C:\MongoDB\Server\3.4\data\db
(二)将MongoDB配置成一个服务
1.创建日志目录
在data目录下创建一个logs目录
2.以管理员身份进入cmd
3.进入MongoDB的bin目录下
4.运行命令
mongod --bind_ip 0.0.0.0 --logpath C:\MongoDB\Server\3.4\data\logs\mongo.log --logappend --dbpath C:\MongoDB\Server\3.4\data\db --port 27017 --serviceName "MongoDB" --serviceDisplayName "MongoDB" --install
5.启动服务
(三)客户端和服务器
mongod–启动MongoDB服务器。
mongo–启动客户端,客户端是用来做曾删改查这些操作的。
(四)mongo的文档document
一个文档就相当于一个字典
{name:'zhangsan',age:'6',grade:'5'}
四、MongoDB操作
新建的默认有两个数据库
admin–配置权限
local–固定的重要数据
(一)基本操作
查看数据库
show dbs
创建数据库(隐式创建)
没有成功,需要两步
还可以
查看集合
show tables
show collections
删除库和集合
db.collectionName.drop() // 删除集合
db.dropDatabase() // 删除数据库
库内没有集合了,库也会自动删除