ceph的隔離級別默認都是host級別,也就是說兩個副本不會同時落在同一個host 上的磁盤中。這樣就保證了有一臺機器故障不導致數據不可用。但是如果同時兩臺機器故障你呢?這就有可能數據丟失造成嚴重後果。甚至說遇到一個機架突然掉電那這個ceph機器都不可用。解決辦法就是提高隔離級別爲機架、甚至爲了避免重大自然災害導致一個機房損壞是的數據丟失隔離級別可以提高到機房級別,甚至更高。
下面我們的例子就是隔離級別提高到機架級別。
[root@ceph-node1 opt]# cat decrushmap
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class hdd
device 1 osd.1 class ssd
device 2 osd.2 class hdd
device 3 osd.3 class ssd
device 4 osd.4 class hdd
device 5 osd.5 class ssd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
device 8 osd.8 class hdd
device 9 osd.9 class hdd
device 10 osd.10 class hdd
device 11 osd.11 class hdd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root
# buckets
host ceph-node1 {
id -3 # do not change unnecessarily
id -4 class hdd # do not change unnecessarily
id -15 class ssd # do not change unnecessarily
# weight 0.058
alg straw2
hash 0 # rjenkins1
item osd.0 weight 0.029
item osd.1 weight 0.029
}
host ceph-node2 {
id -5 # do not change unnecessarily
id -6 class hdd # do not change unnecessarily
id -16 class ssd # do not change unnecessarily
# weight 0.058
alg straw2
hash 0 # rjenkins1
item osd.2 weight 0.029
item osd.3 weight 0.029
}
host ceph-node3 {
id -7 # do not change unnecessarily
id -8 class hdd # do not change unnecessarily
id -17 class ssd # do not change unnecessarily
# weight 0.058
alg straw2
hash 0 # rjenkins1
item osd.4 weight 0.029
item osd.5 weight 0.029
}
host ceph-node4 {
id -9 # do not change unnecessarily
id -10 class hdd # do not change unnecessarily
id -18 class ssd # do not change unnecessarily
# weight 0.058
alg straw2
hash 0 # rjenkins1
item osd.6 weight 0.029
item osd.7 weight 0.029
}
host ceph-node5 {
id -11 # do not change unnecessarily
id -12 class hdd # do not change unnecessarily
id -19 class ssd # do not change unnecessarily
# weight 0.058
alg straw2
hash 0 # rjenkins1
item osd.8 weight 0.029
item osd.9 weight 0.029
}
host ceph-node6 {
id -13 # do not change unnecessarily
id -14 class hdd # do not change unnecessarily
id -20 class ssd # do not change unnecessarily
# weight 0.058
alg straw2
hash 0 # rjenkins1
item osd.10 weight 0.029
item osd.11 weight 0.029
}
# rack
rack rack01 {
id -101 # do not change unnecessarily
id -102 class hdd # do not change unnecessarily
id -103 class ssd # do not change unnecessarily
# weight 0.058
alg straw2
hash 0 # rjenkins1
item ceph-node1 weight 0.058
item ceph-node2 weight 0.058
}
rack rack02 {
id -104 # do not change unnecessarily
id -105 class hdd # do not change unnecessarily
id -106 class ssd # do not change unnecessarily
# weight 0.058
alg straw2
hash 0 # rjenkins1
item ceph-node3 weight 0.058
item ceph-node4 weight 0.058
}
rack rack03 {
id -107 # do not change unnecessarily
id -108 class hdd # do not change unnecessarily
id -109 class ssd # do not change unnecessarily
# weight 0.058
alg straw2
hash 0 # rjenkins1
item ceph-node5 weight 0.058
item ceph-node6 weight 0.058
}
root default {
id -110 # do not change unnecessarily
id -111 class hdd # do not change unnecessarily
id -112 class ssd # do not change unnecessarily
# weight 0.354
alg straw2
hash 0 # rjenkins1
item rack01 weight 0.118
item rack02 weight 0.118
item rack03 weight 0.118
}
# rules
rule replicated_rule {
id 0
type replicated
min_size 1
max_size 10
step take default class hdd
step chooseleaf firstn 0 type rack
step emit
}
rule replicated_ssd {
id 1
type replicated
min_size 1
max_size 10
step take default class ssd
step chooseleaf firstn 0 type rack
step emit
}
[root@ceph-node1 opt]# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-110 0.35399 root default
-101 0.11800 rack rack01
-3 0.05800 host ceph-node1
0 hdd 0.02899 osd.0 up 1.00000 1.00000
1 ssd 0.02899 osd.1 up 1.00000 1.00000
-5 0.05800 host ceph-node2
2 hdd 0.02899 osd.2 up 1.00000 1.00000
3 ssd 0.02899 osd.3 up 1.00000 1.00000
-104 0.11800 rack rack02
-7 0.05800 host ceph-node3
4 hdd 0.02899 osd.4 up 1.00000 1.00000
5 ssd 0.02899 osd.5 up 1.00000 1.00000
-9 0.05800 host ceph-node4
6 hdd 0.02899 osd.6 up 1.00000 1.00000
7 hdd 0.02899 osd.7 up 1.00000 1.00000
-107 0.11800 rack rack03
-11 0.05800 host ceph-node5
8 hdd 0.02899 osd.8 up 1.00000 1.00000
9 hdd 0.02899 osd.9 up 1.00000 1.00000
-13 0.05800 host ceph-node6
10 hdd 0.02899 osd.10 up 1.00000 1.00000
11 hdd 0.02899 osd.11 up 1.00000 1.00000
測試發現達到效果
[root@ceph-node1 opt]# ceph osd map stat rbd_data.10ab6b8b4567.0000000000000042
osdmap e73 pool 'stat' (1) object 'rbd_data.10ab6b8b4567.0000000000000042' -> pg 1.fa3e81bf (1.3f) -> up ([11,2,4], p11) acting ([11,2,4], p11)
# pg的3個副本分別放在了不同rack上
更高級別的就不在演示了。