1)復現流程及lxc的處理
demo1程序與執行結果如下,此時在容器內部看不到執行的程序。
int main()
{
int ret, fd, pid;
printf("father pid old:%d\n", getpid());
fd = open("/dev/ns", O_RDWR);
ret = ioctl(fd, 24635); // parm is dst ns process's pid
printf("father pid old:%d\n", getpid());
sleep(5);
return 0;
}
# ./a.out
father pid old:26169
father pid old:26169
demo2程序與執行結果如下,此時容器內還是看不到執行程序,但是這裏getpid()獲取到的值就爲0了,對比上邊,世界上沒有這麼玄乎的事,或許是緩存的問題?這是第一個疑問。
int main()
{
int ret, fd, pid;
fd = open("/dev/ns", O_RDWR);
ret = ioctl(fd, 24635); // parm is dst ns process's pid
printf("father pid old:%d\n", getpid());
sleep(5);
return 0;
}
# ./a.out
father pid old:0
demo3程序與執行結果如下,這時在容器內部能看到子進程,爲什麼子進程實現了pid ns的切換,父進程卻沒有實現?這是第二個疑問。
int main()
{
int ret, fd, pid;
fd = open("/dev/ns", O_RDWR);
ret = ioctl(fd, 24635); // parm is dst ns process's pid
printf("father pid old:%d\n", getpid());
pid = fork();
if(0 == pid) {
printf("son pid:%d\n", getpid());
sleep(5);
} else {
printf("father pid:%d\n", getpid());
wait();
}
return 0;
}
# ./a.out
father pid old:0
father pid:0
son pid:87
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
1 root 20 0 22036 2144 1624 S 0.0 0.1 0:00.01 bash
57 root 20 0 23672 1516 1160 R 0.0 0.1 0:00.66 top
87 root 20 0 3760 204 120 S 0.0 0.0 0:00.00 a.out
lxc attach流程如下,實現思路和我們上邊的測試demo一致,看來這確實是linux機制的問題。
setns()
pid = fork();
if(!pid) { // son
execve(bash);
}
2)pid緩存問題
編寫了demo4:
int main()
{
int ret, fd, pid;
printf("father pid:%d\n", getpid());
printf("father pid:%d\n", getpid());
printf("father pid:%d\n", getpid());
printf("father pid:%d\n", getpid());
fd = open("/dev/ns", O_RDWR);
ret = ioctl(fd, 24635); // parm is dst ns process's pid
printf("father pid old:%d\n", getpid());
pid = fork();
if(0 == pid) {
printf("son pid:%d\n", getpid());
sleep(5);
} else {
printf("father pid:%d\n", getpid());
wait();
}
return 0;
}
看下結果,fork前後打印一致
# ./a.out
father pid:23246
father pid:23246
father pid:23246
father pid:23246
father pid old:23246
father pid:23246
son pid:96
strace看下,果然和我們猜的一樣,除了第一次getpid調用了syscall後,後邊所有的返回值均是從緩存中獲取的,所以這就能解釋爲什麼不執行getpid並fork後執行getpid獲取到的返回值是0,而執行了getpid並fork後再執行getpid獲取到的返回值不變。
# strace ./a.out
execve("./a.out", ["./a.out"], [/* 29 vars */]) = 0
brk(0) = 0x18f4000
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d79c000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=105026, ...}) = 0
mmap(NULL, 105026, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fa04d782000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/lib/libc.so.6", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0@\356\1\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1478056, ...}) = 0
mmap(NULL, 3586120, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fa04d215000
mprotect(0x7fa04d377000, 2097152, PROT_NONE) = 0
mmap(0x7fa04d577000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x162000) = 0x7fa04d577000
mmap(0x7fa04d57c000, 18504, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa04d57c000
close(3) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d781000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d780000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d77f000
arch_prctl(ARCH_SET_FS, 0x7fa04d780700) = 0
mprotect(0x7fa04d577000, 16384, PROT_READ) = 0
mprotect(0x7fa04d79e000, 4096, PROT_READ) = 0
munmap(0x7fa04d782000, 105026) = 0
getpid() = 23253
fstat(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 4), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d79b000
write(1, "father pid:23253\n", 17father pid:23253
) = 17
write(1, "father pid:23253\n", 17father pid:23253
) = 17
write(1, "father pid:23253\n", 17father pid:23253
) = 17
write(1, "father pid:23253\n", 17father pid:23253
) = 17
open("/dev/ns", O_RDWR) = 3
ioctl(3, 0x603b, 0x7fa04d57cdf0) = 0
write(1, "father pid old:23253\n", 21father pid old:23253
) = 21
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fa04d7809d0) = 97
write(1, "father pid:23253\n", 17father pid:23253
) = 17
wait4(-1, son pid:97
0xffffffff, 0, NULL) = -1 EFAULT (Bad address)
--- SIGCHLD (Child exited) @ 0 (0) ---
3)爲什麼子進程實現了pid ns的切換,父進程卻沒有實現?
先來看看setns的實現,在2.6.32中要支持setns很簡單首先通過不創建新NS的方式調用copy_namespaces並傳入dst ns,這會增加dst ns的引用,之後通過switch_task_namespaces傳入current task與dst ns,在函數中會首先進行nsproxy指針的交換,將當前task切換到dst ns中,之後src ns減引用,這樣就能保證引用數量的正確。因此這裏實際上切換了current task的nsproxy。
copy_namespaces->switch_task_namespaces
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct nsproxy *new_ns;
int err = 0;
if (!old_ns)
return 0;
get_nsproxy(old_ns);
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET)))
return 0;
。。。
}
void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
{
struct nsproxy *ns;
might_sleep();
ns = p->nsproxy;
rcu_assign_pointer(p->nsproxy, new);
if (ns && atomic_dec_and_test(&ns->count)) {
/*
* wait for others to get what they want from this nsproxy.
*
* cannot release this nsproxy via the call_rcu() since
* put_mnt_ns() will want to sleep
*/
synchronize_rcu();
free_nsproxy(ns);
}
}
再看看getpid的調用流程(sys_getpid–>task_tgid_vnr->pid_vnr->pid_nr_ns),這裏考慮線程問題通過task_tgid獲取該PID的tgid,因爲應用層和內核對PID的定義不同,內核中進程與線程都擁有相同的結構體描述struct task_struct,因此進程與線程在內核中均擁有自己獨立的PID,因此這時候找到“用戶態PID”的關鍵是找到根進程,因爲父根進程的pid與tgid是一致如下,因此在這裏傳入tgid來代替父根進程的PID。在pid_nr_ns中會進行兩個判斷,一是該pid ns的level應大於指定pid ns的level,這裏的pid ns level如下圖所示是一個樹狀結構,這裏default pid ns中的pid level爲0,而基於某進程創建的CLONE_NEWPID的進程,其pid ns level + 1,因此這裏的判斷會出現問題,因爲setns只修改了current->nsproxy,如果是在default pid ns中執行的demo程序以及lxc程序,那麼current->nsproxy->pid_ns->level = 1,而pid->level = 0,這會導致判斷失敗直接返回0,這也是我們在demo程序中通過getpid()得到0的原因。
<-- PID 43 --> <----------------- PID 42 ----------------->
+---------+
| process |
_| pid=42 |_
_/ | tgid=42 | \_ (new thread) _
_ (fork) _/ +---------+ \
/ +---------+
+---------+ | process |
| process | | pid=44 |
| pid=43 | | tgid=42 |
| tgid=43 | +---------+
+---------+
<-- PID 43 --> <--------- PID 42 --------> <--- PID 44 --->
SYSCALL_DEFINE0(getpid)
{
return task_tgid_vnr(current);
}
static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
return pid_vnr(task_tgid(tsk));
}
pid_t pid_vnr(struct pid *pid)
{
return pid_nr_ns(pid, current->nsproxy->pid_ns);
}
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
struct upid *upid;
pid_t nr = 0;
if (pid && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns)
nr = upid->nr;
}
return nr;
}
如下,這裏有對獲取到的upid->nr的一個很好的解釋,也就是說upid是pid ns有效的,另外這個實驗是在2.6.32.5中實現的,而2.6.32.5中不支持setns(我對它進行了移植),或許在3.x的某些支持setns的版本中也能復現,但4.9.0中pid_nr_ns的不同實現導致了實驗效果的不同,在4.9.0中demo程序會返回default ns中的pid。
/*
* struct upid is used to get the id of the struct pid, as it is
* seen in particular namespace. Later the struct pid is found with
* find_pid_ns() using the int nr and struct pid_namespace *ns.
*/
struct upid {
/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
int nr;
struct pid_namespace *ns;
struct hlist_node pid_chain;
};
最後再補充下,前面可以看到setns實現其實只修改了task_struct中的nsproxy,因此fork會通過調用鏈do_fork->copy_process->alloc_pid(p->nsproxy->pid_ns)也就是利用父進程的nsproxy->pid_ns來創建自己的pid,這樣就說明了爲什麼lxc中必須fork出子進程來執行execve。