setns對當前進程無效問題的排查(getpid獲取值不變)

1）復現流程及lxc的處理

demo1程序與執行結果如下，此時在容器內部看不到執行的程序。

int main()
{
	int ret, fd, pid;
	printf("father pid old:%d\n", getpid());
	fd = open("/dev/ns", O_RDWR);
	ret = ioctl(fd, 24635); // parm is dst ns process's pid
	printf("father pid old:%d\n", getpid());
	sleep(5);
	return 0;
}

# ./a.out 
father pid old:26169
father pid old:26169

demo2程序與執行結果如下，此時容器內還是看不到執行程序，但是這裏getpid()獲取到的值就爲0了，對比上邊，世界上沒有這麼玄乎的事，或許是緩存的問題？這是第一個疑問。

int main()
{
	int ret, fd, pid;
	fd = open("/dev/ns", O_RDWR);
	ret = ioctl(fd, 24635); // parm is dst ns process's pid
	printf("father pid old:%d\n", getpid());
	sleep(5);
	return 0;
}

# ./a.out 
father pid old:0

demo3程序與執行結果如下，這時在容器內部能看到子進程，爲什麼子進程實現了pid ns的切換，父進程卻沒有實現？這是第二個疑問。

int main()
{
	int ret, fd, pid;
	fd = open("/dev/ns", O_RDWR);
	ret = ioctl(fd, 24635); // parm is dst ns process's pid
	printf("father pid old:%d\n", getpid());
	pid = fork();
	if(0 == pid) {
		printf("son pid:%d\n", getpid());
		sleep(5);
	} else {
		printf("father pid:%d\n", getpid());
		wait();
	}
	return 0;
}

# ./a.out 
father pid old:0
father pid:0
son pid:87

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND                                         
    1 root      20   0   22036   2144   1624 S   0.0  0.1   0:00.01 bash                                            
   57 root      20   0   23672   1516   1160 R   0.0  0.1   0:00.66 top                                             
   87 root      20   0    3760    204    120 S   0.0  0.0   0:00.00 a.out

lxc attach流程如下，實現思路和我們上邊的測試demo一致，看來這確實是linux機制的問題。

setns()
pid = fork();
if(!pid) { // son
execve(bash);
}

2）pid緩存問題

編寫了demo4：

int main()
{
	int ret, fd, pid;
	printf("father pid:%d\n", getpid());
	printf("father pid:%d\n", getpid());
	printf("father pid:%d\n", getpid());
	printf("father pid:%d\n", getpid());
	fd = open("/dev/ns", O_RDWR);
	ret = ioctl(fd, 24635); // parm is dst ns process's pid
	printf("father pid old:%d\n", getpid());
	pid = fork();
	if(0 == pid) {
		printf("son pid:%d\n", getpid());
		sleep(5);
	} else {
		printf("father pid:%d\n", getpid());
		wait();
	}
	return 0;
}

看下結果，fork前後打印一致

# ./a.out 
father pid:23246
father pid:23246
father pid:23246
father pid:23246
father pid old:23246
father pid:23246
son pid:96

strace看下，果然和我們猜的一樣，除了第一次getpid調用了syscall後，後邊所有的返回值均是從緩存中獲取的，所以這就能解釋爲什麼不執行getpid並fork後執行getpid獲取到的返回值是0，而執行了getpid並fork後再執行getpid獲取到的返回值不變。

# strace ./a.out 
execve("./a.out", ["./a.out"], [/* 29 vars */]) = 0
brk(0)                                  = 0x18f4000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d79c000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=105026, ...}) = 0
mmap(NULL, 105026, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fa04d782000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
open("/lib/libc.so.6", O_RDONLY)        = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0@\356\1\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1478056, ...}) = 0
mmap(NULL, 3586120, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fa04d215000
mprotect(0x7fa04d377000, 2097152, PROT_NONE) = 0
mmap(0x7fa04d577000, 20480, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x162000) = 0x7fa04d577000
mmap(0x7fa04d57c000, 18504, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa04d57c000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d781000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d780000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d77f000
arch_prctl(ARCH_SET_FS, 0x7fa04d780700) = 0
mprotect(0x7fa04d577000, 16384, PROT_READ) = 0
mprotect(0x7fa04d79e000, 4096, PROT_READ) = 0
munmap(0x7fa04d782000, 105026)          = 0
getpid()                                = 23253
fstat(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 4), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa04d79b000
write(1, "father pid:23253\n", 17father pid:23253
)      = 17
write(1, "father pid:23253\n", 17father pid:23253
)      = 17
write(1, "father pid:23253\n", 17father pid:23253
)      = 17
write(1, "father pid:23253\n", 17father pid:23253
)      = 17
open("/dev/ns", O_RDWR)                 = 3
ioctl(3, 0x603b, 0x7fa04d57cdf0)        = 0
write(1, "father pid old:23253\n", 21father pid old:23253
)  = 21
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fa04d7809d0) = 97
write(1, "father pid:23253\n", 17father pid:23253
)      = 17
wait4(-1, son pid:97
0xffffffff, 0, NULL)          = -1 EFAULT (Bad address)
--- SIGCHLD (Child exited) @ 0 (0) ---

3）爲什麼子進程實現了pid ns的切換，父進程卻沒有實現？

先來看看setns的實現，在2.6.32中要支持setns很簡單首先通過不創建新NS的方式調用copy_namespaces並傳入dst ns，這會增加dst ns的引用，之後通過switch_task_namespaces傳入current task與dst ns，在函數中會首先進行nsproxy指針的交換，將當前task切換到dst ns中，之後src ns減引用，這樣就能保證引用數量的正確。因此這裏實際上切換了current task的nsproxy。

copy_namespaces->switch_task_namespaces

int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
    struct nsproxy *old_ns = tsk->nsproxy;
    struct nsproxy *new_ns;
    int err = 0;

    if (!old_ns)
        return 0;

    get_nsproxy(old_ns);

    if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                CLONE_NEWPID | CLONE_NEWNET)))
        return 0;
。。。
}

void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
{
    struct nsproxy *ns;

    might_sleep();

    ns = p->nsproxy;

    rcu_assign_pointer(p->nsproxy, new);

    if (ns && atomic_dec_and_test(&ns->count)) {
        /*
         * wait for others to get what they want from this nsproxy.
         *
         * cannot release this nsproxy via the call_rcu() since
         * put_mnt_ns() will want to sleep
         */
        synchronize_rcu();
        free_nsproxy(ns);
    }
}

再看看getpid的調用流程(sys_getpid–>task_tgid_vnr->pid_vnr->pid_nr_ns)，這裏考慮線程問題通過task_tgid獲取該PID的tgid，因爲應用層和內核對PID的定義不同，內核中進程與線程都擁有相同的結構體描述struct task_struct，因此進程與線程在內核中均擁有自己獨立的PID，因此這時候找到“用戶態PID”的關鍵是找到根進程，因爲父根進程的pid與tgid是一致如下，因此在這裏傳入tgid來代替父根進程的PID。在pid_nr_ns中會進行兩個判斷，一是該pid ns的level應大於指定pid ns的level，這裏的pid ns level如下圖所示是一個樹狀結構，這裏default pid ns中的pid level爲0，而基於某進程創建的CLONE_NEWPID的進程，其pid ns level + 1，因此這裏的判斷會出現問題，因爲setns只修改了current->nsproxy，如果是在default pid ns中執行的demo程序以及lxc程序，那麼current->nsproxy->pid_ns->level = 1，而pid->level = 0，這會導致判斷失敗直接返回0，這也是我們在demo程序中通過getpid()得到0的原因。

 <-- PID 43 --> <----------------- PID 42 ----------------->
                     +---------+
                     | process |
                    _| pid=42  |_
                  _/ | tgid=42 | \_ (new thread) _
       _ (fork) _/   +---------+                  \
      /                                        +---------+
+---------+                                    | process |
| process |                                    | pid=44  |
| pid=43  |                                    | tgid=42 |
| tgid=43 |                                    +---------+
+---------+
 <-- PID 43 --> <--------- PID 42 --------> <--- PID 44 --->

SYSCALL_DEFINE0(getpid)
{
    return task_tgid_vnr(current);
}

static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
    return pid_vnr(task_tgid(tsk));
}

pid_t pid_vnr(struct pid *pid)
{
    return pid_nr_ns(pid, current->nsproxy->pid_ns);
}

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
    struct upid *upid;
    pid_t nr = 0;

    if (pid && ns->level <= pid->level) {
        upid = &pid->numbers[ns->level];
        if (upid->ns == ns)
            nr = upid->nr;
    }
    return nr;
}

如下，這裏有對獲取到的upid->nr的一個很好的解釋，也就是說upid是pid ns有效的，另外這個實驗是在2.6.32.5中實現的，而2.6.32.5中不支持setns(我對它進行了移植)，或許在3.x的某些支持setns的版本中也能復現，但4.9.0中pid_nr_ns的不同實現導致了實驗效果的不同，在4.9.0中demo程序會返回default ns中的pid。

/*
 * struct upid is used to get the id of the struct pid, as it is
 * seen in particular namespace. Later the struct pid is found with
 * find_pid_ns() using the int nr and struct pid_namespace *ns.
 */
struct upid {
    /* Try to keep pid_chain in the same cacheline as nr for find_vpid */
    int nr;
    struct pid_namespace *ns;
    struct hlist_node pid_chain;
};

最後再補充下，前面可以看到setns實現其實只修改了task_struct中的nsproxy，因此fork會通過調用鏈do_fork->copy_process->alloc_pid(p->nsproxy->pid_ns)也就是利用父進程的nsproxy->pid_ns來創建自己的pid，這樣就說明了爲什麼lxc中必須fork出子進程來執行execve。

setns對當前進程無效問題的排查(getpid獲取值不變)

1）復現流程及lxc的處理

2）pid緩存問題

3）爲什麼子進程實現了pid ns的切換，父進程卻沒有實現？

apisix~helm方式的部署到k8s

firmeye - IoT固件漏洞挖掘工具

解決virtio-gpu對framebuffer支持及VT切換等問題

setns對當前進程無效問題的排查(getpid獲取值不變)

解決qemu虛擬機圖形界面卡死問題

解決qemu虛擬機中內存偏小的問題

kubernetes中的CLUSTER-IP和EXTERNAL-IP無法ping通，但是curl可以獲取到頁面

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結