目录
tool命令各个击破
零、通用代码
1.命令行解析
解析方法
lxc_arguments_parse:注意部分参数是通过各个命令工具源码中的my_parser(struct lxc_arguments *args, int c, char *arg)
方法解析的
confile.c中封装了配置项的set/get方法
lxc-start命令重要的参数:
- 1)lxcpath:即容器配置文件的根目录
lxcpath可以通过-P参数指定,一般在编译时通过configure脚本的--with-config-path
参数指定,最终在编译时通过宏定义-DLXCPATH
传递给目标文件。见lxc_arguments_parse方法,如果lxcpath在编译时指定,则解析如下:
/* If no lxcpaths were given, use default */
if (!args->lxcpath_cnt) {
ret = lxc_arguments_lxcpath_add(
args, lxc_get_global_config_item("lxc.lxcpath"));
if (ret < 0)
return ret;
}
//lxc_get_global_config_item位于lxccontainer.c中
//lxc_get_global_config_item再调用initutils.c中的lxc_global_config_value从宏中获取lxcpath参数
- 2)args->argv:即容器要执行的目标程序
数据结构
lxc_arguments
→argv:存储命令的启动参数,如lxc-start -- /init,即是/init.
日志位置及等级设定
--logfile ./lxc_attach.log --logpriority trace
自定义lxc配置项
https://www.cnblogs.com/lisperl/archive/2012/04/16/2451215.html
对于lxc-start命令,配置项是通过-s参数以KEY=VALUE的形式指定的,在my_parser方法中实现。
case 's':
return lxc_config_define_add(&defines, arg);
2.日志初始化
lxc_log_init
可知日志的默认等级为ERROR
int lxc_log_init(struct lxc_log *log)
{
int ret;
int lxc_priority = LXC_LOG_LEVEL_ERROR;
...
if (log->level)
lxc_priority = lxc_log_priority_to_int(log->level);
}
3.日志打印
看来作者对宏定义屡试不爽
#define INFO(format, ...) do { \
struct lxc_log_locinfo locinfo = LXC_LOG_LOCINFO_INIT; \
LXC_INFO(&locinfo, format, ##__VA_ARGS__); \
} while (0)
__lxc_unused static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo, \
const char* format, ...)
一、lxc-start
对于lxc-start在命令行解析之前有一段初始化配置文件数据结构的代码:
static struct lxc_list defines;
...
//初始化双向链表
lxc_list_init(&defines);
1.创建容器
调用lxccontainer.c的lxc_container_new方法
记录config_path
c->config_path = strdup(configpath);
set_config_filename:将config文件路径保存到c->configfile
lxc_conf初始化
if (file_exists(c->configfile) && !lxcapi_load_config(c, NULL)) {
fprintf(stderr, "Failed to load config for %s\n", name);
goto err;
}
//注意lxcapi_load_config这个方法,是宏定义的函数。
WRAP_API_1(bool, lxcapi_load_config, const char *)
#define WRAP_API_1(rettype, fnname, t1) \
static rettype fnname(struct lxc_container *c, t1 a1) \
{ \
rettype ret; \
bool reset_config = false; \
\
if (!current_config && c && c->lxc_conf) { \
current_config = c->lxc_conf; \
reset_config = true; \
} \
\
ret = do_##fnname(c, a1); \
if (reset_config) \
current_config = NULL; \
\
return ret; \
}
//lxcapi_load_config最后会调用do_lxcapi_load_config方法
static bool load_config_locked(struct lxc_container *c, const char *fname)
{
if (!c->lxc_conf)
c->lxc_conf = lxc_conf_init();
if (!c->lxc_conf)
return false;
//将config文件中的配置项读取到c->lxc_conf中
if (lxc_config_read(fname, c->lxc_conf, false) != 0)
return false;
c->lxc_conf->name = c->name;
return true;
}
//lxc_config_read会调用parse_line方法
static int parse_line(char *buffer, void *data){
...
//返回config_jump_table数组中的lxc_config_t结构体,定义了各个参数的get/set/clr方法
config = lxc_get_config(key);
if (!config) {
ERROR("Unknown configuration key \"%s\"", key);
goto on_error;
}
ret = config->set(key, value, plc->conf, NULL);
...
}
struct lxc_config_t {
char *name;
config_set_cb set;
config_get_cb get;
config_clr_cb clr;
};
- lxc.net.N.xx
注意网络配置可能会有多组,通过lxc_network_add方法追加。
函数指针赋值
...
c->start = lxcapi_start;
//默认以守护进程的方式创建容器
c->daemonize = true;
...
2.校验容器
如容器有没有在运行等
3.运行容器
调用容器lxccontainer.c的start方法,也就是lxcapi_start方法,接着调用:
→do_lxcapi_start
3-1)创建容器进程
3-1-1)一些初始化工作
→→ongoing_create
→→container_mem_lock
→→lxc_init_handler(重要)
创建并初始化lxc_handler结构体
//记录lxc-start命令进程,transient临时的意思。因为最终lxc-start进程fork出容器的守护进程之后是会被杀死的。
handler->transient_pid = lxc_raw_getpid();
//创建一对相互通信的socket,存储在lxc_handler->state_socket_pair结构体中,容器进程就是通过这对socket告知lxc-start进程容器状态的。
socketpair
//Created abstract unix socket "/data/media/build_lxc/containers/android1/command"
lxc_cmd_init
拿到lxc-start的启动程序
→→container_mem_unlock
3-1-2)创建守护进程(容器进程)
→→pid_first = fork();
→/* first parent */ (父进程,即lxc-start命令进程。)
/* Set to NULL because we don't want father unlink
* the PID file, child will do the free and unlink.
*/
c->pidfile = NULL;
/*
Wait for container to tell us whether it started successfully.
*/
//lxc-start进程等待pid_first fork完pid_second进程就结束了
started = wait_on_daemonized_start(handler, pid_first);
//lxc-start进程至此就结束了
return started;
→/* first child */ (子进程)
//设置子进程的进程名 (失败了,待解决。涉及非常底层的系统调用)
setproctitle
→→pid_second = fork();
/* second parent */
释放内存并退出当前进程
/* second child */
//修改进程的工作目录为/
chdir
//关闭守护进程fd,并忽略部分fd。
lxc_check_inherited
/* redirect std{in,out,err} to /dev/null */
null_stdfds();
/* become session leader */
ret = setsid();
注意自此开始,都是守护进程在执行!!!!!
虽然ps | grep lxc仍然显示lxc-start进程,但是这个名字为lxc-start的进程实际为第二个fork的守护进程。(正是因为更改子进程setproctitle方法执行失败了,才会有这个误解!)
//lxc-start通过-p参数指定了pid写入路径,就记录守护进程的pid。
if (c->pidfile) {
...
}
3-2)守护进程(容器进程)start逻辑
→→lxc_start
→→→__lxc_start
3-2-1)初始化容器(主CGROUP逻辑处理)
→→→→重要方法1:lxc_init(初始化容器)
//获取守护进程的进程id
handler->monitor_pid = lxc_raw_getpid();
//读取守护进程的status状态
status_fd = open("/proc/self/status", O_RDONLY | O_CLOEXEC);
//初始化安全框架
lsm_init
//设置云机状态为STARTING,就是利用前面的pair socket来实现进程通讯。
ret = lxc_set_state(name, handler, STARTING);
//设置各种LXC_XX环境变量
...
//运行pre-start的hook脚本
ret = run_lxc_hooks(name, "pre-start", conf, NULL);
//利用signalfd函数创建信号文件描述符,后面lxc_poll方法里会用到。
setup_signal_fd
//默认配置为空,do nothing
lxc_terminal_setup
goto 3-2-1-0)
//默认配置为空,do nothing
lxc_terminal_map_ids
3-2-1-0)lxc_terminal_setup
→lxc_terminal_create
a.利用openpty函数创建伪终端对
ret = openpty(&terminal->master, &terminal->slave, NULL, NULL, NULL);
b.获取从设备的设备名称
ret = ttyname_r(terminal->slave, terminal->name, sizeof(terminal->name));
获取后会存储在lxc_conf的console变量中
c.关闭伪终端对
//?
lxc_terminal_peer_default
goto on_success
→lxc_terminal_create_ringbuf
默认是没有buffer的
3-2-1-1)初始化cgroup
→→→→→cgroup_init
(cgroup.c)
函数返回cgroup_ops结构体存储于handler->cgroup_ops,后面会用到。注意cgroup_ops结构体很重要,存储cgroup相关的各种信息和操作函数。
handler->cgroup_ops = cgroup_init(handler->conf);
→→→→→→cgfsng_ops_init(cgfsng.c)
使用动态内存创建cgfsng_ops结构体
-----TAG_A-START-----
→→→→→→→cg_init
- TAG_A-1) CGROUP2初始化
→→→→→→→→cg_unified_init (v2走此方法)
→→→→→→→→→unified_cgroup_hierarchy (判断是否是unified模式,也就是cgroup2)
会检查DEFAULT_CGROUP_MOUNTPOINT也就是/sys/fs/cgroup
是什么类型的cgroup
- TAG_A-2) CGROUP1初始化
→→→→→→→→cg_hybrid_init (非v2才会走此方法)
→→→→→→→→→get_existing_subsystems (获取存在的cgroup子系统)
解析/proc/self/cgroup
文件
→→→→→→→→→lxc_cgfsng_print_basecg_debuginfo (打印cgroup子系统基本信息)
结果为:
6:schedtune:/
5:cpuset:/
4:cpuacct:/
3:cpu:/
2:blkio:/
1:memory:/
0::/
→→→→→→→→→解析/proc/self/mountinfo
文件
会过滤掉cgroup2类型
cg_hybrid_get_controllers (判断hierarchy是否在DEFAULT_CGROUP_MOUNTPOINT下)
//TODO,未完待续
-----TAG_A-END-----
cgfsng_ops结构体函数指针赋值
3-2-1-2)初始化安全框架
ret = lxc_read_seccomp_config(conf);
ret = lsm_process_prepare(conf, handler->lxcpath);
3-2-1-3)记录守护进程status文件fd
handler->monitor_status_fd = move_fd(status_fd);
记录/proc/self/status文件的fd值,后面会用到。
-------------------------------------------------------------------------------------------------
lxc_init方法之后会调用cgroup_ops的monitor相关方法:
1) monitor_create
创建monitor group。假如容器name为android1,则此group名称为lxc.monitor.android1.
具体见create_cgroup_tree方法:
if (payload) {
//记录cgroup的fd值,后面会用到。
h->cgfd_con = lxc_open_dirfd(path);
if (h->cgfd_con < 0)
return log_error_errno(false, errno, "Failed to open %s", path);
h->container_full_path = move_ptr(path);
} else {
h->cgfd_mon = lxc_open_dirfd(path);
if (h->cgfd_mon < 0)
return log_error_errno(false, errno, "Failed to open %s", path);
h->monitor_full_path = move_ptr(path);
}
2) monitor_enter
进入monitor cgroup,将容器进程的pid写入monitor cgroup的cgroup.procs文件里
3)monitor_delegate_controllers
-------------------------------------------------------------------------------------------------
//TODO,未完待续
3-2-2)孵化容器(主NAMESPACE逻辑处理)
→→→→重要方法2:lxc_spawn(孵化容器,再次创建子进程payload)
方法说明:
/* lxc_spawn() performs crucial setup tasks and clone()s the new process which
* exec()s the requested container binary.
* Note that lxc_spawn() runs in the parent namespaces. Any operations performed
* right here should be double checked if they'd pose a security risk. (For
* example, any {u}mount() operations performed here will be reflected on the
* host!)
*/
//创建socket对,存于handler->sync_sock。(用于monitor与payload进程同步)
lxc_sync_init
//创建socket对,存于handler->data_sock
//解析命令空间克隆标记
resolve_clone_flags
主要有
Cloned CLONE_NEWNS
Cloned CLONE_NEWPID
Cloned CLONE_NEWUTS
Cloned CLONE_NEWIPC
Cloned CLONE_NEWNET
int resolve_clone_flags(struct lxc_handler *handler)
{
int i;
struct lxc_conf *conf = handler->conf;
for (i = 0; i < LXC_NS_MAX; i++) {
if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
continue;
//配置文件配置了网络才会克隆网络命名空间
if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
continue;
//`/proc/self/ns/cgroup`文件存在才支持克隆CGROUP命名空间
if (i == LXC_NS_CGROUP && !cgns_supported())
continue;
handler->ns_clone_flags |= ns_info[i].clone_flag;
}
}
//创建payload cgroup
payload_create
//share_ns为false
//调用内联汇编进行进程的克隆(lxc-start 2次fork出来的子进程再clone个新进程)
//注意:clone的子进程会有诸多的命名空间,所以和父进程之间利用socket跨命名空间通讯。
handler->pid = lxc_raw_clone_cb(do_start, handler,
CLONE_PIDFD | handler->ns_on_clone_flags,
&handler->pidfd);
## 到此为止,lxc-start命令创建的进程太多了,为了便于区分,后面统一规定:守护进程叫做monitor进程,由monitor进程clone出来的这个进程叫做payload进程。 ##
lxc_raw_clone_cb函数:
-------------------------------------------------------------------------------------------------
pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags,
int *pidfd)
{
pid_t pid;
pid = lxc_raw_clone(flags, pidfd);
if (pid < 0)
return -1;
/*lxc_raw_clone
* exit() is not thread-safe and might mess with the parent's signal
* handlers and other stuff when exec() fails.
*/
if (pid == 0)
_exit(fn(args));
return pid;
}
那么子进程payload进程将执行do_start(void *data)方法
-------------------------------------------------------------------------------------------------
3-2-2-1)monitor进程继续执行
后面执行的都是monitor进程!!
//记录clone出来的进程id到环境变量
ret = setenv("LXC_PID", pidstr, 1);
/* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
* specified in ns_clone_flags.
* Return true on success, false on failure.
*/
//下面方法会将子进程/proc/payload_pid/ns下的命名空间文件打开,并且fd存储于handler->nsfd数组中,后面会用到。
lxc_try_preserve_namespaces
//关闭父进程的sync_sock[0]
lxc_sync_fini_child
//send status file descriptor to child process
//解除子进程的socket阻塞点
lxc_abstract_unix_send_fds(handler->data_sock[0], &handler->monitor_status_fd, 1, NULL, 0)
//解除子进程的socket阻塞点。(原理:payload进程利用socket recv阻塞,这里发个消息解除阻塞。)
ret = lxc_sync_wake_child(handler, LXC_SYNC_STARTUP);
//父进程阻塞,等待被子进程唤醒,此时最好看看子进程唤醒父进程之前干了些啥。
ret = lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE);
//下面就是对cgroup的一些操作
//管理payload进程,将其进程id写入lxc.payload.android1的cgroup.procs文件中。
cgroup_ops->payload_enter(cgroup_ops, handler)
...
//创建网络并通知子进程
//设置容器网络命名空间的网络ID
ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
//1.创建网络
→ret = lxc_create_network(handler);
//调用netlink接口进行网络的配置
→→lxc_create_network_priv(handler);
//将创建的网络设备移至payload进程的的网络命名空间
→→lxc_network_move_created_netdev_priv(handler);
//2.将网络信息发送给子进程,为什么要发?
→ret = lxc_network_send_to_child(handler);
//设置/proc文件系统(可选)
setup_proc_filesystem
//
setup_resource_limits
/* Tell the child to continue its initialization. We'll get
* LXC_SYNC_CGROUP when it is ready for us to setup cgroups.
*/
ret = lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE);
//
setup_resource_limits
//阻塞等待子进程通知设置CGROUP
ret = lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE);
...
//打印init程序已经启动的日志
ret = handler->ops->post_start(handler, handler->data);
//设置容器状态为RUNNING
ret = lxc_set_state(name, handler, RUNNING);
//关闭sync_sock对,至此,父子进程交互完毕。
lxc_sync_fini
3-2-2-2)payload进程干了些啥
注意clone之后的子进程获取pid不能直接调用系统的getpid方法,要调用lxc_raw_getpid方法!
//关闭sync_sock[1]
lxc_sync_fini_parent(handler);
//阻塞进程,等待父进程通过socket唤醒。
lxc_abstract_unix_recv_fds
//防止进程变成僵尸进程,如果父进程被kill掉,当前进程也会被kill.
lxc_set_death_signal
//设置线程掩码
pthread_sigmask
//关闭pinfd文件
close_prot_errno_disarm(handler->pinfd)
//阻塞进程,等待父进程通过socket唤醒。
ret = lxc_sync_wait_parent(handler, LXC_SYNC_STARTUP);
/* Tell the parent task it can begin to configure the container and wait
* for it to finish.
*/
ret = lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE);
//接收父进程发送的网络设备名称
lxc_network_recv_from_parent
/* Ask father to setup cgroups and wait for him to finish. */
ret = lxc_sync_barrier_parent(handler, LXC_SYNC_CGROUP);
//将子进程加入新建CGROUP命名空间
//子进程clone时已经带了CLONE_NEWCGROUP的flag,怎么这里还要再次unshare?源码作了详尽的说明
ret = unshare(CLONE_NEWCGROUP);
注意父子进程有多处socket阻塞异步通信的地方:
1)(子阻塞等待)lxc_abstract_unix_recv_fds --
(父解阻塞唤醒)lxc_abstract_unix_send_fds
2)(子阻塞等待)lxc_sync_wait_parent LXC_SYNC_STARTUP --
(父解阻塞唤醒)lxc_sync_wake_child LXC_SYNC_STARTUP
3)(父阻塞等待)lxc_sync_wait_child LXC_SYNC_CONFIGURE --
(子解阻塞唤醒并等待)lxc_sync_barrier_parent LXC_SYNC_CONFIGURE
(父解阻塞唤醒)lxc_sync_barrier_child LXC_SYNC_POST_CONFIGURE --
解除上面子进程的LXC_SYNC_CONFIGURE的等待。
...
enum {
LXC_SYNC_STARTUP,
LXC_SYNC_CONFIGURE,
LXC_SYNC_POST_CONFIGURE,
LXC_SYNC_CGROUP,
LXC_SYNC_CGROUP_UNSHARE,
LXC_SYNC_CGROUP_LIMITS,
LXC_SYNC_READY_START,
LXC_SYNC_RESTART,
LXC_SYNC_POST_RESTART,
LXC_SYNC_ERROR = -1 /* Used to report errors from another process */
};
//此方法通过LXC_SYNC_XX枚举实现精密的流程交互与推进
//代码追踪时通过这些枚举定位会更快
static int __sync_barrier(int fd, int sequence)
{
if (__sync_wake(fd, sequence))
return -1;
return __sync_wait(fd, sequence+1);
}
即使父进程抢先运行,先发送阻塞唤醒,子进程起来之后阻塞等待依然会收到数据解除阻塞!
1.lxc_setup(子进程设置,非常重要)--------------------------------
/ Setup the container, ip, names, utsname, ... /
→1.lxc_setup_rootfs_prepare_root
→→remount_all_slave
将本进程的/proc/self/mountinfo下的挂载点重新以MS_SLAVE的传播类型挂载,利用sendfile技术拷贝读取/proc/self/mountinfo文件里的挂载信息。
→→ ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
底层调用exec执行命令
→→ ret = lxc_mount_rootfs(conf);
//初始化lxc_storage结构体,存储根文件系统相关信息。
→→→ 1)bdev = storage_init(conf);
/* When lxc is mounting a rootfs, then src will be the "lxc.rootfs.path" value,
* dest will be the mount dir (i.e. "<libdir>/lxc") When clone or create is
* doing so, then dest will be "<lxcpath>/<lxcname>/rootfs", since we may need
* to rsync from one to the other.
*/
struct lxc_storage {
//
const struct lxc_storage_ops *ops;
const char *type;
char *src;
char *dest;
char *mntopts;
/* Turn the following into a union if need be. */
/* lofd is the open fd for the mounted loopback file. */
int lofd;
/* index for the connected nbd device. */
int nbd_idx;
int flags;
};
static const struct lxc_storage_type bdevs[] = {
{ .name = "dir", .ops = &dir_ops, },
{ .name = "zfs", .ops = &zfs_ops, },
{ .name = "lvm", .ops = &lvm_ops, },
{ .name = "rbd", .ops = &rbd_ops, },
{ .name = "btrfs", .ops = &btrfs_ops, },
{ .name = "overlay", .ops = &ovl_ops, },
{ .name = "overlayfs", .ops = &ovl_ops, },
{ .name = "loop", .ops = &loop_ops, },
{ .name = "nbd", .ops = &nbd_ops, },
};
→→→→q = storage_query(conf);
最终q为bdevs[0]也就是dir类型lxc_storage_type结构体
→→→→动态内存创建lxc_storage,会复制q的数据。
→→→ 2)ret = bdev->ops->mount(bdev);
调用dir_mount方法,利用bind挂载技术挂载根文件系统(实际是个目录)
最终挂载成功提示: Mounted "/data/media/build_lxc/containers/android1/rootfs" on "/data/media/build_lxc/lib/lxc/rootfs"
但是查看/data/media/build_lxc/lib/lxc/rootfs下为空,因为现在处于父进程命名空间,要通过nsenter -t pid -m sh
才能看到挂载视图。
→→→ 3)storage_put(bdev);
释放内存
→2.setup_utsname
lxc_conf->utsname为NULL,所以实际没有设置。
→3.lxc_setup_keyring
密钥保留服务
/* Try to allocate a new session keyring for the container to prevent
- information leaks.
*/
keyctl(KEYCTL_JOIN_SESSION_KEYRING)
//子进程网卡设置
→4.lxc_setup_network_in_child_namespaces
//设置mac地址,ip地址等信息。
→→lxc_network_setup_in_child_namespaces_common
→5.mount_autodev
在容器进程挂载好的rootfs下创建一个dev目录,然后挂载一个tmpfs内存文件系统,用于保存日志。
挂载好/dev之后,又在其下创建了一个pts目录。
→6.lxc_mount_auto_mounts
对应配置文件里的lxc.mount.auto
/* Do automatic mounts (mainly /proc and /sys), but exclude those that
- need to wait until other stuff has finished.
*/
主要用于挂载/proc 和 /sys,但是实际debug没有挂载啥?
https://zhuanlan.zhihu.com/p/633338117
→7.setup_mount
lxc_conf->rootfs为NULL,实际什么也没做。
→8.setup_mount_entries
对应配置文件里的lxc.mount.entry
,主要是把宿主机里的一些目录映射挂载到容器进程mnt命名空间里。(后面统一简称挂载到容器根文件系统)
→→make_anonymous_mount_file
创建内存文件,并将lxc.mount.entry
的挂载信息写入文件。
→→mount_file_entries
执行上面的挂载映射
→9.lxc_mount_auto_mounts
用于挂载cgroups
→9.lxc_fill_autodev
在/dev下创建一些设备文件节点,利用mknod函数。
static const struct lxc_device_node lxc_devices[] = {
{ "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
{ "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
{ "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
{ "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
{ "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
{ "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
};
→10.lxc_setup_console
ttydir即对应配置文件lxc.tty.dir
,用于后续在/dev/ttydir目录下创建tty设备。
static int lxc_setup_console(const struct lxc_rootfs *rootfs,
const struct lxc_terminal *console, char *ttydir)
{
if (!ttydir)
return lxc_setup_dev_console(rootfs, console);
return lxc_setup_ttydir_console(rootfs, console, ttydir);
}
→→lxc_setup_ttydir_console
1)创建ttydir目录
2)ttydir目录下创建console设备
ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
ret = mknod(lxcpath, S_IFREG | 0000, 0);
3)如果rootfs/dev/console设备节点存在,执行lazy卸载。
ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
if (ret < 0 || (size_t)ret >= sizeof(path))
return -1;
if (file_exists(path)) {
ret = lxc_unstack_mountpoint(path, false);
}
4)创建rootfs/dev/console设备节点
ret = mknod(path, S_IFREG | 0000, 0);
5)将宿主机里console绑定到容器里的console
/* bind mount console->name to 'rootfs/dev/<ttydir>/console' */
如Mounted "/dev/pts/1" onto "/data/media/build_lxc/lib/lxc/rootfs/dev/lxc/console"
6) 将容器里的/dev/
/* bind mount '/dev/<ttydir>/console' to '/dev/console' */
→11.lxc_setup_dev_symlinks
在/dev下创建链接文件
/dev/fd -> /proc/self/fd
/dev/stderr -> /proc/self/fd/2
/dev/stdin -> /proc/self/fd/0
/dev/stdout -> /proc/self/fd/1
→12.lxc_create_tmp_proc_mount
将宿主机里的/proc目录挂载到容器/proc下
→13.lxc_setup_rootfs_switch_root
//如果rootfs是基于ramfs的(不支持pivot_root)
if (detect_ramfs_rootfs())
return lxc_chroot(rootfs);
//会走这里
return lxc_pivot_root(rootfs->mount);
→→lxc_pivot_root
执行成功之后,通过nsenter 指定-m参数报错:nsenter: exec sh: Too many symbolic links encountered
,why?
-----------------------------------------------------------------
→14.lxc_setup_boot_id
→15.lxc_setup_devpts
//卸载原有/dev/pts挂载点
(void)umount2("/dev/pts", MNT_DETACH);
//挂载新devpts实例
for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
/* mount new devpts instance */
ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, *opts);
if (ret == 0)
break;
}
/* Remove any pre-existing /dev/ptmx file. */
ret = remove("/dev/ptmx");
/* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
ret = mknod("/dev/ptmx", S_IFREG | 0000, 0);
/* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
/* Remove the dummy /dev/ptmx file we created above. */
ret = remove("/dev/ptmx");
/* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
→16.lxc_create_ttys
→→ret = lxc_allocate_ttys(conf);
根据配置文件lxc.tty.max创建若干个tty对,如/dev/pts/0
→→ret = lxc_send_ttys_to_parent(handler);
将创建tty设备信息发送给父进程
→→lxc_setup_ttys
Bind mounted "/dev/pts/0" onto "/dev/lxc/tty1"
2.一些其它执行--------------------------------
ret = set_stdfds(handler->conf->console.slave);
/* If we mounted a temporary proc, then unmount it now. */
tmp_proc_unmount(handler->conf);
//子进程payload变成守护进程
setsid();
ret = lxc_sync_barrier_parent(handler, LXC_SYNC_CGROUP_LIMITS);
ret = clearenv();
默认是没有自定义new_uid与new_gid,会赋值
new_uid = LXC_INVALID_UID;
new_gid = LXC_INVALID_GID;
3.启动init程序--------------------------------
最终进程调用start方法,start调用execvp方法启动init程序。
handler->ops->start(handler, handler->data);
static struct lxc_operations start_ops = {
.start = start,
.post_start = post_start
};
static int start(struct lxc_handler *handler, void* data)
{
struct start_args *arg = data;
NOTICE("Exec'ing \"%s\"", arg->argv[0]);
execvp(arg->argv[0], arg->argv);
SYSERROR("Failed to exec \"%s\"", arg->argv[0]);
return 0;
}
static int post_start(struct lxc_handler *handler, void* data)
{
struct start_args *arg = data;
NOTICE("Started \"%s\" with pid \"%d\"", arg->argv[0], handler->pid);
return 0;
}
3-2-3) 容器监听
→→→→重要方法3:lxc_poll(孵化容器)
利用epoll机制监听容器的事件,如init崩溃事件。
二、lxc-attach
创建lxc_container对象
struct lxc_container *c = lxc_container_new(my_args.name, my_args.lxcpath[0]);
调用lxc_container对象的attach方法,也即是lxcapi_attach方法
ret = c->attach(c, lxc_attach_run_shell, NULL, &attach_options, &pid);
lxcapi_attach
→lxc_attach
//判断是否支持namespace特性
ret = access("/proc/self/ns", X_OK);
//获取容器的初始进程的pid(底层是通过域socket通讯实现)
init_pid = lxc_cmd_get_init_pid(name, lxcpath);
0 条评论