tool命令各个击破

零、通用代码

1.命令行解析

解析方法

lxc_arguments_parse:注意部分参数是通过各个命令工具源码中的my_parser(struct lxc_arguments *args, int c, char *arg)方法解析的
confile.c中封装了配置项的set/get方法

lxc-start命令重要的参数:

  • 1)lxcpath:即容器配置文件的根目录

lxcpath可以通过-P参数指定,一般在编译时通过configure脚本的--with-config-path参数指定,最终在编译时通过宏定义-DLXCPATH传递给目标文件。见lxc_arguments_parse方法,如果lxcpath在编译时指定,则解析如下:

    /* If no lxcpaths were given, use default */
    if (!args->lxcpath_cnt) {
        ret = lxc_arguments_lxcpath_add(
            args, lxc_get_global_config_item("lxc.lxcpath"));
        if (ret < 0)
            return ret;
    }

//lxc_get_global_config_item位于lxccontainer.c中
//lxc_get_global_config_item再调用initutils.c中的lxc_global_config_value从宏中获取lxcpath参数
  • 2)args->argv:即容器要执行的目标程序

数据结构

lxc_arguments
→argv:存储命令的启动参数,如lxc-start -- /init,即是/init.

日志位置及等级设定

--logfile ./lxc_attach.log --logpriority trace

自定义lxc配置项

https://www.cnblogs.com/lisperl/archive/2012/04/16/2451215.html
对于lxc-start命令,配置项是通过-s参数以KEY=VALUE的形式指定的,在my_parser方法中实现。

    case 's':
        return lxc_config_define_add(&defines, arg);

2.日志初始化

lxc_log_init
可知日志的默认等级为ERROR

int lxc_log_init(struct lxc_log *log)
{
    int ret;
    int lxc_priority = LXC_LOG_LEVEL_ERROR;
    ...
    if (log->level)
        lxc_priority = lxc_log_priority_to_int(log->level);
}

3.日志打印

看来作者对宏定义屡试不爽

#define INFO(format, ...) do {                      \
    struct lxc_log_locinfo locinfo = LXC_LOG_LOCINFO_INIT;      \
    LXC_INFO(&locinfo, format, ##__VA_ARGS__);          \
} while (0)

__lxc_unused static inline void LXC_##LEVEL(struct lxc_log_locinfo* locinfo,    \
                       const char* format, ...) 

一、lxc-start

对于lxc-start在命令行解析之前有一段初始化配置文件数据结构的代码:

static struct lxc_list defines;
...
//初始化双向链表
lxc_list_init(&defines);

1.创建容器

调用lxccontainer.c的lxc_container_new方法

记录config_path

c->config_path = strdup(configpath);

set_config_filename:将config文件路径保存到c->configfile

lxc_conf初始化

    if (file_exists(c->configfile) && !lxcapi_load_config(c, NULL)) {
        fprintf(stderr, "Failed to load config for %s\n", name);
        goto err;
    }

//注意lxcapi_load_config这个方法,是宏定义的函数。
WRAP_API_1(bool, lxcapi_load_config, const char *)  

#define WRAP_API_1(rettype, fnname, t1)                 \
static rettype fnname(struct lxc_container *c, t1 a1)           \
{                                   \
    rettype ret;                            \
    bool reset_config = false;                  \
                                    \
    if (!current_config && c && c->lxc_conf) {           \
        current_config = c->lxc_conf;                \
        reset_config = true;                    \
    }                               \
                                    \
    ret = do_##fnname(c, a1);                   \
    if (reset_config)                       \
        current_config = NULL;                  \
                                    \
    return ret;                         \
}

//lxcapi_load_config最后会调用do_lxcapi_load_config方法
static bool load_config_locked(struct lxc_container *c, const char *fname)
{
    if (!c->lxc_conf)
        c->lxc_conf = lxc_conf_init();

    if (!c->lxc_conf)
        return false;

    //将config文件中的配置项读取到c->lxc_conf中
    if (lxc_config_read(fname, c->lxc_conf, false) != 0)
        return false;

    c->lxc_conf->name = c->name;
    return true;
}

//lxc_config_read会调用parse_line方法
static int parse_line(char *buffer, void *data){
...
//返回config_jump_table数组中的lxc_config_t结构体,定义了各个参数的get/set/clr方法
config = lxc_get_config(key);
    if (!config) {
        ERROR("Unknown configuration key \"%s\"", key);
        goto on_error;
    }
    ret = config->set(key, value, plc->conf, NULL);
...
}

struct lxc_config_t {
    char *name;
    config_set_cb set;
    config_get_cb get;
    config_clr_cb clr;
};
  • lxc.net.N.xx
    注意网络配置可能会有多组,通过lxc_network_add方法追加。

函数指针赋值

...
c->start = lxcapi_start;

//默认以守护进程的方式创建容器
c->daemonize = true;
...

2.校验容器

如容器有没有在运行等

3.运行容器

调用容器lxccontainer.c的start方法,也就是lxcapi_start方法,接着调用:
→do_lxcapi_start

3-1)创建容器进程

3-1-1)一些初始化工作

→→ongoing_create
→→container_mem_lock
→→lxc_init_handler(重要)

创建并初始化lxc_handler结构体
//记录lxc-start命令进程,transient临时的意思。因为最终lxc-start进程fork出容器的守护进程之后是会被杀死的。
handler->transient_pid = lxc_raw_getpid();
//创建一对相互通信的socket,存储在lxc_handler->state_socket_pair结构体中,容器进程就是通过这对socket告知lxc-start进程容器状态的。
socketpair
//Created abstract unix socket "/data/media/build_lxc/containers/android1/command"
lxc_cmd_init

拿到lxc-start的启动程序

→→container_mem_unlock

3-1-2)创建守护进程(容器进程)

→→pid_first = fork();

→/* first parent */ (父进程,即lxc-start命令进程。)

/* Set to NULL because we don't want father unlink
 * the PID file, child will do the free and unlink.
*/
c->pidfile = NULL;

/*
Wait for container to tell us whether it started successfully.
*/
//lxc-start进程等待pid_first fork完pid_second进程就结束了
started = wait_on_daemonized_start(handler, pid_first);

//lxc-start进程至此就结束了
return started;
→/* first child */ (子进程)
//设置子进程的进程名 (失败了,待解决。涉及非常底层的系统调用)
setproctitle

→→pid_second = fork();

/* second parent */
释放内存并退出当前进程

/* second child */
//修改进程的工作目录为/
chdir 

//关闭守护进程fd,并忽略部分fd。
lxc_check_inherited

/* redirect std{in,out,err} to /dev/null */
null_stdfds();

/* become session leader */
ret = setsid();

注意自此开始,都是守护进程在执行!!!!!

虽然ps | grep lxc仍然显示lxc-start进程,但是这个名字为lxc-start的进程实际为第二个fork的守护进程。(正是因为更改子进程setproctitle方法执行失败了,才会有这个误解!)

//lxc-start通过-p参数指定了pid写入路径,就记录守护进程的pid。
if (c->pidfile) {
...
}

3-2)守护进程(容器进程)start逻辑

→→lxc_start
→→→__lxc_start

3-2-1)初始化容器(主CGROUP逻辑处理)

→→→→重要方法1:lxc_init(初始化容器)

//获取守护进程的进程id
handler->monitor_pid = lxc_raw_getpid();

//读取守护进程的status状态
status_fd = open("/proc/self/status", O_RDONLY | O_CLOEXEC);

//初始化安全框架
lsm_init

//设置云机状态为STARTING,就是利用前面的pair socket来实现进程通讯。
ret = lxc_set_state(name, handler, STARTING);

//设置各种LXC_XX环境变量
...

//运行pre-start的hook脚本
ret = run_lxc_hooks(name, "pre-start", conf, NULL);

//利用signalfd函数创建信号文件描述符,后面lxc_poll方法里会用到。
setup_signal_fd

//默认配置为空,do nothing
lxc_terminal_setup
goto 3-2-1-0)

//默认配置为空,do nothing
lxc_terminal_map_ids
3-2-1-0)lxc_terminal_setup

→lxc_terminal_create
a.利用openpty函数创建伪终端对
ret = openpty(&terminal->master, &terminal->slave, NULL, NULL, NULL);

b.获取从设备的设备名称
ret = ttyname_r(terminal->slave, terminal->name, sizeof(terminal->name));
获取后会存储在lxc_conf的console变量中

c.关闭伪终端对

//?
lxc_terminal_peer_default
goto on_success

→lxc_terminal_create_ringbuf
默认是没有buffer的

3-2-1-1)初始化cgroup

→→→→→cgroup_init(cgroup.c)
函数返回cgroup_ops结构体存储于handler->cgroup_ops,后面会用到。注意cgroup_ops结构体很重要,存储cgroup相关的各种信息和操作函数。
handler->cgroup_ops = cgroup_init(handler->conf);

→→→→→→cgfsng_ops_init(cgfsng.c)

使用动态内存创建cgfsng_ops结构体

-----TAG_A-START-----
→→→→→→→cg_init

  • TAG_A-1) CGROUP2初始化

→→→→→→→→cg_unified_init (v2走此方法
→→→→→→→→→unified_cgroup_hierarchy (判断是否是unified模式,也就是cgroup2)
会检查DEFAULT_CGROUP_MOUNTPOINT也就是/sys/fs/cgroup是什么类型的cgroup

  • TAG_A-2) CGROUP1初始化

→→→→→→→→cg_hybrid_init (非v2才会走此方法
→→→→→→→→→get_existing_subsystems (获取存在的cgroup子系统)
解析/proc/self/cgroup文件
→→→→→→→→→lxc_cgfsng_print_basecg_debuginfo (打印cgroup子系统基本信息)
结果为:

6:schedtune:/
5:cpuset:/
4:cpuacct:/
3:cpu:/
2:blkio:/
1:memory:/
0::/

→→→→→→→→→解析/proc/self/mountinfo文件
会过滤掉cgroup2类型
cg_hybrid_get_controllers (判断hierarchy是否在DEFAULT_CGROUP_MOUNTPOINT下)
//TODO,未完待续
-----TAG_A-END-----

cgfsng_ops结构体函数指针赋值

3-2-1-2)初始化安全框架
ret = lxc_read_seccomp_config(conf);
ret = lsm_process_prepare(conf, handler->lxcpath);
3-2-1-3)记录守护进程status文件fd

handler->monitor_status_fd = move_fd(status_fd);
记录/proc/self/status文件的fd值,后面会用到。

-------------------------------------------------------------------------------------------------

lxc_init方法之后会调用cgroup_ops的monitor相关方法:
1) monitor_create
创建monitor group。假如容器name为android1,则此group名称为lxc.monitor.android1.

具体见create_cgroup_tree方法:

if (payload) {
//记录cgroup的fd值,后面会用到。
        h->cgfd_con = lxc_open_dirfd(path);
        if (h->cgfd_con < 0)
            return log_error_errno(false, errno, "Failed to open %s", path);
        h->container_full_path = move_ptr(path);
    } else {
        h->cgfd_mon = lxc_open_dirfd(path);
        if (h->cgfd_mon < 0)
            return log_error_errno(false, errno, "Failed to open %s", path);
        h->monitor_full_path = move_ptr(path);
    }

2) monitor_enter
进入monitor cgroup,将容器进程的pid写入monitor cgroup的cgroup.procs文件里
3)monitor_delegate_controllers

-------------------------------------------------------------------------------------------------

//TODO,未完待续

3-2-2)孵化容器(主NAMESPACE逻辑处理)

→→→→重要方法2:lxc_spawn(孵化容器,再次创建子进程payload)
方法说明:

/* lxc_spawn() performs crucial setup tasks and clone()s the new process which
 * exec()s the requested container binary.
 * Note that lxc_spawn() runs in the parent namespaces. Any operations performed
 * right here should be double checked if they'd pose a security risk. (For
 * example, any {u}mount() operations performed here will be reflected on the
 * host!)
 */
//创建socket对,存于handler->sync_sock。(用于monitor与payload进程同步)
lxc_sync_init

//创建socket对,存于handler->data_sock

//解析命令空间克隆标记

resolve_clone_flags
主要有
Cloned CLONE_NEWNS
Cloned CLONE_NEWPID
Cloned CLONE_NEWUTS
Cloned CLONE_NEWIPC
Cloned CLONE_NEWNET

int resolve_clone_flags(struct lxc_handler *handler)
{
    int i;
    struct lxc_conf *conf = handler->conf;
    for (i = 0; i < LXC_NS_MAX; i++) {
            if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
                continue;
            //配置文件配置了网络才会克隆网络命名空间
            if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
                continue;

            //`/proc/self/ns/cgroup`文件存在才支持克隆CGROUP命名空间
            if (i == LXC_NS_CGROUP && !cgns_supported())
                continue;

            handler->ns_clone_flags |= ns_info[i].clone_flag;
        }
}
//创建payload cgroup
payload_create

//share_ns为false
//调用内联汇编进行进程的克隆(lxc-start 2次fork出来的子进程再clone个新进程)
//注意:clone的子进程会有诸多的命名空间,所以和父进程之间利用socket跨命名空间通讯。
handler->pid = lxc_raw_clone_cb(do_start, handler,
                        CLONE_PIDFD | handler->ns_on_clone_flags,
                        &handler->pidfd);

## 到此为止,lxc-start命令创建的进程太多了,为了便于区分,后面统一规定:守护进程叫做monitor进程,由monitor进程clone出来的这个进程叫做payload进程##

lxc_raw_clone_cb函数:
-------------------------------------------------------------------------------------------------

pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags,
               int *pidfd)
{
    pid_t pid;

    pid = lxc_raw_clone(flags, pidfd);
    if (pid < 0)
        return -1;

    /*lxc_raw_clone
     * exit() is not thread-safe and might mess with the parent's signal
     * handlers and other stuff when exec() fails.
     */
    if (pid == 0)
        _exit(fn(args));

    return pid;
}

那么子进程payload进程将执行do_start(void *data)方法

-------------------------------------------------------------------------------------------------

3-2-2-1)monitor进程继续执行
后面执行的都是monitor进程!!

//记录clone出来的进程id到环境变量
ret = setenv("LXC_PID", pidstr, 1);

/* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
 * specified in ns_clone_flags.
 * Return true on success, false on failure.
 */
//下面方法会将子进程/proc/payload_pid/ns下的命名空间文件打开,并且fd存储于handler->nsfd数组中,后面会用到。
lxc_try_preserve_namespaces

//关闭父进程的sync_sock[0]
lxc_sync_fini_child

//send status file descriptor to child process
//解除子进程的socket阻塞点
lxc_abstract_unix_send_fds(handler->data_sock[0], &handler->monitor_status_fd, 1, NULL, 0)

//解除子进程的socket阻塞点。(原理:payload进程利用socket recv阻塞,这里发个消息解除阻塞。)
ret = lxc_sync_wake_child(handler, LXC_SYNC_STARTUP);

//父进程阻塞,等待被子进程唤醒,此时最好看看子进程唤醒父进程之前干了些啥。
ret = lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE);

//下面就是对cgroup的一些操作
//管理payload进程,将其进程id写入lxc.payload.android1的cgroup.procs文件中。
cgroup_ops->payload_enter(cgroup_ops, handler)
...

//创建网络并通知子进程

//设置容器网络命名空间的网络ID
ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);

//1.创建网络
→ret = lxc_create_network(handler);
//调用netlink接口进行网络的配置
→→lxc_create_network_priv(handler);
//将创建的网络设备移至payload进程的的网络命名空间
→→lxc_network_move_created_netdev_priv(handler);

//2.将网络信息发送给子进程,为什么要发?      
→ret = lxc_network_send_to_child(handler);
//设置/proc文件系统(可选)
setup_proc_filesystem

//
setup_resource_limits

/* Tell the child to continue its initialization. We'll get
* LXC_SYNC_CGROUP when it is ready for us to setup cgroups.
*/
ret = lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE);

//
setup_resource_limits

//阻塞等待子进程通知设置CGROUP
ret = lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE);

...

//打印init程序已经启动的日志
ret = handler->ops->post_start(handler, handler->data);

//设置容器状态为RUNNING
ret = lxc_set_state(name, handler, RUNNING);

//关闭sync_sock对,至此,父子进程交互完毕。
lxc_sync_fini
3-2-2-2)payload进程干了些啥

注意clone之后的子进程获取pid不能直接调用系统的getpid方法,要调用lxc_raw_getpid方法!

//关闭sync_sock[1]
lxc_sync_fini_parent(handler);

//阻塞进程,等待父进程通过socket唤醒。
lxc_abstract_unix_recv_fds

//防止进程变成僵尸进程,如果父进程被kill掉,当前进程也会被kill.
lxc_set_death_signal

//设置线程掩码
pthread_sigmask

//关闭pinfd文件
close_prot_errno_disarm(handler->pinfd)

//阻塞进程,等待父进程通过socket唤醒。
ret = lxc_sync_wait_parent(handler, LXC_SYNC_STARTUP);

/* Tell the parent task it can begin to configure the container and wait
* for it to finish.
*/
ret = lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE);

//接收父进程发送的网络设备名称
lxc_network_recv_from_parent

/* Ask father to setup cgroups and wait for him to finish. */
ret = lxc_sync_barrier_parent(handler, LXC_SYNC_CGROUP);

//将子进程加入新建CGROUP命名空间
//子进程clone时已经带了CLONE_NEWCGROUP的flag,怎么这里还要再次unshare?源码作了详尽的说明
ret = unshare(CLONE_NEWCGROUP);

注意父子进程有多处socket阻塞异步通信的地方:
1)(子阻塞等待)lxc_abstract_unix_recv_fds -- (父解阻塞唤醒)lxc_abstract_unix_send_fds
2)(子阻塞等待)lxc_sync_wait_parent LXC_SYNC_STARTUP -- (父解阻塞唤醒)lxc_sync_wake_child LXC_SYNC_STARTUP
3)(父阻塞等待)lxc_sync_wait_child LXC_SYNC_CONFIGURE -- (子解阻塞唤醒并等待)lxc_sync_barrier_parent LXC_SYNC_CONFIGURE
(父解阻塞唤醒)lxc_sync_barrier_child LXC_SYNC_POST_CONFIGURE -- 解除上面子进程的LXC_SYNC_CONFIGURE的等待。
...

enum {
    LXC_SYNC_STARTUP,
    LXC_SYNC_CONFIGURE,
    LXC_SYNC_POST_CONFIGURE,
    LXC_SYNC_CGROUP,
    LXC_SYNC_CGROUP_UNSHARE,
    LXC_SYNC_CGROUP_LIMITS,
    LXC_SYNC_READY_START,
    LXC_SYNC_RESTART,
    LXC_SYNC_POST_RESTART,
    LXC_SYNC_ERROR = -1 /* Used to report errors from another process */
};

//此方法通过LXC_SYNC_XX枚举实现精密的流程交互与推进
//代码追踪时通过这些枚举定位会更快
static int __sync_barrier(int fd, int sequence)
{
    if (__sync_wake(fd, sequence))
        return -1;
    return __sync_wait(fd, sequence+1);
}

即使父进程抢先运行,先发送阻塞唤醒,子进程起来之后阻塞等待依然会收到数据解除阻塞!

1.lxc_setup(子进程设置,非常重要)--------------------------------
/ Setup the container, ip, names, utsname, ... /

→1.lxc_setup_rootfs_prepare_root
→→remount_all_slave
将本进程的/proc/self/mountinfo下的挂载点重新以MS_SLAVE的传播类型挂载,利用sendfile技术拷贝读取/proc/self/mountinfo文件里的挂载信息。

→→ ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
底层调用exec执行命令

→→ ret = lxc_mount_rootfs(conf);
//初始化lxc_storage结构体,存储根文件系统相关信息。
→→→ 1)bdev = storage_init(conf);

/* When lxc is mounting a rootfs, then src will be the "lxc.rootfs.path" value,
 * dest will be the mount dir (i.e. "<libdir>/lxc")  When clone or create is
 * doing so, then dest will be "<lxcpath>/<lxcname>/rootfs", since we may need
 * to rsync from one to the other.
 */
struct lxc_storage {
    //
    const struct lxc_storage_ops *ops;
    const char *type;
    char *src;
    char *dest;
    char *mntopts;
    /* Turn the following into a union if need be. */
    /* lofd is the open fd for the mounted loopback file. */
    int lofd;
    /* index for the connected nbd device. */
    int nbd_idx;
    int flags;
};

static const struct lxc_storage_type bdevs[] = {
    { .name = "dir",       .ops = &dir_ops,   },
    { .name = "zfs",       .ops = &zfs_ops,   },
    { .name = "lvm",       .ops = &lvm_ops,   },
    { .name = "rbd",       .ops = &rbd_ops,   },
    { .name = "btrfs",     .ops = &btrfs_ops, },
    { .name = "overlay",   .ops = &ovl_ops,   },
    { .name = "overlayfs", .ops = &ovl_ops,   },
    { .name = "loop",      .ops = &loop_ops,  },
    { .name = "nbd",       .ops = &nbd_ops,   },
};

→→→→q = storage_query(conf);
最终q为bdevs[0]也就是dir类型lxc_storage_type结构体

→→→→动态内存创建lxc_storage,会复制q的数据。

→→→ 2)ret = bdev->ops->mount(bdev);
调用dir_mount方法,利用bind挂载技术挂载根文件系统(实际是个目录)
最终挂载成功提示: Mounted "/data/media/build_lxc/containers/android1/rootfs" on "/data/media/build_lxc/lib/lxc/rootfs"

但是查看/data/media/build_lxc/lib/lxc/rootfs下为空,因为现在处于父进程命名空间,要通过nsenter -t pid -m sh才能看到挂载视图。

→→→ 3)storage_put(bdev);
释放内存

→2.setup_utsname
lxc_conf->utsname为NULL,所以实际没有设置。

→3.lxc_setup_keyring
密钥保留服务
/* Try to allocate a new session keyring for the container to prevent

  • information leaks.
    */
    keyctl(KEYCTL_JOIN_SESSION_KEYRING)

//子进程网卡设置
→4.lxc_setup_network_in_child_namespaces

//设置mac地址,ip地址等信息。
→→lxc_network_setup_in_child_namespaces_common

→5.mount_autodev
在容器进程挂载好的rootfs下创建一个dev目录,然后挂载一个tmpfs内存文件系统,用于保存日志。
挂载好/dev之后,又在其下创建了一个pts目录。

→6.lxc_mount_auto_mounts
对应配置文件里的lxc.mount.auto
/* Do automatic mounts (mainly /proc and /sys), but exclude those that

→7.setup_mount
lxc_conf->rootfs为NULL,实际什么也没做。

→8.setup_mount_entries
对应配置文件里的lxc.mount.entry,主要是把宿主机里的一些目录映射挂载到容器进程mnt命名空间里。(后面统一简称挂载到容器根文件系统)

→→make_anonymous_mount_file
创建内存文件,并将lxc.mount.entry的挂载信息写入文件。

→→mount_file_entries
执行上面的挂载映射

→9.lxc_mount_auto_mounts
用于挂载cgroups

→9.lxc_fill_autodev
在/dev下创建一些设备文件节点,利用mknod函数。

static const struct lxc_device_node lxc_devices[] = {
    { "full",    S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
    { "null",    S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
    { "random",  S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
    { "tty",     S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
    { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
    { "zero",    S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
};

→10.lxc_setup_console
ttydir即对应配置文件lxc.tty.dir,用于后续在/dev/ttydir目录下创建tty设备。

static int lxc_setup_console(const struct lxc_rootfs *rootfs,
                 const struct lxc_terminal *console, char *ttydir)
{
    if (!ttydir)
        return lxc_setup_dev_console(rootfs, console);

    return lxc_setup_ttydir_console(rootfs, console, ttydir);
}

→→lxc_setup_ttydir_console
1)创建ttydir目录
2)ttydir目录下创建console设备

ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
ret = mknod(lxcpath, S_IFREG | 0000, 0);

3)如果rootfs/dev/console设备节点存在,执行lazy卸载。

ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
    if (ret < 0 || (size_t)ret >= sizeof(path))
        return -1;

    if (file_exists(path)) {
        ret = lxc_unstack_mountpoint(path, false);
    }

4)创建rootfs/dev/console设备节点
ret = mknod(path, S_IFREG | 0000, 0);

5)将宿主机里console绑定到容器里的console
/* bind mount console->name to 'rootfs/dev/<ttydir>/console' */
Mounted "/dev/pts/1" onto "/data/media/build_lxc/lib/lxc/rootfs/dev/lxc/console"

6) 将容器里的/dev//console挂载到/dev/console
/* bind mount '/dev/<ttydir>/console' to '/dev/console' */

→11.lxc_setup_dev_symlinks
在/dev下创建链接文件

/dev/fd -> /proc/self/fd
/dev/stderr -> /proc/self/fd/2
/dev/stdin -> /proc/self/fd/0
/dev/stdout -> /proc/self/fd/1

→12.lxc_create_tmp_proc_mount
将宿主机里的/proc目录挂载到容器/proc下

→13.lxc_setup_rootfs_switch_root

    //如果rootfs是基于ramfs的(不支持pivot_root)
    if (detect_ramfs_rootfs())
        return lxc_chroot(rootfs);
    //会走这里
    return lxc_pivot_root(rootfs->mount);

→→lxc_pivot_root

执行成功之后,通过nsenter 指定-m参数报错:nsenter: exec sh: Too many symbolic links encountered,why?
-----------------------------------------------------------------
→14.lxc_setup_boot_id

→15.lxc_setup_devpts

//卸载原有/dev/pts挂载点
(void)umount2("/dev/pts", MNT_DETACH);

//挂载新devpts实例
    for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
        /* mount new devpts instance */
        ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, *opts);
        if (ret == 0)
            break;
    }

/* Remove any pre-existing /dev/ptmx file. */
    ret = remove("/dev/ptmx");

/* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
    ret = mknod("/dev/ptmx", S_IFREG | 0000, 0);

/* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx  */
    ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
 Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");

/* Remove the dummy /dev/ptmx file we created above. */
    ret = remove("/dev/ptmx");

/* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
    ret = symlink("/dev/pts/ptmx", "/dev/ptmx");

→16.lxc_create_ttys
→→ret = lxc_allocate_ttys(conf);
根据配置文件lxc.tty.max创建若干个tty对,如/dev/pts/0

→→ret = lxc_send_ttys_to_parent(handler);
将创建tty设备信息发送给父进程

→→lxc_setup_ttys
Bind mounted "/dev/pts/0" onto "/dev/lxc/tty1"

2.一些其它执行--------------------------------

ret = set_stdfds(handler->conf->console.slave);

/* If we mounted a temporary proc, then unmount it now. */
tmp_proc_unmount(handler->conf);

//子进程payload变成守护进程
setsid();

ret = lxc_sync_barrier_parent(handler, LXC_SYNC_CGROUP_LIMITS);

ret = clearenv();

默认是没有自定义new_uid与new_gid,会赋值
new_uid = LXC_INVALID_UID;
new_gid = LXC_INVALID_GID;

3.启动init程序--------------------------------
最终进程调用start方法,start调用execvp方法启动init程序。

handler->ops->start(handler, handler->data);

static struct lxc_operations start_ops = {
    .start = start,
    .post_start = post_start
};

static int start(struct lxc_handler *handler, void* data)
{
    struct start_args *arg = data;

    NOTICE("Exec'ing \"%s\"", arg->argv[0]);

    execvp(arg->argv[0], arg->argv);
    SYSERROR("Failed to exec \"%s\"", arg->argv[0]);
    return 0;
}

static int post_start(struct lxc_handler *handler, void* data)
{
    struct start_args *arg = data;

    NOTICE("Started \"%s\" with pid \"%d\"", arg->argv[0], handler->pid);
    return 0;
}
3-2-3) 容器监听

→→→→重要方法3:lxc_poll(孵化容器)
利用epoll机制监听容器的事件,如init崩溃事件。

二、lxc-attach

创建lxc_container对象
struct lxc_container *c = lxc_container_new(my_args.name, my_args.lxcpath[0]);
调用lxc_container对象的attach方法,也即是lxcapi_attach方法
ret = c->attach(c, lxc_attach_run_shell, NULL, &attach_options, &pid);
lxcapi_attach
→lxc_attach

//判断是否支持namespace特性
ret = access("/proc/self/ns", X_OK);
//获取容器的初始进程的pid(底层是通过域socket通讯实现)
init_pid = lxc_cmd_get_init_pid(name, lxcpath);

0 条评论

发表回复

您的电子邮箱地址不会被公开。