Home [快速笔记] proc文件系统的pid细节
Post
Cancel

[快速笔记] proc文件系统的pid细节

/proc/pid

每一位Linux开发者都应该了解过proc文件系统,它位于/proc,通常的一个用处就是列出pid:

caturra@DESKTOP-P4DDLG1:/proc$ ls -F
1/     181/  264/  641/  acpi/      devices      ioports      kpageflags  net@          sysvipc/
120/   192/  271/  648/  buddyinfo  diskstats    irq/         loadavg     pagetypeinfo  thread-self@
121/   193/  288/  663/  bus/       dma          kallsyms     locks       partitions    timer_list
122/   194/  4/    682/  cgroups    driver/      kcore        mdstat      schedstat     tty/
123/   201/  630/  7/    cmdline    execdomains  key-users    meminfo     self@         uptime
128/   202/  631/  724/  config.gz  filesystems  keys         misc        softirqs      version
1335/  203/  632/  781/  consoles   fs/          kmsg         modules     stat          vmallocinfo
1396/  209/  639/  8/    cpuinfo    interrupts   kpagecgroup  mounts@     swaps         vmstat
1577/  246/  640/  9/    crypto     iomem        kpagecount   mtrr        sys/          zoneinfo

其中,数字开头的是当前系统用到的pid。这里使用-F可以区分出它们是一个目录,内部以文件形式展示对应pid的相关信息。

/proc/tid

需要注意的是,如果一个进程(线程组)有多个线程,那么/proc只展示不重复的tgid。

虽然对/proc的读操作并不显示线程目录,但仍可通过/proc/[tid]的方式进行访问。继续上述示例:

# 上述ls示例中并没有"6/"
caturra@DESKTOP-P4DDLG1:/proc$ cd 6
# 但是可以定位
caturra@DESKTOP-P4DDLG1:/proc/6$

caturra@DESKTOP-P4DDLG1:/proc/6$ sudo ls
arch_status      cpuset   limits      net            root          stat            uid_map
attr             cwd      loginuid    ns             sched         statm           wchan
auxv             environ  map_files   oom_adj        schedstat     status
cgroup           exe      maps        oom_score      sessionid     syscall
clear_refs       fd       mem         oom_score_adj  setgroups     task
cmdline          fdinfo   mountinfo   pagemap        smaps         timens_offsets
comm             gid_map  mounts      personality    smaps_rollup  timers
coredump_filter  io       mountstats  projid_map     stack         timerslack_ns

/proc/pid/task/tid

每个进程(线程组)所持有的线程ID可以通过/proc/[pid]/task/得到。

实际上,前面示例的数字6init产生的一个线程:

caturra@DESKTOP-P4DDLG1:/proc$ cd 1
caturra@DESKTOP-P4DDLG1:/proc/1$ cd task
caturra@DESKTOP-P4DDLG1:/proc/1/task$ ls -F
1/  6/

目录内容与上述/proc/6一致,这里就不列出了。


各种目录细节见man 5 proc。

proc root

为什么ls /proc明明只输出pid,/proc/[tid]又能访问到?因为/proc作为伪文件系统(pseudo-filesystem),其输出内容可以是每次读操作时动态生成的。

通过strace ls /proc跟踪系统调用,节选关键部分:

# 打开/proc,fd对应3
openat(AT_FDCWD, "/proc", O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_DIRECTORY) = 3
# 获取fd=3的信息
newfstatat(3, "", {st_mode=S_IFDIR|0555, st_size=0, ...}, AT_EMPTY_PATH) = 0
# 类似调用readdir(3),参考man 2 getdents:
#   These are not the interfaces you are interested in.  Look at
#   readdir(3) for the POSIX-conforming C library interface.
getdents64(3, 0x559cc061f4f0 /* 63 entries */, 32768) = 1832
getdents64(3, 0x559cc061f4f0 /* 0 entries */, 32768) = 0
close(3)                                = 0
newfstatat(1, "", {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0), ...}, AT_EMPTY_PATH) = 0
write(1, "1    8\t\tcgroups    crypto     ex"..., 1161 # 后面都是标准输出,略

其中getdents64()内部途经iterate_dir()并调用VFS接口file->f_op->iterate_shared

int iterate_dir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        int res = -ENOTDIR;

        if (!file->f_op->iterate_shared)
                goto out;

        // ...

        if (!IS_DEADDIR(inode)) {
                ctx->pos = file->f_pos;
                res = file->f_op->iterate_shared(file, ctx);
        }

        // ...
}

iterate_shared接口参考内核文档说明:called when the VFS needs to read the directory contents when filesystem supports concurrent dir iterators.

通过接口找实现,proc的根目录对应fop如下:

/*
 * The root /proc directory is special, as it has the
 * <pid> directories. Thus we don't use the generic
 * directory handling functions for that..
 */
static const struct file_operations proc_root_operations = {
        .read            = generic_read_dir,
        .iterate_shared  = proc_root_readdir,
        .llseek         = generic_file_llseek,
};

static int proc_root_readdir(struct file *file, struct dir_context *ctx)
{
        // pos为0表示.,1表示..,[2, FIRST_PROCESS_ENTRY)表示一些固定的文件
        // 通过同一个ctx贯穿重复整个过程
        if (ctx->pos < FIRST_PROCESS_ENTRY) {
                int error = proc_readdir(file, ctx);
                if (unlikely(error <= 0))
                        return error;
                ctx->pos = FIRST_PROCESS_ENTRY;
        }

        // 关键所在
        return proc_pid_readdir(file, ctx);
}

/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
        struct tgid_iter iter;
        struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
        struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
        loff_t pos = ctx->pos;

        // ...

        iter.tgid = pos - TGID_OFFSET;
        iter.task = NULL;
        // 关键遍历
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
                char name[10 + 1];
                unsigned int len;

                cond_resched();
                if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
                        continue;

                // 关键操作
                len = snprintf(name, sizeof(name), "%u", iter.tgid);
                ctx->pos = iter.tgid + TGID_OFFSET;
                if (!proc_fill_cache(file, ctx, name, len,
                                     proc_pid_instantiate, iter.task, NULL)) {
                        put_task_struct(iter.task);
                        return 0;
                }
        }
        ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
        return 0;
}

其核心函数就是遍历tgid的循环,通过复杂的proc_fill_cache()建立dentry。

因此结论是ls /proc只显示tgid。

proc lookup

访问/proc/[tgid or tid]需要VFS调用proc root的lookup接口。

/*
 * proc root can do almost nothing..
 */
static const struct inode_operations proc_root_inode_operations = {
        .lookup	        = proc_root_lookup,
        .getattr        = proc_root_getattr,
};

static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
        if (!proc_pid_lookup(dentry, flags))
                return NULL;

        return proc_lookup(dir, dentry, flags);
}

struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
        struct task_struct *task;
        unsigned tgid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        // tgid命名不对,只是一个dentry分量名字
        tgid = name_to_int(&dentry->d_name);
        if (tgid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        // 通过分量转型后的整数来寻找task
        task = find_task_by_pid_ns(tgid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;

        /* Limit procfs to only ptraceable tasks */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
                        goto out_put_task;
        }

        result = proc_pid_instantiate(dentry, task, NULL);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

/*
 * Must be called under rcu_read_lock().
 */
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "find_task_by_pid_ns() needs rcu_read_lock() protection");
        // 关键:PIDTYPE_PID而不是PIDTYPE_TGID
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}

这里面的ID类型是有所区别的:

enum pid_type
{
        PIDTYPE_PID,
        PIDTYPE_TGID,
        PIDTYPE_PGID,
        PIDTYPE_SID,
        PIDTYPE_MAX,
};

从内核视角来看,线程与进程同样作为task_struct,均具有唯一的pid(不同于用户视角)。

因此结论是访问/proc/[tid]实际为查询[kernel_pid]的意思。

附:glibc的「坑」

glibc提供系统调用的封装getpid()gettid(),这其实在命名上非常令人混淆。getpid()实际返回内核视角的tgid,而gettid()实际返回内核视角的pid。从上面的pid_type也可以知道,内核根本不关心所谓的tid。

getpid() returns the process ID (PID) of the calling process…From a kernel perspective, the PID (which is shared by all of the threads in a multithreaded process) is sometimes also known as the thread group ID (TGID).

References

Linux source code (v6.4.8) – Bootlin
proc(5) – Linux manual page
getpid(2) – Linux manual page
getdents(2) – Linux manual page
Overview of the Linux Virtual File System – The Linux Kernel Archives

This post is licensed under CC BY 4.0 by the author.
Contents