[快速笔记] proc 文件系统的 pid 细节

/proc/pid

每一位 Linux 开发者都应该了解过 proc 文件系统，它位于 /proc，通常的一个用处就是列出 pid：

caturra@DESKTOP-P4DDLG1:/proc$ ls -F
1/     181/  264/  641/  acpi/      devices      ioports      kpageflags  net@          sysvipc/
120/   192/  271/  648/  buddyinfo  diskstats    irq/         loadavg     pagetypeinfo  thread-self@
121/   193/  288/  663/  bus/       dma          kallsyms     locks       partitions    timer_list
122/   194/  4/    682/  cgroups    driver/      kcore        mdstat      schedstat     tty/
123/   201/  630/  7/    cmdline    execdomains  key-users    meminfo     self@         uptime
128/   202/  631/  724/  config.gz  filesystems  keys         misc        softirqs      version
1335/  203/  632/  781/  consoles   fs/          kmsg         modules     stat          vmallocinfo
1396/  209/  639/  8/    cpuinfo    interrupts   kpagecgroup  mounts@     swaps         vmstat
1577/  246/  640/  9/    crypto     iomem        kpagecount   mtrr        sys/          zoneinfo

其中，数字开头的是当前系统用到的 pid。这里使用 -F 可以区分出它们是一个目录，内部以文件形式展示对应 pid 的相关信息。

/proc/tid

需要注意的是，如果一个进程（线程组）有多个线程，那么 /proc 只展示不重复的 tgid。

虽然对 /proc 的读操作并不显示线程目录，但仍可通过 /proc/[tid] 的方式进行访问。继续上述示例：

# 上述ls示例中并没有"6/"
caturra@DESKTOP-P4DDLG1:/proc$ cd 6
# 但是可以定位
caturra@DESKTOP-P4DDLG1:/proc/6$

caturra@DESKTOP-P4DDLG1:/proc/6$ sudo ls
arch_status      cpuset   limits      net            root          stat            uid_map
attr             cwd      loginuid    ns             sched         statm           wchan
auxv             environ  map_files   oom_adj        schedstat     status
cgroup           exe      maps        oom_score      sessionid     syscall
clear_refs       fd       mem         oom_score_adj  setgroups     task
cmdline          fdinfo   mountinfo   pagemap        smaps         timens_offsets
comm             gid_map  mounts      personality    smaps_rollup  timers
coredump_filter  io       mountstats  projid_map     stack         timerslack_ns

/proc/pid/task/tid

每个进程（线程组）所持有的线程 ID 可以通过 /proc/[pid]/task/ 得到。

实际上，前面示例的数字 6 是 init 产生的一个线程：

caturra@DESKTOP-P4DDLG1:/proc$ cd 1
caturra@DESKTOP-P4DDLG1:/proc/1$ cd task
caturra@DESKTOP-P4DDLG1:/proc/1/task$ ls -F
1/  6/

目录内容与上述 /proc/6 一致，这里就不列出了。

各种目录细节见 man 5 proc。

proc root

为什么 ls /proc 明明只输出 pid，/proc/[tid] 又能访问到？因为 /proc 作为伪文件系统（pseudo-filesystem），其输出内容可以是每次读操作时动态生成的。

通过 strace ls /proc 跟踪系统调用，节选关键部分：

# 打开/proc，fd对应3
openat(AT_FDCWD, "/proc", O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_DIRECTORY) = 3
# 获取fd=3的信息
newfstatat(3, "", {st_mode=S_IFDIR|0555, st_size=0, ...}, AT_EMPTY_PATH) = 0
# 类似调用readdir(3)，参考man 2 getdents：
#   These are not the interfaces you are interested in.  Look at
#   readdir(3) for the POSIX-conforming C library interface.
getdents64(3, 0x559cc061f4f0 /* 63 entries */, 32768) = 1832
getdents64(3, 0x559cc061f4f0 /* 0 entries */, 32768) = 0
close(3)                                = 0
newfstatat(1, "", {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0), ...}, AT_EMPTY_PATH) = 0
write(1, "1    8\t\tcgroups    crypto     ex"..., 1161 # 后面都是标准输出，略

其中 getdents64() 内部途经 iterate_dir() 并调用 VFS 接口 file->f_op->iterate_shared。

int iterate_dir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        int res = -ENOTDIR;

        if (!file->f_op->iterate_shared)
                goto out;

        // ...

        if (!IS_DEADDIR(inode)) {
                ctx->pos = file->f_pos;
                res = file->f_op->iterate_shared(file, ctx);
        }

        // ...
}

iterate_shared 接口参考内核文档说明：called when the VFS needs to read the directory contents when filesystem supports concurrent dir iterators.

通过接口找实现，proc 的根目录对应 fop 如下：

/*
 * The root /proc directory is special, as it has the
 * <pid> directories. Thus we don't use the generic
 * directory handling functions for that..
 */
static const struct file_operations proc_root_operations = {
        .read            = generic_read_dir,
        .iterate_shared  = proc_root_readdir,
        .llseek         = generic_file_llseek,
};

static int proc_root_readdir(struct file *file, struct dir_context *ctx)
{
        // pos 为 0 表示.，1 表示..，[2, FIRST_PROCESS_ENTRY) 表示一些固定的文件
        // 通过同一个 ctx 贯穿重复整个过程
        if (ctx->pos < FIRST_PROCESS_ENTRY) {
                int error = proc_readdir(file, ctx);
                if (unlikely(error <= 0))
                        return error;
                ctx->pos = FIRST_PROCESS_ENTRY;
        }

        // 关键所在
        return proc_pid_readdir(file, ctx);
}

/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
        struct tgid_iter iter;
        struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
        struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
        loff_t pos = ctx->pos;

        // ...

        iter.tgid = pos - TGID_OFFSET;
        iter.task = NULL;
        // 关键遍历
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
                char name[10 + 1];
                unsigned int len;

                cond_resched();
                if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
                        continue;

                // 关键操作
                len = snprintf(name, sizeof(name), "%u", iter.tgid);
                ctx->pos = iter.tgid + TGID_OFFSET;
                if (!proc_fill_cache(file, ctx, name, len,
                                     proc_pid_instantiate, iter.task, NULL)) {
                        put_task_struct(iter.task);
                        return 0;
                }
        }
        ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
        return 0;
}

其核心函数就是遍历 tgid 的循环，通过复杂的 proc_fill_cache() 建立 dentry。

因此结论是 ls /proc 只显示 tgid。

proc lookup

访问 /proc/[tgid or tid] 需要 VFS 调用 proc root 的 lookup 接口。

/*
 * proc root can do almost nothing..
 */
static const struct inode_operations proc_root_inode_operations = {
        .lookup	        = proc_root_lookup,
        .getattr        = proc_root_getattr,
};

static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
        if (!proc_pid_lookup(dentry, flags))
                return NULL;

        return proc_lookup(dir, dentry, flags);
}

struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
        struct task_struct *task;
        unsigned tgid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        // tgid 命名不对，只是一个 dentry 分量名字
        tgid = name_to_int(&dentry->d_name);
        if (tgid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        // 通过分量转型后的整数来寻找 task
        task = find_task_by_pid_ns(tgid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;

        /* Limit procfs to only ptraceable tasks */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
                        goto out_put_task;
        }

        result = proc_pid_instantiate(dentry, task, NULL);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

/*
 * Must be called under rcu_read_lock().
 */
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "find_task_by_pid_ns() needs rcu_read_lock() protection");
        // 关键：PIDTYPE_PID 而不是 PIDTYPE_TGID
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}

这里面的 ID 类型是有所区别的：

enum pid_type
{
        PIDTYPE_PID,
        PIDTYPE_TGID,
        PIDTYPE_PGID,
        PIDTYPE_SID,
        PIDTYPE_MAX,
};

从内核视角来看，线程与进程同样作为 task_struct，均具有唯一的 pid（不同于用户视角）。

因此结论是访问 /proc/[tid] 实际为查询 [kernel_pid] 的意思。

附：glibc 的「坑」

glibc 提供系统调用的封装 getpid() 和 gettid()，这其实在命名上非常令人混淆。getpid() 实际返回内核视角的 tgid，而 gettid() 实际返回内核视角的 pid。从上面的 pid_type 也可以知道，内核根本不关心所谓的 tid。

getpid() returns the process ID (PID) of the calling process…From a kernel perspective, the PID (which is shared by all of the threads in a multithreaded process) is sometimes also known as the thread group ID (TGID).

References

Linux source code (v6.4.8) – Bootlin
proc(5) – Linux manual page
getpid(2) – Linux manual page
getdents(2) – Linux manual page
Overview of the Linux Virtual File System – The Linux Kernel Archives