Skip to content

Commit d8b47d0

Browse files
committed
Merge patch series "pidfs: file handle preliminaries"
Christian Brauner <[email protected]> says: This reworks the inode number allocation for pidfs in order to support file handles properly. Recently we received a patchset that aims to enable file handle encoding and decoding via name_to_handle_at(2) and open_by_handle_at(2). A crucical step in the patch series is how to go from inode number to struct pid without leaking information into unprivileged contexts. The issue is that in order to find a struct pid the pid number in the initial pid namespace must be encoded into the file handle via name_to_handle_at(2). This can be used by containers using a separate pid namespace to learn what the pid number of a given process in the initial pid namespace is. While this is a weak information leak it could be used in various exploits and in general is an ugly wart in the design. To solve this problem a new way is needed to lookup a struct pid based on the inode number allocated for that struct pid. The other part is to remove the custom inode number allocation on 32bit systems that is also an ugly wart that should go away. So, a new scheme is used that I was discusssing with Tejun some time back. A cyclic ida is used for the lower 32 bits and a the high 32 bits are used for the generation number. This gives a 64 bit inode number that is unique on both 32 bit and 64 bit. The lower 32 bit number is recycled slowly and can be used to lookup struct pids. * patches from https://lore.kernel.org/r/[email protected]: pidfs: support FS_IOC_GETVERSION pidfs: remove 32bit inode number handling pidfs: rework inode number allocation Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Christian Brauner <[email protected]>
2 parents 40384c8 + 230536f commit d8b47d0

File tree

3 files changed

+95
-48
lines changed

3 files changed

+95
-48
lines changed

fs/pidfs.c

+86-41
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,79 @@
2323
#include "internal.h"
2424
#include "mount.h"
2525

26+
static DEFINE_IDR(pidfs_ino_idr);
27+
28+
static u32 pidfs_ino_upper_32_bits = 0;
29+
30+
#if BITS_PER_LONG == 32
31+
/*
32+
* On 32 bit systems the lower 32 bits are the inode number and
33+
* the higher 32 bits are the generation number. The starting
34+
* value for the inode number and the generation number is one.
35+
*/
36+
static u32 pidfs_ino_lower_32_bits = 1;
37+
38+
static inline unsigned long pidfs_ino(u64 ino)
39+
{
40+
return lower_32_bits(ino);
41+
}
42+
43+
/* On 32 bit the generation number are the upper 32 bits. */
44+
static inline u32 pidfs_gen(u64 ino)
45+
{
46+
return upper_32_bits(ino);
47+
}
48+
49+
#else
50+
51+
static u32 pidfs_ino_lower_32_bits = 0;
52+
53+
/* On 64 bit simply return ino. */
54+
static inline unsigned long pidfs_ino(u64 ino)
55+
{
56+
return ino;
57+
}
58+
59+
/* On 64 bit the generation number is 1. */
60+
static inline u32 pidfs_gen(u64 ino)
61+
{
62+
return 1;
63+
}
64+
#endif
65+
66+
/*
67+
* Construct an inode number for struct pid in a way that we can use the
68+
* lower 32bit to lookup struct pid independent of any pid numbers that
69+
* could be leaked into userspace (e.g., via file handle encoding).
70+
*/
71+
int pidfs_add_pid(struct pid *pid)
72+
{
73+
u32 upper;
74+
int lower;
75+
76+
/*
77+
* Inode numbering for pidfs start at 2. This avoids collisions
78+
* with the root inode which is 1 for pseudo filesystems.
79+
*/
80+
lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC);
81+
if (lower >= 0 && lower < pidfs_ino_lower_32_bits)
82+
pidfs_ino_upper_32_bits++;
83+
upper = pidfs_ino_upper_32_bits;
84+
pidfs_ino_lower_32_bits = lower;
85+
if (lower < 0)
86+
return lower;
87+
88+
pid->ino = ((u64)upper << 32) | lower;
89+
pid->stashed = NULL;
90+
return 0;
91+
}
92+
93+
/* The idr number to remove is the lower 32 bits of the inode. */
94+
void pidfs_remove_pid(struct pid *pid)
95+
{
96+
idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino));
97+
}
98+
2699
#ifdef CONFIG_PROC_FS
27100
/**
28101
* pidfd_show_fdinfo - print information about a pidfd
@@ -198,6 +271,14 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
198271
struct ns_common *ns_common = NULL;
199272
struct pid_namespace *pid_ns;
200273

274+
if (cmd == FS_IOC_GETVERSION) {
275+
if (!arg)
276+
return -EINVAL;
277+
278+
__u32 __user *argp = (__u32 __user *)arg;
279+
return put_user(file_inode(file)->i_generation, argp);
280+
}
281+
201282
task = get_pid_task(pid, PIDTYPE_PID);
202283
if (!task)
203284
return -ESRCH;
@@ -318,40 +399,6 @@ struct pid *pidfd_pid(const struct file *file)
318399

319400
static struct vfsmount *pidfs_mnt __ro_after_init;
320401

321-
#if BITS_PER_LONG == 32
322-
/*
323-
* Provide a fallback mechanism for 32-bit systems so processes remain
324-
* reliably comparable by inode number even on those systems.
325-
*/
326-
static DEFINE_IDA(pidfd_inum_ida);
327-
328-
static int pidfs_inum(struct pid *pid, unsigned long *ino)
329-
{
330-
int ret;
331-
332-
ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
333-
UINT_MAX, GFP_ATOMIC);
334-
if (ret < 0)
335-
return -ENOSPC;
336-
337-
*ino = ret;
338-
return 0;
339-
}
340-
341-
static inline void pidfs_free_inum(unsigned long ino)
342-
{
343-
if (ino > 0)
344-
ida_free(&pidfd_inum_ida, ino);
345-
}
346-
#else
347-
static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
348-
{
349-
*ino = pid->ino;
350-
return 0;
351-
}
352-
#define pidfs_free_inum(ino) ((void)(ino))
353-
#endif
354-
355402
/*
356403
* The vfs falls back to simple_setattr() if i_op->setattr() isn't
357404
* implemented. Let's reject it completely until we have a clean
@@ -403,7 +450,6 @@ static void pidfs_evict_inode(struct inode *inode)
403450

404451
clear_inode(inode);
405452
put_pid(pid);
406-
pidfs_free_inum(inode->i_ino);
407453
}
408454

409455
static const struct super_operations pidfs_sops = {
@@ -429,17 +475,16 @@ static const struct dentry_operations pidfs_dentry_operations = {
429475

430476
static int pidfs_init_inode(struct inode *inode, void *data)
431477
{
478+
const struct pid *pid = data;
479+
432480
inode->i_private = data;
433481
inode->i_flags |= S_PRIVATE;
434482
inode->i_mode |= S_IRWXU;
435483
inode->i_op = &pidfs_inode_operations;
436484
inode->i_fop = &pidfs_file_operations;
437-
/*
438-
* Inode numbering for pidfs start at RESERVED_PIDS + 1. This
439-
* avoids collisions with the root inode which is 1 for pseudo
440-
* filesystems.
441-
*/
442-
return pidfs_inum(data, &inode->i_ino);
485+
inode->i_ino = pidfs_ino(pid->ino);
486+
inode->i_generation = pidfs_gen(pid->ino);
487+
return 0;
443488
}
444489

445490
static void pidfs_put_data(void *data)

include/linux/pidfs.h

+2
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,7 @@
44

55
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
66
void __init pidfs_init(void);
7+
int pidfs_add_pid(struct pid *pid);
8+
void pidfs_remove_pid(struct pid *pid);
79

810
#endif /* _LINUX_PID_FS_H */

kernel/pid.c

+7-7
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,6 @@ int pid_max = PID_MAX_DEFAULT;
6464

6565
int pid_max_min = RESERVED_PIDS + 1;
6666
int pid_max_max = PID_MAX_LIMIT;
67-
/*
68-
* Pseudo filesystems start inode numbering after one. We use Reserved
69-
* PIDs as a natural offset.
70-
*/
71-
static u64 pidfs_ino = RESERVED_PIDS;
7267

7368
/*
7469
* PID-map pages start out as NULL, they get allocated upon
@@ -158,6 +153,7 @@ void free_pid(struct pid *pid)
158153

159154
idr_remove(&ns->idr, upid->nr);
160155
}
156+
pidfs_remove_pid(pid);
161157
spin_unlock_irqrestore(&pidmap_lock, flags);
162158

163159
call_rcu(&pid->rcu, delayed_put_pid);
@@ -273,22 +269,26 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
273269
INIT_HLIST_HEAD(&pid->inodes);
274270

275271
upid = pid->numbers + ns->level;
272+
idr_preload(GFP_KERNEL);
276273
spin_lock_irq(&pidmap_lock);
277274
if (!(ns->pid_allocated & PIDNS_ADDING))
278275
goto out_unlock;
279-
pid->stashed = NULL;
280-
pid->ino = ++pidfs_ino;
276+
retval = pidfs_add_pid(pid);
277+
if (retval)
278+
goto out_unlock;
281279
for ( ; upid >= pid->numbers; --upid) {
282280
/* Make the PID visible to find_pid_ns. */
283281
idr_replace(&upid->ns->idr, pid, upid->nr);
284282
upid->ns->pid_allocated++;
285283
}
286284
spin_unlock_irq(&pidmap_lock);
285+
idr_preload_end();
287286

288287
return pid;
289288

290289
out_unlock:
291290
spin_unlock_irq(&pidmap_lock);
291+
idr_preload_end();
292292
put_pid_ns(ns);
293293

294294
out_free:

0 commit comments

Comments
 (0)