/* SPDX-License-Identifier: LGPL-2.1+ */ /*** This file is part of systemd. Copyright 2010 Lennart Poettering systemd is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. systemd is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with systemd; If not, see . ***/ #include #include #include #include #include #include #include "dirent-util.h" #include "fd-util.h" #include "fileio.h" #include "fs-util.h" #include "macro.h" #include "memfd-util.h" #include "missing.h" #include "parse-util.h" #include "path-util.h" #include "process-util.h" #include "socket-util.h" #include "stdio-util.h" #include "util.h" int close_nointr(int fd) { assert(fd >= 0); if (close(fd) >= 0) return 0; /* * Just ignore EINTR; a retry loop is the wrong thing to do on * Linux. * * http://lkml.indiana.edu/hypermail/linux/kernel/0509.1/0877.html * https://bugzilla.gnome.org/show_bug.cgi?id=682819 * http://utcc.utoronto.ca/~cks/space/blog/unix/CloseEINTR * https://sites.google.com/site/michaelsafyan/software-engineering/checkforeintrwheninvokingclosethinkagain */ if (errno == EINTR) return 0; return -errno; } int safe_close(int fd) { /* * Like close_nointr() but cannot fail. Guarantees errno is * unchanged. Is a NOP with negative fds passed, and returns * -1, so that it can be used in this syntax: * * fd = safe_close(fd); */ if (fd >= 0) { PROTECT_ERRNO; /* The kernel might return pretty much any error code * via close(), but the fd will be closed anyway. The * only condition we want to check for here is whether * the fd was invalid at all... */ assert_se(close_nointr(fd) != -EBADF); } return -1; } void safe_close_pair(int p[]) { assert(p); if (p[0] == p[1]) { /* Special case pairs which use the same fd in both * directions... */ p[0] = p[1] = safe_close(p[0]); return; } p[0] = safe_close(p[0]); p[1] = safe_close(p[1]); } void close_many(const int fds[], unsigned n_fd) { unsigned i; assert(fds || n_fd <= 0); for (i = 0; i < n_fd; i++) safe_close(fds[i]); } int fclose_nointr(FILE *f) { assert(f); /* Same as close_nointr(), but for fclose() */ if (fclose(f) == 0) return 0; if (errno == EINTR) return 0; return -errno; } FILE* safe_fclose(FILE *f) { /* Same as safe_close(), but for fclose() */ if (f) { PROTECT_ERRNO; assert_se(fclose_nointr(f) != EBADF); } return NULL; } DIR* safe_closedir(DIR *d) { if (d) { PROTECT_ERRNO; assert_se(closedir(d) >= 0 || errno != EBADF); } return NULL; } int fd_nonblock(int fd, bool nonblock) { int flags, nflags; assert(fd >= 0); flags = fcntl(fd, F_GETFL, 0); if (flags < 0) return -errno; if (nonblock) nflags = flags | O_NONBLOCK; else nflags = flags & ~O_NONBLOCK; if (nflags == flags) return 0; if (fcntl(fd, F_SETFL, nflags) < 0) return -errno; return 0; } int fd_cloexec(int fd, bool cloexec) { int flags, nflags; assert(fd >= 0); flags = fcntl(fd, F_GETFD, 0); if (flags < 0) return -errno; if (cloexec) nflags = flags | FD_CLOEXEC; else nflags = flags & ~FD_CLOEXEC; if (nflags == flags) return 0; if (fcntl(fd, F_SETFD, nflags) < 0) return -errno; return 0; } void stdio_unset_cloexec(void) { (void) fd_cloexec(STDIN_FILENO, false); (void) fd_cloexec(STDOUT_FILENO, false); (void) fd_cloexec(STDERR_FILENO, false); } _pure_ static bool fd_in_set(int fd, const int fdset[], unsigned n_fdset) { unsigned i; assert(n_fdset == 0 || fdset); for (i = 0; i < n_fdset; i++) if (fdset[i] == fd) return true; return false; } int close_all_fds(const int except[], unsigned n_except) { _cleanup_closedir_ DIR *d = NULL; struct dirent *de; int r = 0; assert(n_except == 0 || except); d = opendir("/proc/self/fd"); if (!d) { int fd; struct rlimit rl; /* When /proc isn't available (for example in chroots) * the fallback is brute forcing through the fd * table */ assert_se(getrlimit(RLIMIT_NOFILE, &rl) >= 0); for (fd = 3; fd < (int) rl.rlim_max; fd ++) { int q; if (fd_in_set(fd, except, n_except)) continue; q = close_nointr(fd); if (q < 0 && q != -EBADF && r >= 0) r = q; } return r; } FOREACH_DIRENT(de, d, return -errno) { int fd = -1, q; if (safe_atoi(de->d_name, &fd) < 0) /* Let's better ignore this, just in case */ continue; if (fd < 3) continue; if (fd == dirfd(d)) continue; if (fd_in_set(fd, except, n_except)) continue; q = close_nointr(fd); if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */ r = q; } return r; } int same_fd(int a, int b) { struct stat sta, stb; pid_t pid; int r, fa, fb; assert(a >= 0); assert(b >= 0); /* Compares two file descriptors. Note that semantics are * quite different depending on whether we have kcmp() or we * don't. If we have kcmp() this will only return true for * dup()ed file descriptors, but not otherwise. If we don't * have kcmp() this will also return true for two fds of the same * file, created by separate open() calls. Since we use this * call mostly for filtering out duplicates in the fd store * this difference hopefully doesn't matter too much. */ if (a == b) return true; /* Try to use kcmp() if we have it. */ pid = getpid_cached(); r = kcmp(pid, pid, KCMP_FILE, a, b); if (r == 0) return true; if (r > 0) return false; if (errno != ENOSYS) return -errno; /* We don't have kcmp(), use fstat() instead. */ if (fstat(a, &sta) < 0) return -errno; if (fstat(b, &stb) < 0) return -errno; if ((sta.st_mode & S_IFMT) != (stb.st_mode & S_IFMT)) return false; /* We consider all device fds different, since two device fds * might refer to quite different device contexts even though * they share the same inode and backing dev_t. */ if (S_ISCHR(sta.st_mode) || S_ISBLK(sta.st_mode)) return false; if (sta.st_dev != stb.st_dev || sta.st_ino != stb.st_ino) return false; /* The fds refer to the same inode on disk, let's also check * if they have the same fd flags. This is useful to * distinguish the read and write side of a pipe created with * pipe(). */ fa = fcntl(a, F_GETFL); if (fa < 0) return -errno; fb = fcntl(b, F_GETFL); if (fb < 0) return -errno; return fa == fb; } void cmsg_close_all(struct msghdr *mh) { struct cmsghdr *cmsg; assert(mh); CMSG_FOREACH(cmsg, mh) if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) close_many((int*) CMSG_DATA(cmsg), (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int)); } bool fdname_is_valid(const char *s) { const char *p; /* Validates a name for $LISTEN_FDNAMES. We basically allow * everything ASCII that's not a control character. Also, as * special exception the ":" character is not allowed, as we * use that as field separator in $LISTEN_FDNAMES. * * Note that the empty string is explicitly allowed * here. However, we limit the length of the names to 255 * characters. */ if (!s) return false; for (p = s; *p; p++) { if (*p < ' ') return false; if (*p >= 127) return false; if (*p == ':') return false; } return p - s < 256; } int fd_get_path(int fd, char **ret) { char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)]; int r; xsprintf(procfs_path, "/proc/self/fd/%i", fd); r = readlink_malloc(procfs_path, ret); if (r == -ENOENT) /* If the file doesn't exist the fd is invalid */ return -EBADF; return r; } int move_fd(int from, int to, int cloexec) { int r; /* Move fd 'from' to 'to', make sure FD_CLOEXEC remains equal if requested, and release the old fd. If * 'cloexec' is passed as -1, the original FD_CLOEXEC is inherited for the new fd. If it is 0, it is turned * off, if it is > 0 it is turned on. */ if (from < 0) return -EBADF; if (to < 0) return -EBADF; if (from == to) { if (cloexec >= 0) { r = fd_cloexec(to, cloexec); if (r < 0) return r; } return to; } if (cloexec < 0) { int fl; fl = fcntl(from, F_GETFD, 0); if (fl < 0) return -errno; cloexec = !!(fl & FD_CLOEXEC); } r = dup3(from, to, cloexec ? O_CLOEXEC : 0); if (r < 0) return -errno; assert(r == to); safe_close(from); return to; } int acquire_data_fd(const void *data, size_t size, unsigned flags) { char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int)]; _cleanup_close_pair_ int pipefds[2] = { -1, -1 }; char pattern[] = "/dev/shm/data-fd-XXXXXX"; _cleanup_close_ int fd = -1; int isz = 0, r; ssize_t n; off_t f; assert(data || size == 0); /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more * complex than I wish it was. But here's why: * * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14. * * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged * clients can only bump their size to a system-wide limit, which might be quite low. * * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via * /proc/self/. Unfortunately O_TMPFILE is not available on older kernels on tmpfs. * * d) Finally, we try creating a regular file in /dev/shm, which we then delete. * * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I * figure. */ if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) { /* As a special case, return /dev/null if we have been called for an empty data block */ r = open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY); if (r < 0) return -errno; return r; } if ((flags & ACQUIRE_NO_MEMFD) == 0) { fd = memfd_new("data-fd"); if (fd < 0) goto try_pipe; n = write(fd, data, size); if (n < 0) return -errno; if ((size_t) n != size) return -EIO; f = lseek(fd, 0, SEEK_SET); if (f != 0) return -errno; r = memfd_set_sealed(fd); if (r < 0) return r; r = fd; fd = -1; return r; } try_pipe: if ((flags & ACQUIRE_NO_PIPE) == 0) { if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0) return -errno; isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); if (isz < 0) return -errno; if ((size_t) isz < size) { isz = (int) size; if (isz < 0 || (size_t) isz != size) return -E2BIG; /* Try to bump the pipe size */ (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz); /* See if that worked */ isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0); if (isz < 0) return -errno; if ((size_t) isz < size) goto try_dev_shm; } n = write(pipefds[1], data, size); if (n < 0) return -errno; if ((size_t) n != size) return -EIO; (void) fd_nonblock(pipefds[0], false); r = pipefds[0]; pipefds[0] = -1; return r; } try_dev_shm: if ((flags & ACQUIRE_NO_TMPFILE) == 0) { fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500); if (fd < 0) goto try_dev_shm_without_o_tmpfile; n = write(fd, data, size); if (n < 0) return -errno; if ((size_t) n != size) return -EIO; /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ xsprintf(procfs_path, "/proc/self/fd/%i", fd); r = open(procfs_path, O_RDONLY|O_CLOEXEC); if (r < 0) return -errno; return r; } try_dev_shm_without_o_tmpfile: if ((flags & ACQUIRE_NO_REGULAR) == 0) { fd = mkostemp_safe(pattern); if (fd < 0) return fd; n = write(fd, data, size); if (n < 0) { r = -errno; goto unlink_and_return; } if ((size_t) n != size) { r = -EIO; goto unlink_and_return; } /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */ r = open(pattern, O_RDONLY|O_CLOEXEC); if (r < 0) r = -errno; unlink_and_return: (void) unlink(pattern); return r; } return -EOPNOTSUPP; }