diff --git a/distrib/sets/lists/minix-base/mi b/distrib/sets/lists/minix-base/mi index e2ee8526a..8463f888f 100644 --- a/distrib/sets/lists/minix-base/mi +++ b/distrib/sets/lists/minix-base/mi @@ -179,6 +179,7 @@ ./etc/system.conf.d/ipc minix-base ./etc/system.conf.d/lwip minix-base ./etc/system.conf.d/random minix-base +./etc/system.conf.d/uds minix-base ./etc/system.conf.d/usb_hub minix-base ./etc/system.conf.d/usb_storage minix-base ./etc/termcap minix-base diff --git a/distrib/sets/lists/minix-man/mi b/distrib/sets/lists/minix-man/mi index 6ce70fc75..895353269 100644 --- a/distrib/sets/lists/minix-man/mi +++ b/distrib/sets/lists/minix-man/mi @@ -477,7 +477,7 @@ ./usr/man/man2/getgid.2 minix-man ./usr/man/man2/getitimer.2 minix-man ./usr/man/man2/getnucred.2 minix-man obsolete -./usr/man/man2/getpeereid.2 minix-man +./usr/man/man2/getpeereid.2 minix-man obsolete ./usr/man/man2/getpeername.2 minix-man ./usr/man/man2/getpid.2 minix-man ./usr/man/man2/getpriority.2 minix-man @@ -3463,7 +3463,7 @@ ./usr/man/man8/syslogd.8 minix-man ./usr/man/man8/tcpd.8 minix-man ./usr/man/man8/traceroute.8 minix-man -./usr/man/man8/uds.8 minix-man +./usr/man/man8/uds.8 minix-man obsolete ./usr/man/man8/unix.8 minix-man ./usr/man/man8/unlink.8 minix-man ./usr/man/man8/unstr.8 minix-man diff --git a/etc/system.conf b/etc/system.conf index d89a6d3df..38c43f24f 100644 --- a/etc/system.conf +++ b/etc/system.conf @@ -494,14 +494,6 @@ service vnd uid 0; # only for copyfd(2) }; -service uds -{ - ipc - SYSTEM vfs rs vm - ; - uid 0; # only for checkperms(2) and copyfd(2) -}; - service pty { system diff --git a/etc/usr/rc b/etc/usr/rc index 69dfb1607..1d5bb3e84 100644 --- a/etc/usr/rc +++ b/etc/usr/rc @@ -201,7 +201,7 @@ start) # pty needs to know the "tty" group ID up pty -dev /dev/ptmx -args "gid=`stat -f '%g' /dev/ptmx`" - up uds -dev /dev/uds + up uds up -n ipc diff --git a/external/bsd/tmux/dist/client.c b/external/bsd/tmux/dist/client.c index d790ea8e8..ce88e5448 100644 --- a/external/bsd/tmux/dist/client.c +++ b/external/bsd/tmux/dist/client.c @@ -107,11 +107,7 @@ client_connect(char *path, int start_server) } retry: -#ifndef __minix if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) -#else - if ((fd = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1) -#endif /* !defined(__minix) */ fatal("socket failed"); if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == -1) { diff --git a/external/bsd/tmux/dist/server.c b/external/bsd/tmux/dist/server.c index 5682afe66..33b8576c2 100644 --- a/external/bsd/tmux/dist/server.c +++ b/external/bsd/tmux/dist/server.c @@ -84,11 +84,7 @@ server_create_socket(void) } unlink(sa.sun_path); -#ifndef __minix if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) -#else - if ((fd = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1) -#endif /* !defined(__minix) */ fatal("socket failed"); mask = umask(S_IXUSR|S_IXGRP|S_IRWXO); @@ -114,11 +110,7 @@ server_start(int lockfd, char *lockfile) char *cause; /* The first client is special and gets a socketpair; create it. */ -#ifndef __minix if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, pair) != 0) -#else - if (socketpair(AF_UNIX, SOCK_SEQPACKET, PF_UNSPEC, pair) != 0) -#endif /* !defined(__minix) */ fatal("socketpair failed"); switch (fork()) { diff --git a/lib/libc/gen/syslog.c b/lib/libc/gen/syslog.c index f07434811..5ece67ddd 100644 --- a/lib/libc/gen/syslog.c +++ b/lib/libc/gen/syslog.c @@ -59,10 +59,6 @@ __RCSID("$NetBSD: syslog.c,v 1.54 2014/09/18 13:58:20 christos Exp $"); #include "reentrant.h" #include "extern.h" -#if defined(__minix) -#include -#endif /* defined(__minix) */ - #ifdef __weak_alias __weak_alias(closelog,_closelog) __weak_alias(openlog,_openlog) @@ -452,11 +448,7 @@ vsyslogp_r(int pri, struct syslog_data *data, const char *msgid, * to give syslogd a chance to empty its socket buffer. */ for (tries = 0; tries < MAXTRIES; tries++) { -#if defined(__minix) - if (write(data->log_file, tbuf, cnt) != -1) -#else if (send(data->log_file, tbuf, cnt, 0) != -1) -#endif /* defined(__minix) */ break; if (errno != ENOBUFS) { disconnectlog_r(data); @@ -513,9 +505,7 @@ connectlog_r(struct syslog_data *data) /* AF_UNIX address of local logger */ static const struct sockaddr_un sun = { .sun_family = AF_LOCAL, -#if !defined(__minix) .sun_len = sizeof(sun), -#endif /* !defined(__minix) */ .sun_path = _PATH_LOG, }; @@ -526,14 +516,9 @@ connectlog_r(struct syslog_data *data) data->log_connected = 0; } if (!data->log_connected) { -#if defined(__minix) - if(ioctl(data->log_file, NWIOSUDSTADDR, __UNCONST(&sun)) < 0) - -#else if (connect(data->log_file, (const struct sockaddr *)(const void *)&sun, (socklen_t)sizeof(sun)) == -1) -#endif /* defined(__minix) */ { (void)close(data->log_file); data->log_file = -1; diff --git a/lib/libc/net/Makefile.inc b/lib/libc/net/Makefile.inc index bf65717a1..fa29213c5 100644 --- a/lib/libc/net/Makefile.inc +++ b/lib/libc/net/Makefile.inc @@ -2,13 +2,6 @@ # @(#)Makefile.inc 8.2 (Berkeley) 9/5/93 # net sources -.if defined(__MINIX) -.PATH: ${NETBSDSRCDIR}/minix/lib/libc/net - -CPPFLAGS.getpeereid.c+= -D_MINIX_SYSTEM=1 -CPPFLAGS.getsockopt.c+= -D_MINIX_SYSTEM=1 -CPPFLAGS.setsockopt.c+= -D_MINIX_SYSTEM=1 -.endif .PATH: ${ARCHDIR}/net ${.CURDIR}/net SRCS+= base64.c ethers.c gethnamaddr.c getifaddrs.c \ diff --git a/minix/commands/DESCRIBE/DESCRIBE.sh b/minix/commands/DESCRIBE/DESCRIBE.sh index e826052ab..09f56ecf5 100644 --- a/minix/commands/DESCRIBE/DESCRIBE.sh +++ b/minix/commands/DESCRIBE/DESCRIBE.sh @@ -192,9 +192,6 @@ do 17,0) des="hello" dev=hello ;; - 18,0) - des="UNIX domain socket" dev=uds - ;; 5[6-9],0|6[0-3],0) drive=`expr $major - 56` des="vnode disk $drive" dev=vnd$drive diff --git a/minix/commands/MAKEDEV/MAKEDEV.sh b/minix/commands/MAKEDEV/MAKEDEV.sh index 78d647dcb..8c8a98b12 100755 --- a/minix/commands/MAKEDEV/MAKEDEV.sh +++ b/minix/commands/MAKEDEV/MAKEDEV.sh @@ -49,7 +49,6 @@ STD_DEVICES=" ttypa ttypb ttypc ttypd ttype ttypf ttyq0 ttyq1 ttyq2 ttyq3 ttyq4 ttyq5 ttyq6 ttyq7 ttyq8 ttyq9 ttyqa ttyqb ttyqc ttyqd ttyqe ttyqf - uds vnd0 vnd0p0 vnd0p0s0 vnd1 vnd1p0 vnd1p0s0 vnd2 vnd3 vnd4 vnd5 vnd6 vnd7 " @@ -134,7 +133,6 @@ Where key is one of the following: klog # Make /dev/klog ptmx # Make /dev/ptmx random # Make /dev/random, /dev/urandom - uds # Make /dev/uds filter # Make /dev/filter fbd # Make /dev/fbd hello # Make /dev/hello @@ -438,10 +436,6 @@ do makedev ${dev} c 4 ${minor} ${uname} tty ${permissions} ;; - uds) - # Unix domain sockets device - makedev ${dev} c 18 0 ${uname} ${gname} 666 - ;; vnd[0-7]) # Whole vnode disk devices. makedev ${dev} b ${major} 0 ${uname} ${gname} ${permissions} diff --git a/minix/include/minix/dmap.h b/minix/include/minix/dmap.h index 8c6560d3b..c02731747 100644 --- a/minix/include/minix/dmap.h +++ b/minix/include/minix/dmap.h @@ -36,8 +36,8 @@ #define LOG_MAJOR 15 /* 15 = /dev/klog (log driver) */ #define RANDOM_MAJOR 16 /* 16 = /dev/random (random driver) */ #define HELLO_MAJOR 17 /* 17 = /dev/hello (hello driver) */ -#define UDS_MAJOR 18 /* 18 = /dev/uds (pfs) */ -#define FB_MAJOR 19 /* 18 = /dev/fb0 (fb driver) */ + /* 18 = (unused) */ +#define FB_MAJOR 19 /* 19 = /dev/fb0 (fb driver) */ #define I2C0_MAJOR 20 /* 20 = /dev/i2c-1 (i2c-dev) */ #define I2C1_MAJOR 21 /* 21 = /dev/i2c-2 (i2c-dev) */ #define I2C2_MAJOR 22 /* 22 = /dev/i2c-3 (i2c-dev) */ diff --git a/minix/include/minix/syslib.h b/minix/include/minix/syslib.h index 98f58cbd5..1856c41da 100644 --- a/minix/include/minix/syslib.h +++ b/minix/include/minix/syslib.h @@ -273,11 +273,10 @@ uid_t getnuid(endpoint_t proc_ep); gid_t getngid(endpoint_t proc_ep); int getsockcred(endpoint_t proc_ep, struct sockcred * sockcred, gid_t * groups, int ngroups); -int socketpath(endpoint_t endpt, char *path, size_t size, int what, dev_t *dev, - ino_t *ino); +int socketpath(endpoint_t endpt, const char *path, size_t size, int what, + dev_t *dev, ino_t *ino); #define SPATH_CHECK 0 /* check user permissions on socket path */ #define SPATH_CREATE 1 /* create socket file at given path */ -#define SPATH_CANONIZE 0x8000 /* copy back canonized path (legacy support) */ int copyfd(endpoint_t endpt, int fd, int what); #define COPYFD_FROM 0 /* copy file descriptor from remote process */ #define COPYFD_TO 1 /* copy file descriptor to remote process */ diff --git a/minix/lib/libc/net/getpeereid.c b/minix/lib/libc/net/getpeereid.c deleted file mode 100644 index 7638c12ba..000000000 --- a/minix/lib/libc/net/getpeereid.c +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include -#include -#include - -/* - * get the effective user ID and effective group ID of a peer - * connected through a Unix domain socket. - */ -int getpeereid(int sd, uid_t *euid, gid_t *egid) { - int rc; - struct uucred cred; - socklen_t ucred_length; - - /* Initialize Data Structures */ - ucred_length = sizeof(struct uucred); - memset(&cred, '\0', ucred_length); - - /* Validate Input Parameters */ - if (euid == NULL || egid == NULL) { - errno = EFAULT; - return -1; - } /* getsockopt will handle validating 'sd' */ - - /* Get the credentials of the peer at the other end of 'sd' */ - rc = getsockopt(sd, SOL_SOCKET, SO_PEERCRED, &cred, &ucred_length); - if (rc == 0) { - /* Success - return the results */ - *euid = cred.cr_uid; - *egid = cred.cr_gid; - return 0; - } else { - /* Failure - getsockopt takes care of setting errno */ - return -1; - } -} diff --git a/minix/lib/libc/sys/getsockopt.c b/minix/lib/libc/sys/getsockopt.c index 7edea530e..7c000fb05 100644 --- a/minix/lib/libc/sys/getsockopt.c +++ b/minix/lib/libc/sys/getsockopt.c @@ -244,6 +244,7 @@ static int _uds_getsockopt(int sock, int level, int option_name, return 0; } +#ifdef SO_PEERCRED if (level == SOL_SOCKET && option_name == SO_PEERCRED) { struct uucred cred; @@ -257,6 +258,7 @@ static int _uds_getsockopt(int sock, int level, int option_name, option_len); return 0; } +#endif if (level == SOL_SOCKET && option_name == SO_REUSEADDR) @@ -269,12 +271,14 @@ static int _uds_getsockopt(int sock, int level, int option_name, return 0; } +#ifdef SO_PASSCRED if (level == SOL_SOCKET && option_name == SO_PASSCRED) { i = 1; /* option is always 'on' */ getsockopt_copy(&i, sizeof(i), option_value, option_len); return 0; } +#endif #if DEBUG fprintf(stderr, "_uds_getsocketopt: level %d, name %d\n", diff --git a/minix/lib/libc/sys/setsockopt.c b/minix/lib/libc/sys/setsockopt.c index 04c0d311d..cbe29ed03 100644 --- a/minix/lib/libc/sys/setsockopt.c +++ b/minix/lib/libc/sys/setsockopt.c @@ -267,6 +267,7 @@ static int _uds_setsockopt(int sock, int level, int option_name, return 0; } +#ifdef SO_PASSCRED if (level == SOL_SOCKET && option_name == SO_PASSCRED) { if (option_len != sizeof(i)) @@ -283,6 +284,7 @@ static int _uds_setsockopt(int sock, int level, int option_name, } return 0; } +#endif #if DEBUG fprintf(stderr, "_uds_setsocketopt: level %d, name %d\n", diff --git a/minix/lib/libsys/socketpath.c b/minix/lib/libsys/socketpath.c index 2473a7815..e4634bfb0 100644 --- a/minix/lib/libsys/socketpath.c +++ b/minix/lib/libsys/socketpath.c @@ -5,22 +5,22 @@ #include int -socketpath(endpoint_t endpt, char * path, size_t size, int what, dev_t * dev, - ino_t * ino) +socketpath(endpoint_t endpt, const char * path, size_t size, int what, + dev_t * dev, ino_t * ino) { cp_grant_id_t grant; message m; int r; if ((grant = cpf_grant_direct(VFS_PROC_NR, (vir_bytes)path, size, - CPF_READ | CPF_WRITE)) == GRANT_INVALID) + CPF_READ)) == GRANT_INVALID) return ENOMEM; memset(&m, 0, sizeof(m)); m.m_lsys_vfs_socketpath.endpt = endpt; m.m_lsys_vfs_socketpath.grant = grant; m.m_lsys_vfs_socketpath.count = size; - m.m_lsys_vfs_socketpath.what = what | SPATH_CANONIZE; + m.m_lsys_vfs_socketpath.what = what; r = _taskcall(VFS_PROC_NR, VFS_SOCKETPATH, &m); diff --git a/minix/man/man2/Makefile b/minix/man/man2/Makefile index 3db88c945..1342b77ec 100644 --- a/minix/man/man2/Makefile +++ b/minix/man/man2/Makefile @@ -1,6 +1,6 @@ MAN= accept.2 access.2 bind.2 brk.2 chdir.2 chmod.2 chown.2 \ chroot.2 close.2 connect.2 creat.2 dup.2 execve.2 exit.2 fcntl.2 \ - fork.2 getgid.2 getitimer.2 getpeereid.2 \ + fork.2 getgid.2 getitimer.2 \ getpeername.2 getpid.2 getpriority.2 getsockname.2 getsockopt.2 \ gettimeofday.2 getuid.2 intro.2 ioctl.2 kill.2 link.2 listen.2 \ lseek.2 mkdir.2 mknod.2 mount.2 open.2 ptrace.2 \ diff --git a/minix/man/man2/getpeereid.2 b/minix/man/man2/getpeereid.2 deleted file mode 100644 index 2c0a15f07..000000000 --- a/minix/man/man2/getpeereid.2 +++ /dev/null @@ -1,42 +0,0 @@ -.TH GETPEEREID 2 -.SH NAME -getpeereid \- get the effective user ID and effective group ID of a peer -connected through a Unix domain socket. -.SH SYNOPSIS -.ft B -#include - -.in +5 -.ti -5 -int getpeereid(int \fIsd\fP, uid_t *\fIeuid\fP, gid_t *\fIegid\fP); -.br -.ft P -.SH DESCRIPTION -getpeereid() is often used to authenticate clients connecting to a -server through a Unix domain socket. The server can call this function -with a socket descriptor \fIsd\fP and this function will fill\-in -\fIeuid\fP and \fIegid\fP with the effective user ID and the effective -group ID of the client process. -.SH RETURN VALUES -On success, this function returns 0, \fIeuid\fP is set to the effective -user ID of the peer connected through Unix domain socket \fIsd\fP, and -\fIegid\fP is set to the effective group ID of the peer connected -through Unix domain socket \fIsd\fP. On error, -1 is returned and -\fIerrno\fP is set. -.SH ERRORS -.TP 15 -[EBADF] -The argument \fIsd\fP is not a descriptor. -.TP 15 -[ENOTSOCK] -The argument \fIsd\fP is a descriptor, but not a socket descriptor. -.TP 15 -[EFAULT] -The address pointed to by \fIeuid\fP and/or \fIegid\fP is not in a -valid part of the process address space. -.SH SEE ALSO -.BR socket(2), -.BR socketpair(2), -.BR unix(8) -.SH HISTORY -This function first appeared in Minix 3.1.8. diff --git a/minix/net/uds/Makefile b/minix/net/uds/Makefile index 8ae35c943..a85c4483e 100644 --- a/minix/net/uds/Makefile +++ b/minix/net/uds/Makefile @@ -1,9 +1,15 @@ # Makefile for the UNIX Domain Sockets driver (UDS) PROG= uds -SRCS= uds.c ioc_uds.c -MAN= uds.8 unix.8 +SRCS= uds.c io.c stat.c +MAN= unix.8 -DPADD+= ${LIBCHARDRIVER} ${LIBSYS} -LDADD+= -lchardriver -lsys +FILES=${PROG}.conf +FILESNAME=${PROG} +FILESDIR= /etc/system.conf.d + +DPADD+= ${LIBSOCKEVENT} ${LIBSOCKDRIVER} ${LIBSYS} ${LIBTIMERS} +LDADD+= -lsockevent -lsockdriver -lsys -ltimers + +WARNS?= 5 .include diff --git a/minix/net/uds/io.c b/minix/net/uds/io.c new file mode 100644 index 000000000..1b8de37b8 --- /dev/null +++ b/minix/net/uds/io.c @@ -0,0 +1,1795 @@ +/* UNIX Domain Sockets - io.c - sending and receiving */ + +#include "uds.h" +#include + +/* + * Our UDS sockets do not have a send buffer. They only have a receive buffer. + * This receive buffer, when not empty, is split up in segments. Each segment + * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and + * (SOCK_DGRAM) neither. There are two types of ancillary data: in-flight file + * descriptors and sender credentials. In addition, for SOCK_DGRAM sockets, + * the segment may contain the sender's socket path (if the sender's socket is + * bound). Each segment has has a header, containing the full segment size, + * the size of the actual data in the segment (if any), and a flags field that + * states which ancillary are associated with the segment (if any). For + * SOCK_STREAM type sockets, new data may be merged into a previous segment, + * but only if it has no ancillary data. For the other two socket types, each + * packet has its own header. The resulting behavior should be in line with + * the POSIX "Socket Receive Queue" specification. + * + * More specifically, each segment consists of the following parts: + * - always a five-byte header, containing a two-byte segment length (including + * the header, so always non-zero), a two-byte regular data length (zero or + * more), and a one-byte flags field which is a bitwise combination of + * UDS_HAS_{FD,CRED,PATH} flags; + * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure; + * since this structure is variable-size, the structure is prepended by a + * single byte that contains the length of the structure (excluding the byte + * itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN); + * - next, if UDS_HAS_PATH is set in the segment header: + * - next, if the data length is non-zero, the actual regular data. + * If the segment is not the last in the receive buffer, it is followed by the + * next segment immediately afterward. There is no alignment. + * + * It is the sender's responsibility to merge new data into the last segment + * whenever possible, so that the receiver side never needs to consider more + * than one segment at once. In order to allow such merging, each receive + * buffer has not only a tail and in-use length (pointing to the head when + * combined) but also an offset from the tail to the last header, if any. Note + * that the receiver may over time still look at multiple segments for a single + * request: this happens when a MSG_WAITALL request empties the buffer and then + * blocks - the next piece of arriving data can then obviously not be merged. + * + * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file + * descriptors are associated with the segment. These are stored in a separate + * data structure, mainly to simplify cleaning up when the socket is shut down + * for reading or closed. That structure also contains the number of file + * descriptors associated with the current segment, so this is not stored in + * the segment itself. As mentioned later, this may be changed in the future. + * + * On the sender side, there is a trade-off between fully utilizing the receive + * buffer, and not repeatedly performing expensive actions for the same call: + * it may be costly to determine exactly how many in-flight file descriptors + * there will be (if any) and/or how much space is needed to store credentials. + * We currently use the policy that we rather block/reject a send request that + * may (just) have fit in the remaining part of the receive buffer, than obtain + * the same information multiple times or keep state between callbacks. In + * practice this is not expected to make a difference, especially since + * transfer of ancillary data should be rare anyway. + */ +/* + * The current layout of the segment header is as follows. + * + * The first byte contains the upper eight bits of the total segment length. + * The second byte contains the lower eight bits of the total segment length. + * The third byte contains the upper eight bits of the data length. + * The fourth byte contains the lower eight bits of the data length. + * The fifth byte is a bitmask for ancillary data associated with the segment. + */ +#define UDS_HDRLEN 5 + +#define UDS_HAS_FDS 0x01 /* segment has in-flight file descriptors */ +#define UDS_HAS_CRED 0x02 /* segment has sender credentials */ +#define UDS_HAS_PATH 0x04 /* segment has source socket path */ + +#define UDS_MAXCREDLEN SOCKCREDSIZE(NGROUPS_MAX) + +#define uds_get_head(uds) \ + ((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF) +#define uds_get_last(uds) \ + ((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF) +#define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF) + +/* + * All in-flight file descriptors are (co-)owned by the UDS driver itself, as + * local open file descriptors. Like any other process, the UDS driver can not + * have more than OPEN_MAX open file descriptors at any time. Thus, this is + * also the inherent maximum number of in-flight file descriptors. Therefore, + * we maintain a single pool of in-flight FD structures, and we associate these + * structures with sockets as needed. + */ +static struct uds_fd uds_fds[OPEN_MAX]; +static SIMPLEQ_HEAD(uds_freefds, uds_fd) uds_freefds; + +static char uds_ctlbuf[UDS_CTL_MAX]; +static int uds_ctlfds[UDS_CTL_MAX / sizeof(int)]; + +/* + * Initialize the input/output part of the UDS service. + */ +void +uds_io_init(void) +{ + unsigned int slot; + + SIMPLEQ_INIT(&uds_freefds); + + for (slot = 0; slot < __arraycount(uds_fds); slot++) + SIMPLEQ_INSERT_TAIL(&uds_freefds, &uds_fds[slot], ufd_next); +} + +/* + * Set up all input/output state for the given socket, which has just been + * allocated. As part of this, allocate memory for the receive buffer of the + * socket. Return OK or a negative error code. + */ +int +uds_io_setup(struct udssock * uds) +{ + + /* TODO: decide if we should preallocate the memory. */ + if ((uds->uds_buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) + return ENOMEM; + + uds->uds_tail = 0; + uds->uds_len = 0; + uds->uds_last = 0; + + SIMPLEQ_INIT(&uds->uds_fds); + + return OK; +} + +/* + * Clean up the input/output state for the given socket, which is about to be + * freed. As part of this, deallocate memory for the receive buffer and close + * any file descriptors still in flight on the socket. + */ +void +uds_io_cleanup(struct udssock * uds) +{ + + /* Close any in-flight file descriptors. */ + uds_io_reset(uds); + + /* Free the receive buffer memory. */ + if (munmap(uds->uds_buf, UDS_BUF) != 0) + panic("UDS: munmap failed: %d", errno); +} + +/* + * The socket is being closed or shut down for reading. If there are still any + * in-flight file descriptors, theey will never be received anymore, so close + * them now. + */ +void +uds_io_reset(struct udssock * uds) +{ + struct uds_fd *ufd; + + /* + * The UDS service may have the last and only reference to any of these + * file descriptors here. For that reason, we currently disallow + * transfer of UDS file descriptors, because the close(2) here could + * block on a socket close operation back to us, leading to a deadlock. + * Also, we use a non-blocking variant of close(2), to prevent that we + * end up hanging on sockets with SO_LINGER turned on. + */ + SIMPLEQ_FOREACH(ufd, &uds->uds_fds, ufd_next) { + dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); + + closenb(ufd->ufd_fd); + } + + SIMPLEQ_CONCAT(&uds_freefds, &uds->uds_fds); + + /* + * If this reset happens as part of a shutdown, it might be done + * again on close, so ensure that it will find a clean state. The + * receive buffer should never be looked at again either way, but reset + * it too just to be sure. + */ + uds->uds_tail = 0; + uds->uds_len = 0; + uds->uds_last = 0; + + SIMPLEQ_INIT(&uds->uds_fds); +} + +/* + * Return the maximum usable part of the receive buffer, in bytes. The return + * value is used for the SO_SNDBUF and SO_RCVBUF socket options. + */ +size_t +uds_io_buflen(void) +{ + + /* + * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we + * could use the full receive buffer for data. This would require that + * we store up to one header in the socket object rather than in the + * receive buffer. + */ + return UDS_BUF - UDS_HDRLEN; +} + +/* + * Fetch 'len' bytes starting from absolute position 'pos' into the receive + * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'. + * Return the absolute position of the first byte after the fetched data in the + * receive buffer. + */ +static size_t +uds_fetch(struct udssock * uds, size_t off, void * ptr, size_t len) +{ + size_t left; + + assert(off < UDS_BUF); + + left = UDS_BUF - off; + if (len >= left) { + memcpy(ptr, &uds->uds_buf[off], left); + + if ((len -= left) > 0) + memcpy((char *)ptr + left, &uds->uds_buf[0], len); + + return len; + } else { + memcpy(ptr, &uds->uds_buf[off], len); + + return off + len; + } +} + +/* + * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive + * buffer of socket 'uds', starting at absolute position 'pos' into the receive + * buffer. Return the absolute position of the first byte after the stored + * data in the receive buffer. + */ +static size_t +uds_store(struct udssock * uds, size_t off, const void * ptr, size_t len) +{ + size_t left; + + assert(off < UDS_BUF); + + left = UDS_BUF - off; + if (len >= left) { + memcpy(&uds->uds_buf[off], ptr, left); + + if ((len -= left) > 0) + memcpy(&uds->uds_buf[0], (const char *)ptr + left, + len); + + return len; + } else { + memcpy(&uds->uds_buf[off], ptr, len); + + return off + len; + } +} + +/* + * Fetch a segment header previously stored in the receive buffer of socket + * 'uds' at absolute position 'off'. Return the absolute position of the first + * byte after the header, as well as the entire segment length in 'seglen', the + * length of the data in the segment in 'datalen', and the segment flags in + * 'segflags'. + */ +static size_t +uds_fetch_hdr(struct udssock * uds, size_t off, size_t * seglen, + size_t * datalen, unsigned int * segflags) +{ + unsigned char hdr[UDS_HDRLEN]; + + off = uds_fetch(uds, off, hdr, sizeof(hdr)); + + *seglen = ((size_t)hdr[0] << 8) | (size_t)hdr[1]; + *datalen = ((size_t)hdr[2] << 8) | (size_t)hdr[3]; + *segflags = hdr[4]; + + assert(*seglen >= UDS_HDRLEN); + assert(*seglen <= uds->uds_len); + assert(*datalen <= *seglen - UDS_HDRLEN); + assert(*segflags != 0 || *datalen == *seglen - UDS_HDRLEN); + assert(!(*segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH))); + + return off; +} + +/* + * Store a segment header in the receive buffer of socket 'uds' at absolute + * position 'off', with the segment length 'seglen', the segment data length + * 'datalen', and the segment flags 'segflags'. Return the absolute receive + * buffer position of the first data byte after the stored header. + */ +static size_t +uds_store_hdr(struct udssock * uds, size_t off, size_t seglen, size_t datalen, + unsigned int segflags) +{ + unsigned char hdr[UDS_HDRLEN]; + + assert(seglen <= USHRT_MAX); + assert(datalen <= seglen); + assert(segflags <= UCHAR_MAX); + assert(!(segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH))); + + hdr[0] = (seglen >> 8) & 0xff; + hdr[1] = seglen & 0xff; + hdr[2] = (datalen >> 8) & 0xff; + hdr[3] = datalen & 0xff; + hdr[4] = segflags; + + return uds_store(uds, off, hdr, sizeof(hdr)); +} + +/* + * Perform initial checks on a send request, before it may potentially be + * suspended. Return OK if this send request is valid, or a negative error + * code if it is not. + */ +int +uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused, + const struct sockaddr * addr, socklen_t addr_len __unused, + endpoint_t user_endpt __unused, int flags) +{ + struct udssock *uds = (struct udssock *)sock; + size_t pathlen; + + /* + * Reject calls with unknown flags. Besides the flags handled entirely + * by libsockevent (which are not part of 'flags' here), that is all of + * them. TODO: ensure that we should really reject all other flags + * rather than ignore them. + */ + if (flags != 0) + return EOPNOTSUPP; + + /* + * Perform very basic address and message size checks on the send call. + * For non-stream sockets, we must reject packets that may never fit in + * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the + * send call may end up being suspended indefinitely. Therefore, we + * assume the worst-case scenario, which is that a full set of + * credentials must be associated with the packet. As a result, we may + * reject some large packets that could actually just fit. Checking + * the peer's LOCAL_CREDS setting here is not safe: even if we know the + * peer already at all (for SOCK_DGRAM we do not), the send may still + * block and the option toggled before it unblocks. + */ + switch (uds_get_type(uds)) { + case SOCK_STREAM: + /* Nothing to check for this case. */ + break; + + case SOCK_SEQPACKET: + if (len > UDS_BUF - UDS_HDRLEN - 1 - UDS_MAXCREDLEN) + return EMSGSIZE; + + break; + + case SOCK_DGRAM: + if (!uds_has_link(uds) && addr == NULL) + return EDESTADDRREQ; + + /* + * The path is stored without null terminator, but with leading + * byte containing the path length--if there is a path at all. + */ + pathlen = (size_t)uds->uds_pathlen; + if (pathlen > 0) + pathlen++; + + if (len > UDS_BUF - UDS_HDRLEN - pathlen - 1 - UDS_MAXCREDLEN) + return EMSGSIZE; + + break; + + default: + assert(0); + } + + return OK; +} + +/* + * Determine whether the (real or pretend) send request should be processed + * now, suspended until later, or rejected based on the current socket state. + * Return OK if the send request should be processed now. Return SUSPEND if + * the send request should be retried later. Return an appropriate negative + * error code if the send request should fail. + */ +static int +uds_send_test(struct udssock * uds, size_t len, socklen_t ctl_len, size_t min, + int partial) +{ + struct udssock *conn; + size_t avail, hdrlen, credlen; + + assert(!uds_is_shutdown(uds, SFL_SHUT_WR)); + + if (uds_get_type(uds) != SOCK_DGRAM) { + if (uds_is_connecting(uds)) + return SUSPEND; + if (!uds_is_connected(uds) && !uds_is_disconnected(uds)) + return ENOTCONN; + if (!uds_has_conn(uds)) + return EPIPE; + + conn = uds->uds_conn; + + if (uds_is_shutdown(conn, SFL_SHUT_RD)) + return EPIPE; + + /* + * For connection-type sockets, we now have to check if there + * is enough room in the receive buffer. For SOCK_STREAM + * sockets, we must check if at least 'min' bytes can be moved + * into the receive buffer, at least if that is a reasonable + * value for ever making any forward progress at all. For + * SOCK_SEQPACKET sockets, we must check if the entire packet + * of size 'len' can be stored in the receive buffer. In both + * cases, we must take into account any metadata to store along + * with the data. + * + * Unlike in uds_pre_send(), we can now check safely whether + * the peer is expecting credentials, but we still don't know + * the actual size of the credentials, so again we take the + * maximum possible size. The same applies to file descriptors + * transferred via control data: all we have the control length + * right now, which if non-zero we assume to mean there might + * be file descriptors. + * + * In both cases, the reason of overestimating is that actually + * getting accurate sizes, by obtaining credentials or copying + * in control data, is very costly. We want to do that only + * when we are sure we will not suspend the send call after + * all. It is no problem to overestimate how much space will + * be needed here, but not to underestimate: that could cause + * applications that use select(2) and non-blocking sockets to + * end up in a busy-wait loop. + */ + if (!partial && (conn->uds_flags & UDSF_PASSCRED)) + credlen = 1 + UDS_MAXCREDLEN; + else + credlen = 0; + + avail = UDS_BUF - conn->uds_len; + + if (uds_get_type(uds) == SOCK_STREAM) { + /* + * Limit the low threshold to the maximum that can ever + * be sent at once. + */ + if (min > UDS_BUF - UDS_HDRLEN - credlen) + min = UDS_BUF - UDS_HDRLEN - credlen; + + /* + * Suspend the call only if not even the low threshold + * is met. Otherwise we may make (partial) progress. + */ + if (len > min) + len = min; + + /* + * If the receive buffer already has at least one + * segment, and there are certainly no file descriptors + * to transfer now, and we do not have to store + * credentials either, then this segment can be merged + * with the previous one. In that case, we need no + * space for a header. That is certainly the case if + * we are resuming an already partially completed send. + */ + hdrlen = (avail == UDS_BUF || ctl_len != 0 || + credlen > 0) ? UDS_HDRLEN : 0; + } else + hdrlen = UDS_HDRLEN; + + if (avail < hdrlen + credlen + len) + return SUSPEND; + } + + return OK; +} + +/* + * Get the destination peer for a send request. The send test has already been + * performed first. On success, return OK, with a pointer to the peer socket + * stored in 'peerp'. On failure, return an appropriate error code. + */ +static int +uds_send_peer(struct udssock * uds, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp) +{ + struct udssock *peer; + int r; + + if (uds_get_type(uds) == SOCK_DGRAM) { + if (!uds_has_link(uds)) { + /* This was already checked in uds_pre_check(). */ + assert(addr != NULL); + + /* + * Find the socket identified by the given address. + * If it exists at all, see if it is a proper match. + */ + if ((r = uds_lookup(uds, addr, addr_len, user_endpt, + &peer)) != OK) + return r; + + /* + * If the peer socket is connected to a target, it + * must be this socket. Unfortunately, POSIX does not + * specify an error code for this. We borrow Linux's. + */ + if (uds_has_link(peer) && peer->uds_link != uds) + return EPERM; + } else + peer = uds->uds_link; + + /* + * If the receiving end will never receive this packet, we + * might as well not send it, so drop it immeiately. Indicate + * as such to the caller, using NetBSD's chosen error code. + */ + if (uds_is_shutdown(peer, SFL_SHUT_RD)) + return ENOBUFS; + } else { + assert(uds_has_conn(uds)); + + peer = uds->uds_conn; + } + + *peerp = peer; + return OK; +} + +/* + * Generate a new segment for the current send request, or arrange things such + * that new data can be merged with a previous segment. As part of this, + * decide whether we can merge data at all. The segment will be merged if, and + * only if, all of the following requirements are met: + * + * 1) the socket is of type SOCK_STREAM; + * 2) there is a previous segment in the receive buffer; + * 3) there is no ancillary data for the current send request. + * + * Also copy in regular data (if any), retrieve the sender's credentials (if + * needed), and copy over the source path (if applicable). However, do not yet + * commit the segment (or the new part to be merged), because the send request + * may still fail for other reasons. + * + * On success, return the length of the new segment (or, when merging, the + * length to be added to the last segment), as well as a flag indicating + * whether we are merging into the last segment in 'mergep', the length of the + * (new) data in the segment in 'datalenp', and the new segment's flags in + * 'segflagsp' (always zero when merging). Note that a return value of zero + * implies that we are merging zero extra bytes into the last segment, which + * means that effectively nothing changes; in that case the send call will be + * cut short and return zero to the caller as well. On failure, return a + * negative error code. + */ +static int +uds_send_data(struct udssock * uds, struct udssock * peer, + const struct sockdriver_data * data, size_t len, size_t off, + endpoint_t user_endpt, unsigned int nfds, int * __restrict mergep, + size_t * __restrict datalenp, unsigned int * __restrict segflagsp) +{ + struct sockcred sockcred; + gid_t groups[NGROUPS_MAX]; + iovec_t iov[2]; + unsigned int iovcnt, segflags; + unsigned char lenbyte; + size_t credlen, pathlen, datalen, seglen; + size_t avail, pos, left; + int r, merge; + + /* + * At this point we should add the data to the peer's receive buffer. + * In the case of SOCK_STREAM sockets, we should add as much of the + * data as possible and suspend the call to send the rest later, if + * applicable. In the case of SOCK_DGRAM sockets, we should drop the + * packet if it does not fit in the buffer. + * + * Due to the checks in uds_can_send(), we know for sure that we no + * longer have to suspend without making any progress at this point. + */ + segflags = (nfds > 0) ? UDS_HAS_FDS : 0; + + /* + * Obtain the credentials now. Doing so allows us to determine how + * much space we actually need for them. + */ + if (off == 0 && (peer->uds_flags & UDSF_PASSCRED)) { + memset(&sockcred, 0, sizeof(sockcred)); + + if ((r = getsockcred(user_endpt, &sockcred, groups, + __arraycount(groups))) != OK) + return r; + + credlen = 1 + SOCKCREDSIZE(sockcred.sc_ngroups); + + segflags |= UDS_HAS_CRED; + } else + credlen = 0; + + /* For bound source datagram sockets, include the source path. */ + if (uds_get_type(uds) == SOCK_DGRAM && uds->uds_pathlen != 0) { + pathlen = (size_t)uds->uds_pathlen + 1; + + segflags |= UDS_HAS_PATH; + } else + pathlen = 0; + + avail = UDS_BUF - peer->uds_len; + + if (uds_get_type(uds) == SOCK_STREAM) { + /* + * Determine whether we can merge data into the previous + * segment. This is a more refined version of the test in + * uds_can_send(), as we now know whether there are actually + * any FDs to transfer. + */ + merge = (peer->uds_len != 0 && nfds == 0 && credlen == 0); + + /* Determine how much we can send at once. */ + if (!merge) { + assert(avail > UDS_HDRLEN + credlen); + datalen = avail - UDS_HDRLEN - credlen; + } else + datalen = avail; + + if (datalen > len) + datalen = len; + + /* If we cannot make progress, we should have suspended.. */ + assert(datalen != 0 || len == 0); + } else { + merge = FALSE; + + datalen = len; + } + assert(datalen <= len); + assert(datalen <= UDS_BUF); + + /* + * Compute the total amount of space we need for the segment in the + * receive buffer. Given that we have done will-it-fit tests in + * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one + * case left where the result may not fit, and that is for SOCK_DGRAM + * packets. In that case, we drop the packet. POSIX says we should + * throw an error in that case, and that is also what NetBSD does. + */ + if (!merge) + seglen = UDS_HDRLEN + credlen + pathlen + datalen; + else + seglen = datalen; + + if (seglen > avail) { + assert(uds_get_type(uds) == SOCK_DGRAM); + + /* Drop the packet, borrowing NetBSD's chosen error code. */ + return ENOBUFS; + } + + /* + * Generate the full segment, but do not yet update the buffer head. + * We may still run into an error (copying in file descriptors) or even + * decide that nothing gets sent after all (if there are no data or + * file descriptors). If we are merging the new data into the previous + * segment, do not generate a header. + */ + pos = uds_get_head(peer); + + /* Generate the header, if needed. */ + if (!merge) + pos = uds_store_hdr(peer, pos, seglen, datalen, segflags); + else + assert(segflags == 0); + + /* Copy in and store the sender's credentials, if desired. */ + if (credlen > 0) { + assert(credlen >= 1 + sizeof(sockcred)); + assert(credlen <= UCHAR_MAX); + + lenbyte = credlen - 1; + pos = uds_store(peer, pos, &lenbyte, 1); + + if (sockcred.sc_ngroups > 0) { + pos = uds_store(peer, pos, &sockcred, + offsetof(struct sockcred, sc_groups)); + pos = uds_store(peer, pos, groups, + sockcred.sc_ngroups * sizeof(gid_t)); + } else + pos = uds_store(peer, pos, &sockcred, + sizeof(sockcred)); + } + + /* Store the sender's address if any. Datagram sockets only. */ + if (pathlen > 0) { + assert(pathlen > 1); + assert(pathlen <= UCHAR_MAX); + + lenbyte = uds->uds_pathlen; + pos = uds_store(peer, pos, &lenbyte, 1); + pos = uds_store(peer, pos, uds->uds_path, pathlen - 1); + } + + /* Lastly, copy in the actual data (if any) from the caller. */ + if (datalen > 0) { + iov[0].iov_addr = (vir_bytes)&peer->uds_buf[pos]; + left = UDS_BUF - pos; + + if (left < datalen) { + assert(left > 0); + iov[0].iov_size = left; + iov[1].iov_addr = (vir_bytes)&peer->uds_buf[0]; + iov[1].iov_size = datalen - left; + iovcnt = 2; + } else { + iov[0].iov_size = datalen; + iovcnt = 1; + } + + if ((r = sockdriver_vcopyin(data, off, iov, iovcnt)) != OK) + return r; + } + + *mergep = merge; + *datalenp = datalen; + *segflagsp = segflags; + return seglen; +} + +/* + * Copy in control data for the current send request, and extract any file + * descriptors to be transferred. Do not yet duplicate the file descriptors, + * but rather store a list in a temporary buffer: the send request may still + * fail in which case we want to avoid having to undo the duplication. + * + * On success, return the number of (zero or more) file descriptors extracted + * from the request and stored in the temporary buffer. On failure, return a + * negative error code. + */ +static int +uds_send_ctl(const struct sockdriver_data * ctl, socklen_t ctl_len, + endpoint_t user_endpt) +{ + struct msghdr msghdr; + struct cmsghdr *cmsg; + socklen_t left; + unsigned int i, n, nfds; + int r; + + /* + * Copy in the control data. We can spend a lot of effort copying in + * the data in small chunks, and change the receiving side to do the + * same, but it is really not worth it: applications never send a whole + * lot of file descriptors at once, and the buffer size is currently + * such that the UDS service itself will exhaust its OPEN_MAX limit + * anyway if they do. + */ + if (ctl_len > sizeof(uds_ctlbuf)) + return ENOBUFS; + + if ((r = sockdriver_copyin(ctl, 0, uds_ctlbuf, ctl_len)) != OK) + return r; + + if (ctl_len < sizeof(uds_ctlbuf)) + memset(&uds_ctlbuf[ctl_len], 0, sizeof(uds_ctlbuf) - ctl_len); + + /* + * Look for any file descriptors, and store their remote file + * descriptor numbers into a temporary array. + */ + memset(&msghdr, 0, sizeof(msghdr)); + msghdr.msg_control = uds_ctlbuf; + msghdr.msg_controllen = ctl_len; + + nfds = 0; + r = OK; + + /* + * The sender may provide file descriptors in multiple chunks. + * Currently we do not preserve these chunk boundaries, instead + * generating one single chunk with all file descriptors for the + * segment upon receipt. If needed, we can fairly easily adapt this + * later. + */ + for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { + /* + * Check for bogus lengths. There is no excuse for this; + * either the caller does not know what they are doing or we + * are looking at a hacking attempt. + */ + assert((socklen_t)((char *)cmsg - uds_ctlbuf) <= ctl_len); + left = ctl_len - (socklen_t)((char *)cmsg - uds_ctlbuf); + assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */ + + if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) { + printf("UDS: malformed control data from %u\n", + user_endpt); + r = EINVAL; + break; + } + + if (cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + continue; + + n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + for (i = 0; i < n; i++) { + /* + * Copy the file descriptor to the temporary buffer, + * whose size is based on the control data buffer, so + * it is always large enough to contain all FDs. + */ + assert(nfds < __arraycount(uds_ctlfds)); + + memcpy(&uds_ctlfds[nfds], + &((int *)CMSG_DATA(cmsg))[i], sizeof(int)); + + nfds++; + } + } + + return nfds; +} + +/* + * Actually duplicate any file descriptors that we extracted from the sender's + * control data and stored in our temporary buffer. On success, return OK, + * with all file descriptors stored in file descriptor objects that are + * appended to the socket's list of in-flight FD objects. Thus, on success, + * the send request may no longer fail. On failure, return a negative error + * code, with any partial duplication undone. + */ +static int +uds_send_fds(struct udssock * peer, unsigned int nfds, endpoint_t user_endpt) +{ + SIMPLEQ_HEAD(, uds_fd) fds; + struct uds_fd *ufd; + unsigned int i; + int r; + + SIMPLEQ_INIT(&fds); + + for (i = 0; i < nfds; i++) { + if (SIMPLEQ_EMPTY(&uds_freefds)) { + /* UDS itself may already have OPEN_MAX FDs. */ + r = ENFILE; + break; + } + + /* + * The caller may have given an invalid FD, or UDS itself may + * unexpectedly have run out of available file descriptors etc. + */ + if ((r = copyfd(user_endpt, uds_ctlfds[i], COPYFD_FROM)) < 0) + break; + + ufd = SIMPLEQ_FIRST(&uds_freefds); + SIMPLEQ_REMOVE_HEAD(&uds_freefds, ufd_next); + + ufd->ufd_fd = r; + ufd->ufd_count = 0; + + SIMPLEQ_INSERT_TAIL(&fds, ufd, ufd_next); + + dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds[i], r)); + } + + /* Did we experience an error while copying in the file descriptors? */ + if (r < 0) { + /* Revert the successful copyfd() calls made so far. */ + SIMPLEQ_FOREACH(ufd, &fds, ufd_next) { + dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); + + closenb(ufd->ufd_fd); + } + + SIMPLEQ_CONCAT(&uds_freefds, &fds); + + return r; + } + + /* + * Success. If there were any file descriptors at all, add them to the + * peer's list of in-flight file descriptors. Assign the number of + * file descriptors copied in to the first file descriptor object, so + * that we know how many to copy out (or discard) for this segment. + * Also set the UDS_HAS_FDS flag on the segment. + */ + ufd = SIMPLEQ_FIRST(&fds); + ufd->ufd_count = nfds; + + SIMPLEQ_CONCAT(&peer->uds_fds, &fds); + + return OK; +} + +/* + * The current send request is successful or at least has made progress. + * Commit the new segment or, if we decided to merge the new data into the last + * segment, update the header of the last segment. Also wake up the receiving + * side, because there will now be new data to receive. + */ +static void +uds_send_advance(struct udssock * uds, struct udssock * peer, size_t datalen, + int merge, size_t seglen, unsigned int segflags) +{ + size_t pos, prevseglen, prevdatalen; + + /* + * For non-datagram sockets, credentials are sent only once after + * setting the LOCAL_CREDS option. After that, the option is unset. + */ + if ((segflags & UDS_HAS_CRED) && uds_get_type(uds) != SOCK_DGRAM) + peer->uds_flags &= ~UDSF_PASSCRED; + + if (merge) { + assert(segflags == 0); + + pos = uds_get_last(peer); + + (void)uds_fetch_hdr(peer, pos, &prevseglen, &prevdatalen, + &segflags); + + peer->uds_len += seglen; + assert(peer->uds_len <= UDS_BUF); + + seglen += prevseglen; + datalen += prevdatalen; + assert(seglen <= UDS_BUF); + + uds_store_hdr(peer, pos, seglen, datalen, segflags); + } else { + peer->uds_last = peer->uds_len; + + peer->uds_len += seglen; + assert(peer->uds_len <= UDS_BUF); + } + + /* Now that there are new data, wake up the receiver side. */ + sockevent_raise(&peer->uds_sock, SEV_RECV); +} + +/* + * Process a send request. Return OK if the send request has successfully + * completed, SUSPEND if it should be tried again later, or a negative error + * code on failure. In all cases, the values of 'off' and 'ctl_off' must be + * updated if any progress has been made; if either is non-zero, libsockevent + * will return the partial progress rather than an error code. + */ +int +uds_send(struct sock * sock, const struct sockdriver_data * data, size_t len, + size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len, + socklen_t * ctl_off, const struct sockaddr * addr, socklen_t addr_len, + endpoint_t user_endpt, int flags __unused, size_t min) +{ + struct udssock *uds = (struct udssock *)sock; + struct udssock *peer; + size_t seglen, datalen = 0 /*gcc*/; + unsigned int nfds, segflags = 0 /*gcc*/; + int r, partial, merge = 0 /*gcc*/; + + dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n", + uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len, + (ctl_off != NULL) ? *ctl_off : 0, flags)); + + partial = (off != NULL && *off > 0); + + /* + * First see whether we can process this send call at all right now. + * Most importantly, for connected sockets, if the peer's receive + * buffer is full, we may have to suspend the call until some space has + * been freed up. + */ + if ((r = uds_send_test(uds, len, ctl_len, min, partial)) != OK) + return r; + + /* + * Then get the peer socket. For connected sockets, this is trivial. + * For unconnected sockets, it may involve a lookup of the given + * address. + */ + if ((r = uds_send_peer(uds, addr, addr_len, user_endpt, &peer)) != OK) + return r; + + /* + * We now know for sure that we will not suspend this call without + * making any progress. However, the call may still fail. Copy in + * control data first now, so that we know whether there are any file + * descriptors to transfer. This aspect may determine whether or not + * we can merge data with a previous segment. Do not actually copy in + * the actual file descriptors yet, because that is much harder to undo + * in case of a failure later on. + */ + if (ctl_len > 0) { + /* We process control data once, in full. */ + assert(*ctl_off == 0); + + if ((r = uds_send_ctl(ctl, ctl_len, user_endpt)) < 0) + return r; + nfds = (unsigned int)r; + } else + nfds = 0; + + /* + * Now generate a new segment, or (if possible) merge new data into the + * last segment. Since the call may still fail, prepare the segment + * but do not update the buffer head yet. Note that the segment + * contains not just regular data (in fact it may contain no data at + * all) but (also) certain ancillary data. + */ + if ((r = uds_send_data(uds, peer, data, len, *off, user_endpt, nfds, + &merge, &datalen, &segflags)) <= 0) + return r; + seglen = (size_t)r; + + /* + * If we extracted any file descriptors from the control data earlier, + * copy them over to ourselves now. The resulting in-flight file + * descriptors are stored in a separate data structure. This is the + * last point where the send call may actually fail. + */ + if (nfds > 0) { + if ((r = uds_send_fds(peer, nfds, user_endpt)) != OK) + return r; + } + + /* + * The transmission is now known to be (partially) successful. Commit + * the new work by moving the receive buffer head. + */ + uds_send_advance(uds, peer, datalen, merge, seglen, segflags); + + /* + * Register the result. For stream-type sockets, the expected behavior + * is that all data be sent, and so we may still have to suspend the + * call after partial progress. Otherwise, we are now done. Either + * way, we are done with the control data, so mark it as consumed. + */ + *off += datalen; + *ctl_off += ctl_len; + if (uds_get_type(uds) == SOCK_STREAM && datalen < len) + return SUSPEND; + else + return OK; +} + +/* + * Test whether a send request would block. The given 'min' parameter contains + * the minimum number of bytes that should be possible to send without blocking + * (the low send watermark). Return SUSPEND if the send request would block, + * or any other error code if it would not. + */ +int +uds_test_send(struct sock * sock, size_t min) +{ + struct udssock *uds = (struct udssock *)sock; + + return uds_send_test(uds, min, 0, min, FALSE /*partial*/); +} + +/* + * Perform initial checks on a receive request, before it may potentially be + * suspended. Return OK if this receive request is valid, or a negative error + * code if it is not. + */ +int +uds_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, + int flags) +{ + + /* + * Reject calls with unknown flags. TODO: ensure that we should really + * reject all other flags rather than ignore them. + */ + if ((flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC)) != 0) + return EOPNOTSUPP; + + return OK; +} + +/* + * Determine whether the (real or pretend) receive request should be processed + * now, suspended until later, or rejected based on the current socket state. + * Return OK if the receive request should be processed now, along with a first + * indication whether the call may still be suspended later in 'may_block'. + * Return SUSPEND if the receive request should be retried later. Return an + * appropriate negative error code if the receive request should fail. + */ +static int +uds_recv_test(struct udssock * uds, size_t len, size_t min, int partial, + int * may_block) +{ + size_t seglen, datalen; + unsigned int segflags; + int r; + + /* + * If there are any pending data, those should always be received + * first. However, if there is nothing to receive, then whether we + * should suspend the receive call or fail immediately depends on other + * conditions. We first look at these other conditions. + */ + r = OK; + + if (uds_get_type(uds) != SOCK_DGRAM) { + if (uds_is_connecting(uds)) + r = SUSPEND; + else if (!uds_is_connected(uds) && !uds_is_disconnected(uds)) + r = ENOTCONN; + else if (!uds_has_conn(uds) || + uds_is_shutdown(uds->uds_conn, SFL_SHUT_WR)) + r = SOCKEVENT_EOF; + } + + if (uds->uds_len == 0) { + /* + * For stream-type sockets, we use the policy: if no regular + * data is requested, then end the call without receiving + * anything. For packet-type sockets, the request should block + * until there is a packet to discard, though. + */ + if (r != OK || (uds_get_type(uds) == SOCK_STREAM && len == 0)) + return r; + + return SUSPEND; + } + + /* + * For stream-type sockets, we should still suspend the call if fewer + * than 'min' bytes are available right now, and there is a possibility + * that more data may arrive later. More may arrive later iff 'r' is + * OK (i.e., no EOF or error will follow) and, in case we already + * received some partial results, there is not already a next segment + * with ancillary data (i.e, nonzero segment flags), or in any case + * there isn't more than one segment in the buffer. Limit 'min' to the + * maximum that can ever be received, though. Since that is difficult + * in our case, we check whether the buffer is entirely full instead. + */ + if (r == OK && uds_get_type(uds) == SOCK_STREAM && min > 0 && + uds->uds_len < UDS_BUF) { + assert(uds->uds_len >= UDS_HDRLEN); + + (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, &datalen, + &segflags); + + if (datalen < min && seglen == uds->uds_len && + (!partial || segflags == 0)) + return SUSPEND; + } + + /* + * Also start the decision process as to whether we should suspend the + * current call if MSG_WAITALL is given. Unfortunately there is no one + * place where we can conveniently do all the required checks. + */ + if (may_block != NULL) + *may_block = (r == OK && uds_get_type(uds) == SOCK_STREAM); + return OK; +} + +/* + * Receive regular data, and possibly the source path, from the tail segment in + * the receive buffer. On success, return the positive non-zero length of the + * tail segment, with 'addr' and 'addr_len' modified to store the source + * address if applicable, the result flags in 'rflags' updated as appropriate, + * the tail segment's data length stored in 'datalen', the number of received + * regular data bytes stored in 'reslen', the segment flags stored in + * 'segflags', and the absolute receive buffer position of the credentials in + * the segment stored in 'credpos' if applicable. Since the receive call may + * still fail, this function must not yet update the tail or any other aspect + * of the receive buffer. Return zero if the current receive call was already + * partially successful (due to MSG_WAITALL) and can no longer make progress, + * and thus should be ended. Return a negative error code on failure. + */ +static int +uds_recv_data(struct udssock * uds, const struct sockdriver_data * data, + size_t len, size_t off, struct sockaddr * addr, socklen_t * addr_len, + int * __restrict rflags, size_t * __restrict datalen, + size_t * __restrict reslen, unsigned int * __restrict segflags, + size_t * __restrict credpos) +{ + iovec_t iov[2]; + unsigned char lenbyte; + unsigned int iovcnt; + size_t pos, seglen, left; + int r; + + pos = uds_fetch_hdr(uds, uds->uds_tail, &seglen, datalen, segflags); + + /* + * If a partially completed receive now runs into a segment that cannot + * be logically merged with the previous one (because it has at least + * one segment flag set, meaning it has ancillary data), then we must + * shortcut the receive now. + */ + if (off != 0 && *segflags != 0) + return OK; + + /* + * As stated, for stream-type sockets, we choose to ignore zero-size + * receive calls. This has the consequence that reading a zero-sized + * segment (with ancillary data) requires a receive request for at + * least one regular data byte. Such a receive call would then return + * zero. The problem with handling zero-data receive requests is that + * we need to know whether the current segment is terminated (i.e., no + * more data can possibly be merged into it later), which is a test + * that we rather not perform, not in the least because we do not know + * whether there is an error pending on the socket. + * + * For datagrams, we currently allow a zero-size receive call to + * discard the next datagram. + * + * TODO: compare this against policies on other platforms. + */ + if (len == 0 && uds_get_type(uds) == SOCK_STREAM) + return OK; + + /* + * We have to skip the credentials for now: these are copied out as + * control data, and thus will (well, may) be looked at when dealing + * with the control data. For the same reason, we do not even look at + * UDS_HAS_FDS here. + */ + if (*segflags & UDS_HAS_CRED) { + *credpos = pos; + + pos = uds_fetch(uds, pos, &lenbyte, 1); + pos = uds_advance(pos, (size_t)lenbyte); + } + + /* + * Copy out the source address, but only if the (datagram) socket is + * not connected. TODO: even when it is connected, it may still + * receive packets sent to it from other sockets *before* being + * connected, and the receiver has no way of knowing that those packets + * did not come from its new peer. Ideally, the older packets should + * be dropped.. + */ + if (*segflags & UDS_HAS_PATH) { + pos = uds_fetch(uds, pos, &lenbyte, 1); + + if (uds_get_type(uds) == SOCK_DGRAM && !uds_has_link(uds)) + uds_make_addr((const char *)&uds->uds_buf[pos], + (size_t)lenbyte, addr, addr_len); + + pos = uds_advance(pos, (size_t)lenbyte); + } + + /* + * We can receive no more data than those that are present in the + * segment, obviously. For stream-type sockets, any more data that + * could have been received along with the current data would have been + * merged in the current segment, so we need not search for any next + * segments. + * + * For non-stream sockets, the caller may receive less than a whole + * packet if it supplied a small buffer. In that case, the rest of the + * packet will be discarded (but not here yet!) and the caller gets + * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway. + */ + if (len > *datalen) + len = *datalen; + else if (len < *datalen && uds_get_type(uds) != SOCK_STREAM) + *rflags |= MSG_TRUNC; + + /* Copy out the data to the caller. */ + if (len > 0) { + iov[0].iov_addr = (vir_bytes)&uds->uds_buf[pos]; + left = UDS_BUF - pos; + + if (left < len) { + iov[0].iov_size = left; + iov[1].iov_addr = (vir_bytes)&uds->uds_buf[0]; + iov[1].iov_size = len - left; + iovcnt = 2; + } else { + iov[0].iov_size = len; + iovcnt = 1; + } + + if ((r = sockdriver_vcopyout(data, off, iov, iovcnt)) != OK) + return r; + } + + *reslen = len; + assert(seglen > 0 && seglen <= INT_MAX); + return (int)seglen; +} + +/* + * The current segment has associated file descriptors. If possible, copy out + * all file descriptors to the receiver, and generate and copy out a chunk of + * control data that contains their file descriptor numbers. If not all + * file descriptors fit in the receiver's buffer, or if any error occurs, no + * file descriptors are copied out. + */ +static int +uds_recv_fds(struct udssock * uds, const struct sockdriver_data * ctl, + socklen_t ctl_len, socklen_t ctl_off, endpoint_t user_endpt, int flags) +{ + struct msghdr msghdr; + struct cmsghdr *cmsg; + struct uds_fd *ufd; + unsigned int i, nfds; + socklen_t chunklen, chunkspace; + int r, fd, what; + + /* See how many file descriptors should be part of this chunk. */ + assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); + ufd = SIMPLEQ_FIRST(&uds->uds_fds); + nfds = ufd->ufd_count; + assert(nfds > 0); + + /* + * We produce and copy out potentially unaligned chunks, using + * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE. + * This may leave "gap" bytes unchanged in userland, but that should + * not be a problem. By producing unaligned chunks, we eliminate a + * potential boundary case where the unaligned chunk passed in (by the + * sender) no longer fits in the same buffer after being aligned here. + */ + chunklen = CMSG_LEN(sizeof(int) * nfds); + chunkspace = CMSG_SPACE(sizeof(int) * nfds); + assert(chunklen <= sizeof(uds_ctlbuf)); + if (chunklen > ctl_len) + return 0; /* chunk would not fit, so produce nothing instead */ + if (chunkspace > ctl_len) + chunkspace = ctl_len; + + memset(&msghdr, 0, sizeof(msghdr)); + msghdr.msg_control = uds_ctlbuf; + msghdr.msg_controllen = sizeof(uds_ctlbuf); + + memset(uds_ctlbuf, 0, chunklen); + cmsg = CMSG_FIRSTHDR(&msghdr); + cmsg->cmsg_len = chunklen; + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + + /* + * Copy the group's local file descriptors to the target endpoint, and + * store the resulting remote file descriptors in the chunk buffer. + */ + r = OK; + + for (i = 0; i < nfds; i++) { + assert(ufd != SIMPLEQ_END(&uds->uds_fds)); + assert(i == 0 || ufd->ufd_count == 0); + + what = COPYFD_TO; + if (flags & MSG_CMSG_CLOEXEC) + what |= COPYFD_CLOEXEC; + + /* Failure may happen legitimately here (e.g., EMFILE). */ + if ((r = copyfd(user_endpt, ufd->ufd_fd, what)) < 0) + break; /* we keep our progress so far in 'i' */ + + fd = r; + + dprintf(("UDS: copied out fd %d -> %d\n", ufd->ufd_fd, fd)); + + memcpy(&((int *)CMSG_DATA(cmsg))[i], &fd, sizeof(int)); + + ufd = SIMPLEQ_NEXT(ufd, ufd_next); + } + + /* If everything went well so far, copy out the produced chunk. */ + if (r >= 0) + r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen); + + /* + * Handle errors. At this point, the 'i' variable contains the number + * of file descriptors that have already been successfully copied out. + */ + if (r < 0) { + /* Revert the successful copyfd() calls made so far. */ + while (i-- > 0) { + memcpy(&fd, &((int *)CMSG_DATA(cmsg))[i], sizeof(int)); + + (void)copyfd(user_endpt, fd, COPYFD_CLOSE); + } + + return r; + } + + /* + * Success. Return the aligned size of the produced chunk, if the + * given length permits it. From here on, the receive call may no + * longer fail, as that would result in lost file descriptors. + */ + return chunkspace; +} + +/* + * Generate and copy out a chunk of control data with the sender's credentials. + * Return the aligned chunk size on success, or a negative error code on + * failure. + */ +static int +uds_recv_cred(struct udssock * uds, const struct sockdriver_data * ctl, + socklen_t ctl_len, socklen_t ctl_off, size_t credpos) +{ + struct msghdr msghdr; + struct cmsghdr *cmsg; + socklen_t chunklen, chunkspace; + unsigned char lenbyte; + size_t credlen; + int r; + + /* + * Since the sender side already did the hard work of producing the + * (variable-size) sockcred structure as it should be received, there + * is relatively little work to be done here. + */ + credpos = uds_fetch(uds, credpos, &lenbyte, 1); + credlen = (size_t)lenbyte; + + chunklen = CMSG_LEN(credlen); + chunkspace = CMSG_SPACE(credlen); + assert(chunklen <= sizeof(uds_ctlbuf)); + if (chunklen > ctl_len) + return 0; /* chunk would not fit, so produce nothing instead */ + if (chunkspace > ctl_len) + chunkspace = ctl_len; + + memset(&msghdr, 0, sizeof(msghdr)); + msghdr.msg_control = uds_ctlbuf; + msghdr.msg_controllen = sizeof(uds_ctlbuf); + + memset(uds_ctlbuf, 0, chunklen); + cmsg = CMSG_FIRSTHDR(&msghdr); + cmsg->cmsg_len = chunklen; + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_CREDS; + + uds_fetch(uds, credpos, CMSG_DATA(cmsg), credlen); + + if ((r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen)) != OK) + return r; + + return chunkspace; +} + +/* + * Copy out control data for the ancillary data associated with the current + * segment, if any. Return OK on success, at which point the current receive + * call may no longer fail. 'rflags' may be updated with additional result + * flags. Return a negative error code on failure. + */ +static int +uds_recv_ctl(struct udssock * uds, const struct sockdriver_data * ctl, + socklen_t ctl_len, socklen_t * ctl_off, endpoint_t user_endpt, + int flags, unsigned int segflags, size_t credpos, int * rflags) +{ + int r; + + /* + * We first copy out all file descriptors, if any. We put them in one + * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS + * chunks. We believe that this should not cause application-level + * issues, but if it does, we can change that later with some effort. + * We then copy out credentials, if any. + * + * We copy out each control chunk independently of the others, and also + * perform error recovery on a per-chunk basis. This implies the + * following. If producing or copying out the first chunk fails, the + * entire recvmsg(2) call will fail with an appropriate error. If + * producing or copying out any subsequent chunk fails, the recvmsg(2) + * call will still return the previously generated chunks (a "short + * control read" if you will) as well as the MSG_CTRUNC flag. This + * approach is simple and clean, and it guarantees that we can always + * copy out at least as many file descriptors as we copied in for this + * segment, even if credentials are present as well. However, the + * approach does cause slightly more overhead when there are multiple + * chunks per call, as those are copied out separately. + * + * Since the generated SCM_RIGHTS chunk is never larger than the + * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf" + * buffer is always large enough to contain the chunk in its entirety. + * SCM_CREDS chunks should always fit easily as well. + * + * The MSG_CTRUNC flag will be returned iff not the entire user-given + * control buffer was filled and not all control chunks were delivered. + * Our current implementation does not deliver partial chunks. NetBSD + * does, except for SCM_RIGHTS chunks. + * + * TODO: get rid of the redundancy in processing return values. + */ + if (segflags & UDS_HAS_FDS) { + r = uds_recv_fds(uds, ctl, ctl_len, *ctl_off, user_endpt, + flags); + + /* + * At this point, 'r' contains one of the following: + * + * r > 0 a chunk of 'r' bytes was added successfully. + * r == 0 not enough space left; the chunk was not added. + * r < 0 an error occurred; the chunk was not added. + */ + if (r < 0 && *ctl_off == 0) + return r; + + if (r > 0) { + ctl_len -= r; + *ctl_off += r; + } else + *rflags |= MSG_CTRUNC; + } + + if (segflags & UDS_HAS_CRED) { + r = uds_recv_cred(uds, ctl, ctl_len, *ctl_off, credpos); + + /* As above. */ + if (r < 0 && *ctl_off == 0) + return r; + + if (r > 0) { + ctl_len -= r; + *ctl_off += r; + } else + *rflags |= MSG_CTRUNC; + } + + return OK; +} + +/* + * The current receive request is successful or, in the case of MSG_WAITALL, + * has made progress. Advance the receive buffer tail, either by discarding + * the entire tail segment or by generating a new, smaller tail segment that + * contains only the regular data left to be received from the original tail + * segment. Also wake up the sending side for connection-oriented sockets if + * applicable, because there may now be room for more data to be sent. Update + * 'may_block' if we are now sure that the call may not block on MSG_WAITALL + * after all. + */ +static void +uds_recv_advance(struct udssock * uds, size_t seglen, size_t datalen, + size_t reslen, unsigned int segflags, int * may_block) +{ + struct udssock *conn; + struct uds_fd *ufd; + size_t delta, nseglen, advance; + unsigned int nfds; + + /* Note that 'reslen' may be legitimately zero. */ + assert(reslen <= datalen); + + if (uds_get_type(uds) != SOCK_STREAM && reslen < datalen) + reslen = datalen; + + delta = datalen - reslen; + + if (delta == 0) { + /* + * Fully consume the tail segment. We advance the tail by the + * full segment length, thus moving up to either the next + * segment in the receive buffer, or an empty receive buffer. + */ + advance = seglen; + + uds->uds_tail = uds_advance(uds->uds_tail, advance); + } else { + /* + * Partially consume the tail segment. We put a new segment + * header right in front of the remaining data, which obviously + * always fits. Since any ancillary data was consumed along + * with the first data byte of the segment, the new segment has + * no ancillary data anymore (and thus a zero flags field). + */ + nseglen = UDS_HDRLEN + delta; + assert(nseglen < seglen); + + advance = seglen - nseglen; + + uds->uds_tail = uds_advance(uds->uds_tail, advance); + + uds_store_hdr(uds, uds->uds_tail, nseglen, delta, 0); + } + + /* + * For datagram-oriented sockets, we always consume at least a header. + * For stream-type sockets, we either consume a zero-data segment along + * with its ancillary data, or we consume at least one byte from a + * segment that does have regular data. In all other cases, the + * receive call has already been ended by now. Thus, we always advance + * the tail of the receive buffer here. + */ + assert(advance > 0); + + /* + * The receive buffer's used length (uds_len) and pointer to the + * previous segment header (uds_last) are offsets from the tail. Now + * that we have moved the tail, we need to adjust these accordingly. + * If the buffer is now empty, reset the tail to the buffer start so as + * to avoid splitting inter-process copies whenever possible. + */ + assert(uds->uds_len >= advance); + uds->uds_len -= advance; + + if (uds->uds_len == 0) + uds->uds_tail = 0; + + /* + * If uds_last is zero here, it was pointing to the segment we just + * (partially) consumed. By leaving it zero, it will still point to + * the new or next segment. + */ + if (uds->uds_last > 0) { + assert(uds->uds_len > 0); + assert(uds->uds_last >= advance); + uds->uds_last -= advance; + } + + /* + * If there were any file descriptors associated with this segment, + * close and free them now. + */ + if (segflags & UDS_HAS_FDS) { + assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); + ufd = SIMPLEQ_FIRST(&uds->uds_fds); + nfds = ufd->ufd_count; + assert(nfds > 0); + + while (nfds-- > 0) { + assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); + ufd = SIMPLEQ_FIRST(&uds->uds_fds); + SIMPLEQ_REMOVE_HEAD(&uds->uds_fds, ufd_next); + + dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); + + closenb(ufd->ufd_fd); + + SIMPLEQ_INSERT_TAIL(&uds_freefds, ufd, ufd_next); + } + } + + /* + * If there is now any data left in the receive buffer, then there has + * been a reason that we haven't received it. For stream sockets, that + * reason is that the next segment has ancillary data. In any case, + * this means we should never block the current receive operation + * waiting for more data. Otherwise, we may block on MSG_WAITALL. + */ + if (uds->uds_len > 0) + *may_block = FALSE; + + /* + * If the (non-datagram) socket has a peer that is not shut down for + * writing, see if it can be woken up to send more data. Note that + * the event will never be processed immediately. + */ + if (uds_is_connected(uds)) { + assert(uds_get_type(uds) != SOCK_DGRAM); + + conn = uds->uds_conn; + + if (!uds_is_shutdown(conn, SFL_SHUT_WR)) + sockevent_raise(&conn->uds_sock, SEV_SEND); + } +} + +/* + * Process a receive request. Return OK if the receive request has completed + * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an + * end-of-file condition is reached, or a negative error code on failure. In + * all cases, the values of 'off' and 'ctl_off' must be updated if any progress + * has been made; if either is non-zero, libsockevent will return the partial + * progress rather than an error code or EOF. + */ +int +uds_recv(struct sock * sock, const struct sockdriver_data * data, size_t len, + size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len, + socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len, + endpoint_t user_endpt, int flags, size_t min, int * rflags) +{ + struct udssock *uds = (struct udssock *)sock; + size_t seglen, datalen, reslen = 0 /*gcc*/, credpos = 0 /*gcc*/; + unsigned int segflags; + int r, partial, may_block; + + dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n", + uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len, + (ctl_off != NULL) ? *ctl_off : 0, flags)); + + /* + * Start by testing whether anything can be received at all, or whether + * an error or EOF should be returned instead, or whether the receive + * call should be suspended until later otherwise. If no (regular or + * control) data can be received, or if this was a test for select, + * we bail out right after. + */ + partial = (off != NULL && *off > 0); + + if ((r = uds_recv_test(uds, len, min, partial, &may_block)) != OK) + return r; + + /* + * Copy out regular data, if any. Do this before copying out control + * data, because the latter is harder to undo on failure. This data + * copy function returns returns OK (0) if we are to return a result of + * zero bytes (which is *not* EOF) to the caller without doing anything + * else. The function returns a nonzero positive segment length if we + * should carry on with the receive call (as it happens, all its other + * returned values may in fact be zero). + */ + if ((r = uds_recv_data(uds, data, len, *off, addr, addr_len, rflags, + &datalen, &reslen, &segflags, &credpos)) <= 0) + return r; + seglen = (size_t)r; + + /* + * Copy out control data, if any: transfer and copy out records of file + * descriptors, and/or copy out sender credentials. This is the last + * part of the call that may fail. + */ + if ((r = uds_recv_ctl(uds, ctl, ctl_len, ctl_off, user_endpt, flags, + segflags, credpos, rflags)) != OK) + return r; + + /* + * Now that the call has succeeded, move the tail of the receive + * buffer, unless we were merely peeking. + */ + if (!(flags & MSG_PEEK)) + uds_recv_advance(uds, seglen, datalen, reslen, segflags, + &may_block); + else + may_block = FALSE; + + /* + * If the MSG_WAITALL flag was given, we may still have to suspend the + * call after partial success. In particular, the receive call may + * suspend after partial success if all of these conditions are met: + * + * 1) the socket is a stream-type socket; + * 2) MSG_WAITALL is set; + * 3) MSG_PEEK is not set; + * 4) MSG_DONTWAIT is not set (tested upon return); + * 5) the socket must not have a pending error (tested upon return); + * 6) the socket must not be shut down for reading (tested later); + * 7) the socket must still be connected to a peer (no EOF); + * 8) the peer must not have been shut down for writing (no EOF); + * 9) the next segment, if any, contains no ancillary data. + * + * Together, these points guarantee that the call could conceivably + * receive more after being resumed. Points 4 to 6 are covered by + * libsockevent, which will end the call even if we return SUSPEND + * here. Due to segment merging, we cover point 9 by checking that + * there is currently no next segment at all. Once a new segment + * arrives, the ancillary-data test is done then. + */ + *off += reslen; + if ((flags & MSG_WAITALL) && reslen < len && may_block) + return SUSPEND; + else + return OK; +} + +/* + * Test whether a receive request would block. The given 'min' parameter + * contains the minimum number of bytes that should be possible to receive + * without blocking (the low receive watermark). Return SUSPEND if the send + * request would block. Otherwise, return any other error code (including OK + * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled + * with the number of bytes available for receipt right now (if not zero). + * Note that if 'size' is not NULL, 'min' will always be zero. + */ +int +uds_test_recv(struct sock * sock, size_t min, size_t * size) +{ + struct udssock *uds = (struct udssock *)sock; + size_t seglen; + unsigned int segflags; + int r; + + if ((r = uds_recv_test(uds, min, min, FALSE /*partial*/, + NULL /*may_block*/)) == SUSPEND) + return r; + + if (size != NULL && uds->uds_len > 0) + (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, size, + &segflags); + + return r; +} diff --git a/minix/net/uds/ioc_uds.c b/minix/net/uds/ioc_uds.c deleted file mode 100644 index 8271f4377..000000000 --- a/minix/net/uds/ioc_uds.c +++ /dev/null @@ -1,1114 +0,0 @@ -/* - * Unix Domain Sockets Implementation (PF_UNIX, PF_LOCAL) - * This code handles ioctl(2) commands to implement the socket API. - * Some helper functions are also present. - */ - -#include "uds.h" - -static int -perform_connection(devminor_t minorx, devminor_t minory, - struct sockaddr_un *addr) -{ - /* - * There are several places were a connection is established, the - * initiating call being one of accept(2), connect(2), socketpair(2). - */ - dprintf(("UDS: perform_connection(%d, %d)\n", minorx, minory)); - - /* - * Only connection-oriented types are acceptable and only equal - * types can connect to each other. - */ - if ((uds_fd_table[minorx].type != SOCK_SEQPACKET && - uds_fd_table[minorx].type != SOCK_STREAM) || - uds_fd_table[minorx].type != uds_fd_table[minory].type) - return EINVAL; - - /* Connect the pair of sockets. */ - uds_fd_table[minorx].peer = minory; - uds_fd_table[minory].peer = minorx; - - /* Set the address of both sockets */ - memcpy(&uds_fd_table[minorx].addr, addr, sizeof(struct sockaddr_un)); - memcpy(&uds_fd_table[minory].addr, addr, sizeof(struct sockaddr_un)); - - return OK; -} - -static int -do_accept(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - devminor_t minorparent; /* minor number of parent (server) */ - devminor_t minorpeer; - int rc, i; - struct sockaddr_un addr; - - dprintf(("UDS: do_accept(%d)\n", minor)); - - /* - * Somewhat weird logic is used in this function, so here's an - * overview... The minor number is the server's client socket - * (the socket to be returned by accept()). The data waiting - * for us in the IO Grant is the address that the server is - * listening on. This function uses the address to find the - * server's descriptor. From there we can perform the - * connection or suspend and wait for a connect(). - */ - - /* This IOCTL must be called on a 'fresh' socket. */ - if (uds_fd_table[minor].type != -1) - return EINVAL; - - /* Get the server's address */ - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr, - sizeof(struct sockaddr_un))) != OK) - return rc; - - /* Locate the server socket. */ - for (i = 0; i < NR_FDS; i++) { - if (uds_fd_table[i].stale == FALSE && - uds_fd_table[i].listening == TRUE && - uds_fd_table[i].addr.sun_family == AF_UNIX && - !strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path, - sizeof(uds_fd_table[i].addr.sun_path))) - break; - } - - if (i == NR_FDS) - return EINVAL; - - minorparent = i; /* parent */ - - /* We are the parent's child. */ - uds_fd_table[minorparent].child = minor; - - /* - * The peer has the same type as the parent. we need to be that - * type too. - */ - uds_fd_table[minor].type = uds_fd_table[minorparent].type; - - /* Locate the peer to accept in the parent's backlog. */ - minorpeer = -1; - for (i = 0; i < uds_fd_table[minorparent].backlog_size; i++) { - if (uds_fd_table[minorparent].backlog[i] != -1) { - minorpeer = uds_fd_table[minorparent].backlog[i]; - uds_fd_table[minorparent].backlog[i] = -1; - break; - } - } - - if (minorpeer == -1) { - dprintf(("UDS: do_accept(%d): suspend\n", minor)); - - /* - * There are no peers in the backlog, suspend and wait for one - * to show up. - */ - uds_fd_table[minor].suspended = UDS_SUSPENDED_ACCEPT; - - return EDONTREPLY; - } - - dprintf(("UDS: connecting %d to %d -- parent is %d\n", minor, - minorpeer, minorparent)); - - if ((rc = perform_connection(minor, minorpeer, &addr)) != OK) { - dprintf(("UDS: do_accept(%d): connection failed\n", minor)); - - return rc; - } - - uds_fd_table[minorparent].child = -1; - - /* If the peer is blocked on connect() or write(), revive the peer. */ - if (uds_fd_table[minorpeer].suspended == UDS_SUSPENDED_CONNECT || - uds_fd_table[minorpeer].suspended == UDS_SUSPENDED_WRITE) { - dprintf(("UDS: do_accept(%d): revive %d\n", minor, minorpeer)); - uds_unsuspend(minorpeer); - } - - /* See if we can satisfy an ongoing select. */ - if ((uds_fd_table[minorpeer].sel_ops & CDEV_OP_WR) && - uds_fd_table[minorpeer].size < UDS_BUF) { - /* A write on the peer is possible now. */ - chardriver_reply_select(uds_fd_table[minorpeer].sel_endpt, - minorpeer, CDEV_OP_WR); - uds_fd_table[minorpeer].sel_ops &= ~CDEV_OP_WR; - } - - return OK; -} - -static int -do_connect(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int child, peer; - struct sockaddr_un addr; - int rc, i, j; - dev_t dev; - ino_t ino; - - dprintf(("UDS: do_connect(%d)\n", minor)); - - /* Only connection oriented sockets can connect. */ - if (uds_fd_table[minor].type != SOCK_STREAM && - uds_fd_table[minor].type != SOCK_SEQPACKET) - return EINVAL; - - /* The socket must not be connecting or connected already. */ - peer = uds_fd_table[minor].peer; - if (peer != -1) { - if (uds_fd_table[peer].peer == -1) - return EALREADY; /* connecting */ - else - return EISCONN; /* connected */ - } - - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr, - sizeof(struct sockaddr_un))) != OK) - return rc; - - if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path, - sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK) - return rc; - - /* - * Look for a socket of the same type that is listening on the - * address we want to connect to. - */ - for (i = 0; i < NR_FDS; i++) { - if (uds_fd_table[minor].type != uds_fd_table[i].type) - continue; - if (uds_fd_table[i].listening == FALSE) - continue; - if (uds_fd_table[i].stale == TRUE) - continue; - if (uds_fd_table[i].addr.sun_family != AF_UNIX) - continue; - if (strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path, - sizeof(uds_fd_table[i].addr.sun_path))) - continue; - - /* Found a matching socket. */ - break; - } - - if (i == NR_FDS) - return ECONNREFUSED; - - /* If the server is blocked on an accept, perform the connection. */ - if ((child = uds_fd_table[i].child) != -1) { - rc = perform_connection(minor, child, &addr); - - if (rc != OK) - return rc; - - uds_fd_table[i].child = -1; - - dprintf(("UDS: do_connect(%d): revive %d\n", minor, child)); - - /* Wake up the accepting party. */ - uds_unsuspend(child); - - return OK; - } - - dprintf(("UDS: adding %d to %d's backlog\n", minor, i)); - - /* Look for a free slot in the backlog. */ - rc = -1; - for (j = 0; j < uds_fd_table[i].backlog_size; j++) { - if (uds_fd_table[i].backlog[j] == -1) { - uds_fd_table[i].backlog[j] = minor; - - rc = 0; - break; - } - } - - if (rc == -1) - return ECONNREFUSED; /* backlog is full */ - - /* See if the server is blocked on select(). */ - if (uds_fd_table[i].sel_ops & CDEV_OP_RD) { - /* Satisfy a read-type select on the server. */ - chardriver_reply_select(uds_fd_table[i].sel_endpt, i, - CDEV_OP_RD); - - uds_fd_table[i].sel_ops &= ~CDEV_OP_RD; - } - - /* We found our server. */ - uds_fd_table[minor].peer = i; - - memcpy(&uds_fd_table[minor].addr, &addr, sizeof(struct sockaddr_un)); - - dprintf(("UDS: do_connect(%d): suspend\n", minor)); - - /* Suspend until the server side accepts the connection. */ - uds_fd_table[minor].suspended = UDS_SUSPENDED_CONNECT; - - return EDONTREPLY; -} - -static int -do_listen(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int rc; - int backlog_size; - - dprintf(("UDS: do_listen(%d)\n", minor)); - - /* Ensure the socket has a type and is bound. */ - if (uds_fd_table[minor].type == -1 || - uds_fd_table[minor].addr.sun_family != AF_UNIX) - return EINVAL; - - /* listen(2) supports only two socket types. */ - if (uds_fd_table[minor].type != SOCK_STREAM && - uds_fd_table[minor].type != SOCK_SEQPACKET) - return EOPNOTSUPP; - - /* - * The POSIX standard doesn't say what to do if listen() has - * already been called. Well, there isn't an errno. We silently - * let it happen, but if listen() has already been called, we - * don't allow the backlog to shrink. - */ - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &backlog_size, - sizeof(backlog_size))) != OK) - return rc; - - if (uds_fd_table[minor].listening == FALSE) { - /* Set the backlog size to a reasonable value. */ - if (backlog_size <= 0 || backlog_size > UDS_SOMAXCONN) - backlog_size = UDS_SOMAXCONN; - - uds_fd_table[minor].backlog_size = backlog_size; - } else { - /* Allow the user to expand the backlog size. */ - if (backlog_size > uds_fd_table[minor].backlog_size && - backlog_size < UDS_SOMAXCONN) - uds_fd_table[minor].backlog_size = backlog_size; - - /* - * Don't let the user shrink the backlog_size, as we might - * have clients waiting in those slots. - */ - } - - /* This socket is now listening. */ - uds_fd_table[minor].listening = TRUE; - - return OK; -} - -static int -do_socket(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int rc, type; - - dprintf(("UDS: do_socket(%d)\n", minor)); - - /* The socket type can only be set once. */ - if (uds_fd_table[minor].type != -1) - return EINVAL; - - /* Get the requested type. */ - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &type, - sizeof(type))) != OK) - return rc; - - /* Assign the type if it is valid only. */ - switch (type) { - case SOCK_STREAM: - case SOCK_DGRAM: - case SOCK_SEQPACKET: - uds_fd_table[minor].type = type; - return OK; - - default: - return EINVAL; - } -} - -static int -do_bind(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - struct sockaddr_un addr; - int rc, i; - dev_t dev; - ino_t ino; - - dprintf(("UDS: do_bind(%d)\n", minor)); - - /* If the type hasn't been set by do_socket() yet, OR an attempt - * to re-bind() a non-SOCK_DGRAM socket is made, fail the call. - */ - if ((uds_fd_table[minor].type == -1) || - (uds_fd_table[minor].addr.sun_family == AF_UNIX && - uds_fd_table[minor].type != SOCK_DGRAM)) - return EINVAL; - - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr, - sizeof(struct sockaddr_un))) != OK) - return rc; - - /* Do some basic sanity checks on the address. */ - if (addr.sun_family != AF_UNIX) - return EAFNOSUPPORT; - - if (addr.sun_path[0] == '\0') - return ENOENT; - - /* Attempt to create the socket file. */ - if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path, -#if NOT_YET - sizeof(addr.sun_path), SPATH_CREATE, &dev, &ino)) != OK) -#else - sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK) -#endif - return rc; - - /* - * It is possible that the socket path name was already in use as - * address by another socket. This means that the socket file was - * prematurely unlinked. In that case, mark the old socket as stale, - * so that its path name will not be matched and only the newly bound - * socket will be found in address-based searches. For now, we leave - * the old socket marked as stale for as long as it is bound to the - * same address. A more advanced implementation could establish an - * order between the sockets so that the most recently bound socket is - * found at any time, but it is doubtful whether that would be useful. - */ - for (i = 0; i < NR_FDS; i++) { - if (uds_fd_table[i].stale == FALSE && - uds_fd_table[i].addr.sun_family == AF_UNIX && - !strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path, - sizeof(uds_fd_table[i].addr.sun_path))) { -#if NOT_YET - uds_fd_table[i].stale = TRUE; -#else - return EADDRINUSE; -#endif - } - } - - /* Looks good, perform the bind(). */ - uds_fd_table[minor].stale = FALSE; - memcpy(&uds_fd_table[minor].addr, &addr, sizeof(struct sockaddr_un)); - - return OK; -} - -static int -do_getsockname(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - dprintf(("UDS: do_getsockname(%d)\n", minor)); - - /* - * Unconditionally send the address we have assigned to this socket. - * The POSIX standard doesn't say what to do if the address hasn't been - * set. If the address isn't currently set, then the user will get - * NULL bytes. Note: libc depends on this behavior. - */ - return sys_safecopyto(endpt, grant, 0, - (vir_bytes) &uds_fd_table[minor].addr, sizeof(struct sockaddr_un)); -} - -static int -do_getpeername(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int peer_minor; - - dprintf(("UDS: do_getpeername(%d)\n", minor)); - - /* Check that the socket is connected with a valid peer. */ - if (uds_fd_table[minor].peer != -1) { - peer_minor = uds_fd_table[minor].peer; - - /* Copy the address from the peer. */ - return sys_safecopyto(endpt, grant, 0, - (vir_bytes) &uds_fd_table[peer_minor].addr, - sizeof(struct sockaddr_un)); - } else if (uds_fd_table[minor].err == ECONNRESET) { - uds_fd_table[minor].err = 0; - - return ECONNRESET; - } else - return ENOTCONN; -} - -static int -do_shutdown(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int rc, how; - - dprintf(("UDS: do_shutdown(%d)\n", minor)); - - /* The socket must be connection oriented. */ - if (uds_fd_table[minor].type != SOCK_STREAM && - uds_fd_table[minor].type != SOCK_SEQPACKET) - return EINVAL; - - if (uds_fd_table[minor].peer == -1) { - /* shutdown(2) is only valid for connected sockets. */ - if (uds_fd_table[minor].err == ECONNRESET) - return ECONNRESET; - else - return ENOTCONN; - } - - /* Get the 'how' parameter from the caller. */ - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &how, - sizeof(how))) != OK) - return rc; - - switch (how) { - case SHUT_RD: /* Take away read permission. */ - uds_fd_table[minor].mode &= ~UDS_R; - break; - - case SHUT_WR: /* Take away write permission. */ - uds_fd_table[minor].mode &= ~UDS_W; - break; - - case SHUT_RDWR: /* Shut down completely. */ - uds_fd_table[minor].mode = 0; - break; - - default: - return EINVAL; - } - - return OK; -} - -static int -do_socketpair(devminor_t minorx, endpoint_t endpt, cp_grant_id_t grant) -{ - int rc; - dev_t minorin; - devminor_t minory; - struct sockaddr_un addr; - - dprintf(("UDS: do_socketpair(%d)\n", minorx)); - - /* The ioctl argument is the minor number of the second socket. */ - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &minorin, - sizeof(minorin))) != OK) - return rc; - - minory = minor(minorin); - - dprintf(("UDS: socketpair(%d, %d,)\n", minorx, minory)); - - /* Security check: both sockets must have the same owner endpoint. */ - if (uds_fd_table[minorx].owner != uds_fd_table[minory].owner) - return EPERM; - - addr.sun_family = AF_UNIX; - addr.sun_path[0] = 'X'; - addr.sun_path[1] = '\0'; - - return perform_connection(minorx, minory, &addr); -} - -static int -do_getsockopt_sotype(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - dprintf(("UDS: do_getsockopt_sotype(%d)\n", minor)); - - /* If the type hasn't been set yet, we fail the call. */ - if (uds_fd_table[minor].type == -1) - return EINVAL; - - return sys_safecopyto(endpt, grant, 0, - (vir_bytes) &uds_fd_table[minor].type, sizeof(int)); -} - -static int -do_getsockopt_peercred(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int peer_minor; - int rc; - struct uucred cred; - - dprintf(("UDS: do_getsockopt_peercred(%d)\n", minor)); - - if (uds_fd_table[minor].peer == -1) { - if (uds_fd_table[minor].err == ECONNRESET) { - uds_fd_table[minor].err = 0; - - return ECONNRESET; - } else - return ENOTCONN; - } - - peer_minor = uds_fd_table[minor].peer; - - /* - * Obtain the peer's credentials and copy them out. Ignore failures; - * in that case, the caller will simply get no credentials. - */ - memset(&cred, 0, sizeof(cred)); - cred.cr_uid = -1; - cred.cr_gid = -1; - (void)getepinfo(uds_fd_table[peer_minor].owner, &cred.cr_uid, - &cred.cr_gid); - - return sys_safecopyto(endpt, grant, 0, (vir_bytes) &cred, - sizeof(struct uucred)); -} - -static int -do_getsockopt_sndbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - size_t sndbuf = UDS_BUF; - - dprintf(("UDS: do_getsockopt_sndbuf(%d)\n", minor)); - - return sys_safecopyto(endpt, grant, 0, (vir_bytes) &sndbuf, - sizeof(sndbuf)); -} - -static int -do_setsockopt_sndbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int rc; - size_t sndbuf; - - dprintf(("UDS: do_setsockopt_sndbuf(%d)\n", minor)); - - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &sndbuf, - sizeof(sndbuf))) != OK) - return rc; - - /* The send buffer is limited to 32KB at the moment. */ - if (sndbuf > UDS_BUF) - return ENOSYS; - - /* FIXME: actually shrink the buffer. */ - return OK; -} - -static int -do_getsockopt_rcvbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - size_t rcvbuf = UDS_BUF; - - dprintf(("UDS: do_getsockopt_rcvbuf(%d)\n", minor)); - - return sys_safecopyto(endpt, grant, 0, (vir_bytes) &rcvbuf, - sizeof(rcvbuf)); -} - -static int -do_setsockopt_rcvbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int rc; - size_t rcvbuf; - - dprintf(("UDS: do_setsockopt_rcvbuf(%d)\n", minor)); - - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &rcvbuf, - sizeof(rcvbuf))) != OK) - return rc; - - /* The receive buffer is limited to 32KB at the moment. */ - if (rcvbuf > UDS_BUF) - return ENOSYS; - - /* FIXME: actually shrink the buffer. */ - return OK; -} - -static int -do_sendto(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int rc; - struct sockaddr_un addr; - dev_t dev; - ino_t ino; - - dprintf(("UDS: do_sendto(%d)\n", minor)); - - /* This IOCTL is only for SOCK_DGRAM sockets. */ - if (uds_fd_table[minor].type != SOCK_DGRAM) - return EINVAL; - - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr, - sizeof(struct sockaddr_un))) != OK) - return rc; - - /* Do some basic sanity checks on the address. */ - if (addr.sun_family != AF_UNIX || addr.sun_path[0] == '\0') - return EINVAL; - - if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path, - sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK) - return rc; - - memcpy(&uds_fd_table[minor].target, &addr, sizeof(struct sockaddr_un)); - - return OK; -} - -static int -do_recvfrom(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - dprintf(("UDS: do_recvfrom(%d)\n", minor)); - - return sys_safecopyto(endpt, grant, 0, - (vir_bytes) &uds_fd_table[minor].source, - sizeof(struct sockaddr_un)); -} - -static int -send_fds(devminor_t minor, struct msg_control *msg_ctrl, - struct ancillary *data) -{ - int i, rc, nfds, totalfds; - endpoint_t from_ep; - struct msghdr msghdr; - struct cmsghdr *cmsg = NULL; - - dprintf(("UDS: send_fds(%d)\n", minor)); - - from_ep = uds_fd_table[minor].owner; - - /* Obtain this socket's credentials. */ - if ((rc = getepinfo(from_ep, &data->cred.uid, &data->cred.gid)) < 0) - return rc; - - dprintf(("UDS: minor=%d cred={%d,%d}\n", minor, - data->cred.uid, data->cred.gid)); - - totalfds = data->nfiledes; - - memset(&msghdr, '\0', sizeof(struct msghdr)); - msghdr.msg_control = msg_ctrl->msg_control; - msghdr.msg_controllen = msg_ctrl->msg_controllen; - - for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { - if (cmsg->cmsg_level != SOL_SOCKET || - cmsg->cmsg_type != SCM_RIGHTS) - continue; - - nfds = MIN((cmsg->cmsg_len-CMSG_LEN(0))/sizeof(int), OPEN_MAX); - - for (i = 0; i < nfds; i++) { - if (totalfds == OPEN_MAX) - return EOVERFLOW; - - data->fds[totalfds] = ((int *) CMSG_DATA(cmsg))[i]; - dprintf(("UDS: minor=%d fd[%d]=%d\n", minor, totalfds, - data->fds[totalfds])); - totalfds++; - } - } - - for (i = data->nfiledes; i < totalfds; i++) { - if ((rc = copyfd(from_ep, data->fds[i], COPYFD_FROM)) < 0) { - printf("UDS: copyfd(COPYFD_FROM) failed: %d\n", rc); - - /* Revert the successful copyfd() calls made so far. */ - for (i--; i >= data->nfiledes; i--) - close(data->fds[i]); - - return rc; - } - - dprintf(("UDS: send_fds(): %d -> %d\n", data->fds[i], rc)); - - data->fds[i] = rc; /* this is now the local FD */ - } - - data->nfiledes = totalfds; - - return OK; -} - -/* - * This function calls close() for all of the FDs in flight. This is used - * when a Unix Domain Socket is closed and there exists references to file - * descriptors that haven't been received with recvmsg(). - */ -int -uds_clear_fds(devminor_t minor, struct ancillary *data) -{ - int i; - - dprintf(("UDS: uds_clear_fds(%d)\n", minor)); - - for (i = 0; i < data->nfiledes; i++) { - dprintf(("UDS: uds_clear_fds() => %d\n", data->fds[i])); - - close(data->fds[i]); - - data->fds[i] = -1; - } - - data->nfiledes = 0; - - return OK; -} - -static int -recv_fds(devminor_t minor, struct ancillary *data, - struct msg_control *msg_ctrl) -{ - int rc, i, j, fds[OPEN_MAX]; - struct msghdr msghdr; - struct cmsghdr *cmsg; - endpoint_t to_ep; - - dprintf(("UDS: recv_fds(%d)\n", minor)); - - msghdr.msg_control = msg_ctrl->msg_control; - msghdr.msg_controllen = msg_ctrl->msg_controllen; - - cmsg = CMSG_FIRSTHDR(&msghdr); - cmsg->cmsg_len = CMSG_LEN(sizeof(int) * data->nfiledes); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - - to_ep = uds_fd_table[minor].owner; - - /* Copy to the target endpoint. */ - for (i = 0; i < data->nfiledes; i++) { - if ((rc = copyfd(to_ep, data->fds[i], COPYFD_TO)) < 0) { - printf("UDS: copyfd(COPYFD_TO) failed: %d\n", rc); - - /* Revert the successful copyfd() calls made so far. */ - for (i--; i >= 0; i--) - (void) copyfd(to_ep, fds[i], COPYFD_CLOSE); - - return rc; - } - - fds[i] = rc; /* this is now the remote FD */ - } - - /* Close the local copies only once the entire procedure succeeded. */ - for (i = 0; i < data->nfiledes; i++) { - dprintf(("UDS: recv_fds(): %d -> %d\n", data->fds[i], fds[i])); - - ((int *)CMSG_DATA(cmsg))[i] = fds[i]; - - close(data->fds[i]); - - data->fds[i] = -1; - } - - data->nfiledes = 0; - - return OK; -} - -static int -recv_cred(devminor_t minor, struct ancillary *data, - struct msg_control *msg_ctrl) -{ - struct msghdr msghdr; - struct cmsghdr *cmsg; - struct uucred *cred; - - dprintf(("UDS: recv_cred(%d)\n", minor)); - - msghdr.msg_control = msg_ctrl->msg_control; - msghdr.msg_controllen = msg_ctrl->msg_controllen; - - cmsg = CMSG_FIRSTHDR(&msghdr); - if (cmsg->cmsg_len > 0) - cmsg = CMSG_NXTHDR(&msghdr, cmsg); - - cmsg->cmsg_len = CMSG_LEN(sizeof(struct uucred)); - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_CREDS; - cred = (struct uucred *)CMSG_DATA(cmsg); - memset(cred, 0, sizeof(*cred)); - cred->cr_uid = data->cred.uid; - cred->cr_gid = data->cred.gid; - - return OK; -} - -static int -do_sendmsg(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int peer, rc, i; - struct msg_control msg_ctrl; - - dprintf(("UDS: do_sendmsg(%d)\n", minor)); - - memset(&msg_ctrl, '\0', sizeof(struct msg_control)); - - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &msg_ctrl, - sizeof(struct msg_control))) != OK) - return rc; - - /* Locate the peer. */ - peer = -1; - if (uds_fd_table[minor].type == SOCK_DGRAM) { - if (uds_fd_table[minor].target.sun_path[0] == '\0' || - uds_fd_table[minor].target.sun_family != AF_UNIX) - return EDESTADDRREQ; - - for (i = 0; i < NR_FDS; i++) { - /* - * Look for a SOCK_DGRAM socket that is bound on the - * target address. - */ - if (uds_fd_table[i].type == SOCK_DGRAM && - uds_fd_table[i].stale == FALSE && - uds_fd_table[i].addr.sun_family == AF_UNIX && - !strncmp(uds_fd_table[minor].target.sun_path, - uds_fd_table[i].addr.sun_path, - sizeof(uds_fd_table[i].addr.sun_path))) { - peer = i; - break; - } - } - - if (peer == -1) - return ENOENT; - } else { - peer = uds_fd_table[minor].peer; - if (peer == -1) - return ENOTCONN; - } - - dprintf(("UDS: sendmsg(%d) -- peer=%d\n", minor, peer)); - - /* - * Note: it's possible that there is already some file descriptors in - * ancillary_data if the peer didn't call recvmsg() yet. That's okay. - * The receiver will get the current file descriptors plus the new - * ones. - */ - return send_fds(minor, &msg_ctrl, &uds_fd_table[peer].ancillary_data); -} - -static int -do_recvmsg(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int rc; - struct msg_control msg_ctrl; - socklen_t clen_avail = 0; - socklen_t clen_needed = 0; - socklen_t clen_desired = 0; - - dprintf(("UDS: do_recvmsg(%d)\n", minor)); - dprintf(("UDS: minor=%d credentials={uid:%d,gid:%d}\n", minor, - uds_fd_table[minor].ancillary_data.cred.uid, - uds_fd_table[minor].ancillary_data.cred.gid)); - - memset(&msg_ctrl, '\0', sizeof(struct msg_control)); - - /* - * Get the msg_control from the user. It will include the - * amount of space the user has allocated for control data. - */ - if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &msg_ctrl, - sizeof(struct msg_control))) != OK) - return rc; - - clen_avail = MIN(msg_ctrl.msg_controllen, MSG_CONTROL_MAX); - - if (uds_fd_table[minor].ancillary_data.nfiledes > 0) { - clen_needed = CMSG_SPACE(sizeof(int) * - uds_fd_table[minor].ancillary_data.nfiledes); - } - - /* if there is room we also include credentials */ - clen_desired = clen_needed + CMSG_SPACE(sizeof(struct uucred)); - - if (clen_needed > clen_avail) - return EOVERFLOW; - - if (uds_fd_table[minor].ancillary_data.nfiledes > 0) { - if ((rc = recv_fds(minor, &uds_fd_table[minor].ancillary_data, - &msg_ctrl)) != OK) - return rc; - } - - if (clen_desired <= clen_avail) { - rc = recv_cred(minor, &uds_fd_table[minor].ancillary_data, - &msg_ctrl); - if (rc != OK) - return rc; - msg_ctrl.msg_controllen = clen_desired; - } else - msg_ctrl.msg_controllen = clen_needed; - - /* Send the control data to the user. */ - return sys_safecopyto(endpt, grant, 0, (vir_bytes) &msg_ctrl, - sizeof(struct msg_control)); -} - -static int -do_fionread(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant) -{ - int rc; - - rc = uds_perform_read(minor, NONE, GRANT_INVALID, UDS_BUF, 1); - - /* What should we do on error? Just set to zero for now. */ - if (rc < 0) - rc = 0; - - return sys_safecopyto(endpt, grant, 0, (vir_bytes) &rc, sizeof(rc)); -} - -int -uds_do_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt, - cp_grant_id_t grant) -{ - int rc; - - switch (request) { - case NWIOSUDSCONN: - /* Connect to a listening socket -- connect(). */ - rc = do_connect(minor, endpt, grant); - - break; - - case NWIOSUDSACCEPT: - /* Accept an incoming connection -- accept(). */ - rc = do_accept(minor, endpt, grant); - - break; - - case NWIOSUDSBLOG: - /* - * Set the backlog_size and put the socket into the listening - * state -- listen(). - */ - rc = do_listen(minor, endpt, grant); - - break; - - case NWIOSUDSTYPE: - /* Set the SOCK_ type for this socket -- socket(). */ - rc = do_socket(minor, endpt, grant); - - break; - - case NWIOSUDSADDR: - /* Set the address for this socket -- bind(). */ - rc = do_bind(minor, endpt, grant); - - break; - - case NWIOGUDSADDR: - /* Get the address for this socket -- getsockname(). */ - rc = do_getsockname(minor, endpt, grant); - - break; - - case NWIOGUDSPADDR: - /* Get the address for the peer -- getpeername(). */ - rc = do_getpeername(minor, endpt, grant); - - break; - - case NWIOSUDSSHUT: - /* - * Shut down a socket for reading, writing, or both -- - * shutdown(). - */ - rc = do_shutdown(minor, endpt, grant); - - break; - - case NWIOSUDSPAIR: - /* Connect two sockets -- socketpair(). */ - rc = do_socketpair(minor, endpt, grant); - - break; - - case NWIOGUDSSOTYPE: - /* Get socket type -- getsockopt(SO_TYPE). */ - rc = do_getsockopt_sotype(minor, endpt, grant); - - break; - - case NWIOGUDSPEERCRED: - /* Get peer endpoint -- getsockopt(SO_PEERCRED). */ - rc = do_getsockopt_peercred(minor, endpt, grant); - - break; - - case NWIOSUDSTADDR: - /* Set target address -- sendto(). */ - rc = do_sendto(minor, endpt, grant); - - break; - - case NWIOGUDSFADDR: - /* Get from address -- recvfrom(). */ - rc = do_recvfrom(minor, endpt, grant); - - break; - - case NWIOGUDSSNDBUF: - /* Get the send buffer size -- getsockopt(SO_SNDBUF). */ - rc = do_getsockopt_sndbuf(minor, endpt, grant); - - break; - - case NWIOSUDSSNDBUF: - /* Set the send buffer size -- setsockopt(SO_SNDBUF). */ - rc = do_setsockopt_sndbuf(minor, endpt, grant); - - break; - - case NWIOGUDSRCVBUF: - /* Get the send buffer size -- getsockopt(SO_SNDBUF). */ - rc = do_getsockopt_rcvbuf(minor, endpt, grant); - - break; - - case NWIOSUDSRCVBUF: - /* Set the send buffer size -- setsockopt(SO_SNDBUF). */ - rc = do_setsockopt_rcvbuf(minor, endpt, grant); - - break; - - case NWIOSUDSCTRL: - /* Set the control data -- sendmsg(). */ - rc = do_sendmsg(minor, endpt, grant); - - break; - - case NWIOGUDSCTRL: - /* Set the control data -- recvmsg(). */ - rc = do_recvmsg(minor, endpt, grant); - - break; - - case FIONREAD: - /* - * Get the number of bytes immediately available for reading. - */ - rc = do_fionread(minor, endpt, grant); - - break; - - default: - /* - * The IOCTL command is not valid for /dev/uds -- this happens - * a lot and is normal. A lot of libc functions determine the - * socket type with IOCTLs. Any unrecognized requests simply - * get an ENOTTY response. - */ - - rc = ENOTTY; - } - - return rc; -} diff --git a/minix/net/uds/stat.c b/minix/net/uds/stat.c new file mode 100644 index 000000000..2759f6318 --- /dev/null +++ b/minix/net/uds/stat.c @@ -0,0 +1,186 @@ +/* UNIX Domain Sockets - stat.c - network status */ + +#include "uds.h" +#include +#include + +/* + * Fill the given 'ki' structure with information about the socket 'uds'. + */ +static void +uds_get_info(struct kinfo_pcb * ki, const struct udssock * uds) +{ + struct udssock *peer; + socklen_t len; + int type; + + type = uds_get_type(uds); + peer = uds_get_peer(uds); + + ki->ki_pcbaddr = (uint64_t)(uintptr_t)uds; + ki->ki_ppcbaddr = (uint64_t)(uintptr_t)uds; + ki->ki_sockaddr = (uint64_t)(uintptr_t)&uds->uds_sock; + ki->ki_family = AF_UNIX; + ki->ki_type = type; + ki->ki_protocol = UDSPROTO_UDS; + ki->ki_pflags = 0; + if (uds->uds_flags & UDSF_CONNWAIT) + ki->ki_pflags |= UNP_CONNWAIT; + if (uds->uds_flags & UDSF_PASSCRED) + ki->ki_pflags |= UNP_WANTCRED; + if (type != SOCK_DGRAM && uds->uds_cred.unp_pid != -1) { + if (uds_is_listening(uds)) + ki->ki_pflags |= UNP_EIDSBIND; + else if (uds_is_connecting(uds) || uds_is_connected(uds)) + ki->ki_pflags |= UNP_EIDSVALID; + } + /* Not sure about NetBSD connection states. First attempt here. */ + if (uds_is_connecting(uds)) + ki->ki_sostate = SS_ISCONNECTING; + else if (uds_is_connected(uds)) + ki->ki_sostate = SS_ISCONNECTED; + else if (uds_is_disconnected(uds)) + ki->ki_sostate = SS_ISDISCONNECTED; + ki->ki_rcvq = uds->uds_len; + /* We currently mirror the peer's receive queue size when connected. */ + if (uds_is_connected(uds)) + ki->ki_sndq = peer->uds_len; + /* The source is not set for bound connection-type sockets here. */ + if (type == SOCK_DGRAM || uds_is_listening(uds)) + uds_make_addr(uds->uds_path, (size_t)uds->uds_pathlen, + &ki->ki_src, &len); + if (peer != NULL) + uds_make_addr(peer->uds_path, (size_t)peer->uds_pathlen, + &ki->ki_dst, &len); + /* TODO: we should set ki_inode and ki_vnode, but to what? */ + ki->ki_conn = (uint64_t)(uintptr_t)peer; + if (!TAILQ_EMPTY(&uds->uds_queue)) + ki->ki_refs = + (uint64_t)(uintptr_t)TAILQ_FIRST(&uds->uds_queue); + if (uds_has_link(uds)) + ki->ki_nextref = + (uint64_t)(uintptr_t)TAILQ_NEXT(uds, uds_next); +} + +/* + * Remote MIB implementation of CTL_NET PF_LOCAL {SOCK_STREAM,SOCK_DGRAM, + * SOCK_SEQPACKET} 0. This function handles all queries on the + * "net.local.{stream,dgram,seqpacket}.pcblist" sysctl(7) nodes. + * + * The 0 for "pcblist" is a MINIXism: we use it to keep our arrays small. + * NetBSD numbers these nodes dynamically and so they have numbers above + * CREATE_BASE. That also means that no userland application can possibly + * hardcode their numbers, and must perform lookups by name. In turn, that + * means that we can safely change the 0 to another number if NetBSD ever + * introduces statically numbered nodes in these subtrees. + */ +static ssize_t +net_local_pcblist(struct rmib_call * call, struct rmib_node * node __unused, + struct rmib_oldp * oldp, struct rmib_newp * newp __unused) +{ + struct udssock *uds; + struct kinfo_pcb ki; + ssize_t off; + int r, type, size, max; + + if (call->call_namelen != 4) + return EINVAL; + + /* The first two added name fields are not used. */ + + size = call->call_name[2]; + if (size < 0 || (size_t)size > sizeof(ki)) + return EINVAL; + if (size == 0) + size = sizeof(ki); + max = call->call_name[3]; + + type = call->call_oname[2]; + + off = 0; + + for (uds = uds_enum(NULL, type); uds != NULL; + uds = uds_enum(uds, type)) { + if (rmib_inrange(oldp, off)) { + memset(&ki, 0, sizeof(ki)); + + uds_get_info(&ki, uds); + + if ((r = rmib_copyout(oldp, off, &ki, size)) < 0) + return r; + } + + off += size; + if (max > 0 && --max == 0) + break; + } + + /* + * Margin to limit the possible effects of the inherent race condition + * between receiving just the data size and receiving the actual data. + */ + if (oldp == NULL) + off += PCB_SLOP * size; + + return off; +} + +/* The CTL_NET PF_LOCAL SOCK_STREAM subtree. */ +static struct rmib_node net_local_stream_table[] = { + [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist, + "pcblist", "SOCK_STREAM protocol control block list"), +}; + +/* The CTL_NET PF_LOCAL SOCK_DGRAM subtree. */ +static struct rmib_node net_local_dgram_table[] = { + [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist, + "pcblist", "SOCK_DGRAM protocol control block list"), +}; + +/* The CTL_NET PF_LOCAL SOCK_SEQPACKET subtree. */ +static struct rmib_node net_local_seqpacket_table[] = { + [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist, + "pcblist", "SOCK_SEQPACKET protocol control block list"), +}; + +/* The CTL_NET PF_LOCAL subtree. */ +static struct rmib_node net_local_table[] = { +/* 1*/ [SOCK_STREAM] = RMIB_NODE(RMIB_RO, net_local_stream_table, + "stream", "SOCK_STREAM settings"), +/* 2*/ [SOCK_DGRAM] = RMIB_NODE(RMIB_RO, net_local_dgram_table, + "dgram", "SOCK_DGRAM settings"), +/* 5*/ [SOCK_SEQPACKET] = RMIB_NODE(RMIB_RO, net_local_seqpacket_table, + "seqpacket", "SOCK_SEQPACKET settings"), +}; + +static struct rmib_node net_local_node = + RMIB_NODE(RMIB_RO, net_local_table, "local", "PF_LOCAL related settings"); + +/* + * Initialize the status module. + */ +void +uds_stat_init(void) +{ + const int mib[] = { CTL_NET, PF_LOCAL }; + int r; + + /* + * Register our own "net.local" subtree with the MIB service. + * + * This call only returns local failures. Remote failures (in the MIB + * service) are silently ignored. So, we can safely panic on failure. + */ + if ((r = rmib_register(mib, __arraycount(mib), &net_local_node)) != OK) + panic("UDS: unable to register remote MIB tree: %d", r); +} + +/* + * Clean up the status module. + */ +void +uds_stat_cleanup(void) +{ + + rmib_deregister(&net_local_node); +} diff --git a/minix/net/uds/uds.8 b/minix/net/uds/uds.8 deleted file mode 100644 index 2484ea709..000000000 --- a/minix/net/uds/uds.8 +++ /dev/null @@ -1,15 +0,0 @@ -.TH UDS 8 -.SH NAME -uds \- unix domain sockets device -.SH DESCRIPTION -The \fIuds\fP device gives access to the unix domain socket services in -Minix. It is a virtual device similar to the \fItcp\fP and \fIudp\fP -Internet Protocol server devices. -.SH SEE ALSO -.BR socket(2), -.BR socketpair(2), -.BR dev(4), -.BR ip(4), -.BR unix(8) -.SH HISTORY -This device first appeared in Minix 3.1.8. diff --git a/minix/net/uds/uds.c b/minix/net/uds/uds.c index baca3c1ed..2052a2ac2 100644 --- a/minix/net/uds/uds.c +++ b/minix/net/uds/uds.c @@ -1,740 +1,1376 @@ -/* - * Unix Domain Sockets Implementation (PF_UNIX, PF_LOCAL) - * This code handles requests generated by operations on /dev/uds - * - * The interface to UNIX domain sockets is similar to the interface to network - * sockets. There is a character device (/dev/uds) and this server is a - * 'driver' for that device. - */ +/* UNIX Domain Sockets - uds.c - socket management */ #include "uds.h" -static ssize_t uds_perform_write(devminor_t, endpoint_t, cp_grant_id_t, size_t, - int); +static struct udssock uds_array[NR_UDSSOCK]; +static TAILQ_HEAD(uds_freelist, udssock) uds_freelist; +static unsigned int uds_in_use; +static int uds_running; -static int uds_open(devminor_t, int, endpoint_t); -static int uds_close(devminor_t); -static ssize_t uds_read(devminor_t, u64_t, endpoint_t, cp_grant_id_t, size_t, - int, cdev_id_t); -static ssize_t uds_write(devminor_t, u64_t, endpoint_t, cp_grant_id_t, size_t, - int, cdev_id_t); -static int uds_ioctl(devminor_t, unsigned long, endpoint_t, cp_grant_id_t, int, - endpoint_t, cdev_id_t); -static int uds_cancel(devminor_t, endpoint_t, cdev_id_t); -static int uds_select(devminor_t, unsigned int, endpoint_t); +static const struct sockevent_ops uds_ops; -static struct chardriver uds_tab = { - .cdr_open = uds_open, - .cdr_close = uds_close, - .cdr_read = uds_read, - .cdr_write = uds_write, - .cdr_ioctl = uds_ioctl, - .cdr_cancel = uds_cancel, - .cdr_select = uds_select -}; +static SLIST_HEAD(udshash, udssock) udshash[UDSHASH_SLOTS]; -/* File Descriptor Table */ -uds_fd_t uds_fd_table[NR_FDS]; - -static unsigned int uds_exit_left; - -static int -uds_open(devminor_t UNUSED(orig_minor), int access, - endpoint_t user_endpt) +/* + * Initialize file-to-socket hash table. + */ +static void +udshash_init(void) { - devminor_t minor; - char *buf; - int i; + unsigned int slot; - dprintf(("UDS: uds_open() from %d\n", user_endpt)); - - /* - * Find a slot in the descriptor table for the new descriptor. - * The index of the descriptor in the table will be returned. - * Subsequent calls to read/write/close/ioctl/etc will use this - * minor number. The minor number must be different from the - * the /dev/uds device's minor number (0). - */ - for (minor = 1; minor < NR_FDS; minor++) - if (uds_fd_table[minor].state == UDS_FREE) - break; - - if (minor == NR_FDS) - return ENFILE; - - /* - * Allocate memory for the ringer buffer. In order to save on memory - * in the common case, the buffer is allocated only when the socket is - * in use. We use mmap instead of malloc to allow the memory to be - * actually freed later. - */ - if ((buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE, - MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) - return ENOMEM; - - /* - * Allocate the socket, and set its initial parameters. - */ - uds_fd_table[minor].state = UDS_INUSE; - uds_fd_table[minor].owner = user_endpt; - uds_fd_table[minor].sel_endpt = NONE; - uds_fd_table[minor].sel_ops = 0; - uds_fd_table[minor].buf = buf; - uds_fd_table[minor].pos = 0; - uds_fd_table[minor].size = 0; - uds_fd_table[minor].mode = UDS_R | UDS_W; - uds_fd_table[minor].type = -1; - - for (i = 0; i < UDS_SOMAXCONN; i++) - uds_fd_table[minor].backlog[i] = -1; - uds_fd_table[minor].backlog_size = UDS_SOMAXCONN; - - memset(&uds_fd_table[minor].ancillary_data, '\0', - sizeof(struct ancillary)); - for (i = 0; i < OPEN_MAX; i++) - uds_fd_table[minor].ancillary_data.fds[i] = -1; - - uds_fd_table[minor].stale = FALSE; - uds_fd_table[minor].listening = FALSE; - uds_fd_table[minor].peer = -1; - uds_fd_table[minor].child = -1; - - memset(&uds_fd_table[minor].addr, '\0', sizeof(struct sockaddr_un)); - memset(&uds_fd_table[minor].source, '\0', sizeof(struct sockaddr_un)); - memset(&uds_fd_table[minor].target, '\0', sizeof(struct sockaddr_un)); - - uds_fd_table[minor].suspended = UDS_NOT_SUSPENDED; - - return CDEV_CLONED | minor; + for (slot = 0; slot < __arraycount(udshash); slot++) + SLIST_INIT(&udshash[slot]); } -static void -uds_reset(devminor_t minor) +/* + * Return a hash table slot number for the given pair. + */ +static unsigned int +udshash_slot(dev_t dev, ino_t ino) { - /* Disconnect the socket from its peer. */ - uds_fd_table[minor].peer = -1; - /* Set an error to pass to the caller. */ - uds_fd_table[minor].err = ECONNRESET; + assert(dev != NO_DEV); + assert(ino != 0); - /* If a process was blocked on I/O, revive it. */ - if (uds_fd_table[minor].suspended != UDS_NOT_SUSPENDED) - uds_unsuspend(minor); + /* + * Effectively combining two 64-bit numbers into a single 6-or-so-bit + * hash is not too easy. This hash function is probably among the + * worst options. Then again it is not all that critical as we are not + * expecting that many bound UDS sockets in the system anyway. + */ + return (unsigned int)(dev ^ ino) % UDSHASH_SLOTS; +} - /* All of the peer's calls will fail immediately now. */ - if (uds_fd_table[minor].sel_ops != 0) { - chardriver_reply_select(uds_fd_table[minor].sel_endpt, minor, - uds_fd_table[minor].sel_ops); - uds_fd_table[minor].sel_ops = 0; +/* + * Look for a socket that is bound to the given pair. Return a + * pointer to the socket if found, or NULL otherwise. + */ +static struct udssock * +udshash_get(dev_t dev, ino_t ino) +{ + struct udssock *uds; + unsigned int slot; + + slot = udshash_slot(dev, ino); + + SLIST_FOREACH(uds, &udshash[slot], uds_hash) { + if (uds->uds_dev == dev && uds->uds_ino == ino) + return uds; + } + + return NULL; +} + +/* + * Add a socket to the file-to-socket hash table. The socket must have its + * device and inode fields set, and must not be in the hash table already. + */ +static void +udshash_add(struct udssock * uds) +{ + unsigned int slot; + + slot = udshash_slot(uds->uds_dev, uds->uds_ino); + + SLIST_INSERT_HEAD(&udshash[slot], uds, uds_hash); +} + +/* + * Remove a socket from the file-to-socket hash table. The socket must be in + * the hash table. + */ +static void +udshash_del(struct udssock * uds) +{ + unsigned int slot; + + slot = udshash_slot(uds->uds_dev, uds->uds_ino); + + /* This macro is O(n). */ + SLIST_REMOVE(&udshash[slot], uds, udssock, uds_hash); +} + +/* + * Return the socket identifier for the given UDS socket object. + */ +sockid_t +uds_get_id(struct udssock * uds) +{ + + return (sockid_t)(uds - uds_array); +} + +/* + * Given either NULL or a previously returned socket, return the next in-use + * UDS socket of the given socket type, or NULL if there are no more matches. + * The sockets are returned in random order, but each matching socket is + * returned exactly once (until any socket is allocated or freed). + */ +struct udssock * +uds_enum(struct udssock * prev, int type) +{ + sockid_t id; + + if (prev != NULL) + id = uds_get_id(prev) + 1; + else + id = 0; + + for (; id < NR_UDSSOCK; id++) + if ((uds_array[id].uds_flags & UDSF_IN_USE) && + uds_get_type(&uds_array[id]) == type) + return &uds_array[id]; + + return NULL; +} + +/* + * Invalidate credentials on the socket. + */ +static void +uds_clear_cred(struct udssock * uds) +{ + + uds->uds_cred.unp_pid = -1; + uds->uds_cred.unp_euid = -1; + uds->uds_cred.unp_egid = -1; +} + +/* + * Obtain the credentials (process, user, and group ID) of the given user + * endpoint and associate them with the socket for later retrieval. It is + * important to note that this information is obtained once at connect time, + * and never updated later. The party receiving the credentials must take this + * into account. + */ +static void +uds_get_cred(struct udssock * uds, endpoint_t user_endpt) +{ + int r; + + if ((uds->uds_cred.unp_pid = r = getepinfo(user_endpt, + &uds->uds_cred.unp_euid, &uds->uds_cred.unp_egid)) < 0) { + printf("UDS: failed obtaining credentials of %d (%d)\n", + user_endpt, r); + + uds_clear_cred(uds); } } +/* + * Allocate and initialize a UDS socket. On succes, return OK with a pointer + * to the new socket in 'udsp'. On failure, return a negative error code. + */ static int -uds_close(devminor_t minor) +uds_alloc(struct udssock ** udsp) { - int i, peer; + struct udssock *uds; + int r; - dprintf(("UDS: uds_close(%d)\n", minor)); + /* Allocate, initialize, and return a UNIX domain socket object. */ + if (TAILQ_EMPTY(&uds_freelist)) + return ENOBUFS; - if (minor < 0 || minor >= NR_FDS) return ENXIO; + uds = TAILQ_FIRST(&uds_freelist); - if (uds_fd_table[minor].state != UDS_INUSE) + uds->uds_conn = NULL; /* not connected */ + uds->uds_link = NULL; /* not connecting or linked */ + uds->uds_queued = 0; + uds->uds_flags = UDSF_IN_USE; /* may be found through enumeration */ + uds->uds_pathlen = 0; /* not bound: no path */ + uds->uds_dev = NO_DEV; /* not hashed: no socket file device */ + uds->uds_ino = 0; /* not hashed: no socket file inode */ + uds_clear_cred(uds); /* no bind/connect-time credentials */ + TAILQ_INIT(&uds->uds_queue); /* an empty queue */ + + if ((r = uds_io_setup(uds)) != OK) + return r; + + TAILQ_REMOVE(&uds_freelist, uds, uds_next); + + assert(uds_in_use < NR_UDSSOCK); + uds_in_use++; + + *udsp = uds; + return OK; +} + +/* + * Free a previously allocated socket. + */ +static void +uds_free(struct sock * sock) +{ + struct udssock *uds = (struct udssock *)sock; + + uds_io_cleanup(uds); + + uds->uds_flags = 0; /* no longer in use */ + + TAILQ_INSERT_HEAD(&uds_freelist, uds, uds_next); + + assert(uds_in_use > 0); + if (--uds_in_use == 0 && uds_running == FALSE) + sef_cancel(); +} + +/* + * Create a new socket. + */ +static sockid_t +uds_socket(int domain, int type, int protocol, endpoint_t user_endpt __unused, + struct sock ** sockp, const struct sockevent_ops ** ops) +{ + struct udssock *uds; + int r; + + dprintf(("UDS: socket(%d,%d,%d)\n", domain, type, protocol)); + + if (domain != PF_UNIX) { + /* This means the service was configured incorrectly. */ + printf("UDS: got request for domain %d\n", domain); + + return EAFNOSUPPORT; + } + + /* We support the following three socket types. */ + switch (type) { + case SOCK_STREAM: + case SOCK_SEQPACKET: + case SOCK_DGRAM: + break; + default: + return EPROTOTYPE; + } + + /* + * The PF_UNIX domain does not support particular protocols, so the + * given protocol must be zero (= anything that matches). + */ + if (protocol != UDSPROTO_UDS) + return EPROTONOSUPPORT; + + if ((r = uds_alloc(&uds)) != OK) + return r; + + dprintf(("UDS: socket returns %d\n", uds_get_id(uds))); + + *sockp = &uds->uds_sock; + *ops = &uds_ops; + return uds_get_id(uds); +} + +/* + * Connect a pair of sockets. + */ +static int +uds_pair(struct sock * sock1, struct sock * sock2, endpoint_t user_endpt) +{ + struct udssock *uds1 = (struct udssock *)sock1; + struct udssock *uds2 = (struct udssock *)sock2; + + dprintf(("UDS: pair(%d,%d)\n", uds_get_id(uds1), uds_get_id(uds2))); + + /* Only connection-oriented types are acceptable. */ + if (uds_get_type(uds1) == SOCK_DGRAM) + return EOPNOTSUPP; + + /* Connect the sockets. */ + uds1->uds_conn = uds2; + uds2->uds_conn = uds1; + uds1->uds_flags |= UDSF_CONNECTED; + uds2->uds_flags |= UDSF_CONNECTED; + + /* Obtain the (same) credentials for both sides of the connection. */ + uds_get_cred(uds1, user_endpt); + memcpy(&uds2->uds_cred, &uds1->uds_cred, sizeof(uds2->uds_cred)); + + return OK; +} + +/* + * Disconnect a UDS socket, notifying or freeing up the other end of the + * connection depending on whether the socket was linked, that is, on the + * accept queue of a listening socket. + */ +static void +uds_disconnect(struct udssock * uds, int was_linked) +{ + struct udssock *conn; + + assert(uds_is_connected(uds)); + assert(uds_has_conn(uds)); + + conn = uds->uds_conn; + + assert(uds_is_connected(conn)); + assert(uds_has_conn(conn)); + assert(!uds_has_link(conn)); + assert(conn->uds_conn == uds); + + /* Disconnect the sockets. */ + uds->uds_conn = NULL; + conn->uds_conn = NULL; + + /* + * If the given socket is linked, then it is a connected socket for + * which the other end has been created but not yet accepted. In that + * case, the other end ('conn') will have to be freed up. Otherwise, + * it is a regular user-created socket and we must properly transition + * it into disconnected state. + */ + if (!was_linked) { + sockevent_raise(&conn->uds_sock, SEV_SEND | SEV_RECV); + + /* + * Clear the peer credentials so that they will not be mistaken + * for having been obtained at bind time. + */ + uds_clear_cred(conn); + } else + sockevent_raise(&conn->uds_sock, SEV_CLOSE); +} + +/* + * Add the socket 'link' to the queue of the socket 'uds'. This also implies + * that 'link's link socket is set to 'uds'. + */ +static void +uds_add_queue(struct udssock * uds, struct udssock * link) +{ + + dprintf(("UDS: add_queue(%d,%d)\n", + uds_get_id(uds), uds_get_id(link))); + + TAILQ_INSERT_TAIL(&uds->uds_queue, link, uds_next); + + uds->uds_queued++; + assert(uds->uds_queued != 0); + + link->uds_link = uds; +} + +/* + * Remove the socket 'link' from the queue of the socket 'uds'. This also + * reset 'link's link to NULL. + */ +static void +uds_del_queue(struct udssock * uds, struct udssock * link) +{ + + dprintf(("UDS: del_queue(%d,%d)\n", + uds_get_id(uds), uds_get_id(link))); + + assert(link->uds_link == uds); + + TAILQ_REMOVE(&uds->uds_queue, link, uds_next); + + assert(uds->uds_queued > 0); + uds->uds_queued--; + + link->uds_link = NULL; +} + +/* + * Remove all sockets from the queue of the socket 'uds', with the exception of + * 'except' if non-NULL. Raise an ECONNRESET error on all removed sockets that + * are not equal to 'uds'. + */ +static void +uds_clear_queue(struct udssock * uds, struct udssock * except) +{ + struct udssock *link, *tmp; + int found; + + dprintf(("UDS: clear_queue(%d,%d)\n", + uds_get_id(uds), (except != NULL) ? uds_get_id(except) : -1)); + + found = 0; + + /* + * Abort all connecting sockets queued on this socket, except for the + * given exception, which may be NULL. + */ + TAILQ_FOREACH_SAFE(link, &uds->uds_queue, uds_next, tmp) { + if (link == except) { + found++; + + continue; + } + + dprintf(("UDS: clear_queue removes %d\n", uds_get_id(link))); + + assert(uds_get_type(link) == SOCK_DGRAM || + uds_is_connecting(link) || uds_is_connected(link)); + + uds_del_queue(uds, link); + + /* + * Generate an error only if the socket was not linked to + * itself (only datagram sockets can be linked to themselves). + * The error is not helpful for applications in that case. + */ + if (uds != link) + sockevent_set_error(&link->uds_sock, ECONNRESET); + + /* + * If this is a listening socket, disconnect the connecting or + * connected end. If a connected peer was already created for + * the queued socket, dispose of that peer. + * + * Clear credentials obtained when starting to connect (in + * which case the socket is always a connection-oriented + * socket), so that they will not be mistaken for credentials + * obtained at bind time. + */ + if (uds_get_type(link) != SOCK_DGRAM) { + if (uds_is_connected(link)) + uds_disconnect(link, TRUE /*was_linked*/); + else + uds_clear_cred(link); + } + } + + assert(uds->uds_queued == found); +} + +/* + * Check whether the socket address given in 'addr', with length 'addr_len', is + * a valid UNIX domain socket address (including a path to a socket file). On + * success, return the (non-zero) length of the socket file's path, minus the + * null terminator which may in fact not be present. The caller is responsible + * for copying and terminating the path as needed. A pointer to the path as + * stored in 'addr' is returned in 'pathp'. On failure, return an error code. + */ +static int +uds_check_addr(const struct sockaddr * addr, socklen_t addr_len, + const char ** pathp) +{ + const char *p; + size_t len; + + /* + * We could cast to a sockaddr_un structure pointer first, but that + * would not provide any benefits here. Instead, we use sa_data as the + * generic equivalent of sun_path. + */ + if (addr_len < offsetof(struct sockaddr, sa_data)) return EINVAL; - peer = uds_fd_table[minor].peer; - if (peer != -1 && uds_fd_table[peer].peer == -1) { - /* Connecting socket: clear from server's backlog. */ - if (!uds_fd_table[peer].listening) - panic("connecting socket attached to non-server"); + if (addr->sa_family != AF_UNIX) + return EAFNOSUPPORT; - for (i = 0; i < uds_fd_table[peer].backlog_size; i++) { - if (uds_fd_table[peer].backlog[i] == minor) { - uds_fd_table[peer].backlog[i] = -1; - break; - } - } - } else if (peer != -1) { - /* Connected socket: disconnect it. */ - uds_reset(peer); - } else if (uds_fd_table[minor].listening) { - /* Listening socket: disconnect all sockets in the backlog. */ - for (i = 0; i < uds_fd_table[minor].backlog_size; i++) - if (uds_fd_table[minor].backlog[i] != -1) - uds_reset(uds_fd_table[minor].backlog[i]); + len = (size_t)addr_len - offsetof(struct sockaddr, sa_data); + if (len > 0 && (p = memchr(addr->sa_data, '\0', len)) != NULL) + len = (size_t)(p - addr->sa_data); + + /* The given path name must not be an empty string. */ + if (len == 0) + return ENOENT; + + /* This check should be redundant but better safe than sorry. */ + if (len >= UDS_PATH_MAX) + return EINVAL; + + *pathp = (const char *)addr->sa_data; + return len; +} + +/* + * Given the socket file path given as 'path' with length 'path_len' (not + * necessarily null terminated), store a socket address with the path in + * 'addr', and return the socket address length in 'addr_len'. The calling + * libraries (libsockdriver, libsockevent) and the static assert in uds.h + * guarantee that 'addr' is sufficiently large to store any address we generate + * here. The libraries may subsequently copy out only a part of it to the user + * process. This function always succeeds. + */ +void +uds_make_addr(const char * path, size_t len, struct sockaddr * addr, + socklen_t * addr_len) +{ + + /* + * Generate the address. The stored length (sa_len/sun_len) does not + * include a null terminator. The entire structure does include a null + * terminator, but only if the socket is bound. + */ + addr->sa_len = offsetof(struct sockaddr, sa_data) + len; + addr->sa_family = AF_UNIX; + if (len > 0) { + /* This call may (intentionally) overrun the sa_data size. */ + memcpy((char *)addr->sa_data, path, len); + ((char *)addr->sa_data)[len] = '\0'; + + /* The socket is bound, so include the null terminator. */ + len++; + assert(len <= UDS_PATH_MAX); } - if (uds_fd_table[minor].ancillary_data.nfiledes > 0) - uds_clear_fds(minor, &uds_fd_table[minor].ancillary_data); + /* Note that this length may be different from sa_len/sun_len now. */ + *addr_len = offsetof(struct sockaddr, sa_data) + len; +} - /* Release the memory for the ring buffer. */ - munmap(uds_fd_table[minor].buf, UDS_BUF); +/* + * Bind a socket to a local address. + */ +static int +uds_bind(struct sock * sock, const struct sockaddr * addr, socklen_t addr_len, + endpoint_t user_endpt) +{ + struct udssock *uds = (struct udssock *)sock; + struct udssock *uds2; + const char *path; + size_t len; + dev_t dev; + ino_t ino; + int r; - /* Set the socket back to its original UDS_FREE state. */ - memset(&uds_fd_table[minor], '\0', sizeof(uds_fd_t)); + dprintf(("UDS: bind(%d)\n", uds_get_id(uds))); - /* If terminating, and this was the last open socket, exit now. */ - if (uds_exit_left > 0) { - if (--uds_exit_left == 0) - chardriver_terminate(); + /* A socket may be bound at any time, but only once. */ + if (uds_is_bound(uds)) + return EINVAL; + + /* Verify that the user gave us an acceptable address. */ + if ((r = uds_check_addr(addr, addr_len, &path)) < 0) + return r; + len = (size_t)r; + + /* Attempt to create the socket file on the file system. */ + r = socketpath(user_endpt, path, len, SPATH_CREATE, &dev, &ino); + if (r != OK) + return r; + assert(dev != NO_DEV && ino != 0); + + /* + * It is possible that a socket file of a previously bound socket was + * unlinked, and due to inode number reuse, a new socket file has now + * been created with the same pair. In that case, we must + * unbind the old socket, because it must no longer be found. The old + * socket will still have a path (and behave as though it is bound) but + * no longer be found through hash lookups. + */ + if ((uds2 = udshash_get(dev, ino)) != NULL) { + udshash_del(uds2); + + uds2->uds_dev = NO_DEV; + uds2->uds_ino = 0; + } + + /* + * Obtain credentials for the socket, unless the socket is already + * connecting or connected, in which case we must not replace the + * credentials we obtained already. We later clear those credentials + * upon a connection failure or disconnect, so that if the socket is + * then put in listening mode, we know there are no bind-time + * credentials. Not ideal, but we really need two separate sets of + * credentials if we want to get this right, which is a waste of memory + * as no sane application writer would ever rely on credential passing + * after recycling a socket.. + */ + if (uds_get_type(uds) != SOCK_DGRAM && !uds_is_connecting(uds) && + !uds_is_connected(uds)) + uds_get_cred(uds, user_endpt); + + /* Asssign the address to the socket. */ + uds->uds_pathlen = len; + memcpy(&uds->uds_path, path, len); + uds->uds_dev = dev; + uds->uds_ino = ino; + + udshash_add(uds); + + return OK; +} + +/* + * Look up a UDS socket based on a user-given address. If a socket exists for + * the address, check if it is type-compatible with the given UDS socket. + * On succes, return OK, with 'peerp' set to the socket that was found. On + * failure, return a negative error code. + */ +int +uds_lookup(struct udssock * uds, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp) +{ + struct udssock *peer; + const char *path; + size_t len; + dev_t dev; + ino_t ino; + int r; + + /* Verify that the user gave us an acceptable address. */ + if ((r = uds_check_addr(addr, addr_len, &path)) < 0) + return r; + len = (size_t)r; + + /* Attempt to look up the socket file on the file system. */ + r = socketpath(user_endpt, path, len, SPATH_CHECK, &dev, &ino); + if (r != OK) + return r; + assert(dev != NO_DEV && ino != 0); + + if ((peer = udshash_get(dev, ino)) == NULL) + return ECONNREFUSED; + if (uds_get_type(peer) != uds_get_type(uds)) + return EPROTOTYPE; + + *peerp = peer; + return OK; +} + +/* + * Given the listening socket 'uds', and the socket 'link' that is calling or + * has called connect(2) and is or will be linked to the listening socket's + * queue, create a new socket and connect it to 'link', putting both sockets in + * the connected state. The given link socket may be in unconnected, + * connecting, or disconnected state prior to the call. Return OK or an error + * code. The link state of the link socket remains unchanged in any case. + */ +static int +uds_attach(struct udssock * uds, struct udssock * link) +{ + struct udssock *conn; + int r; + + /* + * Allocate a new socket to use as peer socket for the connection that + * is about to be established. The new socket is not yet known by + * libsockevent. + */ + if ((r = uds_alloc(&conn)) != OK) + return r; + + /* + * Ask libsockevent to clone the sock object in the new UDS socket from + * the listening socket. This adds the sock object to libsockevent's + * data structures and ensures that we can safely use the socket + * despite the fact that it has not yet been accepted (and thus + * returned to libsockevent). From this moment on, we must either + * return the socket's ID (but not a pointer to it!) from uds_accept() + * or raise SEV_CLOSE on it. + */ + sockevent_clone(&uds->uds_sock, &conn->uds_sock, uds_get_id(conn)); + + /* Connect the link socket to the new socket. */ + link->uds_conn = conn; + link->uds_flags |= UDSF_CONNECTED; + + /* + * Connect the new socket to the link socket as well. The child + * socket should also inherit pretty much all settings from the + * listening socket, including the bind path and the listening socket's + * bind-time credentials. + */ + conn->uds_conn = link; + conn->uds_flags = uds->uds_flags & (UDSF_PASSCRED | UDSF_CONNWAIT); + conn->uds_flags |= UDSF_CONNECTED; + conn->uds_pathlen = uds->uds_pathlen; + memcpy(conn->uds_path, uds->uds_path, (size_t)uds->uds_pathlen); + memcpy(&conn->uds_cred, &uds->uds_cred, sizeof(conn->uds_cred)); + + return OK; +} + +/* + * Connect a socket to a remote address. + */ +static int +uds_connect(struct sock * sock, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt) +{ + struct udssock *uds = (struct udssock *)sock; + struct udssock *link; + int r; + + dprintf(("UDS: connect(%d)\n", uds_get_id(uds))); + + /* For connection-oriented sockets, several state checks apply. */ + if (uds_get_type(uds) != SOCK_DGRAM) { + if (uds_is_listening(uds)) + return EOPNOTSUPP; + if (uds_is_connecting(uds)) + return EALREADY; + if (uds_is_connected(uds)) + return EISCONN; + /* Disconnected sockets may be reconnected, see below. */ + } else { + /* + * Connectionless sockets may be unconnected by providing an + * address with family AF_UNSPEC. Handle this case first here. + */ + if (addr_len >= offsetof(struct sockaddr, sa_data) && + addr->sa_family == AF_UNSPEC) { + /* + * Reset this socket's previous connection to another + * socket, if any. Unconnecting has no effect on other + * sockets connected to this socket, though. + */ + if (uds_has_link(uds)) + uds_del_queue(uds->uds_link, uds); + + return OK; + } + } + + /* + * Find the socket identified by the given address. If it exists at + * all, see if it is a proper match. + */ + if ((r = uds_lookup(uds, addr, addr_len, user_endpt, &link)) != OK) + return r; + + /* + * Handle connectionless sockets first, in which case a connect links + * the socket to a send target and limits receipt to datagrams from + * that target. We actually point the socket to the peer socket, + * through uds_link. That also means that if the target socket + * disappears, we have to reset any sockets connected to it, in which + * case we return them to the unconnected state. In order to allow + * finding all sockets connected to a particular socket, we put all + * those sockets on their target's queue, hence why we use uds_link and + * not uds_conn. As mentioned before, we allow reconnecting without + * restrictions. + * TODO: see if reconnecting should clear a pending ECONNRESET. + * + * An important note: 'uds' and 'link' may actually be the same socket, + * if the caller chooses to connect a socket with itself! + */ + if (uds_get_type(uds) == SOCK_DGRAM) { + /* Reconnecting to the same socket has no effect. */ + if (uds_has_link(uds) && uds->uds_link == link) + return OK; + + /* + * If the intended target is linked to another socket, we + * refuse linking to it. Sending or receiving would never work + * anyway. Do allow a socket to link to itself after being + * linked to another socket. The error code is the same as in + * the sending code, borrowed from Linux. + */ + if (uds != link && uds_has_link(link) && link->uds_link != uds) + return EPERM; + + /* + * Reset this socket's previous link to another socket, if any. + */ + if (uds_has_link(uds)) + uds_del_queue(uds->uds_link, uds); + + /* + * Reset any links to this socket, except for the one by + * the intended target. Sending or receiving would no longer + * work anyway. If the socket was linked to itself, clear its + * self-link without generating an ECONNRESET. If the socket + * is relinking to itself, reestablish the link after first + * clearing it. + */ + uds_clear_queue(uds, (uds != link) ? link : NULL); + + uds_add_queue(link, uds); + + return OK; + } + + /* + * For connection-oriented sockets there is more to do. First, make + * sure that the peer is a listening socket, that it has not been shut + * down, and that its backlog is not already at the configured maximum. + */ + if (!uds_is_listening(link)) + return ECONNREFUSED; + + if (uds_is_shutdown(link, SFL_SHUT_RD | SFL_SHUT_WR)) + return ECONNREFUSED; + + if (link->uds_queued >= link->uds_backlog) + return ECONNREFUSED; + + /* + * The behavior of connect(2) now depends on whether LOCAL_CONNWAIT is + * set on either the connecting or the listening socket. If it is not, + * the socket will be connected to a new as-yet invisible socket, which + * will be the one returned from accept(2) later. If it was, the + * socket will be put in the connecting state. + */ + if (!((uds->uds_flags | link->uds_flags) & UDSF_CONNWAIT)) { + if ((r = uds_attach(link, uds)) != OK) + return r; + + assert(uds_is_connected(uds)); + } else { + /* + * Disconnected sockets now stop being connected. Any pending + * data can still be received, though. + */ + uds->uds_flags &= ~UDSF_CONNECTED; + + r = SUSPEND; + } + + /* Obtain credentials for the socket. */ + uds_get_cred(uds, user_endpt); + + /* Add the socket at the end of the listening socket's queue. */ + uds_add_queue(link, uds); + + assert(r != SUSPEND || uds_is_connecting(uds)); + + /* + * Let an accept call handle the rest, which will in turn resume this + * connect call. The sockevent library ensures that this works even if + * the call is non-blocking. + */ + sockevent_raise(&link->uds_sock, SEV_ACCEPT); + + return r; +} + +/* + * Put a socket in listening mode. + */ +static int +uds_listen(struct sock * sock, int backlog) +{ + struct udssock *uds = (struct udssock *)sock; + + /* The maximum backlog value must not exceed its field size. */ + assert(SOMAXCONN <= USHRT_MAX); + + dprintf(("UDS: listen(%d)\n", uds_get_id(uds))); + + /* Only connection-oriented types may be put in listening mode. */ + if (uds_get_type(uds) == SOCK_DGRAM) + return EOPNOTSUPP; + + /* A connecting or connected socket may not listen. */ + if (uds_is_connecting(uds) || uds_is_connected(uds)) + return EINVAL; + + /* POSIX says that this is now the appropriate error code here. */ + if (!uds_is_bound(uds)) + return EDESTADDRREQ; + + /* + * The socket is now entering the listening state. If it was + * previously disconnected, clear the connection flag. + */ + uds->uds_flags &= ~UDSF_CONNECTED; + + /* + * We do not remove sockets from the backlog if it is now being dropped + * below the current number of queued sockets. We only refuse newly + * connecting sockets beyond the backlog size. + */ + uds->uds_backlog = backlog; + + return OK; +} + +/* + * Test whether an accept request would block. Return OK if a socket could be + * accepted, an appropriate error code if an accept call would fail instantly, + * or SUSPEND if the accept request would block waiting for a connection. + */ +static int +uds_test_accept(struct sock * sock) +{ + struct udssock *uds = (struct udssock *)sock; + + /* + * Ensure that the socket is in listening mode. If not, we must return + * the error code that is appropriate for this socket type. + */ + if (uds_get_type(uds) == SOCK_DGRAM) + return EOPNOTSUPP; + if (!uds_is_listening(uds)) + return EINVAL; + + /* + * If the socket has been shut down, new connections are no longer + * accepted and accept calls no longer block. This is not a POSIX + * requirement, but rather an application convenience feature. + */ + if (uds->uds_queued == 0) { + if (uds_is_shutdown(uds, SFL_SHUT_RD | SFL_SHUT_WR)) + return ECONNABORTED; + + return SUSPEND; } return OK; } -static int -uds_select(devminor_t minor, unsigned int ops, endpoint_t endpt) +/* + * Accept a connection on a listening socket, creating a new socket. On + * success, return the new socket identifier, with the new socket stored in + * 'newsockp'. Otherwise, return an error code. + */ +static sockid_t +uds_accept(struct sock * sock, struct sockaddr * addr, socklen_t * addr_len, + endpoint_t user_endpt __unused, struct sock ** newsockp) { - unsigned int ready_ops; - int i, bytes, watch; + struct udssock *uds = (struct udssock *)sock; + struct udssock *link, *conn; + sockid_t r; - dprintf(("UDS: uds_select(%d)\n", minor)); + dprintf(("UDS: accept(%d)\n", uds_get_id(uds))); - if (minor < 0 || minor >= NR_FDS) return ENXIO; - - if (uds_fd_table[minor].state != UDS_INUSE) - return EINVAL; - - watch = (ops & CDEV_NOTIFY); - ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR); - - ready_ops = 0; - - /* Check if there is data available to read. */ - if (ops & CDEV_OP_RD) { - bytes = uds_perform_read(minor, NONE, GRANT_INVALID, 1, 1); - if (bytes > 0) { - ready_ops |= CDEV_OP_RD; /* data available */ - } else if (uds_fd_table[minor].listening == TRUE) { - /* Check for pending connections. */ - for (i = 0; i < uds_fd_table[minor].backlog_size; i++) - { - if (uds_fd_table[minor].backlog[i] != -1) { - ready_ops |= CDEV_OP_RD; - break; - } - } - } else if (bytes != EDONTREPLY) { - ready_ops |= CDEV_OP_RD; /* error */ - } - } - - /* Check if we can write without blocking. */ - if (ops & CDEV_OP_WR) { - bytes = uds_perform_write(minor, NONE, GRANT_INVALID, 1, 1); - if (bytes != 0 && bytes != EDONTREPLY) - ready_ops |= CDEV_OP_WR; - } - - /* - * If not all requested ops were ready, and the caller requests to be - * notified about changes, we add the remaining ops to the saved set. - */ - ops &= ~ready_ops; - if (ops && watch) { - uds_fd_table[minor].sel_endpt = endpt; - uds_fd_table[minor].sel_ops |= ops; - } - - return ready_ops; -} - -ssize_t -uds_perform_read(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant, - size_t size, int pretend) -{ - size_t pos, subsize; - int r, peer; - - dprintf(("UDS: uds_perform_read(%d)\n", minor)); - - peer = uds_fd_table[minor].peer; - - /* Skip reads of zero bytes. */ - if (size == 0) - return 0; - - /* Check if the socket isn't shut down for reads. */ - if (!(uds_fd_table[minor].mode & UDS_R)) - return EPIPE; - - if (uds_fd_table[minor].size == 0) { - if (peer == -1) { - /* - * We're not connected. That's only a problem when this - * socket is connection oriented. - */ - if (uds_fd_table[minor].type == SOCK_STREAM || - uds_fd_table[minor].type == SOCK_SEQPACKET) { - if (uds_fd_table[minor].err == ECONNRESET) { - if (!pretend) - uds_fd_table[minor].err = 0; - return ECONNRESET; - } else - return ENOTCONN; - } - } - - /* Check if process is reading from a closed pipe. */ - if (peer != -1 && !(uds_fd_table[peer].mode & UDS_W) && - uds_fd_table[minor].size == 0) - return 0; - - if (pretend) - return EDONTREPLY; - - if (peer != -1 && - uds_fd_table[peer].suspended == UDS_SUSPENDED_WRITE) - panic("writer blocked on empty socket"); - - dprintf(("UDS: suspending read request on %d\n", minor)); - - /* Process is reading from an empty pipe. Suspend it. */ - return EDONTREPLY; - } - - /* How much can we get from the ring buffer? */ - if (size > uds_fd_table[minor].size) - size = uds_fd_table[minor].size; - - if (pretend) - return size; - - /* Get the data from the tail of the ring buffer. */ - pos = uds_fd_table[minor].pos; - - subsize = UDS_BUF - pos; - if (subsize > size) - subsize = size; - - if ((r = sys_safecopyto(endpt, grant, 0, - (vir_bytes) &uds_fd_table[minor].buf[pos], subsize)) != OK) + if ((r = uds_test_accept(sock)) != OK) return r; - if (subsize < size) { - if ((r = sys_safecopyto(endpt, grant, subsize, - (vir_bytes) uds_fd_table[minor].buf, - size - subsize)) != OK) - return r; - } + /* + * Take the first connecting socket off the listening queue. + */ + assert(!TAILQ_EMPTY(&uds->uds_queue)); - /* Advance the buffer tail. */ - uds_fd_table[minor].pos = (pos + size) % UDS_BUF; - uds_fd_table[minor].size -= size; + link = TAILQ_FIRST(&uds->uds_queue); - /* Reset position if the buffer is empty (it may save a copy call). */ - if (uds_fd_table[minor].size == 0) - uds_fd_table[minor].pos = 0; + /* + * Depending on the LOCAL_CONNWAIT setting at the time of connect(2), + * the socket may be connecting or connected. In the latter case, its + * attached socket is the socket we will return now. Otherwise we have + * to attach a socket first. + */ + assert(uds_is_connecting(link) || uds_is_connected(link)); - /* See if we can wake up a blocked writer. */ - if (peer != -1 && uds_fd_table[peer].suspended == UDS_SUSPENDED_WRITE) - uds_unsuspend(peer); - - /* See if we can satisfy an ongoing select. */ - if (peer != -1 && (uds_fd_table[peer].sel_ops & CDEV_OP_WR) && - uds_fd_table[minor].size < UDS_BUF) { - /* A write on the peer is possible now. */ - chardriver_reply_select(uds_fd_table[peer].sel_endpt, peer, - CDEV_OP_WR); - uds_fd_table[peer].sel_ops &= ~CDEV_OP_WR; - } - - return size; /* number of bytes read */ -} - -static ssize_t -uds_perform_write(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant, - size_t size, int pretend) -{ - size_t subsize, pos; - int i, r, peer; - - dprintf(("UDS: uds_perform_write(%d)\n", minor)); - - /* Skip writes of zero bytes. */ - if (size == 0) - return 0; - - /* Check if the socket isn't shut down for writes. */ - if (!(uds_fd_table[minor].mode & UDS_W)) - return EPIPE; - - /* Datagram messages must fit in the buffer entirely. */ - if (size > UDS_BUF && uds_fd_table[minor].type != SOCK_STREAM) - return EMSGSIZE; - - if (uds_fd_table[minor].type == SOCK_STREAM || - uds_fd_table[minor].type == SOCK_SEQPACKET) { + if (uds_is_connecting(link)) { /* - * If we're writing to a connection-oriented socket, then it - * needs a peer to write to. For disconnected sockets, writing - * is an error; for connecting sockets, writes should suspend. + * Attach a new socket. If this fails, return the error but + * leave the connecting socket on the listening queue. */ - peer = uds_fd_table[minor].peer; - - if (peer == -1) { - if (uds_fd_table[minor].err == ECONNRESET) { - if (!pretend) - uds_fd_table[minor].err = 0; - return ECONNRESET; - } else - return ENOTCONN; - } else if (uds_fd_table[peer].peer == -1) /* connecting */ - return EDONTREPLY; - } else /* uds_fd_table[minor].type == SOCK_DGRAM */ { - peer = -1; - - /* Locate the "peer" we want to write to. */ - for (i = 0; i < NR_FDS; i++) { - /* - * Look for a SOCK_DGRAM socket that is bound on - * the target address. - */ - if (uds_fd_table[i].type == SOCK_DGRAM && - uds_fd_table[i].stale == FALSE && - uds_fd_table[i].addr.sun_family == AF_UNIX && - !strncmp(uds_fd_table[minor].target.sun_path, - uds_fd_table[i].addr.sun_path, - sizeof(uds_fd_table[i].addr.sun_path))) { - peer = i; - break; - } - } - - if (peer == -1) - return ENOENT; - } - - /* Check if we write to a closed pipe. */ - if (!(uds_fd_table[peer].mode & UDS_R)) - return EPIPE; - - /* - * We have to preserve the boundary for DGRAM. If there's already a - * packet waiting, discard it silently and pretend it was written. - */ - if (uds_fd_table[minor].type == SOCK_DGRAM && - uds_fd_table[peer].size > 0) - return size; - - /* - * Check if the ring buffer is already full, and if the SEQPACKET - * message wouldn't write to an empty buffer. - */ - if (uds_fd_table[peer].size == UDS_BUF || - (uds_fd_table[minor].type == SOCK_SEQPACKET && - uds_fd_table[peer].size > 0)) { - if (pretend) - return EDONTREPLY; - - if (uds_fd_table[peer].suspended == UDS_SUSPENDED_READ) - panic("reader blocked on full socket"); - - dprintf(("UDS: suspending write request on %d\n", minor)); - - /* Process is reading from an empty pipe. Suspend it. */ - return EDONTREPLY; - } - - /* How much can we add to the ring buffer? */ - if (size > UDS_BUF - uds_fd_table[peer].size) - size = UDS_BUF - uds_fd_table[peer].size; - - if (pretend) - return size; - - /* Put the data at the head of the ring buffer. */ - pos = (uds_fd_table[peer].pos + uds_fd_table[peer].size) % UDS_BUF; - - subsize = UDS_BUF - pos; - if (subsize > size) - subsize = size; - - if ((r = sys_safecopyfrom(endpt, grant, 0, - (vir_bytes) &uds_fd_table[peer].buf[pos], subsize)) != OK) - return r; - - if (subsize < size) { - if ((r = sys_safecopyfrom(endpt, grant, subsize, - (vir_bytes) uds_fd_table[peer].buf, size - subsize)) != OK) + if ((r = uds_attach(uds, link)) != OK) return r; - } - /* Advance the buffer head. */ - uds_fd_table[peer].size += size; + assert(uds_is_connected(link)); - /* Fill in the source address to be returned by recvfrom, recvmsg. */ - if (uds_fd_table[minor].type == SOCK_DGRAM) - memcpy(&uds_fd_table[peer].source, &uds_fd_table[minor].addr, - sizeof(struct sockaddr_un)); - - /* See if we can wake up a blocked reader. */ - if (uds_fd_table[peer].suspended == UDS_SUSPENDED_READ) - uds_unsuspend(peer); - - /* See if we can satisfy an ongoing select. */ - if ((uds_fd_table[peer].sel_ops & CDEV_OP_RD) && - uds_fd_table[peer].size > 0) { - /* A read on the peer is possible now. */ - chardriver_reply_select(uds_fd_table[peer].sel_endpt, peer, - CDEV_OP_RD); - uds_fd_table[peer].sel_ops &= ~CDEV_OP_RD; - } - - return size; /* number of bytes written */ -} - -static ssize_t -uds_read(devminor_t minor, u64_t position, endpoint_t endpt, - cp_grant_id_t grant, size_t size, int flags, cdev_id_t id) -{ - ssize_t rc; - - dprintf(("UDS: uds_read(%d)\n", minor)); - - if (minor < 0 || minor >= NR_FDS) return ENXIO; - - if (uds_fd_table[minor].state != UDS_INUSE) - return EINVAL; - - rc = uds_perform_read(minor, endpt, grant, size, 0); - - /* If the call couldn't complete, suspend the caller. */ - if (rc == EDONTREPLY) { - uds_fd_table[minor].suspended = UDS_SUSPENDED_READ; - uds_fd_table[minor].susp_endpt = endpt; - uds_fd_table[minor].susp_grant = grant; - uds_fd_table[minor].susp_size = size; - uds_fd_table[minor].susp_id = id; - - /* If the call wasn't supposed to block, cancel immediately. */ - if (flags & CDEV_NONBLOCK) { - uds_cancel(minor, endpt, id); - - rc = EAGAIN; - } - } - - return rc; -} - -static ssize_t -uds_write(devminor_t minor, u64_t position, endpoint_t endpt, - cp_grant_id_t grant, size_t size, int flags, cdev_id_t id) -{ - ssize_t rc; - - dprintf(("UDS: uds_write(%d)\n", minor)); - - if (minor < 0 || minor >= NR_FDS) return ENXIO; - - if (uds_fd_table[minor].state != UDS_INUSE) - return EINVAL; - - rc = uds_perform_write(minor, endpt, grant, size, 0); - - /* If the call couldn't complete, suspend the caller. */ - if (rc == EDONTREPLY) { - uds_fd_table[minor].suspended = UDS_SUSPENDED_WRITE; - uds_fd_table[minor].susp_endpt = endpt; - uds_fd_table[minor].susp_grant = grant; - uds_fd_table[minor].susp_size = size; - uds_fd_table[minor].susp_id = id; - - /* If the call wasn't supposed to block, cancel immediately. */ - if (flags & CDEV_NONBLOCK) { - uds_cancel(minor, endpt, id); - - rc = EAGAIN; - } - } - - return rc; -} - -static int -uds_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt, - cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id) -{ - int rc, s; - - dprintf(("UDS: uds_ioctl(%d, %lu)\n", minor, request)); - - if (minor < 0 || minor >= NR_FDS) return ENXIO; - - if (uds_fd_table[minor].state != UDS_INUSE) - return EINVAL; - - /* Update the owner endpoint. */ - uds_fd_table[minor].owner = user_endpt; - - /* Let the UDS ioctl subsystem handle the actual request. */ - rc = uds_do_ioctl(minor, request, endpt, grant); - - /* If the call couldn't complete, suspend the caller. */ - if (rc == EDONTREPLY) { - /* The suspension type is already set by the IOCTL handler. */ - if ((s = uds_fd_table[minor].suspended) == UDS_NOT_SUSPENDED) - panic("IOCTL did not actually suspend?"); - uds_fd_table[minor].susp_endpt = endpt; - uds_fd_table[minor].susp_grant = grant; - uds_fd_table[minor].susp_size = 0; /* irrelevant */ - uds_fd_table[minor].susp_id = id; - - /* If the call wasn't supposed to block, cancel immediately. */ - if (flags & CDEV_NONBLOCK) { - uds_cancel(minor, endpt, id); - if (s == UDS_SUSPENDED_CONNECT) - rc = EINPROGRESS; - else - rc = EAGAIN; - } - } - - return rc; -} - -void -uds_unsuspend(devminor_t minor) -{ - int r; - uds_fd_t *fdp; - - fdp = &uds_fd_table[minor]; - - switch (fdp->suspended) { - case UDS_SUSPENDED_READ: - r = uds_perform_read(minor, fdp->susp_endpt, fdp->susp_grant, - fdp->susp_size, 0); - - if (r == EDONTREPLY) - return; - - break; - - case UDS_SUSPENDED_WRITE: - r = uds_perform_write(minor, fdp->susp_endpt, fdp->susp_grant, - fdp->susp_size, 0); - - if (r == EDONTREPLY) - return; - - break; - - case UDS_SUSPENDED_CONNECT: - case UDS_SUSPENDED_ACCEPT: /* - * In both cases, the caller already set up the connection. - * The only thing to do here is unblock. + * Wake up blocked (connect, send, select) calls on the peer + * socket. */ - r = fdp->err; - fdp->err = 0; - - break; - - default: - panic("unknown suspension type %d", fdp->suspended); + sockevent_raise(&link->uds_sock, SEV_CONNECT); } - chardriver_reply_task(fdp->susp_endpt, fdp->susp_id, r); + uds_del_queue(uds, link); - fdp->suspended = UDS_NOT_SUSPENDED; -} + /* Return the peer socket's address to the caller. */ + uds_make_addr(link->uds_path, link->uds_pathlen, addr, addr_len); -static int -uds_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id) -{ - uds_fd_t *fdp; - int i; + conn = link->uds_conn; - dprintf(("UDS: uds_cancel(%d)\n", minor)); - - if (minor < 0 || minor >= NR_FDS) return EDONTREPLY; - - fdp = &uds_fd_table[minor]; - - if (fdp->state != UDS_INUSE) { - printf("UDS: cancel request for closed minor %d\n", minor); - return EDONTREPLY; - } - - /* Make sure the cancel request is for a request we're hanging on. */ - if (fdp->suspended == UDS_NOT_SUSPENDED || fdp->susp_endpt != endpt || - fdp->susp_id != id) - return EDONTREPLY; /* this happens. */ + dprintf(("UDS: accept returns %d\n", uds_get_id(conn))); /* - * The system call was cancelled, so the socket is not suspended - * anymore. + * We already cloned the sock object, so return its ID but not a + * pointer to it. That tells libsockevent not to reinitialize it. */ - switch (fdp->suspended) { - case UDS_SUSPENDED_ACCEPT: - /* A partial accept() only sets the server's child. */ - for (i = 0; i < NR_FDS; i++) - if (uds_fd_table[i].child == minor) - uds_fd_table[i].child = -1; - - break; - - case UDS_SUSPENDED_CONNECT: - /* Connect requests should continue asynchronously. */ - break; - - case UDS_SUSPENDED_READ: - case UDS_SUSPENDED_WRITE: - /* Nothing more to do. */ - break; - - default: - panic("unknown suspension type %d", fdp->suspended); - } - - fdp->suspended = UDS_NOT_SUSPENDED; - - return EINTR; /* reply to the original request */ + *newsockp = NULL; + return uds_get_id(conn); } /* - * Initialize the server. + * Set socket options. */ static int -uds_init(int UNUSED(type), sef_init_info_t *UNUSED(info)) +uds_setsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t len) { - /* Setting everything to NULL implicitly sets the state to UDS_FREE. */ - memset(uds_fd_table, '\0', sizeof(uds_fd_t) * NR_FDS); + struct udssock *uds = (struct udssock *)sock; + int r, val; - uds_exit_left = 0; + dprintf(("UDS: setsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name)); - /* Announce we are up! */ - chardriver_announce(); + switch (level) { + case SOL_SOCKET: + switch (name) { + case SO_SNDBUF: + case SO_RCVBUF: + /* + * The send buffer size may not be changed because the + * buffer is the same as the other side's receive + * buffer, and what the other side is may vary from + * send call to send call. Changing the receive buffer + * size would disallow us from even accurately guessing + * the send buffer size in getsockopt calls. Therefore + * both are hardcoded and cannot actually be changed. + * In order to support applications that want at least + * a certain minimum, we do accept requests to shrink + * either buffer, but we ignore the given size. + */ + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; - return(OK); + if (val <= 0 || (size_t)val > uds_io_buflen()) + return EINVAL; + + return OK; /* ignore new value */ + } + + break; + + case UDSPROTO_UDS: + switch (name) { + case LOCAL_CREDS: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val) + uds->uds_flags |= UDSF_PASSCRED; + else + uds->uds_flags &= ~UDSF_PASSCRED; + + /* + * In incredibly rare cases, disabling this flag may + * allow blocked sends to be resumed, because suddenly + * no room for the credentials is needed in the receive + * buffer anymore. + */ + if (!val) + sockevent_raise(&uds->uds_sock, SEV_SEND); + + return OK; + + case LOCAL_CONNWAIT: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val) + uds->uds_flags |= UDSF_CONNWAIT; + else + uds->uds_flags &= ~UDSF_CONNWAIT; + + /* + * Changing the setting does not affect sockets that + * are currently pending to be accepted. Therefore, + * uds_accept() may have to deal with either case on a + * socket-by-socket basis. + */ + return OK; + + case LOCAL_PEEREID: + /* This option may be retrieved but not set. */ + return ENOPROTOOPT; + } + + break; + } + + return ENOPROTOOPT; } +/* + * Retrieve socket options. + */ +static int +uds_getsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t * len) +{ + struct udssock *uds = (struct udssock *)sock; + int val; + + dprintf(("UDS: getsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name)); + + switch (level) { + case SOL_SOCKET: + switch (name) { + case SO_SNDBUF: + case SO_RCVBUF: + /* See uds_setsockopt() for why this is static. */ + val = (int)uds_io_buflen(); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + break; + + case UDSPROTO_UDS: + switch (name) { + case LOCAL_CREDS: + val = !!(uds->uds_flags & UDSF_PASSCRED); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case LOCAL_CONNWAIT: + val = !!(uds->uds_flags & UDSF_CONNWAIT); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case LOCAL_PEEREID: + /* getpeereid(3) documents these error codes. */ + if (uds_get_type(uds) == SOCK_DGRAM) + return EINVAL; + if (!uds_is_connected(uds)) + return ENOTCONN; + + /* + * This is a custom MINIX3 error, indicating that there + * are no credentials to return. This could be due to + * a failure to obtain them (which *should* not happen) + * but also if the socket was bound while connected, + * disconnected, and then reused as listening socket. + */ + if (uds->uds_conn->uds_cred.unp_pid == -1) + return EINVAL; + + return sockdriver_copyout_opt(data, + &uds->uds_conn->uds_cred, + sizeof(uds->uds_conn->uds_cred), len); + } + + break; + } + + return ENOPROTOOPT; +} + +/* + * Retrieve a socket's local address. + */ +static int +uds_getsockname(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len) +{ + struct udssock *uds = (struct udssock *)sock; + + dprintf(("UDS: getsockname(%d)\n", uds_get_id(uds))); + + uds_make_addr(uds->uds_path, uds->uds_pathlen, addr, addr_len); + + return OK; +} + +/* + * Retrieve a socket's remote address. + */ +static int +uds_getpeername(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len) +{ + struct udssock *uds = (struct udssock *)sock; + struct udssock *peer; + + dprintf(("UDS: getpeername(%d)\n", uds_get_id(uds))); + + /* + * For disconnected sockets, we no longer have a peer socket and thus + * also no peer address. Too bad, but NetBSD does the same. + * + * For connecting sockets we could in fact return a peer address, but + * POSIX says (and other platforms agree) that we should deny the call. + */ + peer = uds_get_peer(uds); + + if (peer == NULL || uds_is_connecting(uds)) + return ENOTCONN; + + uds_make_addr(peer->uds_path, peer->uds_pathlen, addr, addr_len); + + return OK; +} + +/* + * Shut down socket send and receive operations. Note that 'flags' is a + * bitwise mask with libsockevent's SFL_SHUT_{RD,WR} flags rather than the set + * of SHUT_{RD,WR,RDWR} values from userland. + */ +static int +uds_shutdown(struct sock * sock, unsigned int flags) +{ + struct udssock *uds = (struct udssock *)sock; + struct udssock *conn; + unsigned int mask; + + dprintf(("UDS: shutdown(%d,0x%x)\n", uds_get_id(uds), flags)); + + /* + * If we are shutting down the socket for reading, we can already close + * any in-flight file descriptors associated with this socket. + */ + if (flags & SFL_SHUT_RD) + uds_io_reset(uds); + + /* + * A shutdown on this side of a connection may have an effect on + * ongoing operations on the other side. Fire appropriate events. + */ + if (uds_is_connected(uds)) { + assert(uds_get_type(uds) != SOCK_DGRAM); + + conn = uds->uds_conn; + + mask = 0; + if (flags & SFL_SHUT_RD) + mask |= SEV_SEND; + if (flags & SFL_SHUT_WR) + mask |= SEV_RECV; + + sockevent_raise(&conn->uds_sock, mask); + } + + return OK; +} + +/* + * Close a socket. + * + * The 'force' flag is unused because we need never wait for data to be sent, + * since we keep all in-flight data on the receiver side. + */ +static int +uds_close(struct sock * sock, int force __unused) +{ + struct udssock *uds = (struct udssock *)sock; + + dprintf(("UDS: close(%d)\n", uds_get_id(uds))); + + if (uds_get_type(uds) == SOCK_DGRAM) { + /* If this socket is linked to a target, disconnect it. */ + if (uds_has_link(uds)) + uds_del_queue(uds->uds_link, uds); + + /* Reset all sockets linked to this socket as a target. */ + uds_clear_queue(uds, NULL); + } else if (uds_is_listening(uds)) { + /* + * Abort all connecting sockets queued on this socket, and + * break all connections for connected sockets queued on this + * socket, freeing their peers. + */ + uds_clear_queue(uds, NULL); + } else if (uds_has_link(uds)) { + /* + * This socket is connecting or connected while the other side + * has not been accepted yet. Remove the socket from the + * listening socket's queue, and if it was connected, get rid + * of its peer socket altogether. + */ + assert(uds_is_listening(uds->uds_link)); + + uds_del_queue(uds->uds_link, uds); + + if (uds_is_connected(uds)) + uds_disconnect(uds, TRUE /*was_linked*/); + } else if (uds_is_connected(uds)) { + /* + * Decouple the peer socket from this socket, and possibly wake + * up any pending operations on it. The socket remains marked + * as connected, but will now be disconnected. + */ + uds_disconnect(uds, FALSE /*was_linked*/); + } + + if (uds_is_hashed(uds)) + udshash_del(uds); + + return OK; +} + +static const struct sockevent_ops uds_ops = { + .sop_pair = uds_pair, + .sop_bind = uds_bind, + .sop_connect = uds_connect, + .sop_listen = uds_listen, + .sop_accept = uds_accept, + .sop_test_accept = uds_test_accept, + .sop_pre_send = uds_pre_send, + .sop_send = uds_send, + .sop_test_send = uds_test_send, + .sop_pre_recv = uds_pre_recv, + .sop_recv = uds_recv, + .sop_test_recv = uds_test_recv, + .sop_setsockopt = uds_setsockopt, + .sop_getsockopt = uds_getsockopt, + .sop_getsockname = uds_getsockname, + .sop_getpeername = uds_getpeername, + .sop_shutdown = uds_shutdown, + .sop_close = uds_close, + .sop_free = uds_free +}; + +/* + * Initialize the service. + */ +static int +uds_init(int type __unused, sef_init_info_t * info __unused) +{ + unsigned int i; + + /* Initialize the list of free sockets. */ + TAILQ_INIT(&uds_freelist); + + for (i = 0; i < __arraycount(uds_array); i++) { + uds_array[i].uds_flags = 0; + + TAILQ_INSERT_TAIL(&uds_freelist, &uds_array[i], uds_next); + } + + /* Initialize the file-to-socket hash table. */ + udshash_init(); + + /* Initialize the input/output module. */ + uds_io_init(); + + /* Initialize the status module. */ + uds_stat_init(); + + /* Initialize the sockevent library. */ + sockevent_init(uds_socket); + + uds_in_use = 0; + uds_running = TRUE; + + return OK; +} + +/* + * Clean up before shutdown. + */ +static void +uds_cleanup(void) +{ + + /* Tell the status module to clean up. */ + uds_stat_cleanup(); +} + +/* + * The service has received a signal. + */ static void uds_signal(int signo) { - int i; - /* Only check for termination signal, ignore anything else. */ - if (signo != SIGTERM) return; + /* Only check for the termination signal. Ignore anything else. */ + if (signo != SIGTERM) + return; - /* Only exit once all sockets have been closed. */ - uds_exit_left = 0; - for (i = 0; i < NR_FDS; i++) - if (uds_fd_table[i].state == UDS_INUSE) - uds_exit_left++; + /* Exit only once all sockets have been closed. */ + uds_running = FALSE; - if (uds_exit_left == 0) - chardriver_terminate(); + if (uds_in_use == 0) + sef_cancel(); } +/* + * Perform initialization using the System Event Framework (SEF). + */ static void uds_startup(void) { - /* Register init callbacks. */ + + /* Register initialization callbacks. */ sef_setcb_init_fresh(uds_init); - /* Register signal callbacks. */ + /* Register signal callback. */ sef_setcb_signal_handler(uds_signal); /* Let SEF perform startup. */ @@ -742,14 +1378,40 @@ uds_startup(void) } /* - * The UNIX domain sockets driver. + * The UNIX Domain Sockets driver. */ int main(void) { + message m; + int r, ipc_status; + + /* Initialize the service. */ uds_startup(); - chardriver_task(&uds_tab); + /* Loop receiving and processing messages until instructed to stop. */ + while (uds_running || uds_in_use > 0) { + if ((r = sef_receive_status(ANY, &m, &ipc_status)) != OK) { + if (r == EINTR) + continue; /* sef_cancel() was called */ - return(OK); + panic("UDS: sef_receive_status failed: %d", r); + } + + /* + * Messages from the MIB service are (ultimately) for the + * status module. Everything else is assumed to be a socket + * request and passed to libsockevent, which will ignore + * anything it does not recognize. + */ + if (m.m_source == MIB_PROC_NR) + rmib_process(&m, ipc_status); + else + sockevent_process(&m, ipc_status); + } + + /* Clean up before graceful shutdown. */ + uds_cleanup(); + + return EXIT_SUCCESS; } diff --git a/minix/net/uds/uds.conf b/minix/net/uds/uds.conf new file mode 100644 index 000000000..481f8919d --- /dev/null +++ b/minix/net/uds/uds.conf @@ -0,0 +1,9 @@ +service uds +{ + domain LOCAL; + system KILL; # for SIGPIPE + uid 0; # for socketpath(2) and copyfd(2) + ipc + SYSTEM vfs rs vm mib + ; +}; diff --git a/minix/net/uds/uds.h b/minix/net/uds/uds.h index 741b4bd47..4ccbaf425 100644 --- a/minix/net/uds/uds.h +++ b/minix/net/uds/uds.h @@ -1,23 +1,48 @@ -#ifndef __UDS_UDS_H -#define __UDS_UDS_H +#ifndef MINIX_NET_UDS_UDS_H +#define MINIX_NET_UDS_UDS_H #include -#include -#undef send -#include -#include -#include +#include +#include #include -#include -/* Maximum number of UNIX domain sockets. */ -#define NR_FDS 256 +/* + * Maximum number of UNIX domain sockets. The control structures for all of + * these are allocated statically, although each socket's receive buffer is + * allocated only when the socket is in use. If this constant is increased + * beyond 65535, a few field sizes need to be changed. + */ +#define NR_UDSSOCK 256 -/* Connection backlog size for incoming connections. */ -#define UDS_SOMAXCONN 64 +/* Number of slots in the -to-udssock hash table. */ +#define UDSHASH_SLOTS 64 -/* Maximum UDS socket buffer size. */ -#define UDS_BUF PIPE_BUF +/* UDS has no protocols, so we accept only an "any protocol" value. */ +#define UDSPROTO_UDS 0 + +/* + * The size of each socket's receive buffer. This size is currently a global + * setting which cannot be changed per socket at run time, and it would be + * rather tricky to change that. In order not to waste resources, this size + * should be a multiple of the page size. Due to the fact that data and + * metadata (such as lengths, source addresses and sender credentials) are + * intermixed in the same buffer, the actual amount of data that can be in + * transit at once is typically less than this value. If this constant is + * increased beyond 65535, several fields and field sizes need to be changed. + */ +#define UDS_BUF 32768 + +/* Maximum size of control data that can be sent or received at once. */ +#define UDS_CTL_MAX 4096 + +/* + * We allow longer path names than the size of struct sockaddr_un's sun_path + * field. The actual limit is determined by the maximum value of the sun_len + * field, which is 255 and includes the first two fields of the structure (one + * byte each) but not the null terminator of the path. Thus, the maximum + * length of the path minus null terminator is 253; with terminator it is 254. + */ +#define UDS_PATH_MAX (UINT8_MAX - sizeof(uint8_t) - sizeof(sa_family_t) + 1) /* Output debugging information? */ #define DEBUG 0 @@ -29,191 +54,201 @@ #endif /* - * A light version of the "uucred" credentials structure. We basically do not - * support passing around groups lists, and by not using struct uucred as - * storage, we save memory for those groups lists as well. Note that the - * original Linux uucred structure has a 'cr_pid' field as well, but this is - * unsupported in NetBSD's version of the structure (and rightly so). + * We declare this structure only for the static assert right below it. We + * have no need for the structure otherwise, as we use "struct sockaddr" + * directly instead. */ -struct luucred { - uid_t uid; - gid_t gid; +struct sockaddr_unx { + uint8_t sunx_len; + sa_family_t sunx_family; + char sunx_path[UDS_PATH_MAX]; }; - -/* ancillary data to be sent */ -struct ancillary { - int fds[OPEN_MAX]; - int nfiledes; - struct luucred cred; -}; - -#define UDS_R 0x1 -#define UDS_W 0x2 +STATIC_SOCKADDR_MAX_ASSERT(sockaddr_unx); /* - * Internal State Information for a socket descriptor. + * In-flight file descriptor object. Each in-use object is part of a socket's + * file descriptor queue, and the file descriptor is for a file open by this + * service. For each set of in-flight file descriptors associated with a + * particular segment, the first object's count field contains the number of + * file descriptors in that set. For all other objects in that set, the count + * field is zero. TODO: the count should be stored in the segment itself. */ struct uds_fd { - -/* Flags */ - - enum UDS_STATE { - /* This file descriptor is UDS_FREE and can be allocated. */ - UDS_FREE = 0, - - /* OR it is UDS_INUSE and can't be allocated. */ - UDS_INUSE = 1 - - /* state is set to UDS_INUSE in uds_open(). state is Set to - * UDS_FREE in uds_init() and uds_close(). state should be - * checked prior to all operations. - */ - } state; - -/* Owner Info */ - - /* Socket Owner */ - endpoint_t owner; - -/* Pipe Housekeeping */ - - char *buf; /* ring buffer */ - size_t pos; /* tail position into ring buffer */ - size_t size; /* size of used part of ring buffer */ - - /* control read/write, set by uds_open() and shutdown(2). - * Can be set to UDS_R|UDS_W, UDS_R, UDS_W, or 0 - * for read and write, read only, write only, or neither. - * default is UDS_R|UDS_W. - */ - int mode; - -/* Socket Info */ - - /* socket type - SOCK_STREAM, SOCK_DGRAM, or SOCK_SEQPACKET - * Set by uds_ioctl(NWIOSUDSTYPE). It defaults to -1 in - * uds_open(). Any action on a socket with type -1 besides - * uds_ioctl(NWIOSUDSTYPE) and uds_close() will result in - * an error. - */ - int type; - - /* queue of pending connections for server sockets. - * connect(2) inserts and accept(2) removes from the queue - */ - int backlog[UDS_SOMAXCONN]; - - /* requested connection backlog size. Set by listen(2) - * Bounds (0 <= backlog_size <= UDS_SOMAXCONN) - * Defaults to UDS_SOMAXCONN which is defined above. - */ - unsigned char backlog_size; - - /* index of peer in uds_fd_table for connected sockets. - * -1 is used to mean no peer. Assumptions: peer != -1 means - * connected. - */ - int peer; - - /* index of child (client sd returned by accept(2)) - * -1 is used to mean no child. - */ - int child; - - /* address -- the address the socket is bound to. - * Assumptions: addr.sun_family == AF_UNIX means its bound. - */ - struct sockaddr_un addr; - - /* target -- where DGRAMs are sent to on the next uds_write(). */ - struct sockaddr_un target; - - /* source -- address where DGRAMs are from. used to fill in the - * from address in recvfrom(2) and recvmsg(2). - */ - struct sockaddr_un source; - - /* Flag (TRUE or FALSE) - address overridden by newer socket. - * Default to FALSE. Set to TRUE by do_bind() on another socket with - * the same path but its on-disk socket file removed in the meantime. - */ - int stale; - - /* Flag (TRUE or FALSE) - listening for incoming connections. - * Default to FALSE. Set to TRUE by do_listen(). - */ - int listening; - - /* stores file pointers and credentials being sent between - * processes with sendmsg(2) and recvmsg(2). - */ - struct ancillary ancillary_data; - - /* Holds an errno. This is set when a connected socket is - * closed and we need to pass ECONNRESET on to a suspended - * peer. - */ - int err; - -/* Suspend/Revive Housekeeping */ - - /* SUSPEND State Flags */ - enum UDS_SUSPENDED { - - /* Socket isn't blocked. */ - UDS_NOT_SUSPENDED = 0, - - /* Socket is blocked on read(2) waiting for data to read. */ - UDS_SUSPENDED_READ = 1, - - /* Socket is blocked on write(2) for space to write data. */ - UDS_SUSPENDED_WRITE = 2, - - /* Socket is blocked on connect(2) waiting for the server. */ - UDS_SUSPENDED_CONNECT = 4, - - /* Socket is blocked on accept(2) waiting for clients. */ - UDS_SUSPENDED_ACCEPT = 8 - } suspended; - - /* source endpoint, saved for later use by suspended procs */ - endpoint_t susp_endpt; - - /* i/o grant, saved for later use by suspended procs */ - cp_grant_id_t susp_grant; - - /* size of request, saved for later use by suspended procs */ - size_t susp_size; - - /* request ID, saved for later use by suspended procs */ - cdev_id_t susp_id; - -/* select() */ - - /* when a select is in progress, we notify this endpoint - * of new data. - */ - endpoint_t sel_endpt; - - /* Options (CDEV_OP_RD,WR,ERR) that are requested. */ - unsigned int sel_ops; + SIMPLEQ_ENTRY(uds_fd) ufd_next; /* next FD object for this socket */ + int ufd_fd; /* local file descriptor number */ + unsigned int ufd_count; /* number of FDs for this segment */ }; -typedef struct uds_fd uds_fd_t; +/* + * Connection-type sockets (SOCK_STREAM, SOCK_SEQPACKET) are always in one of + * the following five states, each with unique characteristics: + * + * - Unconnected: this socket is not in any of the other states, usually + * because it either has just been created, or because it has failed a + * connection attempt. This socket has no connected peer and does not have + * the SO_ACCEPTCONN socket option set. + * - Listening: this socket is in listening mode. It has a queue with sockets + * that are connecting or connected to it but have not yet been accepted on + * it. This socket has no connected peer. It has the SO_ACCEPTCONN socket + * option set. + * - Connecting: this socket is on a listening socket's queue. While in this + * state, the socket has the listening socket as its linked peer, and it has + * no connected peer. + * - Connected: this socket is connected to another socket, which is its + * connected peer socket. It has the UDSF_CONNECTED flag set. A socket may + * be connected and still be involved with a listening socket; see below. + * - Disconnected: this socket was connected to another socket, but that other + * socket has been closed. As a result, this socket has no peer. It does + * have the UDSF_CONNECTED flag set. + * + * The UDS service supports two different type of connect behaviors, depending + * on what the LOCAL_CONNWAIT option is set to on either the connecting or the + * listening socket. If LOCAL_CONNWAIT is not set on either (the default), the + * connecting socket socket (let's call it "A") enters the connected state + * right away, even if the connection is not immediately accepted through + * accept(2). In that case, a new limbo socket "B" is allocated as its + * connection peer. Limbo socket B is also in connected state, and either + * returned from accept(2) later, or freed when socket A leaves the connected + * state. Socket A can leave the connected state either by being closed or + * when the listening socket is closed. If LOCAL_CONNWAIT is set, socket A + * stays in the connecting state until it is accepted through accept(2). + * Importantly, in both cases, it is socket A, and (in the first case) *not* + * socket B, that is on the queue of the listening socket! + * + * Connected peers (uds_conn) are always symmetric: if one socket is connected + * to another socket, that other socket is connected to it. Any socket that is + * on the queue of another socket, is said to be "linked" to that other socket + * (uds_link). This is an asymmetric, one-to-many relationship: many sockets + * may be linked to one other socket, which keeps all those sockets on its + * queue. From the above story it should now be clear that for connection-type + * sockets, only listening sockets may have sockets on its queue, and while + * connecting sockets are always on a listening socket's queue, connected + * sockets may or may not be. Sockets in other states never are. + * + * UNIX domain sockets are generally reusable. This means that the listening + * state is the only final state; all other socket states allow the socket to + * enter another state, although not necessarily every other state. For + * example, a disconnected socket may be reconnected to another target; if that + * connection fails, the socket will enter the unconnected state. As a result, + * a socket in any state (even the listening state) may still have incoming + * data pending from a previous connection. However, EOF is currently produced + * only for disconnected sockets. To be sure: connecting and connected sockets + * must first enter the unconnected or disconnected state, respectively, before + * possibly being reconnected. + * + * For connectionless (i.e., SOCK_DGRAM) sockets, there are no separate states. + * However, a connectionless socket may have been connected to another socket. + * We maintain these links not with uds_conn but with uds_link, because such + * connections are not symmetric, and there is an interest in keeping track of + * which datagram sockets are connected to a particular socket (namely, to + * break the connection on close without doing an exhaustive search). For that + * reason, when a datagram socket connects to another socket, it is linked to + * that other socket, and the other socket has this socket on its queue. As a + * strange corner case, a connectionless socket may be connected to itself, in + * which case it is its own linked peer and it is also on its own queue. For + * datagram sockets, uds_conn is always NULL and UDSF_CONNECTED is never set. + * + * For the purposes of sending and receiving, we generally refer to the + * communication partner of a socket as its "peer". As should now be clear, + * for connection-type sockets, the socket's peer is identified with uds_conn; + * for connectionless sockets, the socket's peer is identified with uds_link. + */ +struct udssock { + struct sock uds_sock; /* sock object */ + struct udssock *uds_conn; /* connected socket, or NULL if none */ + struct udssock *uds_link; /* linked socket, or NULL if none */ + unsigned char *uds_buf; /* receive buffer (memory-mapped) */ + unsigned short uds_tail; /* tail of data in receive buffer */ + unsigned short uds_len; /* length of data in receive buffer */ + unsigned short uds_last; /* offset to last header in buffer */ + unsigned short uds_queued; /* current nr of sockets on queue */ + unsigned short uds_backlog; /* maximum nr of connecting sockets */ + unsigned char uds_flags; /* UDS-specific flags (UDSF_) */ + unsigned char uds_pathlen; /* socket file path length (w/o nul) */ + char uds_path[UDS_PATH_MAX - 1];/* socket file path (not terminated) */ + dev_t uds_dev; /* socket file device number */ + ino_t uds_ino; /* socket file inode number */ + struct unpcbid uds_cred; /* bind/connect-time credentials */ + SLIST_ENTRY(udssock) uds_hash; /* next in hash chain */ + TAILQ_ENTRY(udssock) uds_next; /* next in free list or queue */ + SIMPLEQ_HEAD(, uds_fd) uds_fds; /* in-flight file descriptors */ + TAILQ_HEAD(, udssock) uds_queue;/* queue of linked sockets */ +}; -/* File Descriptor Table -- Defined in uds.c */ -EXTERN uds_fd_t uds_fd_table[NR_FDS]; +#define UDSF_IN_USE 0x01 /* in use (for enumeration only) */ +#define UDSF_CONNECTED 0x02 /* connected or disconnected */ +#define UDSF_CONNWAIT 0x04 /* leave connecting until accept */ +#define UDSF_PASSCRED 0x08 /* pass credentials when receiving */ + +/* Macros. */ +#define uds_get_type(uds) sockevent_get_type(&(uds)->uds_sock) + +/* + * A socket that can be found through hash table lookups always has a non-empty + * path as well as a valid pair identifying the socket file that is, + * or once was, identified by that path. However, a socket that is bound, even + * though it will still have an associated path, is not necessarily hashed. + * The reason for the difference is pair reuse. This case is + * elaborated on in uds_bind(). + */ +#define uds_is_bound(uds) ((uds)->uds_pathlen != 0) +#define uds_is_hashed(uds) ((uds)->uds_dev != NO_DEV) + +/* + * These macros may be used on all socket types. However, the uds_is_connected + * macro returns TRUE only for connection-oriented sockets. To see if a + * datagram socket is connected to a target, use uds_has_link instead. + */ +#define uds_has_conn(uds) ((uds)->uds_conn != NULL) +#define uds_has_link(uds) ((uds)->uds_link != NULL) +#define uds_get_peer(uds) \ + ((uds_get_type(uds) != SOCK_DGRAM) ? (uds)->uds_conn : (uds)->uds_link) +#define uds_is_listening(uds) sockevent_is_listening(&(uds)->uds_sock) +#define uds_is_connecting(uds) \ + (uds_has_link(uds) && !((uds)->uds_flags & UDSF_CONNECTED) && \ + uds_get_type(uds) != SOCK_DGRAM) +#define uds_is_connected(uds) \ + (((uds)->uds_flags & UDSF_CONNECTED) && uds_has_conn(uds)) +#define uds_is_disconnected(uds) \ + (((uds)->uds_flags & UDSF_CONNECTED) && !uds_has_conn(uds)) + +#define uds_is_shutdown(uds, mask) \ + sockevent_is_shutdown(&(uds)->uds_sock, (mask)) /* Function prototypes. */ -/* ioc_uds.c */ -int uds_clear_fds(devminor_t minor, struct ancillary *data); -int uds_do_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt, - cp_grant_id_t grant); - /* uds.c */ -ssize_t uds_perform_read(devminor_t minor, endpoint_t endpt, - cp_grant_id_t grant, size_t size, int pretend); -void uds_unsuspend(devminor_t minor); +sockid_t uds_get_id(struct udssock * uds); +struct udssock *uds_enum(struct udssock * prev, int type); +void uds_make_addr(const char * path, size_t len, struct sockaddr * addr, + socklen_t * addr_len); +int uds_lookup(struct udssock * uds, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp); -#endif /* !__UDS_UDS_H */ +/* io.c */ +void uds_io_init(void); +int uds_io_setup(struct udssock * uds); +void uds_io_cleanup(struct udssock * uds); +void uds_io_reset(struct udssock * uds); +size_t uds_io_buflen(void); +int uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len, + const struct sockaddr * addr, socklen_t addr_len, + endpoint_t user_endpt, int flags); +int uds_send(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * off, const struct sockdriver_data * ctl, + socklen_t ctl_len, socklen_t * ctl_off, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt, int flags, size_t min); +int uds_test_send(struct sock * sock, size_t min); +int uds_pre_recv(struct sock * sock, endpoint_t user_endpt, int flags); +int uds_recv(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * off, const struct sockdriver_data * ctl, + socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr, + socklen_t * addr_len, endpoint_t user_endpt, int flags, size_t min, + int * rflags); +int uds_test_recv(struct sock * sock, size_t min, size_t * size); + +/* stat.c */ +void uds_stat_init(void); +void uds_stat_cleanup(void); + +#endif /* !MINIX_NET_UDS_UDS_H */ diff --git a/minix/net/uds/unix.8 b/minix/net/uds/unix.8 index 131753e49..ffde8edc9 100644 --- a/minix/net/uds/unix.8 +++ b/minix/net/uds/unix.8 @@ -10,6 +10,7 @@ unix \- Unix Domain Sockets (PF_UNIX) / Local Sockets (PF_LOCAL) .in +5 .ti -5 int socket(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP); +.br .ti -5 int socketpair(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP, int \fIsv[2]\fP); .br @@ -18,9 +19,8 @@ int socketpair(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP, int \fIsv[2 Local sockets, more commonly known as Unix Domain Sockets, provide a means of interprocess communication using the socket API. .SH SEE ALSO -.BR socket(2), -.BR socketpair(2), -.BR getpeereid(2), -.BR uds(8) +.BR socket(2) , +.BR socketpair(2) , +.BR getpeereid(3) .SH HISTORY -This Unix Domain Sockets first appeared in Minix 3.1.8. +This Unix Domain Sockets first appeared in MINIX 3.1.8. diff --git a/minix/servers/vfs/filedes.c b/minix/servers/vfs/filedes.c index 7ddb51b42..2051221e6 100644 --- a/minix/servers/vfs/filedes.c +++ b/minix/servers/vfs/filedes.c @@ -525,7 +525,8 @@ int do_copyfd(void) { /* Copy a file descriptor between processes, or close a remote file descriptor. * This call is used as back-call by device drivers (UDS, VND), and is expected - * to be used in response to an IOCTL to such device drivers. + * to be used in response to either an IOCTL to VND or a SEND or RECV socket + * request to UDS. */ struct fproc *rfp; struct filp *rfilp; @@ -548,9 +549,9 @@ int do_copyfd(void) rfp = &fproc[slot]; /* FIXME: we should now check that the user process is indeed blocked on an - * IOCTL call, so that we can safely mess with its file descriptors. We - * currently do not have the necessary state to verify this, so we assume - * that the call is always used in the right way. + * IOCTL or socket call, so that we can safely mess with its file + * descriptors. We currently do not have the necessary state to verify this, + * so we assume that the call is always used in the right way. */ /* Depending on the operation, get the file descriptor from the caller or the @@ -566,7 +567,7 @@ int do_copyfd(void) * passes in the file descriptor to the device node on which it is performing * the IOCTL. We do not allow manipulation of such device nodes. In * practice, this only applies to block-special files (and thus VND), because - * character-special files (as used by UDS) are unlocked during the IOCTL. + * socket files (as used by UDS) are unlocked during the socket operation. */ if (rfilp->filp_ioctl_fp == rfp) return(EBADF); diff --git a/minix/servers/vfs/open.c b/minix/servers/vfs/open.c index fc5b0c227..014122fd2 100644 --- a/minix/servers/vfs/open.c +++ b/minix/servers/vfs/open.c @@ -535,9 +535,9 @@ int do_mknod(void) resolve.l_vnode_lock = VNODE_WRITE; /* Only the super_user may make nodes other than fifos. */ - if (!super_user && (!S_ISFIFO(mode_bits) && !S_ISSOCK(mode_bits))) { + if (!super_user && !S_ISFIFO(mode_bits)) return(EPERM); - } + bits = (mode_bits & S_IFMT) | (mode_bits & ACCESSPERMS & fp->fp_umask); /* Open directory that's going to hold the new node. */ diff --git a/minix/servers/vfs/path.c b/minix/servers/vfs/path.c index 5f0191a4b..5360d2c1f 100644 --- a/minix/servers/vfs/path.c +++ b/minix/servers/vfs/path.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include "vmnt.h" #include "vnode.h" @@ -819,7 +818,6 @@ int do_socketpath(void) struct fproc *rfp; char path[PATH_MAX]; struct lookup resolve, resolve2; - struct sockaddr_un sun; mode_t bits; /* This should be replaced by an ACL check. */ @@ -831,24 +829,16 @@ int do_socketpath(void) what = job_m_in.m_lsys_vfs_socketpath.what; if (isokendpt(ep, &slot) != OK) return(EINVAL); - if (pathlen < sizeof(sun.sun_path) || pathlen >= PATH_MAX) return(EINVAL); + rfp = &fproc[slot]; - rfp = &(fproc[slot]); + /* Copy in the path name, which must not be empty. It is typically not null + * terminated. + */ + if (pathlen < 1 || pathlen >= sizeof(path)) return(EINVAL); r = sys_safecopyfrom(who_e, io_gr, (vir_bytes)0, (vir_bytes)path, pathlen); if (r != OK) return(r); path[pathlen] = '\0'; - /* If requested, turn path into canonical path to the socket file */ - if (what & SPATH_CANONIZE) { - if ((r = canonical_path(path, rfp)) != OK) return(r); - if (strlen(path) >= pathlen) return(ENAMETOOLONG); - - /* copy path back to the caller */ - r = sys_safecopyto(who_e, (cp_grant_id_t)io_gr, (vir_bytes)0, - (vir_bytes)path, pathlen); - if (r != OK) return(r); - } - /* Now perform the requested action. For the SPATH_CHECK action, a socket * file is expected to exist already, and we should check whether the given * user process has access to it. For the SPATH_CREATE action, no file is @@ -859,7 +849,7 @@ int do_socketpath(void) * Since the above canonicalization releases all locks once done, we need to * recheck absolutely everything now. TODO: do not release locks in between. */ - switch (what & ~SPATH_CANONIZE) { + switch (what) { case SPATH_CHECK: lookup_init(&resolve, path, PATH_NOFLAGS, &vmp, &vp); resolve.l_vmnt_lock = VMNT_READ; diff --git a/minix/tests/common-socket.c b/minix/tests/common-socket.c index 1517513b4..e84ac351c 100644 --- a/minix/tests/common-socket.c +++ b/minix/tests/common-socket.c @@ -50,16 +50,19 @@ static char *get_timestamp(void) void test_fail_fl(char *msg, char *file, int line) { char *timestamp; + int e; + e = errno; timestamp = get_timestamp(); if (errct == 0) fprintf(stderr, "\n"); + errno = e; fprintf(stderr, "[ERROR][%s] (%s Line %d) %s [pid=%d:errno=%d:%s]\n", - timestamp, file, line, msg, getpid(), - errno, strerror(errno)); + timestamp, file, line, msg, getpid(), errno, strerror(errno)); fflush(stderr); if (timestamp != NULL) { free(timestamp); timestamp = NULL; } + errno = e; e(7); } @@ -317,7 +320,7 @@ void test_shutdown(const struct socket_test_info *info) SOCKET(sd, info->domain, info->type, 0); errno = 0; rc = shutdown(sd, how[i]); - if (!(rc == -1 && errno == ENOTCONN) && + if (rc != 0 && !(rc == -1 && errno == ENOTCONN) && !info->bug_shutdown_not_conn && !info->bug_shutdown) { test_fail("shutdown() should have failed"); @@ -328,10 +331,10 @@ void test_shutdown(const struct socket_test_info *info) SOCKET(sd, info->domain, info->type, 0); errno = 0; rc = shutdown(sd, -1); - if (!(rc == -1 && errno == ENOTCONN) && + if (!(rc == -1 && errno == EINVAL) && !info->bug_shutdown_not_conn && !info->bug_shutdown) { - test_fail("shutdown(sd, -1) should have failed with ENOTCONN"); + test_fail("shutdown(sd, -1) should have failed with EINVAL"); } CLOSE(sd); @@ -431,8 +434,6 @@ void test_sockopts(const struct socket_test_info *info) CLOSE(sd); } - - SOCKET(sd, info->domain, info->type, 0); debug("Test setsockopt() works"); @@ -901,9 +902,6 @@ static void test_xfer_client(const struct socket_test_info *info) test_fail("[client] getpeername() should have worked"); } - /* we need to use the full path "/usr/src/test/DIR_56/test.sock" - * because that is what is returned by getpeername(). - */ info->callback_check_sockaddr((struct sockaddr *) &peer_addr, peer_addr_len, "getpeername", 1); @@ -1299,8 +1297,8 @@ static void test_abort_client(const struct socket_test_info *info, if (!info->ignore_write_conn_reset) { test_fail("write should have failed\n"); } - } else if (errno != ECONNRESET) { - test_fail("errno should've been ECONNRESET\n"); + } else if (errno != EPIPE && errno != ECONNRESET) { + test_fail("errno should've been EPIPE/ECONNRESET\n"); } } @@ -1353,7 +1351,7 @@ static void test_abort_server(const struct socket_test_info *info, if (abort_type == 1) { memset(buf, '\0', BUFSIZE); rc = read(client_sd, buf, BUFSIZE); - if (rc != -1 && (rc != 0 || !info->ignore_read_conn_reset)) { + if (rc != 0 && rc != -1) { test_fail("read should've failed or returned zero\n"); } if (rc != 0 && errno != ECONNRESET) { @@ -1518,9 +1516,6 @@ void test_msg_dgram(const struct socket_test_info *info) test_fail("recvmsg"); } - /* we need to use the full path "/usr/src/test/DIR_56/testb.sock" - * because that is what is returned by recvmsg(). - */ info->callback_check_sockaddr((struct sockaddr *) &addr, msg2.msg_namelen, "recvmsg", 2); @@ -1603,6 +1598,9 @@ test_nonblock(const struct socket_test_info *info) if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1) test_fail("bind() should have worked"); + if (info->callback_set_listen_opt != NULL) + info->callback_set_listen_opt(server_sd); + if (listen(server_sd, 8) == -1) test_fail("listen() should have worked"); @@ -1813,6 +1811,9 @@ test_intr(const struct socket_test_info *info) if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1) test_fail("bind() should have worked"); + if (info->callback_set_listen_opt != NULL) + info->callback_set_listen_opt(server_sd); + if (listen(server_sd, 8) == -1) test_fail("listen() should have worked"); @@ -1844,6 +1845,9 @@ test_intr(const struct socket_test_info *info) errct = 0; close(client_sd); + /* Ensure that the parent is blocked on the send(). */ + sleep(1); + check_select(server_sd, 1 /*read*/, 1 /*write*/, 0 /*block*/); len = sizeof(addr); @@ -1932,6 +1936,9 @@ test_connect_close(const struct socket_test_info *info) if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1) test_fail("bind() should have worked"); + if (info->callback_set_listen_opt != NULL) + info->callback_set_listen_opt(server_sd); + if (listen(server_sd, 8) == -1) test_fail("listen() should have worked"); @@ -1989,6 +1996,9 @@ test_listen_close(const struct socket_test_info *info) if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1) test_fail("bind() should have worked"); + if (info->callback_set_listen_opt != NULL) + info->callback_set_listen_opt(server_sd); + if (listen(server_sd, 8) == -1) test_fail("listen() should have worked"); @@ -2009,7 +2019,6 @@ test_listen_close(const struct socket_test_info *info) byte = 0; if (write(client_sd, &byte, 1) != -1 || errno != ENOTCONN) - /* Yes, you fucked up the fix for the FIXME below. */ test_fail("write() should have yielded ENOTCONN"); if (connect(client_sd, info->clientaddr, info->clientaddrlen) != -1) { @@ -2021,14 +2030,16 @@ test_listen_close(const struct socket_test_info *info) } /* - * FIXME: currently UDS cannot distinguish between sockets that have - * not yet been connected, and sockets that have been disconnected. - * Thus, we get the same error for both: ENOTCONN instead of EPIPE. + * The error we get on the next write() depends on whether the socket + * may be reused after a failed connect: for TCP/IP, it may not, so we + * get EPIPE; for UDS, it may be reused, so we get ENOTCONN. */ -#if 0 - if (write(client_sd, &byte, 1) != -1 || errno != EPIPE) - test_fail("write() should have yielded EPIPE"); -#endif + if (!info->bug_connect_after_close) { + if (write(client_sd, &byte, 1) != -1 || + (errno != EPIPE && errno != ENOTCONN)) + test_fail("write() should have yielded " + "EPIPE/ENOTCONN"); + } close(client_sd); @@ -2059,6 +2070,9 @@ test_listen_close_nb(const struct socket_test_info *info) if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1) test_fail("bind() should have worked"); + if (info->callback_set_listen_opt != NULL) + info->callback_set_listen_opt(server_sd); + if (listen(server_sd, 8) == -1) test_fail("listen() should have worked"); @@ -2097,16 +2111,6 @@ test_listen_close_nb(const struct socket_test_info *info) test_fail("write() should have yielded ECONNRESET"); } - /* - * FIXME: currently UDS cannot distinguish between sockets that have - * not yet been connected, and sockets that have been disconnected. - * Thus, we get the same error for both: ENOTCONN instead of EPIPE. - */ -#if 0 - if (write(client_sd, &byte, 1) != -1 || errno != EPIPE) - test_fail("write() should have yielded EPIPE"); -#endif - check_select_cond(client_sd, 1 /*read*/, 1 /*write*/, 0 /*block*/, !info->ignore_select_delay); diff --git a/minix/tests/common-socket.h b/minix/tests/common-socket.h index f03015be5..03ba77e6c 100644 --- a/minix/tests/common-socket.h +++ b/minix/tests/common-socket.h @@ -88,7 +88,6 @@ struct socket_test_info { int ignore_accept_delay; /* success from accept after aborted connect */ int ignore_connect_delay; /* nb connect not instant */ int ignore_connect_unaccepted; /* connect succeeds without accept */ - int ignore_read_conn_reset; /* read does not guarantee ECONNRESET */ int ignore_select_delay; /* select delay reflecting other side nb op */ int ignore_send_waiting; /* can send while waiting for nb recv */ int ignore_write_conn_reset; /* write does not guarantee ECONNRESET */ @@ -98,6 +97,7 @@ struct socket_test_info { void (* callback_cleanup)(void); void (* callback_xfer_peercred)(int sd); /* can be NULL */ void (* callback_xfer_prepclient)(void); /* can be NULL */ + void (* callback_set_listen_opt)(int sd); /* can be NULL */ }; void test_abort_client_server(const struct socket_test_info *info, diff --git a/minix/tests/test56.c b/minix/tests/test56.c index 583468dd5..d7a211b00 100644 --- a/minix/tests/test56.c +++ b/minix/tests/test56.c @@ -78,20 +78,6 @@ int max_error = 4; /* socket types supported */ static int types[3] = {SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM}; -static char sock_fullpath[PATH_MAX + 1]; - -/* Convert name to the full path of the socket. Assumes name is in cwd. */ -static char *fullpath(const char *name) -{ - char cwd[PATH_MAX + 1]; - - if (realpath(".", cwd) == NULL) - test_fail("Couldn't retrieve current working dir"); - - snprintf(sock_fullpath, PATH_MAX, "%s/%s", cwd, name); - - return(sock_fullpath); -} static void test_header(void) { @@ -187,16 +173,16 @@ static void test_socketpair(void) static void test_ucred(void) { - struct uucred credentials; + struct unpcbid credentials; socklen_t ucred_length; uid_t euid = geteuid(); gid_t egid = getegid(); int sv[2]; int rc; - debug("Test credentials passing"); + debug("Test peer credentials"); - ucred_length = sizeof(struct uucred); + ucred_length = sizeof(credentials); rc = socketpair(PF_UNIX, SOCK_STREAM, 0, sv); if (rc == -1) { @@ -204,22 +190,24 @@ static void test_ucred(void) } memset(&credentials, '\0', ucred_length); - rc = getsockopt(sv[0], SOL_SOCKET, SO_PEERCRED, &credentials, + rc = getsockopt(sv[0], 0, LOCAL_PEEREID, &credentials, &ucred_length); if (rc == -1) { - test_fail("getsockopt(SO_PEERCRED) failed"); - } else if (credentials.cr_ngroups != 0 || - credentials.cr_uid != geteuid() || - credentials.cr_gid != getegid()) { - /* printf("%d=%d %d=%d %d=%d",credentials.cr_ngroups, 0, - credentials.cr_uid, geteuid(), credentials.cr_gid, getegid()); */ + test_fail("getsockopt(LOCAL_PEEREID) failed"); + } else if (credentials.unp_pid != getpid() || + credentials.unp_euid != geteuid() || + credentials.unp_egid != getegid()) { + printf("%d=%d %d=%d %d=%d",credentials.unp_pid, getpid(), + credentials.unp_euid, geteuid(), + credentials.unp_egid, getegid()); test_fail("Credential passing gave us the wrong cred"); } rc = getpeereid(sv[0], &euid, &egid); if (rc == -1) { test_fail("getpeereid(sv[0], &euid, &egid) failed"); - } else if (credentials.cr_uid != euid || credentials.cr_gid != egid) { + } else if (credentials.unp_euid != euid || + credentials.unp_egid != egid) { test_fail("getpeereid() didn't give the correct euid/egid"); } @@ -245,7 +233,7 @@ static void callback_check_sockaddr(const struct sockaddr *sockaddr, if (!(sockaddr_un->sun_family == AF_UNIX && strncmp(sockaddr_un->sun_path, - fullpath(path), + path, sizeof(sockaddr_un->sun_path) - 1) == 0)) { snprintf(buf, sizeof(buf), "%s() didn't return the right addr", @@ -293,7 +281,6 @@ static void test_bind_unix(void) UNLINK(TEST_SYM_A); UNLINK(TEST_SYM_B); - SYMLINK(TEST_SYM_A, TEST_SYM_B); SYMLINK(TEST_SYM_B, TEST_SYM_A); SOCKET(sd, PF_UNIX, SOCK_STREAM, 0); @@ -301,6 +288,19 @@ static void test_bind_unix(void) strncpy(addr.sun_path, TEST_SYM_A, sizeof(addr.sun_path) - 1); errno = 0; rc = bind(sd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); + if (!((rc == -1) && (errno == EADDRINUSE))) { + test_fail("bind() should have failed with EADDRINUSE"); + } + CLOSE(sd); + + SYMLINK(TEST_SYM_A, TEST_SYM_B); + + SOCKET(sd, PF_UNIX, SOCK_STREAM, 0); + + strncpy(addr.sun_path, TEST_SYM_A, sizeof(addr.sun_path) - 1); + strlcat(addr.sun_path, "/x", sizeof(addr.sun_path)); + errno = 0; + rc = bind(sd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (!((rc == -1) && (errno == ELOOP))) { test_fail("bind() should have failed with ELOOP"); } @@ -337,28 +337,49 @@ static void callback_xfer_prepclient(void) { } static void callback_xfer_peercred(int sd) { - struct uucred credentials; + struct unpcbid credentials; int rc; socklen_t ucred_length; - ucred_length = sizeof(struct uucred); + ucred_length = sizeof(credentials); - debug("Test passing the client credentials to the server"); + debug("Test obtaining the peer credentials"); memset(&credentials, '\0', ucred_length); - rc = getsockopt(sd, SOL_SOCKET, SO_PEERCRED, &credentials, - &ucred_length); + rc = getsockopt(sd, 0, LOCAL_PEEREID, &credentials, &ucred_length); if (rc == -1) { test_fail("[client] getsockopt() failed"); - } else if (credentials.cr_uid != geteuid() || - credentials.cr_gid != getegid()) { - printf("%d=%d=%d %d=%d=%d\n", credentials.cr_uid, getuid(), - geteuid(), credentials.cr_gid, getgid(), getegid()); + } else if (credentials.unp_euid != geteuid() || + credentials.unp_egid != getegid()) { + printf("%d=* %d=%d %d=%d", credentials.unp_pid, + credentials.unp_euid, geteuid(), + credentials.unp_egid, getegid()); test_fail("[client] Credential passing gave us a bad UID/GID"); } } +static void +callback_set_listen_opt(int sd) +{ + int val; + + /* + * Several of the tests assume that a new connection to a server will + * not be established (i.e., go from "connecting" to "connected" state) + * until the server actually accepts the connection with an accept(2) + * call. With the new UDS implementation, this is no longer true: to + * match the behavior of other systems, UDS now preemptively connects + * the socket in anticipation of the accept(2) call. We can change + * back to the old behavior by setting LOCAL_CONNWAIT however, and + * since the test effectively tests a larger set of socket transitions + * that way, that is what we do for these tests. + */ + val = 1; + if (setsockopt(sd, 0, LOCAL_CONNWAIT, &val, sizeof(val)) != 0) + test_fail("setsockopt(LOCAL_CONNWAIT)"); +} + static void test_vectorio(int type) { int sv[2]; @@ -563,7 +584,11 @@ static void test_scm_credentials(void) int rc; int src; int dst; - struct uucred cred; + int one; + union { + struct sockcred cred; + char buf[SOCKCREDSIZE(NGROUPS_MAX)]; + } cred; struct cmsghdr *cmsg = NULL; struct sockaddr_un addr; struct iovec iov[3]; @@ -573,7 +598,7 @@ static void test_scm_credentials(void) char buf2[BUFSIZE]; char buf3[BUFSIZE]; char ctrl[BUFSIZE]; - socklen_t addrlen = sizeof(struct sockaddr_un); + socklen_t len, addrlen = sizeof(struct sockaddr_un); debug("test_scm_credentials"); @@ -615,6 +640,16 @@ static void test_scm_credentials(void) test_fail("bind"); } + debug("request credential passing"); + + one = 1; + rc = setsockopt(dst, 0, LOCAL_CREDS, &one, sizeof(one)); + if (rc == -1) { + test_fail("setsockopt(LOCAL_CREDS)"); + } + + debug("sending msg1"); + memset(&buf1, '\0', BUFSIZE); memset(&buf2, '\0', BUFSIZE); memset(&buf3, '\0', BUFSIZE); @@ -640,8 +675,6 @@ static void test_scm_credentials(void) msg1.msg_controllen = 0; msg1.msg_flags = 0; - debug("sending msg1"); - rc = sendmsg(src, &msg1, 0); if (rc == -1) { test_fail("sendmsg"); @@ -684,27 +717,50 @@ static void test_scm_credentials(void) * because that is what is returned by recvmsg(). */ if (addr.sun_family != AF_UNIX || strcmp(addr.sun_path, - fullpath(TEST_SUN_PATHB))) { + TEST_SUN_PATHB)) { test_fail("recvmsg"); } debug("looking for credentials"); - memset(&cred, '\0', sizeof(struct uucred)); + len = 0; + + memset(&cred, 'x', sizeof(cred)); for (cmsg = CMSG_FIRSTHDR(&msg2); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg2, cmsg)) { if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDS) { + /* Great, this alignment business! But then at least + * give me a macro to compute the actual data length.. + */ + len = cmsg->cmsg_len - (socklen_t) + ((char *)CMSG_DATA(cmsg) - (char *)cmsg); - memcpy(&cred, CMSG_DATA(cmsg), sizeof(struct uucred)); + if (len < sizeof(struct sockcred)) + test_fail("credentials too small"); + else if (len > sizeof(cred)) + test_fail("credentials too large"); + memcpy(cred.buf, CMSG_DATA(cmsg), len); break; } } - if (cred.cr_ngroups != 0 || cred.cr_uid != geteuid() || - cred.cr_gid != getegid()) { + if (len == 0) + test_fail("no credentials found"); + if (len != SOCKCREDSIZE(cred.cred.sc_ngroups)) + test_fail("wrong credentials size"); + + /* + * TODO: check supplementary groups. This whole test is pretty much + * pointless since we're running with very standard credentials anyway. + */ + if (cred.cred.sc_uid != getuid() || + cred.cred.sc_euid != geteuid() || + cred.cred.sc_gid != getgid() || + cred.cred.sc_egid != getegid() || + cred.cred.sc_ngroups < 0 || cred.cred.sc_ngroups > NGROUPS_MAX) { test_fail("did no receive the proper credentials"); } @@ -1384,22 +1440,18 @@ static void test_fchmod(void) * Test various aspects related to the socket files on the file system. * This subtest is woefully incomplete and currently only attempts to test * aspects that have recently been affected by code changes. In the future, - * there should be tests for path canonicalization and the entire range of file - * system path and access related error codes (TODO). + * there should be tests for the entire range of file system path and access + * related error codes (TODO). */ static void test_file(void) { - struct sockaddr_un addr; -#if NOT_YET - struct sockaddr_un saddr, saddr2; + struct sockaddr_un addr, saddr, saddr2; char buf[1]; socklen_t len; struct stat st; mode_t omask; - int, csd, fd; -#endif - int sd, sd2; + int sd, sd2, csd, fd; /* * If the provided socket path exists on the file system, the bind(2) @@ -1426,7 +1478,6 @@ test_file(void) CLOSE(sd); -#if NOT_YET if (bind(sd2, (struct sockaddr *)&addr, sizeof(addr)) != -1) test_fail("Binding socket unexpectedly succeeded"); if (errno != EADDRINUSE) @@ -1497,29 +1548,8 @@ test_file(void) if (memcmp(&saddr, &saddr2, sizeof(saddr))) test_fail("Unexpected old socket address"); - /* - * Currently, our implementation "hides" the old socket even if the new - * socket is closed, but since this is not standard behavior and may be - * changed later, we do not test for it. However, in any case, - * rebinding the hidden socket should make it "visible" again. - */ - strlcpy(saddr2.sun_path, TEST_SUN_PATHB, sizeof(saddr2.sun_path)); - if (bind(sd, (struct sockaddr *)&saddr2, sizeof(saddr2)) != 0) - test_fail("Can't rebind socket"); - - memset(buf, 'Z', sizeof(buf)); - if (sendto(csd, buf, sizeof(buf), 0, (struct sockaddr *)&saddr2, - sizeof(saddr2)) != sizeof(buf)) - test_fail("Can't send to socket"); - if (recvfrom(sd, buf, sizeof(buf), 0, NULL, 0) != sizeof(buf)) - test_fail("Can't receive from socket"); - if (buf[0] != 'Z') - test_fail("Transmission failure"); - if (unlink(TEST_SUN_PATH) != 0) test_fail("Can't unlink socket"); - if (unlink(TEST_SUN_PATHB) != 0) - test_fail("Can't unlink other socket"); CLOSE(sd); CLOSE(sd2); @@ -1580,7 +1610,6 @@ test_file(void) UNLINK(TEST_SUN_PATH); umask(omask); -#endif /* * Only socket(2), socketpair(2), and accept(2) may be used to obtain @@ -1631,8 +1660,8 @@ int main(int argc, char *argv[]) .clientaddrsym = (struct sockaddr *) &clientaddrsym, .clientaddrsymlen = sizeof(clientaddrsym), .domain = PF_UNIX, - .expected_rcvbuf = PIPE_BUF, - .expected_sndbuf = PIPE_BUF, + .expected_rcvbuf = 32768 - 5, /* no constants: */ + .expected_sndbuf = 32768 - 5, /* UDS internals */ .serveraddr = (struct sockaddr *) &clientaddr, .serveraddrlen = sizeof(clientaddr), .serveraddr2 = (struct sockaddr *) &clientaddr2, @@ -1644,12 +1673,16 @@ int main(int argc, char *argv[]) .callback_cleanup = callback_cleanup, .callback_xfer_prepclient = callback_xfer_prepclient, .callback_xfer_peercred = callback_xfer_peercred, + .callback_set_listen_opt = callback_set_listen_opt, }; debug("entering main()"); start(56); + /* This test was written before UDS started supporting SIGPIPE. */ + signal(SIGPIPE, SIG_IGN); + test_socket(&info); test_bind(&info); test_bind_unix(); diff --git a/minix/tests/test80.c b/minix/tests/test80.c index 0975387aa..f6bc69ad9 100644 --- a/minix/tests/test80.c +++ b/minix/tests/test80.c @@ -96,7 +96,6 @@ int main(int argc, char *argv[]) .ignore_accept_delay = 1, .ignore_connect_unaccepted = 1, .ignore_connect_delay = 1, - .ignore_read_conn_reset = 1, .ignore_select_delay = 1, .ignore_send_waiting = 1, .ignore_write_conn_reset = 1, diff --git a/minix/tests/test81.c b/minix/tests/test81.c index 1a3188cd6..f23c174d3 100644 --- a/minix/tests/test81.c +++ b/minix/tests/test81.c @@ -99,7 +99,6 @@ int main(int argc, char *argv[]) .ignore_accept_delay = 1, .ignore_connect_unaccepted = 1, .ignore_connect_delay = 1, - .ignore_read_conn_reset = 1, .ignore_select_delay = 1, .ignore_send_waiting = 1, .ignore_write_conn_reset = 1, diff --git a/minix/usr.bin/trace/ioctl/net.c b/minix/usr.bin/trace/ioctl/net.c index 8d7591c80..1842d5e6e 100644 --- a/minix/usr.bin/trace/ioctl/net.c +++ b/minix/usr.bin/trace/ioctl/net.c @@ -189,6 +189,27 @@ static const struct flags udpopt_flags[] = { FLAG(NWUO_DI_IPOPT), }; +static void +put_struct_uucred(struct trace_proc * proc, const char * name, int flags, + vir_bytes addr) +{ + struct uucred cred; + + if (!put_open_struct(proc, name, flags, addr, &cred, sizeof(cred))) + return; + + put_value(proc, "cr_uid", "%u", cred.cr_uid); + if (verbose > 0) { + put_value(proc, "cr_gid", "%u", cred.cr_gid); + if (verbose > 1) + put_value(proc, "cr_ngroups", "%d", cred.cr_ngroups); + put_groups(proc, "cr_groups", PF_LOCADDR, + (vir_bytes)&cred.cr_groups, cred.cr_ngroups); + } + + put_close_struct(proc, verbose > 0); +} + static void put_msg_control(struct trace_proc * proc, struct msg_control * ptr) { diff --git a/minix/usr.bin/trace/proto.h b/minix/usr.bin/trace/proto.h index 11591edc1..e27637b2c 100644 --- a/minix/usr.bin/trace/proto.h +++ b/minix/usr.bin/trace/proto.h @@ -115,8 +115,6 @@ void put_dev(struct trace_proc *proc, const char *name, dev_t dev); void put_in_addr(struct trace_proc *proc, const char *name, struct in_addr in); void put_socket_type(struct trace_proc *proc, const char *name, int type); void put_socket_family(struct trace_proc *proc, const char *name, int family); -void put_struct_uucred(struct trace_proc *proc, const char *name, int flags, - vir_bytes addr); void put_cmsg_type(struct trace_proc *proc, const char *name, int type); void put_shutdown_how(struct trace_proc *proc, const char *name, int how); diff --git a/minix/usr.bin/trace/service/vfs.c b/minix/usr.bin/trace/service/vfs.c index 93c0055fa..8163f4dd7 100644 --- a/minix/usr.bin/trace/service/vfs.c +++ b/minix/usr.bin/trace/service/vfs.c @@ -1802,25 +1802,32 @@ put_struct_iovec(struct trace_proc * proc, const char * name, int flags, put_close(proc, "]"); } -void -put_struct_uucred(struct trace_proc * proc, const char * name, int flags, - vir_bytes addr) +static void +put_struct_sockcred(struct trace_proc * proc, const char * name, int flags, + vir_bytes addr, size_t left) { - struct uucred cred; + struct sockcred sc; - if (!put_open_struct(proc, name, flags, addr, &cred, sizeof(cred))) + if (!put_open_struct(proc, name, flags, addr, &sc, sizeof(sc))) return; - put_value(proc, "cr_uid", "%u", cred.cr_uid); + put_value(proc, "sc_uid", "%u", sc.sc_uid); + if (verbose > 0) + put_value(proc, "sc_euid", "%u", sc.sc_euid); + put_value(proc, "sc_gid", "%u", sc.sc_gid); if (verbose > 0) { - put_value(proc, "cr_gid", "%u", cred.cr_gid); + put_value(proc, "sc_egid", "%u", sc.sc_egid); if (verbose > 1) - put_value(proc, "cr_ngroups", "%d", cred.cr_ngroups); - put_groups(proc, "cr_groups", PF_LOCADDR, - (vir_bytes)&cred.cr_groups, cred.cr_ngroups); + put_value(proc, "sc_ngroups", "%d", sc.sc_ngroups); + if (left >= sizeof(sc.sc_groups[0]) * (sc.sc_ngroups - 1)) { + put_groups(proc, "sc_groups", flags, + addr + offsetof(struct sockcred, sc_groups), + sc.sc_ngroups); + } else + put_field(proc, "sc_groups", ".."); } - put_close_struct(proc, verbose > 0); + put_close_struct(proc, verbose > 1); } static void @@ -1907,7 +1914,7 @@ put_cmsg(struct trace_proc * proc, const char * name, vir_bytes addr, size_t len) { struct cmsghdr cmsg; - char buf[CMSG_SPACE(sizeof(struct uucred))]; + char buf[CMSG_SPACE(sizeof(struct sockcred))]; size_t off, chunk, datalen; if (valuesonly > 1 || addr == 0 || len < CMSG_LEN(0)) { @@ -1960,10 +1967,11 @@ put_cmsg(struct trace_proc * proc, const char * name, vir_bytes addr, addr + off + chunk, datalen); } else if (cmsg.cmsg_level == SOL_SOCKET && cmsg.cmsg_type == SCM_CREDS && - datalen >= sizeof(struct uucred) && + datalen >= sizeof(struct sockcred) && chunk >= CMSG_LEN(datalen)) { - put_struct_uucred(proc, "cmsg_data", PF_LOCADDR, - (vir_bytes)&buf[CMSG_LEN(0)]); + put_struct_sockcred(proc, "cmsg_data", PF_LOCADDR, + (vir_bytes)&buf[CMSG_LEN(0)], + datalen - sizeof(struct sockcred)); } else if (datalen > 0) put_field(proc, "cmsg_data", ".."); @@ -2129,8 +2137,6 @@ put_sockopt_name(struct trace_proc * proc, const char * name, int level, TEXT(SO_REUSEPORT); TEXT(SO_NOSIGPIPE); TEXT(SO_TIMESTAMP); - TEXT(SO_PASSCRED); - TEXT(SO_PEERCRED); TEXT(SO_SNDBUF); TEXT(SO_RCVBUF); TEXT(SO_SNDLOWAT); @@ -2157,7 +2163,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags, const char *text; int i; struct linger l; - struct uucred cr; struct timeval tv; void *ptr; size_t size; @@ -2183,7 +2188,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags, case SO_REUSEPORT: case SO_NOSIGPIPE: case SO_TIMESTAMP: - case SO_PASSCRED: case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: @@ -2199,10 +2203,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags, ptr = &l; size = sizeof(l); break; - case SO_PEERCRED: - ptr = &cr; - size = sizeof(cr); - break; case SO_SNDTIMEO: case SO_RCVTIMEO: ptr = &tv; @@ -2229,9 +2229,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags, put_value(proc, "l_linger", "%d", l.l_linger); put_close(proc, "}"); break; - case SO_PEERCRED: - put_struct_uucred(proc, name, PF_LOCADDR, (vir_bytes)&cr); - break; case SO_ERROR: put_open(proc, name, 0, "{", ", "); if (!valuesonly && (text = get_error_name(i)) != NULL) diff --git a/sys/sys/socket.h b/sys/sys/socket.h index 219b90baf..d7340af64 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -133,12 +133,6 @@ typedef _BSD_SSIZE_T_ ssize_t; #define SO_ACCEPTFILTER 0x1000 /* there is an accept filter */ #define SO_TIMESTAMP 0x2000 /* timestamp received dgram traffic */ -#if defined(__minix) && defined(_MINIX_SYSTEM) -/* Minixism which should go, so hide it from userland. */ -#define SO_PASSCRED 0x100000 -#define SO_PEERCRED 0x200000 -#endif /* defined(__minix) */ - /* * Additional options, not kept in so_options. */