diff --git a/distrib/sets/lists/minix-base/mi b/distrib/sets/lists/minix-base/mi
index e2ee8526a..8463f888f 100644
--- a/distrib/sets/lists/minix-base/mi
+++ b/distrib/sets/lists/minix-base/mi
@@ -179,6 +179,7 @@
 ./etc/system.conf.d/ipc                                 minix-base
 ./etc/system.conf.d/lwip                                minix-base
 ./etc/system.conf.d/random                              minix-base
+./etc/system.conf.d/uds                                 minix-base
 ./etc/system.conf.d/usb_hub                             minix-base
 ./etc/system.conf.d/usb_storage                         minix-base
 ./etc/termcap                                           minix-base
diff --git a/distrib/sets/lists/minix-man/mi b/distrib/sets/lists/minix-man/mi
index 6ce70fc75..895353269 100644
--- a/distrib/sets/lists/minix-man/mi
+++ b/distrib/sets/lists/minix-man/mi
@@ -477,7 +477,7 @@
 ./usr/man/man2/getgid.2                                 minix-man
 ./usr/man/man2/getitimer.2                              minix-man
 ./usr/man/man2/getnucred.2                              minix-man       obsolete
-./usr/man/man2/getpeereid.2                             minix-man
+./usr/man/man2/getpeereid.2                             minix-man       obsolete
 ./usr/man/man2/getpeername.2                            minix-man
 ./usr/man/man2/getpid.2                                 minix-man
 ./usr/man/man2/getpriority.2                            minix-man
@@ -3463,7 +3463,7 @@
 ./usr/man/man8/syslogd.8                                minix-man
 ./usr/man/man8/tcpd.8                                   minix-man
 ./usr/man/man8/traceroute.8                             minix-man
-./usr/man/man8/uds.8                                    minix-man
+./usr/man/man8/uds.8                                    minix-man       obsolete
 ./usr/man/man8/unix.8                                   minix-man
 ./usr/man/man8/unlink.8                                 minix-man
 ./usr/man/man8/unstr.8                                  minix-man
diff --git a/etc/system.conf b/etc/system.conf
index d89a6d3df..38c43f24f 100644
--- a/etc/system.conf
+++ b/etc/system.conf
@@ -494,14 +494,6 @@ service vnd
 	uid	0;	# only for copyfd(2)
 };
 
-service uds
-{
-	ipc
-		SYSTEM vfs rs vm
-	;
-	uid	0;	# only for checkperms(2) and copyfd(2)
-};
-
 service pty
 {
 	system
diff --git a/etc/usr/rc b/etc/usr/rc
index 69dfb1607..1d5bb3e84 100644
--- a/etc/usr/rc
+++ b/etc/usr/rc
@@ -201,7 +201,7 @@ start)
     # pty needs to know the "tty" group ID
     up pty -dev /dev/ptmx -args "gid=`stat -f '%g' /dev/ptmx`"
 
-    up uds -dev /dev/uds
+    up uds
 
     up -n ipc
 
diff --git a/external/bsd/tmux/dist/client.c b/external/bsd/tmux/dist/client.c
index d790ea8e8..ce88e5448 100644
--- a/external/bsd/tmux/dist/client.c
+++ b/external/bsd/tmux/dist/client.c
@@ -107,11 +107,7 @@ client_connect(char *path, int start_server)
 	}
 
 retry:
-#ifndef __minix
 	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1)
-#else
-	if ((fd = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1)
-#endif /* !defined(__minix) */
 		fatal("socket failed");
 
 	if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == -1) {
diff --git a/external/bsd/tmux/dist/server.c b/external/bsd/tmux/dist/server.c
index 5682afe66..33b8576c2 100644
--- a/external/bsd/tmux/dist/server.c
+++ b/external/bsd/tmux/dist/server.c
@@ -84,11 +84,7 @@ server_create_socket(void)
 	}
 	unlink(sa.sun_path);
 
-#ifndef __minix
 	if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1)
-#else
-	if ((fd = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1)
-#endif /* !defined(__minix) */
 		fatal("socket failed");
 
 	mask = umask(S_IXUSR|S_IXGRP|S_IRWXO);
@@ -114,11 +110,7 @@ server_start(int lockfd, char *lockfile)
 	char		*cause;
 
 	/* The first client is special and gets a socketpair; create it. */
-#ifndef __minix
 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, pair) != 0)
-#else
-	if (socketpair(AF_UNIX, SOCK_SEQPACKET, PF_UNSPEC, pair) != 0)
-#endif /* !defined(__minix) */
 		fatal("socketpair failed");
 
 	switch (fork()) {
diff --git a/lib/libc/gen/syslog.c b/lib/libc/gen/syslog.c
index f07434811..5ece67ddd 100644
--- a/lib/libc/gen/syslog.c
+++ b/lib/libc/gen/syslog.c
@@ -59,10 +59,6 @@ __RCSID("$NetBSD: syslog.c,v 1.54 2014/09/18 13:58:20 christos Exp $");
 #include "reentrant.h"
 #include "extern.h"
 
-#if defined(__minix)
-#include <sys/ioctl.h>
-#endif /* defined(__minix) */
-
 #ifdef __weak_alias
 __weak_alias(closelog,_closelog)
 __weak_alias(openlog,_openlog)
@@ -452,11 +448,7 @@ vsyslogp_r(int pri, struct syslog_data *data, const char *msgid,
 	 * to give syslogd a chance to empty its socket buffer.
 	 */
 	for (tries = 0; tries < MAXTRIES; tries++) {
-#if defined(__minix)
-		if (write(data->log_file, tbuf, cnt) != -1)
-#else
 		if (send(data->log_file, tbuf, cnt, 0) != -1)
-#endif /* defined(__minix) */
 			break;
 		if (errno != ENOBUFS) {
 			disconnectlog_r(data);
@@ -513,9 +505,7 @@ connectlog_r(struct syslog_data *data)
 	/* AF_UNIX address of local logger */
 	static const struct sockaddr_un sun = {
 		.sun_family = AF_LOCAL,
-#if !defined(__minix)
 		.sun_len = sizeof(sun),
-#endif /* !defined(__minix) */
 		.sun_path = _PATH_LOG,
 	};
 
@@ -526,14 +516,9 @@ connectlog_r(struct syslog_data *data)
 		data->log_connected = 0;
 	}
 	if (!data->log_connected) {
-#if defined(__minix)
-		if(ioctl(data->log_file, NWIOSUDSTADDR, __UNCONST(&sun)) < 0)
-
-#else
 		if (connect(data->log_file,
 		    (const struct sockaddr *)(const void *)&sun,
 		    (socklen_t)sizeof(sun)) == -1)
-#endif /* defined(__minix) */
 		{
 			(void)close(data->log_file);
 			data->log_file = -1;
diff --git a/lib/libc/net/Makefile.inc b/lib/libc/net/Makefile.inc
index bf65717a1..fa29213c5 100644
--- a/lib/libc/net/Makefile.inc
+++ b/lib/libc/net/Makefile.inc
@@ -2,13 +2,6 @@
 #	@(#)Makefile.inc	8.2 (Berkeley) 9/5/93
 
 # net sources
-.if defined(__MINIX)
-.PATH: ${NETBSDSRCDIR}/minix/lib/libc/net
-
-CPPFLAGS.getpeereid.c+= -D_MINIX_SYSTEM=1
-CPPFLAGS.getsockopt.c+= -D_MINIX_SYSTEM=1
-CPPFLAGS.setsockopt.c+= -D_MINIX_SYSTEM=1
-.endif
 .PATH: ${ARCHDIR}/net ${.CURDIR}/net
 
 SRCS+=	base64.c ethers.c gethnamaddr.c getifaddrs.c \
diff --git a/minix/commands/DESCRIBE/DESCRIBE.sh b/minix/commands/DESCRIBE/DESCRIBE.sh
index e826052ab..09f56ecf5 100644
--- a/minix/commands/DESCRIBE/DESCRIBE.sh
+++ b/minix/commands/DESCRIBE/DESCRIBE.sh
@@ -192,9 +192,6 @@ do
     17,0)
 	des="hello" dev=hello
 	;;
-    18,0)
-	des="UNIX domain socket" dev=uds
-	;;
     5[6-9],0|6[0-3],0)
 	drive=`expr $major - 56`
 	des="vnode disk $drive" dev=vnd$drive
diff --git a/minix/commands/MAKEDEV/MAKEDEV.sh b/minix/commands/MAKEDEV/MAKEDEV.sh
index 78d647dcb..8c8a98b12 100755
--- a/minix/commands/MAKEDEV/MAKEDEV.sh
+++ b/minix/commands/MAKEDEV/MAKEDEV.sh
@@ -49,7 +49,6 @@ STD_DEVICES="
 	ttypa ttypb ttypc ttypd ttype ttypf
 	ttyq0 ttyq1 ttyq2 ttyq3 ttyq4 ttyq5 ttyq6 ttyq7 ttyq8 ttyq9
 	ttyqa ttyqb ttyqc ttyqd ttyqe ttyqf
-	uds
 	vnd0 vnd0p0 vnd0p0s0 vnd1 vnd1p0 vnd1p0s0
 	vnd2 vnd3 vnd4 vnd5 vnd6 vnd7
 "
@@ -134,7 +133,6 @@ Where key is one of the following:
   klog                    # Make /dev/klog
   ptmx                    # Make /dev/ptmx
   random                  # Make /dev/random, /dev/urandom
-  uds                     # Make /dev/uds
   filter                  # Make /dev/filter
   fbd                     # Make /dev/fbd
   hello                   # Make /dev/hello
@@ -438,10 +436,6 @@ do
 
 		makedev ${dev} c 4 ${minor} ${uname} tty ${permissions}
 		;;
-	uds)
-		# Unix domain sockets device
-		makedev ${dev} c 18 0 ${uname} ${gname} 666
-		;;
 	vnd[0-7])
 		# Whole vnode disk devices.
 		makedev ${dev} b ${major} 0 ${uname} ${gname} ${permissions}
diff --git a/minix/include/minix/dmap.h b/minix/include/minix/dmap.h
index 8c6560d3b..c02731747 100644
--- a/minix/include/minix/dmap.h
+++ b/minix/include/minix/dmap.h
@@ -36,8 +36,8 @@
 #define LOG_MAJOR		  15	/* 15 = /dev/klog   (log driver)      */
 #define RANDOM_MAJOR		  16	/* 16 = /dev/random (random driver)   */
 #define HELLO_MAJOR		  17	/* 17 = /dev/hello  (hello driver)    */
-#define UDS_MAJOR		  18	/* 18 = /dev/uds    (pfs)             */
-#define FB_MAJOR		  19	/* 18 = /dev/fb0    (fb driver)       */
+					/* 18 = (unused)                      */
+#define FB_MAJOR		  19	/* 19 = /dev/fb0    (fb driver)       */
 #define I2C0_MAJOR		  20	/* 20 = /dev/i2c-1  (i2c-dev)         */
 #define I2C1_MAJOR		  21	/* 21 = /dev/i2c-2  (i2c-dev)         */
 #define I2C2_MAJOR		  22	/* 22 = /dev/i2c-3  (i2c-dev)         */
diff --git a/minix/include/minix/syslib.h b/minix/include/minix/syslib.h
index 98f58cbd5..1856c41da 100644
--- a/minix/include/minix/syslib.h
+++ b/minix/include/minix/syslib.h
@@ -273,11 +273,10 @@ uid_t getnuid(endpoint_t proc_ep);
 gid_t getngid(endpoint_t proc_ep);
 int getsockcred(endpoint_t proc_ep, struct sockcred * sockcred, gid_t * groups,
 	int ngroups);
-int socketpath(endpoint_t endpt, char *path, size_t size, int what, dev_t *dev,
-	ino_t *ino);
+int socketpath(endpoint_t endpt, const char *path, size_t size, int what,
+	dev_t *dev, ino_t *ino);
 #define SPATH_CHECK	0	/* check user permissions on socket path */
 #define SPATH_CREATE	1	/* create socket file at given path */
-#define SPATH_CANONIZE	0x8000	/* copy back canonized path (legacy support) */
 int copyfd(endpoint_t endpt, int fd, int what);
 #define COPYFD_FROM	0	/* copy file descriptor from remote process */
 #define COPYFD_TO	1	/* copy file descriptor to remote process */
diff --git a/minix/lib/libc/net/getpeereid.c b/minix/lib/libc/net/getpeereid.c
deleted file mode 100644
index 7638c12ba..000000000
--- a/minix/lib/libc/net/getpeereid.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/socket.h>
-#include <sys/ucred.h>
-
-/*
- * get the effective user ID and effective group ID of a peer
- * connected through a Unix domain socket.
- */
-int getpeereid(int sd, uid_t *euid, gid_t *egid) {
-	int rc;
-	struct uucred cred;
-	socklen_t ucred_length;
-
-	/* Initialize Data Structures */
-	ucred_length = sizeof(struct uucred);
-	memset(&cred, '\0', ucred_length);
-
-	/* Validate Input Parameters */
-	if (euid == NULL || egid == NULL) {
-		errno = EFAULT;
-		return -1;
-	} /* getsockopt will handle validating 'sd' */
-
-	/* Get the credentials of the peer at the other end of 'sd' */
-	rc = getsockopt(sd, SOL_SOCKET, SO_PEERCRED, &cred, &ucred_length);
-	if (rc == 0) {
-		/* Success - return the results */
-		*euid = cred.cr_uid;
-		*egid = cred.cr_gid;
-		return 0;
-	} else {
-		/* Failure - getsockopt takes care of setting errno */
-		return -1;
-	}
-}
diff --git a/minix/lib/libc/sys/getsockopt.c b/minix/lib/libc/sys/getsockopt.c
index 7edea530e..7c000fb05 100644
--- a/minix/lib/libc/sys/getsockopt.c
+++ b/minix/lib/libc/sys/getsockopt.c
@@ -244,6 +244,7 @@ static int _uds_getsockopt(int sock, int level, int option_name,
 		return 0;
 	}
 
+#ifdef SO_PEERCRED
 	if (level == SOL_SOCKET && option_name == SO_PEERCRED)
 	{
 		struct uucred cred;
@@ -257,6 +258,7 @@ static int _uds_getsockopt(int sock, int level, int option_name,
 							option_len);
 		return 0;
 	}
+#endif
 
 
 	if (level == SOL_SOCKET && option_name == SO_REUSEADDR)
@@ -269,12 +271,14 @@ static int _uds_getsockopt(int sock, int level, int option_name,
 		return 0;
 	}
 
+#ifdef SO_PASSCRED
 	if (level == SOL_SOCKET && option_name == SO_PASSCRED)
 	{
 		i = 1;	/* option is always 'on' */
 		getsockopt_copy(&i, sizeof(i), option_value, option_len);
 		return 0;
 	}
+#endif
 
 #if DEBUG
 	fprintf(stderr, "_uds_getsocketopt: level %d, name %d\n",
diff --git a/minix/lib/libc/sys/setsockopt.c b/minix/lib/libc/sys/setsockopt.c
index 04c0d311d..cbe29ed03 100644
--- a/minix/lib/libc/sys/setsockopt.c
+++ b/minix/lib/libc/sys/setsockopt.c
@@ -267,6 +267,7 @@ static int _uds_setsockopt(int sock, int level, int option_name,
 		return 0;
 	}
 
+#ifdef SO_PASSCRED
 	if (level == SOL_SOCKET && option_name == SO_PASSCRED)
 	{
 		if (option_len != sizeof(i))
@@ -283,6 +284,7 @@ static int _uds_setsockopt(int sock, int level, int option_name,
 		}
 		return 0;
 	}
+#endif
 
 #if DEBUG
 	fprintf(stderr, "_uds_setsocketopt: level %d, name %d\n",
diff --git a/minix/lib/libsys/socketpath.c b/minix/lib/libsys/socketpath.c
index 2473a7815..e4634bfb0 100644
--- a/minix/lib/libsys/socketpath.c
+++ b/minix/lib/libsys/socketpath.c
@@ -5,22 +5,22 @@
 #include <minix/safecopies.h>
 
 int
-socketpath(endpoint_t endpt, char * path, size_t size, int what, dev_t * dev,
-	ino_t * ino)
+socketpath(endpoint_t endpt, const char * path, size_t size, int what,
+	dev_t * dev, ino_t * ino)
 {
 	cp_grant_id_t grant;
 	message m;
 	int r;
 
 	if ((grant = cpf_grant_direct(VFS_PROC_NR, (vir_bytes)path, size,
-	    CPF_READ | CPF_WRITE)) == GRANT_INVALID)
+	    CPF_READ)) == GRANT_INVALID)
 		return ENOMEM;
 
 	memset(&m, 0, sizeof(m));
 	m.m_lsys_vfs_socketpath.endpt = endpt;
 	m.m_lsys_vfs_socketpath.grant = grant;
 	m.m_lsys_vfs_socketpath.count = size;
-	m.m_lsys_vfs_socketpath.what = what | SPATH_CANONIZE;
+	m.m_lsys_vfs_socketpath.what = what;
 
 	r = _taskcall(VFS_PROC_NR, VFS_SOCKETPATH, &m);
 
diff --git a/minix/man/man2/Makefile b/minix/man/man2/Makefile
index 3db88c945..1342b77ec 100644
--- a/minix/man/man2/Makefile
+++ b/minix/man/man2/Makefile
@@ -1,6 +1,6 @@
 MAN=	accept.2 access.2 bind.2 brk.2 chdir.2 chmod.2 chown.2 \
 	chroot.2 close.2 connect.2 creat.2 dup.2 execve.2 exit.2 fcntl.2 \
-	fork.2 getgid.2 getitimer.2 getpeereid.2 \
+	fork.2 getgid.2 getitimer.2 \
 	getpeername.2 getpid.2 getpriority.2 getsockname.2 getsockopt.2 \
 	gettimeofday.2 getuid.2 intro.2 ioctl.2 kill.2 link.2 listen.2 \
 	lseek.2 mkdir.2 mknod.2 mount.2 open.2 ptrace.2 \
diff --git a/minix/man/man2/getpeereid.2 b/minix/man/man2/getpeereid.2
deleted file mode 100644
index 2c0a15f07..000000000
--- a/minix/man/man2/getpeereid.2
+++ /dev/null
@@ -1,42 +0,0 @@
-.TH GETPEEREID 2
-.SH NAME
-getpeereid \- get the effective user ID and effective group ID of a peer
-connected through a Unix domain socket.
-.SH SYNOPSIS
-.ft B
-#include <sys/socket.h>
-
-.in +5
-.ti -5
-int getpeereid(int \fIsd\fP, uid_t *\fIeuid\fP, gid_t *\fIegid\fP);
-.br
-.ft P
-.SH DESCRIPTION
-getpeereid() is often used to authenticate clients connecting to a 
-server through a Unix domain socket. The server can call this function 
-with a socket descriptor \fIsd\fP and this function will fill\-in 
-\fIeuid\fP and \fIegid\fP with the effective user ID and the effective 
-group ID of the client process.
-.SH RETURN VALUES
-On success, this function returns 0, \fIeuid\fP is set to the effective 
-user ID of the peer connected through Unix domain socket \fIsd\fP, and 
-\fIegid\fP is set to the effective group ID of the peer connected 
-through Unix domain socket \fIsd\fP. On error, -1 is returned and 
-\fIerrno\fP is set.
-.SH ERRORS
-.TP 15
-[EBADF]
-The argument \fIsd\fP is not a descriptor.
-.TP 15
-[ENOTSOCK]
-The argument \fIsd\fP is a descriptor, but not a socket descriptor.
-.TP 15
-[EFAULT]
-The address pointed to by \fIeuid\fP and/or \fIegid\fP is not in a 
-valid part of the process address space.
-.SH SEE ALSO
-.BR socket(2),
-.BR socketpair(2),
-.BR unix(8)
-.SH HISTORY
-This function first appeared in Minix 3.1.8.
diff --git a/minix/net/uds/Makefile b/minix/net/uds/Makefile
index 8ae35c943..a85c4483e 100644
--- a/minix/net/uds/Makefile
+++ b/minix/net/uds/Makefile
@@ -1,9 +1,15 @@
 # Makefile for the UNIX Domain Sockets driver (UDS)
 PROG=	uds
-SRCS=	uds.c ioc_uds.c
-MAN=	uds.8 unix.8
+SRCS=	uds.c io.c stat.c
+MAN=	unix.8
 
-DPADD+=	${LIBCHARDRIVER} ${LIBSYS}
-LDADD+=	-lchardriver -lsys
+FILES=${PROG}.conf
+FILESNAME=${PROG}
+FILESDIR= /etc/system.conf.d
+
+DPADD+=	${LIBSOCKEVENT} ${LIBSOCKDRIVER} ${LIBSYS} ${LIBTIMERS}
+LDADD+=	-lsockevent -lsockdriver -lsys -ltimers
+
+WARNS?=	5
 
 .include <minix.service.mk>
diff --git a/minix/net/uds/io.c b/minix/net/uds/io.c
new file mode 100644
index 000000000..1b8de37b8
--- /dev/null
+++ b/minix/net/uds/io.c
@@ -0,0 +1,1795 @@
+/* UNIX Domain Sockets - io.c - sending and receiving */
+
+#include "uds.h"
+#include <sys/mman.h>
+
+/*
+ * Our UDS sockets do not have a send buffer.  They only have a receive buffer.
+ * This receive buffer, when not empty, is split up in segments.  Each segment
+ * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and
+ * (SOCK_DGRAM) neither.  There are two types of ancillary data: in-flight file
+ * descriptors and sender credentials.  In addition, for SOCK_DGRAM sockets,
+ * the segment may contain the sender's socket path (if the sender's socket is
+ * bound).  Each segment has has a header, containing the full segment size,
+ * the size of the actual data in the segment (if any), and a flags field that
+ * states which ancillary are associated with the segment (if any).  For
+ * SOCK_STREAM type sockets, new data may be merged into a previous segment,
+ * but only if it has no ancillary data.  For the other two socket types, each
+ * packet has its own header.  The resulting behavior should be in line with
+ * the POSIX "Socket Receive Queue" specification.
+ *
+ * More specifically, each segment consists of the following parts:
+ * - always a five-byte header, containing a two-byte segment length (including
+ *   the header, so always non-zero), a two-byte regular data length (zero or
+ *   more), and a one-byte flags field which is a bitwise combination of
+ *   UDS_HAS_{FD,CRED,PATH} flags;
+ * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure;
+ *   since this structure is variable-size, the structure is prepended by a
+ *   single byte that contains the length of the structure (excluding the byte
+ *   itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN);
+ * - next, if UDS_HAS_PATH is set in the segment header:
+ * - next, if the data length is non-zero, the actual regular data.
+ * If the segment is not the last in the receive buffer, it is followed by the
+ * next segment immediately afterward.  There is no alignment.
+ *
+ * It is the sender's responsibility to merge new data into the last segment
+ * whenever possible, so that the receiver side never needs to consider more
+ * than one segment at once.  In order to allow such merging, each receive
+ * buffer has not only a tail and in-use length (pointing to the head when
+ * combined) but also an offset from the tail to the last header, if any.  Note
+ * that the receiver may over time still look at multiple segments for a single
+ * request: this happens when a MSG_WAITALL request empties the buffer and then
+ * blocks - the next piece of arriving data can then obviously not be merged.
+ *
+ * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file
+ * descriptors are associated with the segment.  These are stored in a separate
+ * data structure, mainly to simplify cleaning up when the socket is shut down
+ * for reading or closed.  That structure also contains the number of file
+ * descriptors associated with the current segment, so this is not stored in
+ * the segment itself.  As mentioned later, this may be changed in the future.
+ *
+ * On the sender side, there is a trade-off between fully utilizing the receive
+ * buffer, and not repeatedly performing expensive actions for the same call:
+ * it may be costly to determine exactly how many in-flight file descriptors
+ * there will be (if any) and/or how much space is needed to store credentials.
+ * We currently use the policy that we rather block/reject a send request that
+ * may (just) have fit in the remaining part of the receive buffer, than obtain
+ * the same information multiple times or keep state between callbacks.  In
+ * practice this is not expected to make a difference, especially since
+ * transfer of ancillary data should be rare anyway.
+ */
+/*
+ * The current layout of the segment header is as follows.
+ *
+ * The first byte contains the upper eight bits of the total segment length.
+ * The second byte contains the lower eight bits of the total segment length.
+ * The third byte contains the upper eight bits of the data length.
+ * The fourth byte contains the lower eight bits of the data length.
+ * The fifth byte is a bitmask for ancillary data associated with the segment.
+ */
+#define UDS_HDRLEN	5
+
+#define UDS_HAS_FDS	0x01	/* segment has in-flight file descriptors */
+#define UDS_HAS_CRED	0x02	/* segment has sender credentials */
+#define UDS_HAS_PATH	0x04	/* segment has source socket path */
+
+#define UDS_MAXCREDLEN	SOCKCREDSIZE(NGROUPS_MAX)
+
+#define uds_get_head(uds) 	\
+	((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF)
+#define uds_get_last(uds)	\
+	((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF)
+#define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF)
+
+/*
+ * All in-flight file descriptors are (co-)owned by the UDS driver itself, as
+ * local open file descriptors.  Like any other process, the UDS driver can not
+ * have more than OPEN_MAX open file descriptors at any time.  Thus, this is
+ * also the inherent maximum number of in-flight file descriptors.  Therefore,
+ * we maintain a single pool of in-flight FD structures, and we associate these
+ * structures with sockets as needed.
+ */
+static struct uds_fd uds_fds[OPEN_MAX];
+static SIMPLEQ_HEAD(uds_freefds, uds_fd) uds_freefds;
+
+static char uds_ctlbuf[UDS_CTL_MAX];
+static int uds_ctlfds[UDS_CTL_MAX / sizeof(int)];
+
+/*
+ * Initialize the input/output part of the UDS service.
+ */
+void
+uds_io_init(void)
+{
+	unsigned int slot;
+
+	SIMPLEQ_INIT(&uds_freefds);
+
+	for (slot = 0; slot < __arraycount(uds_fds); slot++)
+		SIMPLEQ_INSERT_TAIL(&uds_freefds, &uds_fds[slot], ufd_next);
+}
+
+/*
+ * Set up all input/output state for the given socket, which has just been
+ * allocated.  As part of this, allocate memory for the receive buffer of the
+ * socket.  Return OK or a negative error code.
+ */
+int
+uds_io_setup(struct udssock * uds)
+{
+
+	/* TODO: decide if we should preallocate the memory. */
+	if ((uds->uds_buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE,
+	    MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
+		return ENOMEM;
+
+	uds->uds_tail = 0;
+	uds->uds_len = 0;
+	uds->uds_last = 0;
+
+	SIMPLEQ_INIT(&uds->uds_fds);
+
+	return OK;
+}
+
+/*
+ * Clean up the input/output state for the given socket, which is about to be
+ * freed.  As part of this, deallocate memory for the receive buffer and close
+ * any file descriptors still in flight on the socket.
+ */
+void
+uds_io_cleanup(struct udssock * uds)
+{
+
+	/* Close any in-flight file descriptors. */
+	uds_io_reset(uds);
+
+	/* Free the receive buffer memory. */
+	if (munmap(uds->uds_buf, UDS_BUF) != 0)
+		panic("UDS: munmap failed: %d", errno);
+}
+
+/*
+ * The socket is being closed or shut down for reading.  If there are still any
+ * in-flight file descriptors, theey will never be received anymore, so close
+ * them now.
+ */
+void
+uds_io_reset(struct udssock * uds)
+{
+	struct uds_fd *ufd;
+
+	/*
+	 * The UDS service may have the last and only reference to any of these
+	 * file descriptors here.  For that reason, we currently disallow
+	 * transfer of UDS file descriptors, because the close(2) here could
+	 * block on a socket close operation back to us, leading to a deadlock.
+	 * Also, we use a non-blocking variant of close(2), to prevent that we
+	 * end up hanging on sockets with SO_LINGER turned on.
+	 */
+	SIMPLEQ_FOREACH(ufd, &uds->uds_fds, ufd_next) {
+		dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
+
+		closenb(ufd->ufd_fd);
+	}
+
+	SIMPLEQ_CONCAT(&uds_freefds, &uds->uds_fds);
+
+	/*
+	 * If this reset happens as part of a shutdown, it might be done
+	 * again on close, so ensure that it will find a clean state.  The
+	 * receive buffer should never be looked at again either way, but reset
+	 * it too just to be sure.
+	 */
+	uds->uds_tail = 0;
+	uds->uds_len = 0;
+	uds->uds_last = 0;
+
+	SIMPLEQ_INIT(&uds->uds_fds);
+}
+
+/*
+ * Return the maximum usable part of the receive buffer, in bytes.  The return
+ * value is used for the SO_SNDBUF and SO_RCVBUF socket options.
+ */
+size_t
+uds_io_buflen(void)
+{
+
+	/*
+	 * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we
+	 * could use the full receive buffer for data.  This would require that
+	 * we store up to one header in the socket object rather than in the
+	 * receive buffer.
+	 */
+	return UDS_BUF - UDS_HDRLEN;
+}
+
+/*
+ * Fetch 'len' bytes starting from absolute position 'pos' into the receive
+ * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'.
+ * Return the absolute position of the first byte after the fetched data in the
+ * receive buffer.
+ */
+static size_t
+uds_fetch(struct udssock * uds, size_t off, void * ptr, size_t len)
+{
+	size_t left;
+
+	assert(off < UDS_BUF);
+
+	left = UDS_BUF - off;
+	if (len >= left) {
+		memcpy(ptr, &uds->uds_buf[off], left);
+
+		if ((len -= left) > 0)
+			memcpy((char *)ptr + left, &uds->uds_buf[0], len);
+
+		return len;
+	} else {
+		memcpy(ptr, &uds->uds_buf[off], len);
+
+		return off + len;
+	}
+}
+
+/*
+ * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive
+ * buffer of socket 'uds', starting at absolute position 'pos' into the receive
+ * buffer.  Return the absolute position of the first byte after the stored
+ * data in the receive buffer.
+ */
+static size_t
+uds_store(struct udssock * uds, size_t off, const void * ptr, size_t len)
+{
+	size_t left;
+
+	assert(off < UDS_BUF);
+
+	left = UDS_BUF - off;
+	if (len >= left) {
+		memcpy(&uds->uds_buf[off], ptr, left);
+
+		if ((len -= left) > 0)
+			memcpy(&uds->uds_buf[0], (const char *)ptr + left,
+			    len);
+
+		return len;
+	} else {
+		memcpy(&uds->uds_buf[off], ptr, len);
+
+		return off + len;
+	}
+}
+
+/*
+ * Fetch a segment header previously stored in the receive buffer of socket
+ * 'uds' at absolute position 'off'.  Return the absolute position of the first
+ * byte after the header, as well as the entire segment length in 'seglen', the
+ * length of the data in the segment in 'datalen', and the segment flags in
+ * 'segflags'.
+ */
+static size_t
+uds_fetch_hdr(struct udssock * uds, size_t off, size_t * seglen,
+	size_t * datalen, unsigned int * segflags)
+{
+	unsigned char hdr[UDS_HDRLEN];
+
+	off = uds_fetch(uds, off, hdr, sizeof(hdr));
+
+	*seglen = ((size_t)hdr[0] << 8) | (size_t)hdr[1];
+	*datalen = ((size_t)hdr[2] << 8) | (size_t)hdr[3];
+	*segflags = hdr[4];
+
+	assert(*seglen >= UDS_HDRLEN);
+	assert(*seglen <= uds->uds_len);
+	assert(*datalen <= *seglen - UDS_HDRLEN);
+	assert(*segflags != 0 || *datalen == *seglen - UDS_HDRLEN);
+	assert(!(*segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
+
+	return off;
+}
+
+/*
+ * Store a segment header in the receive buffer of socket 'uds' at absolute
+ * position 'off', with the segment length 'seglen', the segment data length
+ * 'datalen', and the segment flags 'segflags'.  Return the absolute receive
+ * buffer position of the first data byte after the stored header.
+ */
+static size_t
+uds_store_hdr(struct udssock * uds, size_t off, size_t seglen, size_t datalen,
+	unsigned int segflags)
+{
+	unsigned char hdr[UDS_HDRLEN];
+
+	assert(seglen <= USHRT_MAX);
+	assert(datalen <= seglen);
+	assert(segflags <= UCHAR_MAX);
+	assert(!(segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
+
+	hdr[0] = (seglen >> 8) & 0xff;
+	hdr[1] = seglen & 0xff;
+	hdr[2] = (datalen >> 8) & 0xff;
+	hdr[3] = datalen & 0xff;
+	hdr[4] = segflags;
+
+	return uds_store(uds, off, hdr, sizeof(hdr));
+}
+
+/*
+ * Perform initial checks on a send request, before it may potentially be
+ * suspended.  Return OK if this send request is valid, or a negative error
+ * code if it is not.
+ */
+int
+uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused,
+	const struct sockaddr * addr, socklen_t addr_len __unused,
+	endpoint_t user_endpt __unused, int flags)
+{
+	struct udssock *uds = (struct udssock *)sock;
+	size_t pathlen;
+
+	/*
+	 * Reject calls with unknown flags.  Besides the flags handled entirely
+	 * by libsockevent (which are not part of 'flags' here), that is all of
+	 * them.  TODO: ensure that we should really reject all other flags
+	 * rather than ignore them.
+	 */
+	if (flags != 0)
+		return EOPNOTSUPP;
+
+	/*
+	 * Perform very basic address and message size checks on the send call.
+	 * For non-stream sockets, we must reject packets that may never fit in
+	 * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the
+	 * send call may end up being suspended indefinitely.  Therefore, we
+	 * assume the worst-case scenario, which is that a full set of
+	 * credentials must be associated with the packet.  As a result, we may
+	 * reject some large packets that could actually just fit.  Checking
+	 * the peer's LOCAL_CREDS setting here is not safe: even if we know the
+	 * peer already at all (for SOCK_DGRAM we do not), the send may still
+	 * block and the option toggled before it unblocks.
+	 */
+	switch (uds_get_type(uds)) {
+	case SOCK_STREAM:
+		/* Nothing to check for this case. */
+		break;
+
+	case SOCK_SEQPACKET:
+		if (len > UDS_BUF - UDS_HDRLEN - 1 - UDS_MAXCREDLEN)
+			return EMSGSIZE;
+
+		break;
+
+	case SOCK_DGRAM:
+		if (!uds_has_link(uds) && addr == NULL)
+			return EDESTADDRREQ;
+
+		/*
+		 * The path is stored without null terminator, but with leading
+		 * byte containing the path length--if there is a path at all.
+		 */
+		pathlen = (size_t)uds->uds_pathlen;
+		if (pathlen > 0)
+			pathlen++;
+
+		if (len > UDS_BUF - UDS_HDRLEN - pathlen - 1 - UDS_MAXCREDLEN)
+			return EMSGSIZE;
+
+		break;
+
+	default:
+		assert(0);
+	}
+
+	return OK;
+}
+
+/*
+ * Determine whether the (real or pretend) send request should be processed
+ * now, suspended until later, or rejected based on the current socket state.
+ * Return OK if the send request should be processed now.  Return SUSPEND if
+ * the send request should be retried later.  Return an appropriate negative
+ * error code if the send request should fail.
+ */
+static int
+uds_send_test(struct udssock * uds, size_t len, socklen_t ctl_len, size_t min,
+	int partial)
+{
+	struct udssock *conn;
+	size_t avail, hdrlen, credlen;
+
+	assert(!uds_is_shutdown(uds, SFL_SHUT_WR));
+
+	if (uds_get_type(uds) != SOCK_DGRAM) {
+		if (uds_is_connecting(uds))
+			return SUSPEND;
+		if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
+			return ENOTCONN;
+		if (!uds_has_conn(uds))
+			return EPIPE;
+
+		conn = uds->uds_conn;
+
+		if (uds_is_shutdown(conn, SFL_SHUT_RD))
+			return EPIPE;
+
+		/*
+		 * For connection-type sockets, we now have to check if there
+		 * is enough room in the receive buffer.  For SOCK_STREAM
+		 * sockets, we must check if at least 'min' bytes can be moved
+		 * into the receive buffer, at least if that is a reasonable
+		 * value for ever making any forward progress at all.  For
+		 * SOCK_SEQPACKET sockets, we must check if the entire packet
+		 * of size 'len' can be stored in the receive buffer.  In both
+		 * cases, we must take into account any metadata to store along
+		 * with the data.
+		 *
+		 * Unlike in uds_pre_send(), we can now check safely whether
+		 * the peer is expecting credentials, but we still don't know
+		 * the actual size of the credentials, so again we take the
+		 * maximum possible size.  The same applies to file descriptors
+		 * transferred via control data: all we have the control length
+		 * right now, which if non-zero we assume to mean there might
+		 * be file descriptors.
+		 *
+		 * In both cases, the reason of overestimating is that actually
+		 * getting accurate sizes, by obtaining credentials or copying
+		 * in control data, is very costly.  We want to do that only
+		 * when we are sure we will not suspend the send call after
+		 * all.  It is no problem to overestimate how much space will
+		 * be needed here, but not to underestimate: that could cause
+		 * applications that use select(2) and non-blocking sockets to
+		 * end up in a busy-wait loop.
+		 */
+		if (!partial && (conn->uds_flags & UDSF_PASSCRED))
+			credlen = 1 + UDS_MAXCREDLEN;
+		else
+			credlen = 0;
+
+		avail = UDS_BUF - conn->uds_len;
+
+		if (uds_get_type(uds) == SOCK_STREAM) {
+			/*
+			 * Limit the low threshold to the maximum that can ever
+			 * be sent at once.
+			 */
+			if (min > UDS_BUF - UDS_HDRLEN - credlen)
+				min = UDS_BUF - UDS_HDRLEN - credlen;
+
+			/*
+			 * Suspend the call only if not even the low threshold
+			 * is met.  Otherwise we may make (partial) progress.
+			 */
+			if (len > min)
+				len = min;
+
+			/*
+			 * If the receive buffer already has at least one
+			 * segment, and there are certainly no file descriptors
+			 * to transfer now, and we do not have to store
+			 * credentials either, then this segment can be merged
+			 * with the previous one.  In that case, we need no
+			 * space for a header.  That is certainly the case if
+			 * we are resuming an already partially completed send.
+			 */
+			hdrlen = (avail == UDS_BUF || ctl_len != 0 ||
+			    credlen > 0) ? UDS_HDRLEN : 0;
+		} else
+			hdrlen = UDS_HDRLEN;
+
+		if (avail < hdrlen + credlen + len)
+			return SUSPEND;
+	}
+
+	return OK;
+}
+
+/*
+ * Get the destination peer for a send request.  The send test has already been
+ * performed first.  On success, return OK, with a pointer to the peer socket
+ * stored in 'peerp'.  On failure, return an appropriate error code.
+ */
+static int
+uds_send_peer(struct udssock * uds, const struct sockaddr * addr,
+	socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp)
+{
+	struct udssock *peer;
+	int r;
+
+	if (uds_get_type(uds) == SOCK_DGRAM) {
+		if (!uds_has_link(uds)) {
+			/* This was already checked in uds_pre_check(). */
+			assert(addr != NULL);
+
+			/*
+			 * Find the socket identified by the given address.
+			 * If it exists at all, see if it is a proper match.
+			 */
+			if ((r = uds_lookup(uds, addr, addr_len, user_endpt,
+			    &peer)) != OK)
+				return r;
+
+			/*
+			 * If the peer socket is connected to a target, it
+			 * must be this socket.  Unfortunately, POSIX does not
+			 * specify an error code for this.  We borrow Linux's.
+			 */
+			if (uds_has_link(peer) && peer->uds_link != uds)
+				return EPERM;
+		} else
+			peer = uds->uds_link;
+
+		/*
+		 * If the receiving end will never receive this packet, we
+		 * might as well not send it, so drop it immeiately.  Indicate
+		 * as such to the caller, using NetBSD's chosen error code.
+		 */
+		if (uds_is_shutdown(peer, SFL_SHUT_RD))
+			return ENOBUFS;
+	} else {
+		assert(uds_has_conn(uds));
+
+		peer = uds->uds_conn;
+	}
+
+	*peerp = peer;
+	return OK;
+}
+
+/*
+ * Generate a new segment for the current send request, or arrange things such
+ * that new data can be merged with a previous segment.  As part of this,
+ * decide whether we can merge data at all.  The segment will be merged if, and
+ * only if, all of the following requirements are met:
+ *
+ *   1) the socket is of type SOCK_STREAM;
+ *   2) there is a previous segment in the receive buffer;
+ *   3) there is no ancillary data for the current send request.
+ *
+ * Also copy in regular data (if any), retrieve the sender's credentials (if
+ * needed), and copy over the source path (if applicable).  However, do not yet
+ * commit the segment (or the new part to be merged), because the send request
+ * may still fail for other reasons.
+ *
+ * On success, return the length of the new segment (or, when merging, the
+ * length to be added to the last segment), as well as a flag indicating
+ * whether we are merging into the last segment in 'mergep', the length of the
+ * (new) data in the segment in 'datalenp', and the new segment's flags in
+ * 'segflagsp' (always zero when merging).  Note that a return value of zero
+ * implies that we are merging zero extra bytes into the last segment, which
+ * means that effectively nothing changes; in that case the send call will be
+ * cut short and return zero to the caller as well.  On failure, return a
+ * negative error code.
+ */
+static int
+uds_send_data(struct udssock * uds, struct udssock * peer,
+	const struct sockdriver_data * data, size_t len, size_t off,
+	endpoint_t user_endpt, unsigned int nfds, int * __restrict mergep,
+	size_t * __restrict datalenp, unsigned int * __restrict segflagsp)
+{
+	struct sockcred sockcred;
+	gid_t groups[NGROUPS_MAX];
+	iovec_t iov[2];
+	unsigned int iovcnt, segflags;
+	unsigned char lenbyte;
+	size_t credlen, pathlen, datalen, seglen;
+	size_t avail, pos, left;
+	int r, merge;
+
+	/*
+	 * At this point we should add the data to the peer's receive buffer.
+	 * In the case of SOCK_STREAM sockets, we should add as much of the
+	 * data as possible and suspend the call to send the rest later, if
+	 * applicable.  In the case of SOCK_DGRAM sockets, we should drop the
+	 * packet if it does not fit in the buffer.
+	 *
+	 * Due to the checks in uds_can_send(), we know for sure that we no
+	 * longer have to suspend without making any progress at this point.
+	 */
+	segflags = (nfds > 0) ? UDS_HAS_FDS : 0;
+
+	/*
+	 * Obtain the credentials now.  Doing so allows us to determine how
+	 * much space we actually need for them.
+	 */
+	if (off == 0 && (peer->uds_flags & UDSF_PASSCRED)) {
+		memset(&sockcred, 0, sizeof(sockcred));
+
+		if ((r = getsockcred(user_endpt, &sockcred, groups,
+		    __arraycount(groups))) != OK)
+			return r;
+
+		credlen = 1 + SOCKCREDSIZE(sockcred.sc_ngroups);
+
+		segflags |= UDS_HAS_CRED;
+	} else
+		credlen = 0;
+
+	/* For bound source datagram sockets, include the source path. */
+	if (uds_get_type(uds) == SOCK_DGRAM && uds->uds_pathlen != 0) {
+		pathlen = (size_t)uds->uds_pathlen + 1;
+
+		segflags |= UDS_HAS_PATH;
+	} else
+		pathlen = 0;
+
+	avail = UDS_BUF - peer->uds_len;
+
+	if (uds_get_type(uds) == SOCK_STREAM) {
+		/*
+		 * Determine whether we can merge data into the previous
+		 * segment.  This is a more refined version of the test in
+		 * uds_can_send(), as we now know whether there are actually
+		 * any FDs to transfer.
+		 */
+		merge = (peer->uds_len != 0 && nfds == 0 && credlen == 0);
+
+		/* Determine how much we can send at once. */
+		if (!merge) {
+			assert(avail > UDS_HDRLEN + credlen);
+			datalen = avail - UDS_HDRLEN - credlen;
+		} else
+			datalen = avail;
+
+		if (datalen > len)
+			datalen = len;
+
+		/* If we cannot make progress, we should have suspended.. */
+		assert(datalen != 0 || len == 0);
+	} else {
+		merge = FALSE;
+
+		datalen = len;
+	}
+	assert(datalen <= len);
+	assert(datalen <= UDS_BUF);
+
+	/*
+	 * Compute the total amount of space we need for the segment in the
+	 * receive buffer.  Given that we have done will-it-fit tests in
+	 * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one
+	 * case left where the result may not fit, and that is for SOCK_DGRAM
+	 * packets.  In that case, we drop the packet.  POSIX says we should
+	 * throw an error in that case, and that is also what NetBSD does.
+	 */
+	if (!merge)
+		seglen = UDS_HDRLEN + credlen + pathlen + datalen;
+	else
+		seglen = datalen;
+
+	if (seglen > avail) {
+		assert(uds_get_type(uds) == SOCK_DGRAM);
+
+		/* Drop the packet, borrowing NetBSD's chosen error code. */
+		return ENOBUFS;
+	}
+
+	/*
+	 * Generate the full segment, but do not yet update the buffer head.
+	 * We may still run into an error (copying in file descriptors) or even
+	 * decide that nothing gets sent after all (if there are no data or
+	 * file descriptors).  If we are merging the new data into the previous
+	 * segment, do not generate a header.
+	 */
+	pos = uds_get_head(peer);
+
+	/* Generate the header, if needed. */
+	if (!merge)
+		pos = uds_store_hdr(peer, pos, seglen, datalen, segflags);
+	else
+		assert(segflags == 0);
+
+	/* Copy in and store the sender's credentials, if desired. */
+	if (credlen > 0) {
+		assert(credlen >= 1 + sizeof(sockcred));
+		assert(credlen <= UCHAR_MAX);
+
+		lenbyte = credlen - 1;
+		pos = uds_store(peer, pos, &lenbyte, 1);
+
+		if (sockcred.sc_ngroups > 0) {
+			pos = uds_store(peer, pos, &sockcred,
+			    offsetof(struct sockcred, sc_groups));
+			pos = uds_store(peer, pos, groups,
+			    sockcred.sc_ngroups * sizeof(gid_t));
+		} else
+			pos = uds_store(peer, pos, &sockcred,
+			    sizeof(sockcred));
+	}
+
+	/* Store the sender's address if any.  Datagram sockets only. */
+	if (pathlen > 0) {
+		assert(pathlen > 1);
+		assert(pathlen <= UCHAR_MAX);
+
+		lenbyte = uds->uds_pathlen;
+		pos = uds_store(peer, pos, &lenbyte, 1);
+		pos = uds_store(peer, pos, uds->uds_path, pathlen - 1);
+	}
+
+	/* Lastly, copy in the actual data (if any) from the caller. */
+	if (datalen > 0) {
+		iov[0].iov_addr = (vir_bytes)&peer->uds_buf[pos];
+		left = UDS_BUF - pos;
+
+		if (left < datalen) {
+			assert(left > 0);
+			iov[0].iov_size = left;
+			iov[1].iov_addr = (vir_bytes)&peer->uds_buf[0];
+			iov[1].iov_size = datalen - left;
+			iovcnt = 2;
+		} else {
+			iov[0].iov_size = datalen;
+			iovcnt = 1;
+		}
+
+		if ((r = sockdriver_vcopyin(data, off, iov, iovcnt)) != OK)
+			return r;
+	}
+
+	*mergep = merge;
+	*datalenp = datalen;
+	*segflagsp = segflags;
+	return seglen;
+}
+
+/*
+ * Copy in control data for the current send request, and extract any file
+ * descriptors to be transferred.  Do not yet duplicate the file descriptors,
+ * but rather store a list in a temporary buffer: the send request may still
+ * fail in which case we want to avoid having to undo the duplication.
+ *
+ * On success, return the number of (zero or more) file descriptors extracted
+ * from the request and stored in the temporary buffer.  On failure, return a
+ * negative error code.
+ */
+static int
+uds_send_ctl(const struct sockdriver_data * ctl, socklen_t ctl_len,
+	endpoint_t user_endpt)
+{
+	struct msghdr msghdr;
+	struct cmsghdr *cmsg;
+	socklen_t left;
+	unsigned int i, n, nfds;
+	int r;
+
+	/*
+	 * Copy in the control data.  We can spend a lot of effort copying in
+	 * the data in small chunks, and change the receiving side to do the
+	 * same, but it is really not worth it: applications never send a whole
+	 * lot of file descriptors at once, and the buffer size is currently
+	 * such that the UDS service itself will exhaust its OPEN_MAX limit
+	 * anyway if they do.
+	 */
+	if (ctl_len > sizeof(uds_ctlbuf))
+		return ENOBUFS;
+
+	if ((r = sockdriver_copyin(ctl, 0, uds_ctlbuf, ctl_len)) != OK)
+		return r;
+
+	if (ctl_len < sizeof(uds_ctlbuf))
+		memset(&uds_ctlbuf[ctl_len], 0, sizeof(uds_ctlbuf) - ctl_len);
+
+	/*
+	 * Look for any file descriptors, and store their remote file
+	 * descriptor numbers into a temporary array.
+	 */
+	memset(&msghdr, 0, sizeof(msghdr));
+	msghdr.msg_control = uds_ctlbuf;
+	msghdr.msg_controllen = ctl_len;
+
+	nfds = 0;
+	r = OK;
+
+	/*
+	 * The sender may provide file descriptors in multiple chunks.
+	 * Currently we do not preserve these chunk boundaries, instead
+	 * generating one single chunk with all file descriptors for the
+	 * segment upon receipt.  If needed, we can fairly easily adapt this
+	 * later.
+	 */
+	for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
+	    cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
+		/*
+		 * Check for bogus lengths.  There is no excuse for this;
+		 * either the caller does not know what they are doing or we
+		 * are looking at a hacking attempt.
+		 */
+		assert((socklen_t)((char *)cmsg - uds_ctlbuf) <= ctl_len);
+		left = ctl_len - (socklen_t)((char *)cmsg - uds_ctlbuf);
+		assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */
+
+		if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) {
+			printf("UDS: malformed control data from %u\n",
+			    user_endpt);
+			r = EINVAL;
+			break;
+		}
+
+		if (cmsg->cmsg_level != SOL_SOCKET ||
+		    cmsg->cmsg_type != SCM_RIGHTS)
+			continue;
+
+		n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+		for (i = 0; i < n; i++) {
+			/*
+			 * Copy the file descriptor to the temporary buffer,
+			 * whose size is based on the control data buffer, so
+			 * it is always large enough to contain all FDs.
+			 */
+			assert(nfds < __arraycount(uds_ctlfds));
+
+			memcpy(&uds_ctlfds[nfds],
+			    &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
+
+			nfds++;
+		}
+	}
+
+	return nfds;
+}
+
+/*
+ * Actually duplicate any file descriptors that we extracted from the sender's
+ * control data and stored in our temporary buffer.  On success, return OK,
+ * with all file descriptors stored in file descriptor objects that are
+ * appended to the socket's list of in-flight FD objects.  Thus, on success,
+ * the send request may no longer fail.  On failure, return a negative error
+ * code, with any partial duplication undone.
+ */
+static int
+uds_send_fds(struct udssock * peer, unsigned int nfds, endpoint_t user_endpt)
+{
+	SIMPLEQ_HEAD(, uds_fd) fds;
+	struct uds_fd *ufd;
+	unsigned int i;
+	int r;
+
+	SIMPLEQ_INIT(&fds);
+
+	for (i = 0; i < nfds; i++) {
+		if (SIMPLEQ_EMPTY(&uds_freefds)) {
+			/* UDS itself may already have OPEN_MAX FDs. */
+			r = ENFILE;
+			break;
+		}
+
+		/*
+		 * The caller may have given an invalid FD, or UDS itself may
+		 * unexpectedly have run out of available file descriptors etc.
+		 */
+		if ((r = copyfd(user_endpt, uds_ctlfds[i], COPYFD_FROM)) < 0)
+			break;
+
+		ufd = SIMPLEQ_FIRST(&uds_freefds);
+		SIMPLEQ_REMOVE_HEAD(&uds_freefds, ufd_next);
+
+		ufd->ufd_fd = r;
+		ufd->ufd_count = 0;
+
+		SIMPLEQ_INSERT_TAIL(&fds, ufd, ufd_next);
+
+		dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds[i], r));
+	}
+
+	/* Did we experience an error while copying in the file descriptors? */
+	if (r < 0) {
+		/* Revert the successful copyfd() calls made so far. */
+		SIMPLEQ_FOREACH(ufd, &fds, ufd_next) {
+			dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
+
+			closenb(ufd->ufd_fd);
+		}
+
+		SIMPLEQ_CONCAT(&uds_freefds, &fds);
+
+		return r;
+	}
+
+	/*
+	 * Success.  If there were any file descriptors at all, add them to the
+	 * peer's list of in-flight file descriptors.  Assign the number of
+	 * file descriptors copied in to the first file descriptor object, so
+	 * that we know how many to copy out (or discard) for this segment.
+	 * Also set the UDS_HAS_FDS flag on the segment.
+	 */
+	ufd = SIMPLEQ_FIRST(&fds);
+	ufd->ufd_count = nfds;
+
+	SIMPLEQ_CONCAT(&peer->uds_fds, &fds);
+
+	return OK;
+}
+
+/*
+ * The current send request is successful or at least has made progress.
+ * Commit the new segment or, if we decided to merge the new data into the last
+ * segment, update the header of the last segment.  Also wake up the receiving
+ * side, because there will now be new data to receive.
+ */
+static void
+uds_send_advance(struct udssock * uds, struct udssock * peer, size_t datalen,
+	int merge, size_t seglen, unsigned int segflags)
+{
+	size_t pos, prevseglen, prevdatalen;
+
+	/*
+	 * For non-datagram sockets, credentials are sent only once after
+	 * setting the LOCAL_CREDS option.  After that, the option is unset.
+	 */
+	if ((segflags & UDS_HAS_CRED) && uds_get_type(uds) != SOCK_DGRAM)
+		peer->uds_flags &= ~UDSF_PASSCRED;
+
+	if (merge) {
+		assert(segflags == 0);
+
+		pos = uds_get_last(peer);
+
+		(void)uds_fetch_hdr(peer, pos, &prevseglen, &prevdatalen,
+		    &segflags);
+
+		peer->uds_len += seglen;
+		assert(peer->uds_len <= UDS_BUF);
+
+		seglen += prevseglen;
+		datalen += prevdatalen;
+		assert(seglen <= UDS_BUF);
+
+		uds_store_hdr(peer, pos, seglen, datalen, segflags);
+	} else {
+		peer->uds_last = peer->uds_len;
+
+		peer->uds_len += seglen;
+		assert(peer->uds_len <= UDS_BUF);
+	}
+
+	/* Now that there are new data, wake up the receiver side. */
+	sockevent_raise(&peer->uds_sock, SEV_RECV);
+}
+
+/*
+ * Process a send request.  Return OK if the send request has successfully
+ * completed, SUSPEND if it should be tried again later, or a negative error
+ * code on failure.  In all cases, the values of 'off' and 'ctl_off' must be
+ * updated if any progress has been made; if either is non-zero, libsockevent
+ * will return the partial progress rather than an error code.
+ */
+int
+uds_send(struct sock * sock, const struct sockdriver_data * data, size_t len,
+	size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
+	socklen_t * ctl_off, const struct sockaddr * addr, socklen_t addr_len,
+	endpoint_t user_endpt, int flags __unused, size_t min)
+{
+	struct udssock *uds = (struct udssock *)sock;
+	struct udssock *peer;
+	size_t seglen, datalen = 0 /*gcc*/;
+	unsigned int nfds, segflags = 0 /*gcc*/;
+	int r, partial, merge = 0 /*gcc*/;
+
+	dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n",
+	    uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
+	    (ctl_off != NULL) ? *ctl_off : 0, flags));
+
+	partial = (off != NULL && *off > 0);
+
+	/*
+	 * First see whether we can process this send call at all right now.
+	 * Most importantly, for connected sockets, if the peer's receive
+	 * buffer is full, we may have to suspend the call until some space has
+	 * been freed up.
+	 */
+	if ((r = uds_send_test(uds, len, ctl_len, min, partial)) != OK)
+		return r;
+
+	/*
+	 * Then get the peer socket.  For connected sockets, this is trivial.
+	 * For unconnected sockets, it may involve a lookup of the given
+	 * address.
+	 */
+	if ((r = uds_send_peer(uds, addr, addr_len, user_endpt, &peer)) != OK)
+		return r;
+
+	/*
+	 * We now know for sure that we will not suspend this call without
+	 * making any progress.  However, the call may still fail.  Copy in
+	 * control data first now, so that we know whether there are any file
+	 * descriptors to transfer.  This aspect may determine whether or not
+	 * we can merge data with a previous segment.  Do not actually copy in
+	 * the actual file descriptors yet, because that is much harder to undo
+	 * in case of a failure later on.
+	 */
+	if (ctl_len > 0) {
+		/* We process control data once, in full. */
+		assert(*ctl_off == 0);
+
+		if ((r = uds_send_ctl(ctl, ctl_len, user_endpt)) < 0)
+			return r;
+		nfds = (unsigned int)r;
+	} else
+		nfds = 0;
+
+	/*
+	 * Now generate a new segment, or (if possible) merge new data into the
+	 * last segment.  Since the call may still fail, prepare the segment
+	 * but do not update the buffer head yet.  Note that the segment
+	 * contains not just regular data (in fact it may contain no data at
+	 * all) but (also) certain ancillary data.
+	 */
+	if ((r = uds_send_data(uds, peer, data, len, *off, user_endpt, nfds,
+	    &merge, &datalen, &segflags)) <= 0)
+		return r;
+	seglen = (size_t)r;
+
+	/*
+	 * If we extracted any file descriptors from the control data earlier,
+	 * copy them over to ourselves now.  The resulting in-flight file
+	 * descriptors are stored in a separate data structure.  This is the
+	 * last point where the send call may actually fail.
+	 */
+	if (nfds > 0) {
+		if ((r = uds_send_fds(peer, nfds, user_endpt)) != OK)
+			return r;
+	}
+
+	/*
+	 * The transmission is now known to be (partially) successful.  Commit
+	 * the new work by moving the receive buffer head.
+	 */
+	uds_send_advance(uds, peer, datalen, merge, seglen, segflags);
+
+	/*
+	 * Register the result.  For stream-type sockets, the expected behavior
+	 * is that all data be sent, and so we may still have to suspend the
+	 * call after partial progress.  Otherwise, we are now done.  Either
+	 * way, we are done with the control data, so mark it as consumed.
+	 */
+	*off += datalen;
+	*ctl_off += ctl_len;
+	if (uds_get_type(uds) == SOCK_STREAM && datalen < len)
+		return SUSPEND;
+	else
+		return OK;
+}
+
+/*
+ * Test whether a send request would block.  The given 'min' parameter contains
+ * the minimum number of bytes that should be possible to send without blocking
+ * (the low send watermark).  Return SUSPEND if the send request would block,
+ * or any other error code if it would not.
+ */
+int
+uds_test_send(struct sock * sock, size_t min)
+{
+	struct udssock *uds = (struct udssock *)sock;
+
+	return uds_send_test(uds, min, 0, min, FALSE /*partial*/);
+}
+
+/*
+ * Perform initial checks on a receive request, before it may potentially be
+ * suspended.  Return OK if this receive request is valid, or a negative error
+ * code if it is not.
+ */
+int
+uds_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
+	int flags)
+{
+
+	/*
+	 * Reject calls with unknown flags.  TODO: ensure that we should really
+	 * reject all other flags rather than ignore them.
+	 */
+	if ((flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC)) != 0)
+		return EOPNOTSUPP;
+
+	return OK;
+}
+
+/*
+ * Determine whether the (real or pretend) receive request should be processed
+ * now, suspended until later, or rejected based on the current socket state.
+ * Return OK if the receive request should be processed now, along with a first
+ * indication whether the call may still be suspended later in 'may_block'.
+ * Return SUSPEND if the receive request should be retried later.  Return an
+ * appropriate negative error code if the receive request should fail.
+ */
+static int
+uds_recv_test(struct udssock * uds, size_t len, size_t min, int partial,
+	int * may_block)
+{
+	size_t seglen, datalen;
+	unsigned int segflags;
+	int r;
+
+	/*
+	 * If there are any pending data, those should always be received
+	 * first.  However, if there is nothing to receive, then whether we
+	 * should suspend the receive call or fail immediately depends on other
+	 * conditions.  We first look at these other conditions.
+	 */
+	r = OK;
+
+	if (uds_get_type(uds) != SOCK_DGRAM) {
+		if (uds_is_connecting(uds))
+			r = SUSPEND;
+		else if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
+			r = ENOTCONN;
+		else if (!uds_has_conn(uds) ||
+		    uds_is_shutdown(uds->uds_conn, SFL_SHUT_WR))
+			r = SOCKEVENT_EOF;
+	}
+
+	if (uds->uds_len == 0) {
+		/*
+		 * For stream-type sockets, we use the policy: if no regular
+		 * data is requested, then end the call without receiving
+		 * anything.  For packet-type sockets, the request should block
+		 * until there is a packet to discard, though.
+		 */
+		if (r != OK || (uds_get_type(uds) == SOCK_STREAM && len == 0))
+			return r;
+
+		return SUSPEND;
+	}
+
+	/*
+	 * For stream-type sockets, we should still suspend the call if fewer
+	 * than 'min' bytes are available right now, and there is a possibility
+	 * that more data may arrive later.  More may arrive later iff 'r' is
+	 * OK (i.e., no EOF or error will follow) and, in case we already
+	 * received some partial results, there is not already a next segment
+	 * with ancillary data (i.e, nonzero segment flags), or in any case
+	 * there isn't more than one segment in the buffer.  Limit 'min' to the
+	 * maximum that can ever be received, though.  Since that is difficult
+	 * in our case, we check whether the buffer is entirely full instead.
+	 */
+	if (r == OK && uds_get_type(uds) == SOCK_STREAM && min > 0 &&
+	    uds->uds_len < UDS_BUF) {
+		assert(uds->uds_len >= UDS_HDRLEN);
+
+		(void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, &datalen,
+		    &segflags);
+
+		if (datalen < min && seglen == uds->uds_len &&
+		    (!partial || segflags == 0))
+			return SUSPEND;
+	}
+
+	/*
+	 * Also start the decision process as to whether we should suspend the
+	 * current call if MSG_WAITALL is given.  Unfortunately there is no one
+	 * place where we can conveniently do all the required checks.
+	 */
+	if (may_block != NULL)
+		*may_block = (r == OK && uds_get_type(uds) == SOCK_STREAM);
+	return OK;
+}
+
+/*
+ * Receive regular data, and possibly the source path, from the tail segment in
+ * the receive buffer.  On success, return the positive non-zero length of the
+ * tail segment, with 'addr' and 'addr_len' modified to store the source
+ * address if applicable, the result flags in 'rflags' updated as appropriate,
+ * the tail segment's data length stored in 'datalen', the number of received
+ * regular data bytes stored in 'reslen', the segment flags stored in
+ * 'segflags', and the absolute receive buffer position of the credentials in
+ * the segment stored in 'credpos' if applicable.  Since the receive call may
+ * still fail, this function must not yet update the tail or any other aspect
+ * of the receive buffer.  Return zero if the current receive call was already
+ * partially successful (due to MSG_WAITALL) and can no longer make progress,
+ * and thus should be ended.  Return a negative error code on failure.
+ */
+static int
+uds_recv_data(struct udssock * uds, const struct sockdriver_data * data,
+	size_t len, size_t off, struct sockaddr * addr, socklen_t * addr_len,
+	int * __restrict rflags, size_t * __restrict datalen,
+	size_t * __restrict reslen, unsigned int * __restrict segflags,
+	size_t * __restrict credpos)
+{
+	iovec_t iov[2];
+	unsigned char lenbyte;
+	unsigned int iovcnt;
+	size_t pos, seglen, left;
+	int r;
+
+	pos = uds_fetch_hdr(uds, uds->uds_tail, &seglen, datalen, segflags);
+
+	/*
+	 * If a partially completed receive now runs into a segment that cannot
+	 * be logically merged with the previous one (because it has at least
+	 * one segment flag set, meaning it has ancillary data), then we must
+	 * shortcut the receive now.
+	 */
+	if (off != 0 && *segflags != 0)
+		return OK;
+
+	/*
+	 * As stated, for stream-type sockets, we choose to ignore zero-size
+	 * receive calls.  This has the consequence that reading a zero-sized
+	 * segment (with ancillary data) requires a receive request for at
+	 * least one regular data byte.  Such a receive call would then return
+	 * zero.  The problem with handling zero-data receive requests is that
+	 * we need to know whether the current segment is terminated (i.e., no
+	 * more data can possibly be merged into it later), which is a test
+	 * that we rather not perform, not in the least because we do not know
+	 * whether there is an error pending on the socket.
+	 *
+	 * For datagrams, we currently allow a zero-size receive call to
+	 * discard the next datagram.
+	 *
+	 * TODO: compare this against policies on other platforms.
+	 */
+	if (len == 0 && uds_get_type(uds) == SOCK_STREAM)
+		return OK;
+
+	/*
+	 * We have to skip the credentials for now: these are copied out as
+	 * control data, and thus will (well, may) be looked at when dealing
+	 * with the control data.  For the same reason, we do not even look at
+	 * UDS_HAS_FDS here.
+	 */
+	if (*segflags & UDS_HAS_CRED) {
+		*credpos = pos;
+
+		pos = uds_fetch(uds, pos, &lenbyte, 1);
+		pos = uds_advance(pos, (size_t)lenbyte);
+	}
+
+	/*
+	 * Copy out the source address, but only if the (datagram) socket is
+	 * not connected.  TODO: even when it is connected, it may still
+	 * receive packets sent to it from other sockets *before* being
+	 * connected, and the receiver has no way of knowing that those packets
+	 * did not come from its new peer.  Ideally, the older packets should
+	 * be dropped..
+	 */
+	if (*segflags & UDS_HAS_PATH) {
+		pos = uds_fetch(uds, pos, &lenbyte, 1);
+
+		if (uds_get_type(uds) == SOCK_DGRAM && !uds_has_link(uds))
+			uds_make_addr((const char *)&uds->uds_buf[pos],
+			    (size_t)lenbyte, addr, addr_len);
+
+		pos = uds_advance(pos, (size_t)lenbyte);
+	}
+
+	/*
+	 * We can receive no more data than those that are present in the
+	 * segment, obviously.  For stream-type sockets, any more data that
+	 * could have been received along with the current data would have been
+	 * merged in the current segment, so we need not search for any next
+	 * segments.
+	 *
+	 * For non-stream sockets, the caller may receive less than a whole
+	 * packet if it supplied a small buffer.  In that case, the rest of the
+	 * packet will be discarded (but not here yet!) and the caller gets
+	 * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway.
+	 */
+	if (len > *datalen)
+		len = *datalen;
+	else if (len < *datalen && uds_get_type(uds) != SOCK_STREAM)
+		*rflags |= MSG_TRUNC;
+
+	/* Copy out the data to the caller. */
+	if (len > 0) {
+		iov[0].iov_addr = (vir_bytes)&uds->uds_buf[pos];
+		left = UDS_BUF - pos;
+
+		if (left < len) {
+			iov[0].iov_size = left;
+			iov[1].iov_addr = (vir_bytes)&uds->uds_buf[0];
+			iov[1].iov_size = len - left;
+			iovcnt = 2;
+		} else {
+			iov[0].iov_size = len;
+			iovcnt = 1;
+		}
+
+		if ((r = sockdriver_vcopyout(data, off, iov, iovcnt)) != OK)
+			return r;
+	}
+
+	*reslen = len;
+	assert(seglen > 0 && seglen <= INT_MAX);
+	return (int)seglen;
+}
+
+/*
+ * The current segment has associated file descriptors.  If possible, copy out
+ * all file descriptors to the receiver, and generate and copy out a chunk of
+ * control data that contains their file descriptor numbers.  If not all
+ * file descriptors fit in the receiver's buffer, or if any error occurs, no
+ * file descriptors are copied out.
+ */
+static int
+uds_recv_fds(struct udssock * uds, const struct sockdriver_data * ctl,
+	socklen_t ctl_len, socklen_t ctl_off, endpoint_t user_endpt, int flags)
+{
+	struct msghdr msghdr;
+	struct cmsghdr *cmsg;
+	struct uds_fd *ufd;
+	unsigned int i, nfds;
+	socklen_t chunklen, chunkspace;
+	int r, fd, what;
+
+	/* See how many file descriptors should be part of this chunk. */
+	assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
+	ufd = SIMPLEQ_FIRST(&uds->uds_fds);
+	nfds = ufd->ufd_count;
+	assert(nfds > 0);
+
+	/*
+	 * We produce and copy out potentially unaligned chunks, using
+	 * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE.
+	 * This may leave "gap" bytes unchanged in userland, but that should
+	 * not be a problem.  By producing unaligned chunks, we eliminate a
+	 * potential boundary case where the unaligned chunk passed in (by the
+	 * sender) no longer fits in the same buffer after being aligned here.
+	 */
+	chunklen = CMSG_LEN(sizeof(int) * nfds);
+	chunkspace = CMSG_SPACE(sizeof(int) * nfds);
+	assert(chunklen <= sizeof(uds_ctlbuf));
+	if (chunklen > ctl_len)
+		return 0; /* chunk would not fit, so produce nothing instead */
+	if (chunkspace > ctl_len)
+		chunkspace = ctl_len;
+
+	memset(&msghdr, 0, sizeof(msghdr));
+	msghdr.msg_control = uds_ctlbuf;
+	msghdr.msg_controllen = sizeof(uds_ctlbuf);
+
+	memset(uds_ctlbuf, 0, chunklen);
+	cmsg = CMSG_FIRSTHDR(&msghdr);
+	cmsg->cmsg_len = chunklen;
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+
+	/*
+	 * Copy the group's local file descriptors to the target endpoint, and
+	 * store the resulting remote file descriptors in the chunk buffer.
+	 */
+	r = OK;
+
+	for (i = 0; i < nfds; i++) {
+		assert(ufd != SIMPLEQ_END(&uds->uds_fds));
+		assert(i == 0 || ufd->ufd_count == 0);
+
+		what = COPYFD_TO;
+		if (flags & MSG_CMSG_CLOEXEC)
+			what |= COPYFD_CLOEXEC;
+
+		/* Failure may happen legitimately here (e.g., EMFILE). */
+		if ((r = copyfd(user_endpt, ufd->ufd_fd, what)) < 0)
+			break; /* we keep our progress so far in 'i' */
+
+		fd = r;
+
+		dprintf(("UDS: copied out fd %d -> %d\n", ufd->ufd_fd, fd));
+
+		memcpy(&((int *)CMSG_DATA(cmsg))[i], &fd, sizeof(int));
+
+		ufd = SIMPLEQ_NEXT(ufd, ufd_next);
+	}
+
+	/* If everything went well so far, copy out the produced chunk. */
+	if (r >= 0)
+		r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen);
+
+	/*
+	 * Handle errors.  At this point, the 'i' variable contains the number
+	 * of file descriptors that have already been successfully copied out.
+	 */
+	if (r < 0) {
+		/* Revert the successful copyfd() calls made so far. */
+		while (i-- > 0) {
+			memcpy(&fd, &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
+
+			(void)copyfd(user_endpt, fd, COPYFD_CLOSE);
+		}
+
+		return r;
+	}
+
+	/*
+	 * Success.  Return the aligned size of the produced chunk, if the
+	 * given length permits it.  From here on, the receive call may no
+	 * longer fail, as that would result in lost file descriptors.
+	 */
+	return chunkspace;
+}
+
+/*
+ * Generate and copy out a chunk of control data with the sender's credentials.
+ * Return the aligned chunk size on success, or a negative error code on
+ * failure.
+ */
+static int
+uds_recv_cred(struct udssock * uds, const struct sockdriver_data * ctl,
+	socklen_t ctl_len, socklen_t ctl_off, size_t credpos)
+{
+	struct msghdr msghdr;
+	struct cmsghdr *cmsg;
+	socklen_t chunklen, chunkspace;
+	unsigned char lenbyte;
+	size_t credlen;
+	int r;
+
+	/*
+	 * Since the sender side already did the hard work of producing the
+	 * (variable-size) sockcred structure as it should be received, there
+	 * is relatively little work to be done here.
+	 */
+	credpos = uds_fetch(uds, credpos, &lenbyte, 1);
+	credlen = (size_t)lenbyte;
+
+	chunklen = CMSG_LEN(credlen);
+	chunkspace = CMSG_SPACE(credlen);
+	assert(chunklen <= sizeof(uds_ctlbuf));
+	if (chunklen > ctl_len)
+		return 0; /* chunk would not fit, so produce nothing instead */
+	if (chunkspace > ctl_len)
+		chunkspace = ctl_len;
+
+	memset(&msghdr, 0, sizeof(msghdr));
+	msghdr.msg_control = uds_ctlbuf;
+	msghdr.msg_controllen = sizeof(uds_ctlbuf);
+
+	memset(uds_ctlbuf, 0, chunklen);
+	cmsg = CMSG_FIRSTHDR(&msghdr);
+	cmsg->cmsg_len = chunklen;
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_CREDS;
+
+	uds_fetch(uds, credpos, CMSG_DATA(cmsg), credlen);
+
+	if ((r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen)) != OK)
+		return r;
+
+	return chunkspace;
+}
+
+/*
+ * Copy out control data for the ancillary data associated with the current
+ * segment, if any.  Return OK on success, at which point the current receive
+ * call may no longer fail.  'rflags' may be updated with additional result
+ * flags.  Return a negative error code on failure.
+ */
+static int
+uds_recv_ctl(struct udssock * uds, const struct sockdriver_data * ctl,
+	socklen_t ctl_len, socklen_t * ctl_off, endpoint_t user_endpt,
+	int flags, unsigned int segflags, size_t credpos, int * rflags)
+{
+	int r;
+
+	/*
+	 * We first copy out all file descriptors, if any.  We put them in one
+	 * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS
+	 * chunks.  We believe that this should not cause application-level
+	 * issues, but if it does, we can change that later with some effort.
+	 * We then copy out credentials, if any.
+	 *
+	 * We copy out each control chunk independently of the others, and also
+	 * perform error recovery on a per-chunk basis.  This implies the
+	 * following.  If producing or copying out the first chunk fails, the
+	 * entire recvmsg(2) call will fail with an appropriate error.  If
+	 * producing or copying out any subsequent chunk fails, the recvmsg(2)
+	 * call will still return the previously generated chunks (a "short
+	 * control read" if you will) as well as the MSG_CTRUNC flag.  This
+	 * approach is simple and clean, and it guarantees that we can always
+	 * copy out at least as many file descriptors as we copied in for this
+	 * segment, even if credentials are present as well.  However, the
+	 * approach does cause slightly more overhead when there are multiple
+	 * chunks per call, as those are copied out separately.
+	 *
+	 * Since the generated SCM_RIGHTS chunk is never larger than the
+	 * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf"
+	 * buffer is always large enough to contain the chunk in its entirety.
+	 * SCM_CREDS chunks should always fit easily as well.
+	 *
+	 * The MSG_CTRUNC flag will be returned iff not the entire user-given
+	 * control buffer was filled and not all control chunks were delivered.
+	 * Our current implementation does not deliver partial chunks.  NetBSD
+	 * does, except for SCM_RIGHTS chunks.
+	 *
+	 * TODO: get rid of the redundancy in processing return values.
+	 */
+	if (segflags & UDS_HAS_FDS) {
+		r = uds_recv_fds(uds, ctl, ctl_len, *ctl_off, user_endpt,
+		    flags);
+
+		/*
+		 * At this point, 'r' contains one of the following:
+		 *
+		 *   r > 0	a chunk of 'r' bytes was added successfully.
+		 *   r == 0	not enough space left; the chunk was not added.
+		 *   r < 0	an error occurred; the chunk was not added.
+		 */
+		if (r < 0 && *ctl_off == 0)
+			return r;
+
+		if (r > 0) {
+			ctl_len -= r;
+			*ctl_off += r;
+		} else
+			*rflags |= MSG_CTRUNC;
+	}
+
+	if (segflags & UDS_HAS_CRED) {
+		r = uds_recv_cred(uds, ctl, ctl_len, *ctl_off, credpos);
+
+		/* As above. */
+		if (r < 0 && *ctl_off == 0)
+			return r;
+
+		if (r > 0) {
+			ctl_len -= r;
+			*ctl_off += r;
+		} else
+			*rflags |= MSG_CTRUNC;
+	}
+
+	return OK;
+}
+
+/*
+ * The current receive request is successful or, in the case of MSG_WAITALL,
+ * has made progress.  Advance the receive buffer tail, either by discarding
+ * the entire tail segment or by generating a new, smaller tail segment that
+ * contains only the regular data left to be received from the original tail
+ * segment.  Also wake up the sending side for connection-oriented sockets if
+ * applicable, because there may now be room for more data to be sent.  Update
+ * 'may_block' if we are now sure that the call may not block on MSG_WAITALL
+ * after all.
+ */
+static void
+uds_recv_advance(struct udssock * uds, size_t seglen, size_t datalen,
+	size_t reslen, unsigned int segflags, int * may_block)
+{
+	struct udssock *conn;
+	struct uds_fd *ufd;
+	size_t delta, nseglen, advance;
+	unsigned int nfds;
+
+	/* Note that 'reslen' may be legitimately zero. */
+	assert(reslen <= datalen);
+
+	if (uds_get_type(uds) != SOCK_STREAM && reslen < datalen)
+		reslen = datalen;
+
+	delta = datalen - reslen;
+
+	if (delta == 0) {
+		/*
+		 * Fully consume the tail segment.  We advance the tail by the
+		 * full segment length, thus moving up to either the next
+		 * segment in the receive buffer, or an empty receive buffer.
+		 */
+		advance = seglen;
+
+		uds->uds_tail = uds_advance(uds->uds_tail, advance);
+	} else {
+		/*
+		 * Partially consume the tail segment.  We put a new segment
+		 * header right in front of the remaining data, which obviously
+		 * always fits.  Since any ancillary data was consumed along
+		 * with the first data byte of the segment, the new segment has
+		 * no ancillary data anymore (and thus a zero flags field).
+		 */
+		nseglen = UDS_HDRLEN + delta;
+		assert(nseglen < seglen);
+
+		advance = seglen - nseglen;
+
+		uds->uds_tail = uds_advance(uds->uds_tail, advance);
+
+		uds_store_hdr(uds, uds->uds_tail, nseglen, delta, 0);
+	}
+
+	/*
+	 * For datagram-oriented sockets, we always consume at least a header.
+	 * For stream-type sockets, we either consume a zero-data segment along
+	 * with its ancillary data, or we consume at least one byte from a
+	 * segment that does have regular data.  In all other cases, the
+	 * receive call has already been ended by now.  Thus, we always advance
+	 * the tail of the receive buffer here.
+	 */
+	assert(advance > 0);
+
+	/*
+	 * The receive buffer's used length (uds_len) and pointer to the
+	 * previous segment header (uds_last) are offsets from the tail.  Now
+	 * that we have moved the tail, we need to adjust these accordingly.
+	 * If the buffer is now empty, reset the tail to the buffer start so as
+	 * to avoid splitting inter-process copies whenever possible.
+	 */
+	assert(uds->uds_len >= advance);
+	uds->uds_len -= advance;
+
+	if (uds->uds_len == 0)
+		uds->uds_tail = 0;
+
+	/*
+	 * If uds_last is zero here, it was pointing to the segment we just
+	 * (partially) consumed.  By leaving it zero, it will still point to
+	 * the new or next segment.
+	 */
+	if (uds->uds_last > 0) {
+		assert(uds->uds_len > 0);
+		assert(uds->uds_last >= advance);
+		uds->uds_last -= advance;
+	}
+
+	/*
+	 * If there were any file descriptors associated with this segment,
+	 * close and free them now.
+	 */
+	if (segflags & UDS_HAS_FDS) {
+		assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
+		ufd = SIMPLEQ_FIRST(&uds->uds_fds);
+		nfds = ufd->ufd_count;
+		assert(nfds > 0);
+
+		while (nfds-- > 0) {
+			assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
+			ufd = SIMPLEQ_FIRST(&uds->uds_fds);
+			SIMPLEQ_REMOVE_HEAD(&uds->uds_fds, ufd_next);
+
+			dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
+
+			closenb(ufd->ufd_fd);
+
+			SIMPLEQ_INSERT_TAIL(&uds_freefds, ufd, ufd_next);
+		}
+	}
+
+	/*
+	 * If there is now any data left in the receive buffer, then there has
+	 * been a reason that we haven't received it.  For stream sockets, that
+	 * reason is that the next segment has ancillary data.  In any case,
+	 * this means we should never block the current receive operation
+	 * waiting for more data.  Otherwise, we may block on MSG_WAITALL.
+	 */
+	if (uds->uds_len > 0)
+		*may_block = FALSE;
+
+	/*
+	 * If the (non-datagram) socket has a peer that is not shut down for
+	 * writing, see if it can be woken up to send more data.  Note that
+	 * the event will never be processed immediately.
+	 */
+	if (uds_is_connected(uds)) {
+		assert(uds_get_type(uds) != SOCK_DGRAM);
+
+		conn = uds->uds_conn;
+
+		if (!uds_is_shutdown(conn, SFL_SHUT_WR))
+			sockevent_raise(&conn->uds_sock, SEV_SEND);
+	}
+}
+
+/*
+ * Process a receive request.  Return OK if the receive request has completed
+ * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an
+ * end-of-file condition is reached, or a negative error code on failure.  In
+ * all cases, the values of 'off' and 'ctl_off' must be updated if any progress
+ * has been made; if either is non-zero, libsockevent will return the partial
+ * progress rather than an error code or EOF.
+ */
+int
+uds_recv(struct sock * sock, const struct sockdriver_data * data, size_t len,
+	size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
+	socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len,
+	endpoint_t user_endpt, int flags, size_t min, int * rflags)
+{
+	struct udssock *uds = (struct udssock *)sock;
+	size_t seglen, datalen, reslen = 0 /*gcc*/, credpos = 0 /*gcc*/;
+	unsigned int segflags;
+	int r, partial, may_block;
+
+	dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n",
+	    uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
+	    (ctl_off != NULL) ? *ctl_off : 0, flags));
+
+	/*
+	 * Start by testing whether anything can be received at all, or whether
+	 * an error or EOF should be returned instead, or whether the receive
+	 * call should be suspended until later otherwise.  If no (regular or
+	 * control) data can be received, or if this was a test for select,
+	 * we bail out right after.
+	 */
+	partial = (off != NULL && *off > 0);
+
+	if ((r = uds_recv_test(uds, len, min, partial, &may_block)) != OK)
+		return r;
+
+	/*
+	 * Copy out regular data, if any.  Do this before copying out control
+	 * data, because the latter is harder to undo on failure.  This data
+	 * copy function returns returns OK (0) if we are to return a result of
+	 * zero bytes (which is *not* EOF) to the caller without doing anything
+	 * else.  The function returns a nonzero positive segment length if we
+	 * should carry on with the receive call (as it happens, all its other
+	 * returned values may in fact be zero).
+	 */
+	if ((r = uds_recv_data(uds, data, len, *off, addr, addr_len, rflags,
+	    &datalen, &reslen, &segflags, &credpos)) <= 0)
+		return r;
+	seglen = (size_t)r;
+
+	/*
+	 * Copy out control data, if any: transfer and copy out records of file
+	 * descriptors, and/or copy out sender credentials.  This is the last
+	 * part of the call that may fail.
+	 */
+	if ((r = uds_recv_ctl(uds, ctl, ctl_len, ctl_off, user_endpt, flags,
+	    segflags, credpos, rflags)) != OK)
+		return r;
+
+	/*
+	 * Now that the call has succeeded, move the tail of the receive
+	 * buffer, unless we were merely peeking.
+	 */
+	if (!(flags & MSG_PEEK))
+		uds_recv_advance(uds, seglen, datalen, reslen, segflags,
+		    &may_block);
+	else
+		may_block = FALSE;
+
+	/*
+	 * If the MSG_WAITALL flag was given, we may still have to suspend the
+	 * call after partial success.  In particular, the receive call may
+	 * suspend after partial success if all of these conditions are met:
+	 *
+	 *   1) the socket is a stream-type socket;
+	 *   2) MSG_WAITALL is set;
+	 *   3) MSG_PEEK is not set;
+	 *   4) MSG_DONTWAIT is not set (tested upon return);
+	 *   5) the socket must not have a pending error (tested upon return);
+	 *   6) the socket must not be shut down for reading (tested later);
+	 *   7) the socket must still be connected to a peer (no EOF);
+	 *   8) the peer must not have been shut down for writing (no EOF);
+	 *   9) the next segment, if any, contains no ancillary data.
+	 *
+	 * Together, these points guarantee that the call could conceivably
+	 * receive more after being resumed.  Points 4 to 6 are covered by
+	 * libsockevent, which will end the call even if we return SUSPEND
+	 * here.  Due to segment merging, we cover point 9 by checking that
+	 * there is currently no next segment at all.  Once a new segment
+	 * arrives, the ancillary-data test is done then.
+	 */
+	*off += reslen;
+	if ((flags & MSG_WAITALL) && reslen < len && may_block)
+		return SUSPEND;
+	else
+		return OK;
+}
+
+/*
+ * Test whether a receive request would block.  The given 'min' parameter
+ * contains the minimum number of bytes that should be possible to receive
+ * without blocking (the low receive watermark).  Return SUSPEND if the send
+ * request would block.  Otherwise, return any other error code (including OK
+ * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled
+ * with the number of bytes available for receipt right now (if not zero).
+ * Note that if 'size' is not NULL, 'min' will always be zero.
+ */
+int
+uds_test_recv(struct sock * sock, size_t min, size_t * size)
+{
+	struct udssock *uds = (struct udssock *)sock;
+	size_t seglen;
+	unsigned int segflags;
+	int r;
+
+	if ((r = uds_recv_test(uds, min, min, FALSE /*partial*/,
+	    NULL /*may_block*/)) == SUSPEND)
+		return r;
+
+	if (size != NULL && uds->uds_len > 0)
+		(void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, size,
+		    &segflags);
+
+	return r;
+}
diff --git a/minix/net/uds/ioc_uds.c b/minix/net/uds/ioc_uds.c
deleted file mode 100644
index 8271f4377..000000000
--- a/minix/net/uds/ioc_uds.c
+++ /dev/null
@@ -1,1114 +0,0 @@
-/*
- * Unix Domain Sockets Implementation (PF_UNIX, PF_LOCAL)
- * This code handles ioctl(2) commands to implement the socket API.
- * Some helper functions are also present.
- */
-
-#include "uds.h"
-
-static int
-perform_connection(devminor_t minorx, devminor_t minory,
-	struct sockaddr_un *addr)
-{
-	/*
-	 * There are several places were a connection is established, the
-	 * initiating call being one of accept(2), connect(2), socketpair(2).
-	 */
-	dprintf(("UDS: perform_connection(%d, %d)\n", minorx, minory));
-
-	/*
-	 * Only connection-oriented types are acceptable and only equal
-	 * types can connect to each other.
-	 */
-	if ((uds_fd_table[minorx].type != SOCK_SEQPACKET &&
-	    uds_fd_table[minorx].type != SOCK_STREAM) ||
-	    uds_fd_table[minorx].type != uds_fd_table[minory].type)
-		return EINVAL;
-
-	/* Connect the pair of sockets. */
-	uds_fd_table[minorx].peer = minory;
-	uds_fd_table[minory].peer = minorx;
-
-	/* Set the address of both sockets */
-	memcpy(&uds_fd_table[minorx].addr, addr, sizeof(struct sockaddr_un));
-	memcpy(&uds_fd_table[minory].addr, addr, sizeof(struct sockaddr_un));
-
-	return OK;
-}
-
-static int
-do_accept(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	devminor_t minorparent; /* minor number of parent (server) */
-	devminor_t minorpeer;
-	int rc, i;
-	struct sockaddr_un addr;
-
-	dprintf(("UDS: do_accept(%d)\n", minor));
-
-	/*
-	 * Somewhat weird logic is used in this function, so here's an
-	 * overview... The minor number is the server's client socket
-	 * (the socket to be returned by accept()). The data waiting
-	 * for us in the IO Grant is the address that the server is
-	 * listening on. This function uses the address to find the
-	 * server's descriptor. From there we can perform the
-	 * connection or suspend and wait for a connect().
-	 */
-
-	/* This IOCTL must be called on a 'fresh' socket. */
-	if (uds_fd_table[minor].type != -1)
-		return EINVAL;
-
-	/* Get the server's address */
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
-	    sizeof(struct sockaddr_un))) != OK)
-		return rc;
-
-	/* Locate the server socket. */
-	for (i = 0; i < NR_FDS; i++) {
-		if (uds_fd_table[i].stale == FALSE &&
-		    uds_fd_table[i].listening == TRUE &&
-		    uds_fd_table[i].addr.sun_family == AF_UNIX &&
-		    !strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path,
-		    sizeof(uds_fd_table[i].addr.sun_path)))
-			break;
-	}
-
-	if (i == NR_FDS)
-		return EINVAL;
-
-	minorparent = i; /* parent */
-
-	/* We are the parent's child. */
-	uds_fd_table[minorparent].child = minor;
-
-	/*
-	 * The peer has the same type as the parent. we need to be that
-	 * type too.
-	 */
-	uds_fd_table[minor].type = uds_fd_table[minorparent].type;
-
-	/* Locate the peer to accept in the parent's backlog. */
-	minorpeer = -1;
-	for (i = 0; i < uds_fd_table[minorparent].backlog_size; i++) {
-		if (uds_fd_table[minorparent].backlog[i] != -1) {
-			minorpeer = uds_fd_table[minorparent].backlog[i];
-			uds_fd_table[minorparent].backlog[i] = -1;
-			break;
-		}
-	}
-
-	if (minorpeer == -1) {
-		dprintf(("UDS: do_accept(%d): suspend\n", minor));
-
-		/*
-		 * There are no peers in the backlog, suspend and wait for one
-		 * to show up.
-		 */
-		uds_fd_table[minor].suspended = UDS_SUSPENDED_ACCEPT;
-
-		return EDONTREPLY;
-	}
-
-	dprintf(("UDS: connecting %d to %d -- parent is %d\n", minor,
-	    minorpeer, minorparent));
-
-	if ((rc = perform_connection(minor, minorpeer, &addr)) != OK) {
-		dprintf(("UDS: do_accept(%d): connection failed\n", minor));
-
-		return rc;
-	}
-
-	uds_fd_table[minorparent].child = -1;
-
-	/* If the peer is blocked on connect() or write(), revive the peer. */
-	if (uds_fd_table[minorpeer].suspended == UDS_SUSPENDED_CONNECT ||
-	    uds_fd_table[minorpeer].suspended == UDS_SUSPENDED_WRITE) {
-		dprintf(("UDS: do_accept(%d): revive %d\n", minor, minorpeer));
-		uds_unsuspend(minorpeer);
-	}
-
-	/* See if we can satisfy an ongoing select. */
-	if ((uds_fd_table[minorpeer].sel_ops & CDEV_OP_WR) &&
-	    uds_fd_table[minorpeer].size < UDS_BUF) {
-		/* A write on the peer is possible now. */
-		chardriver_reply_select(uds_fd_table[minorpeer].sel_endpt,
-		    minorpeer, CDEV_OP_WR);
-		uds_fd_table[minorpeer].sel_ops &= ~CDEV_OP_WR;
-	}
-
-	return OK;
-}
-
-static int
-do_connect(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int child, peer;
-	struct sockaddr_un addr;
-	int rc, i, j;
-	dev_t dev;
-	ino_t ino;
-
-	dprintf(("UDS: do_connect(%d)\n", minor));
-
-	/* Only connection oriented sockets can connect. */
-	if (uds_fd_table[minor].type != SOCK_STREAM &&
-	    uds_fd_table[minor].type != SOCK_SEQPACKET)
-		return EINVAL;
-
-	/* The socket must not be connecting or connected already. */
-	peer = uds_fd_table[minor].peer;
-	if (peer != -1) {
-		if (uds_fd_table[peer].peer == -1)
-			return EALREADY;	/* connecting */
-		else
-			return EISCONN;		/* connected */
-	}
-
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
-	    sizeof(struct sockaddr_un))) != OK)
-		return rc;
-
-	if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path,
-	    sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK)
-		return rc;
-
-	/*
-	 * Look for a socket of the same type that is listening on the
-	 * address we want to connect to.
-	 */
-	for (i = 0; i < NR_FDS; i++) {
-		if (uds_fd_table[minor].type != uds_fd_table[i].type)
-			continue;
-		if (uds_fd_table[i].listening == FALSE)
-			continue;
-		if (uds_fd_table[i].stale == TRUE)
-			continue;
-		if (uds_fd_table[i].addr.sun_family != AF_UNIX)
-			continue;
-		if (strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path,
-		    sizeof(uds_fd_table[i].addr.sun_path)))
-			continue;
-
-		/* Found a matching socket. */
-		break;
-	}
-
-	if (i == NR_FDS)
-		return ECONNREFUSED;
-
-	/* If the server is blocked on an accept, perform the connection. */
-	if ((child = uds_fd_table[i].child) != -1) {
-		rc = perform_connection(minor, child, &addr);
-
-		if (rc != OK)
-			return rc;
-
-		uds_fd_table[i].child = -1;
-
-		dprintf(("UDS: do_connect(%d): revive %d\n", minor, child));
-
-		/* Wake up the accepting party. */
-		uds_unsuspend(child);
-
-		return OK;
-	}
-
-	dprintf(("UDS: adding %d to %d's backlog\n", minor, i));
-
-	/* Look for a free slot in the backlog. */
-	rc = -1;
-	for (j = 0; j < uds_fd_table[i].backlog_size; j++) {
-		if (uds_fd_table[i].backlog[j] == -1) {
-			uds_fd_table[i].backlog[j] = minor;
-
-			rc = 0;
-			break;
-		}
-	}
-
-	if (rc == -1)
-		return ECONNREFUSED;	/* backlog is full */
-
-	/* See if the server is blocked on select(). */
-	if (uds_fd_table[i].sel_ops & CDEV_OP_RD) {
-		/* Satisfy a read-type select on the server. */
-		chardriver_reply_select(uds_fd_table[i].sel_endpt, i,
-		    CDEV_OP_RD);
-
-		uds_fd_table[i].sel_ops &= ~CDEV_OP_RD;
-	}
-
-	/* We found our server. */
-	uds_fd_table[minor].peer = i;
-
-	memcpy(&uds_fd_table[minor].addr, &addr, sizeof(struct sockaddr_un));
-
-	dprintf(("UDS: do_connect(%d): suspend\n", minor));
-
-	/* Suspend until the server side accepts the connection. */
-	uds_fd_table[minor].suspended = UDS_SUSPENDED_CONNECT;
-
-	return EDONTREPLY;
-}
-
-static int
-do_listen(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int rc;
-	int backlog_size;
-
-	dprintf(("UDS: do_listen(%d)\n", minor));
-
-	/* Ensure the socket has a type and is bound. */
-	if (uds_fd_table[minor].type == -1 ||
-	    uds_fd_table[minor].addr.sun_family != AF_UNIX)
-		return EINVAL;
-
-	/* listen(2) supports only two socket types. */
-	if (uds_fd_table[minor].type != SOCK_STREAM &&
-	    uds_fd_table[minor].type != SOCK_SEQPACKET)
-		return EOPNOTSUPP;
-
-	/*
-	 * The POSIX standard doesn't say what to do if listen() has
-	 * already been called.  Well, there isn't an errno.  We silently
-	 * let it happen, but if listen() has already been called, we
-	 * don't allow the backlog to shrink.
-	 */
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &backlog_size,
-	    sizeof(backlog_size))) != OK)
-		return rc;
-
-	if (uds_fd_table[minor].listening == FALSE) {
-		/* Set the backlog size to a reasonable value. */
-		if (backlog_size <= 0 || backlog_size > UDS_SOMAXCONN)
-			backlog_size = UDS_SOMAXCONN;
-
-		uds_fd_table[minor].backlog_size = backlog_size;
-	} else {
-		/* Allow the user to expand the backlog size. */
-		if (backlog_size > uds_fd_table[minor].backlog_size &&
-		    backlog_size < UDS_SOMAXCONN)
-			uds_fd_table[minor].backlog_size = backlog_size;
-
-		/*
-		 * Don't let the user shrink the backlog_size, as we might
-		 * have clients waiting in those slots.
-		 */
-	}
-
-	/* This socket is now listening. */
-	uds_fd_table[minor].listening = TRUE;
-
-	return OK;
-}
-
-static int
-do_socket(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int rc, type;
-
-	dprintf(("UDS: do_socket(%d)\n", minor));
-
-	/* The socket type can only be set once. */
-	if (uds_fd_table[minor].type != -1)
-		return EINVAL;
-
-	/* Get the requested type. */
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &type,
-	    sizeof(type))) != OK)
-		return rc;
-
-	/* Assign the type if it is valid only. */
-	switch (type) {
-	case SOCK_STREAM:
-	case SOCK_DGRAM:
-	case SOCK_SEQPACKET:
-		uds_fd_table[minor].type = type;
-		return OK;
-
-	default:
-		return EINVAL;
-	}
-}
-
-static int
-do_bind(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	struct sockaddr_un addr;
-	int rc, i;
-	dev_t dev;
-	ino_t ino;
-
-	dprintf(("UDS: do_bind(%d)\n", minor));
-
-	/* If the type hasn't been set by do_socket() yet, OR an attempt
-	 * to re-bind() a non-SOCK_DGRAM socket is made, fail the call.
-	 */
-	if ((uds_fd_table[minor].type == -1) ||
-	    (uds_fd_table[minor].addr.sun_family == AF_UNIX &&
-	    uds_fd_table[minor].type != SOCK_DGRAM))
-		return EINVAL;
-
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
-	    sizeof(struct sockaddr_un))) != OK)
-		return rc;
-
-	/* Do some basic sanity checks on the address. */
-	if (addr.sun_family != AF_UNIX)
-		return EAFNOSUPPORT;
-
-	if (addr.sun_path[0] == '\0')
-		return ENOENT;
-
-	/* Attempt to create the socket file. */
-	if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path,
-#if NOT_YET
-	    sizeof(addr.sun_path), SPATH_CREATE, &dev, &ino)) != OK)
-#else
-	    sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK)
-#endif
-		return rc;
-
-	/*
-	 * It is possible that the socket path name was already in use as
-	 * address by another socket.  This means that the socket file was
-	 * prematurely unlinked.  In that case, mark the old socket as stale,
-	 * so that its path name will not be matched and only the newly bound
-	 * socket will be found in address-based searches.  For now, we leave
-	 * the old socket marked as stale for as long as it is bound to the
-	 * same address.  A more advanced implementation could establish an
-	 * order between the sockets so that the most recently bound socket is
-	 * found at any time, but it is doubtful whether that would be useful.
-	 */
-	for (i = 0; i < NR_FDS; i++) {
-		if (uds_fd_table[i].stale == FALSE &&
-		    uds_fd_table[i].addr.sun_family == AF_UNIX &&
-		    !strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path,
-		    sizeof(uds_fd_table[i].addr.sun_path))) {
-#if NOT_YET
-			uds_fd_table[i].stale = TRUE;
-#else
-			return EADDRINUSE;
-#endif
-		}
-	}
-
-	/* Looks good, perform the bind(). */
-	uds_fd_table[minor].stale = FALSE;
-	memcpy(&uds_fd_table[minor].addr, &addr, sizeof(struct sockaddr_un));
-
-	return OK;
-}
-
-static int
-do_getsockname(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	dprintf(("UDS: do_getsockname(%d)\n", minor));
-
-	/*
-	 * Unconditionally send the address we have assigned to this socket.
-	 * The POSIX standard doesn't say what to do if the address hasn't been
-	 * set.  If the address isn't currently set, then the user will get
-	 * NULL bytes.  Note: libc depends on this behavior.
-	 */
-	return sys_safecopyto(endpt, grant, 0,
-	    (vir_bytes) &uds_fd_table[minor].addr, sizeof(struct sockaddr_un));
-}
-
-static int
-do_getpeername(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int peer_minor;
-
-	dprintf(("UDS: do_getpeername(%d)\n", minor));
-
-	/* Check that the socket is connected with a valid peer. */
-	if (uds_fd_table[minor].peer != -1) {
-		peer_minor = uds_fd_table[minor].peer;
-
-		/* Copy the address from the peer. */
-		return sys_safecopyto(endpt, grant, 0,
-		    (vir_bytes) &uds_fd_table[peer_minor].addr,
-		    sizeof(struct sockaddr_un));
-	} else if (uds_fd_table[minor].err == ECONNRESET) {
-		uds_fd_table[minor].err = 0;
-
-		return ECONNRESET;
-	} else
-		return ENOTCONN;
-}
-
-static int
-do_shutdown(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int rc, how;
-
-	dprintf(("UDS: do_shutdown(%d)\n", minor));
-
-	/* The socket must be connection oriented. */
-	if (uds_fd_table[minor].type != SOCK_STREAM &&
-	    uds_fd_table[minor].type != SOCK_SEQPACKET)
-		return EINVAL;
-
-	if (uds_fd_table[minor].peer == -1) {
-		/* shutdown(2) is only valid for connected sockets. */
-		if (uds_fd_table[minor].err == ECONNRESET)
-			return ECONNRESET;
-		else
-			return ENOTCONN;
-	}
-
-	/* Get the 'how' parameter from the caller. */
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &how,
-	    sizeof(how))) != OK)
-		return rc;
-
-	switch (how) {
-	case SHUT_RD:		/* Take away read permission. */
-		uds_fd_table[minor].mode &= ~UDS_R;
-		break;
-
-	case SHUT_WR:		/* Take away write permission. */
-		uds_fd_table[minor].mode &= ~UDS_W;
-		break;
-
-	case SHUT_RDWR:		/* Shut down completely. */
-		uds_fd_table[minor].mode = 0;
-		break;
-
-	default:
-		return EINVAL;
-	}
-
-	return OK;
-}
-
-static int
-do_socketpair(devminor_t minorx, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int rc;
-	dev_t minorin;
-	devminor_t minory;
-	struct sockaddr_un addr;
-
-	dprintf(("UDS: do_socketpair(%d)\n", minorx));
-
-	/* The ioctl argument is the minor number of the second socket. */
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &minorin,
-	    sizeof(minorin))) != OK)
-		return rc;
-
-	minory = minor(minorin);
-
-	dprintf(("UDS: socketpair(%d, %d,)\n", minorx, minory));
-
-	/* Security check: both sockets must have the same owner endpoint. */
-	if (uds_fd_table[minorx].owner != uds_fd_table[minory].owner)
-		return EPERM;
-
-	addr.sun_family = AF_UNIX;
-	addr.sun_path[0] = 'X';
-	addr.sun_path[1] = '\0';
-
-	return perform_connection(minorx, minory, &addr);
-}
-
-static int
-do_getsockopt_sotype(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	dprintf(("UDS: do_getsockopt_sotype(%d)\n", minor));
-
-	/* If the type hasn't been set yet, we fail the call. */
-	if (uds_fd_table[minor].type == -1)
-		return EINVAL;
-
-	return sys_safecopyto(endpt, grant, 0,
-	    (vir_bytes) &uds_fd_table[minor].type, sizeof(int));
-}
-
-static int
-do_getsockopt_peercred(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int peer_minor;
-	int rc;
-	struct uucred cred;
-
-	dprintf(("UDS: do_getsockopt_peercred(%d)\n", minor));
-
-	if (uds_fd_table[minor].peer == -1) {
-		if (uds_fd_table[minor].err == ECONNRESET) {
-			uds_fd_table[minor].err = 0;
-
-			return ECONNRESET;
-		} else
-			return ENOTCONN;
-	}
-
-	peer_minor = uds_fd_table[minor].peer;
-
-	/*
-	 * Obtain the peer's credentials and copy them out.  Ignore failures;
-	 * in that case, the caller will simply get no credentials.
-	 */
-	memset(&cred, 0, sizeof(cred));
-	cred.cr_uid = -1;
-	cred.cr_gid = -1;
-	(void)getepinfo(uds_fd_table[peer_minor].owner, &cred.cr_uid,
-	    &cred.cr_gid);
-
-	return sys_safecopyto(endpt, grant, 0, (vir_bytes) &cred,
-	    sizeof(struct uucred));
-}
-
-static int
-do_getsockopt_sndbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	size_t sndbuf = UDS_BUF;
-
-	dprintf(("UDS: do_getsockopt_sndbuf(%d)\n", minor));
-
-	return sys_safecopyto(endpt, grant, 0, (vir_bytes) &sndbuf,
-	    sizeof(sndbuf));
-}
-
-static int
-do_setsockopt_sndbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int rc;
-	size_t sndbuf;
-
-	dprintf(("UDS: do_setsockopt_sndbuf(%d)\n", minor));
-
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &sndbuf,
-	    sizeof(sndbuf))) != OK)
-		return rc;
-
-	/* The send buffer is limited to 32KB at the moment. */
-	if (sndbuf > UDS_BUF)
-		return ENOSYS;
-
-	/* FIXME: actually shrink the buffer. */
-	return OK;
-}
-
-static int
-do_getsockopt_rcvbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	size_t rcvbuf = UDS_BUF;
-
-	dprintf(("UDS: do_getsockopt_rcvbuf(%d)\n", minor));
-
-	return sys_safecopyto(endpt, grant, 0, (vir_bytes) &rcvbuf,
-	    sizeof(rcvbuf));
-}
-
-static int
-do_setsockopt_rcvbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int rc;
-	size_t rcvbuf;
-
-	dprintf(("UDS: do_setsockopt_rcvbuf(%d)\n", minor));
-
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &rcvbuf,
-	    sizeof(rcvbuf))) != OK)
-		return rc;
-
-	/* The receive buffer is limited to 32KB at the moment. */
-	if (rcvbuf > UDS_BUF)
-		return ENOSYS;
-
-	/* FIXME: actually shrink the buffer. */
-	return OK;
-}
-
-static int
-do_sendto(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int rc;
-	struct sockaddr_un addr;
-	dev_t dev;
-	ino_t ino;
-
-	dprintf(("UDS: do_sendto(%d)\n", minor));
-
-	/* This IOCTL is only for SOCK_DGRAM sockets. */
-	if (uds_fd_table[minor].type != SOCK_DGRAM)
-		return EINVAL;
-
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
-	    sizeof(struct sockaddr_un))) != OK)
-		return rc;
-
-	/* Do some basic sanity checks on the address. */
-	if (addr.sun_family != AF_UNIX || addr.sun_path[0] == '\0')
-		return EINVAL;
-
-	if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path,
-	    sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK)
-		return rc;
-
-	memcpy(&uds_fd_table[minor].target, &addr, sizeof(struct sockaddr_un));
-
-	return OK;
-}
-
-static int
-do_recvfrom(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	dprintf(("UDS: do_recvfrom(%d)\n", minor));
-
-	return sys_safecopyto(endpt, grant, 0,
-	    (vir_bytes) &uds_fd_table[minor].source,
-	    sizeof(struct sockaddr_un));
-}
-
-static int
-send_fds(devminor_t minor, struct msg_control *msg_ctrl,
-	struct ancillary *data)
-{
-	int i, rc, nfds, totalfds;
-	endpoint_t from_ep;
-	struct msghdr msghdr;
-	struct cmsghdr *cmsg = NULL;
-
-	dprintf(("UDS: send_fds(%d)\n", minor));
-
-	from_ep = uds_fd_table[minor].owner;
-
-	/* Obtain this socket's credentials. */
-	if ((rc = getepinfo(from_ep, &data->cred.uid, &data->cred.gid)) < 0)
-		return rc;
-
-	dprintf(("UDS: minor=%d cred={%d,%d}\n", minor,
-	    data->cred.uid, data->cred.gid));
-
-	totalfds = data->nfiledes;
-
-	memset(&msghdr, '\0', sizeof(struct msghdr));
-	msghdr.msg_control = msg_ctrl->msg_control;
-	msghdr.msg_controllen = msg_ctrl->msg_controllen;
-
-	for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
-	    cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
-		if (cmsg->cmsg_level != SOL_SOCKET ||
-		    cmsg->cmsg_type != SCM_RIGHTS)
-			continue;
-
-		nfds = MIN((cmsg->cmsg_len-CMSG_LEN(0))/sizeof(int), OPEN_MAX);
-
-		for (i = 0; i < nfds; i++) {
-			if (totalfds == OPEN_MAX)
-				return EOVERFLOW;
-
-			data->fds[totalfds] = ((int *) CMSG_DATA(cmsg))[i];
-			dprintf(("UDS: minor=%d fd[%d]=%d\n", minor, totalfds,
-			    data->fds[totalfds]));
-			totalfds++;
-		}
-	}
-
-	for (i = data->nfiledes; i < totalfds; i++) {
-		if ((rc = copyfd(from_ep, data->fds[i], COPYFD_FROM)) < 0) {
-			printf("UDS: copyfd(COPYFD_FROM) failed: %d\n", rc);
-
-			/* Revert the successful copyfd() calls made so far. */
-			for (i--; i >= data->nfiledes; i--)
-				close(data->fds[i]);
-
-			return rc;
-		}
-
-		dprintf(("UDS: send_fds(): %d -> %d\n", data->fds[i], rc));
-
-		data->fds[i] = rc;	/* this is now the local FD */
-	}
-
-	data->nfiledes = totalfds;
-
-	return OK;
-}
-
-/*
- * This function calls close() for all of the FDs in flight.  This is used
- * when a Unix Domain Socket is closed and there exists references to file
- * descriptors that haven't been received with recvmsg().
- */
-int
-uds_clear_fds(devminor_t minor, struct ancillary *data)
-{
-	int i;
-
-	dprintf(("UDS: uds_clear_fds(%d)\n", minor));
-
-	for (i = 0; i < data->nfiledes; i++) {
-		dprintf(("UDS: uds_clear_fds() => %d\n", data->fds[i]));
-
-		close(data->fds[i]);
-
-		data->fds[i] = -1;
-	}
-
-	data->nfiledes = 0;
-
-	return OK;
-}
-
-static int
-recv_fds(devminor_t minor, struct ancillary *data,
-	struct msg_control *msg_ctrl)
-{
-	int rc, i, j, fds[OPEN_MAX];
-	struct msghdr msghdr;
-	struct cmsghdr *cmsg;
-	endpoint_t to_ep;
-
-	dprintf(("UDS: recv_fds(%d)\n", minor));
-
-	msghdr.msg_control = msg_ctrl->msg_control;
-	msghdr.msg_controllen = msg_ctrl->msg_controllen;
-
-	cmsg = CMSG_FIRSTHDR(&msghdr);
-	cmsg->cmsg_len = CMSG_LEN(sizeof(int) * data->nfiledes);
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_RIGHTS;
-
-	to_ep = uds_fd_table[minor].owner;
-
-	/* Copy to the target endpoint. */
-	for (i = 0; i < data->nfiledes; i++) {
-		if ((rc = copyfd(to_ep, data->fds[i], COPYFD_TO)) < 0) {
-			printf("UDS: copyfd(COPYFD_TO) failed: %d\n", rc);
-
-			/* Revert the successful copyfd() calls made so far. */
-			for (i--; i >= 0; i--)
-				(void) copyfd(to_ep, fds[i], COPYFD_CLOSE);
-
-			return rc;
-		}
-
-		fds[i] = rc;		/* this is now the remote FD */
-	}
-
-	/* Close the local copies only once the entire procedure succeeded. */
-	for (i = 0; i < data->nfiledes; i++) {
-		dprintf(("UDS: recv_fds(): %d -> %d\n", data->fds[i], fds[i]));
-
-		((int *)CMSG_DATA(cmsg))[i] = fds[i];
-
-		close(data->fds[i]);
-
-		data->fds[i] = -1;
-	}
-
-	data->nfiledes = 0;
-
-	return OK;
-}
-
-static int
-recv_cred(devminor_t minor, struct ancillary *data,
-	struct msg_control *msg_ctrl)
-{
-	struct msghdr msghdr;
-	struct cmsghdr *cmsg;
-	struct uucred *cred;
-
-	dprintf(("UDS: recv_cred(%d)\n", minor));
-
-	msghdr.msg_control = msg_ctrl->msg_control;
-	msghdr.msg_controllen = msg_ctrl->msg_controllen;
-
-	cmsg = CMSG_FIRSTHDR(&msghdr);
-	if (cmsg->cmsg_len > 0)
-		cmsg = CMSG_NXTHDR(&msghdr, cmsg);
-
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct uucred));
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_CREDS;
-	cred = (struct uucred *)CMSG_DATA(cmsg);
-	memset(cred, 0, sizeof(*cred));
-	cred->cr_uid = data->cred.uid;
-	cred->cr_gid = data->cred.gid;
-
-	return OK;
-}
-
-static int
-do_sendmsg(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int peer, rc, i;
-	struct msg_control msg_ctrl;
-
-	dprintf(("UDS: do_sendmsg(%d)\n", minor));
-
-	memset(&msg_ctrl, '\0', sizeof(struct msg_control));
-
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &msg_ctrl,
-	    sizeof(struct msg_control))) != OK)
-		return rc;
-
-	/* Locate the peer. */
-	peer = -1;
-	if (uds_fd_table[minor].type == SOCK_DGRAM) {
-		if (uds_fd_table[minor].target.sun_path[0] == '\0' ||
-		    uds_fd_table[minor].target.sun_family != AF_UNIX)
-			return EDESTADDRREQ;
-
-		for (i = 0; i < NR_FDS; i++) {
-			/*
-			 * Look for a SOCK_DGRAM socket that is bound on the
-			 * target address.
-			 */
-			if (uds_fd_table[i].type == SOCK_DGRAM &&
-			    uds_fd_table[i].stale == FALSE &&
-			    uds_fd_table[i].addr.sun_family == AF_UNIX &&
-			    !strncmp(uds_fd_table[minor].target.sun_path,
-			    uds_fd_table[i].addr.sun_path,
-			    sizeof(uds_fd_table[i].addr.sun_path))) {
-				peer = i;
-				break;
-			}
-		}
-
-		if (peer == -1)
-			return ENOENT;
-	} else {
-		peer = uds_fd_table[minor].peer;
-		if (peer == -1)
-			return ENOTCONN;
-	}
-
-	dprintf(("UDS: sendmsg(%d) -- peer=%d\n", minor, peer));
-
-	/*
-	 * Note: it's possible that there is already some file descriptors in
-	 * ancillary_data if the peer didn't call recvmsg() yet.  That's okay.
-	 * The receiver will get the current file descriptors plus the new
-	 * ones.
-	 */
-	return send_fds(minor, &msg_ctrl, &uds_fd_table[peer].ancillary_data);
-}
-
-static int
-do_recvmsg(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int rc;
-	struct msg_control msg_ctrl;
-	socklen_t clen_avail = 0;
-	socklen_t clen_needed = 0;
-	socklen_t clen_desired = 0;
-
-	dprintf(("UDS: do_recvmsg(%d)\n", minor));
-	dprintf(("UDS: minor=%d credentials={uid:%d,gid:%d}\n", minor,
-	    uds_fd_table[minor].ancillary_data.cred.uid,
-	    uds_fd_table[minor].ancillary_data.cred.gid));
-
-	memset(&msg_ctrl, '\0', sizeof(struct msg_control));
-
-	/*
-	 * Get the msg_control from the user.  It will include the
-	 * amount of space the user has allocated for control data.
-	 */
-	if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &msg_ctrl,
-	    sizeof(struct msg_control))) != OK)
-		return rc;
-
-	clen_avail = MIN(msg_ctrl.msg_controllen, MSG_CONTROL_MAX);
-
-	if (uds_fd_table[minor].ancillary_data.nfiledes > 0) {
-		clen_needed = CMSG_SPACE(sizeof(int) *
-		    uds_fd_table[minor].ancillary_data.nfiledes);
-	}
-
-	/* if there is room we also include credentials */
-	clen_desired = clen_needed + CMSG_SPACE(sizeof(struct uucred));
-
-	if (clen_needed > clen_avail)
-		return EOVERFLOW;
-
-	if (uds_fd_table[minor].ancillary_data.nfiledes > 0) {
-		if ((rc = recv_fds(minor, &uds_fd_table[minor].ancillary_data,
-		    &msg_ctrl)) != OK)
-			return rc;
-	}
-
-	if (clen_desired <= clen_avail) {
-		rc = recv_cred(minor, &uds_fd_table[minor].ancillary_data,
-		    &msg_ctrl);
-		if (rc != OK)
-			return rc;
-		msg_ctrl.msg_controllen = clen_desired;
-	} else
-		msg_ctrl.msg_controllen = clen_needed;
-
-	/* Send the control data to the user. */
-	return sys_safecopyto(endpt, grant, 0, (vir_bytes) &msg_ctrl,
-	    sizeof(struct msg_control));
-}
-
-static int
-do_fionread(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-	int rc;
-
-	rc = uds_perform_read(minor, NONE, GRANT_INVALID, UDS_BUF, 1);
-
-	/* What should we do on error?  Just set to zero for now. */
-	if (rc < 0)
-		rc = 0;
-
-	return sys_safecopyto(endpt, grant, 0, (vir_bytes) &rc, sizeof(rc));
-}
-
-int
-uds_do_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
-	cp_grant_id_t grant)
-{
-	int rc;
-
-	switch (request) {
-	case NWIOSUDSCONN:
-		/* Connect to a listening socket -- connect(). */
-		rc = do_connect(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSACCEPT:
-		/* Accept an incoming connection -- accept(). */
-		rc = do_accept(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSBLOG:
-		/*
-		 * Set the backlog_size and put the socket into the listening
-		 * state -- listen().
-		 */
-		rc = do_listen(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSTYPE:
-		/* Set the SOCK_ type for this socket -- socket(). */
-		rc = do_socket(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSADDR:
-		/* Set the address for this socket -- bind(). */
-		rc = do_bind(minor, endpt, grant);
-
-		break;
-
-	case NWIOGUDSADDR:
-		/* Get the address for this socket -- getsockname(). */
-		rc = do_getsockname(minor, endpt, grant);
-
-		break;
-
-	case NWIOGUDSPADDR:
-		/* Get the address for the peer -- getpeername(). */
-		rc = do_getpeername(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSSHUT:
-		/*
-		 * Shut down a socket for reading, writing, or both --
-		 * shutdown().
-		 */
-		rc = do_shutdown(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSPAIR:
-		/* Connect two sockets -- socketpair(). */
-		rc = do_socketpair(minor, endpt, grant);
-
-		break;
-
-	case NWIOGUDSSOTYPE:
-		/* Get socket type -- getsockopt(SO_TYPE). */
-		rc = do_getsockopt_sotype(minor, endpt, grant);
-
-		break;
-
-	case NWIOGUDSPEERCRED:
-		/* Get peer endpoint -- getsockopt(SO_PEERCRED). */
-		rc = do_getsockopt_peercred(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSTADDR:
-		/* Set target address -- sendto(). */
-		rc = do_sendto(minor, endpt, grant);
-
-		break;
-
-	case NWIOGUDSFADDR:
-		/* Get from address -- recvfrom(). */
-		rc = do_recvfrom(minor, endpt, grant);
-
-		break;
-
-	case NWIOGUDSSNDBUF:
-		/* Get the send buffer size -- getsockopt(SO_SNDBUF). */
-		rc = do_getsockopt_sndbuf(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSSNDBUF:
-		/* Set the send buffer size -- setsockopt(SO_SNDBUF). */
-		rc = do_setsockopt_sndbuf(minor, endpt, grant);
-
-		break;
-
-	case NWIOGUDSRCVBUF:
-		/* Get the send buffer size -- getsockopt(SO_SNDBUF). */
-		rc = do_getsockopt_rcvbuf(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSRCVBUF:
-		/* Set the send buffer size -- setsockopt(SO_SNDBUF). */
-		rc = do_setsockopt_rcvbuf(minor, endpt, grant);
-
-		break;
-
-	case NWIOSUDSCTRL:
-		/* Set the control data -- sendmsg(). */
-		rc = do_sendmsg(minor, endpt, grant);
-
-		break;
-
-	case NWIOGUDSCTRL:
-		/* Set the control data -- recvmsg(). */
-		rc = do_recvmsg(minor, endpt, grant);
-
-		break;
-
-	case FIONREAD:
-		/*
-		 * Get the number of bytes immediately available for reading.
-		 */
-		rc = do_fionread(minor, endpt, grant);
-
-		break;
-
-	default:
-		/*
-		 * The IOCTL command is not valid for /dev/uds -- this happens
-		 * a lot and is normal.  A lot of libc functions determine the
-		 * socket type with IOCTLs.  Any unrecognized requests simply
-		 * get an ENOTTY response.
-		 */
-
-		rc = ENOTTY;
-	}
-
-	return rc;
-}
diff --git a/minix/net/uds/stat.c b/minix/net/uds/stat.c
new file mode 100644
index 000000000..2759f6318
--- /dev/null
+++ b/minix/net/uds/stat.c
@@ -0,0 +1,186 @@
+/* UNIX Domain Sockets - stat.c - network status */
+
+#include "uds.h"
+#include <sys/socketvar.h>
+#include <sys/unpcb.h>
+
+/*
+ * Fill the given 'ki' structure with information about the socket 'uds'.
+ */
+static void
+uds_get_info(struct kinfo_pcb * ki, const struct udssock * uds)
+{
+	struct udssock *peer;
+	socklen_t len;
+	int type;
+
+	type = uds_get_type(uds);
+	peer = uds_get_peer(uds);
+
+	ki->ki_pcbaddr = (uint64_t)(uintptr_t)uds;
+	ki->ki_ppcbaddr = (uint64_t)(uintptr_t)uds;
+	ki->ki_sockaddr = (uint64_t)(uintptr_t)&uds->uds_sock;
+	ki->ki_family = AF_UNIX;
+	ki->ki_type = type;
+	ki->ki_protocol = UDSPROTO_UDS;
+	ki->ki_pflags = 0;
+	if (uds->uds_flags & UDSF_CONNWAIT)
+		ki->ki_pflags |= UNP_CONNWAIT;
+	if (uds->uds_flags & UDSF_PASSCRED)
+		ki->ki_pflags |= UNP_WANTCRED;
+	if (type != SOCK_DGRAM && uds->uds_cred.unp_pid != -1) {
+		if (uds_is_listening(uds))
+			ki->ki_pflags |= UNP_EIDSBIND;
+		else if (uds_is_connecting(uds) || uds_is_connected(uds))
+			ki->ki_pflags |= UNP_EIDSVALID;
+	}
+	/* Not sure about NetBSD connection states.  First attempt here. */
+	if (uds_is_connecting(uds))
+		ki->ki_sostate = SS_ISCONNECTING;
+	else if (uds_is_connected(uds))
+		ki->ki_sostate = SS_ISCONNECTED;
+	else if (uds_is_disconnected(uds))
+		ki->ki_sostate = SS_ISDISCONNECTED;
+	ki->ki_rcvq = uds->uds_len;
+	/* We currently mirror the peer's receive queue size when connected. */
+	if (uds_is_connected(uds))
+		ki->ki_sndq = peer->uds_len;
+	/* The source is not set for bound connection-type sockets here. */
+	if (type == SOCK_DGRAM || uds_is_listening(uds))
+		uds_make_addr(uds->uds_path, (size_t)uds->uds_pathlen,
+		    &ki->ki_src, &len);
+	if (peer != NULL)
+		uds_make_addr(peer->uds_path, (size_t)peer->uds_pathlen,
+		    &ki->ki_dst, &len);
+	/* TODO: we should set ki_inode and ki_vnode, but to what? */
+	ki->ki_conn = (uint64_t)(uintptr_t)peer;
+	if (!TAILQ_EMPTY(&uds->uds_queue))
+		ki->ki_refs =
+		    (uint64_t)(uintptr_t)TAILQ_FIRST(&uds->uds_queue);
+	if (uds_has_link(uds))
+		ki->ki_nextref =
+		    (uint64_t)(uintptr_t)TAILQ_NEXT(uds, uds_next);
+}
+
+/*
+ * Remote MIB implementation of CTL_NET PF_LOCAL {SOCK_STREAM,SOCK_DGRAM,
+ * SOCK_SEQPACKET} 0.  This function handles all queries on the
+ * "net.local.{stream,dgram,seqpacket}.pcblist" sysctl(7) nodes.
+ *
+ * The 0 for "pcblist" is a MINIXism: we use it to keep our arrays small.
+ * NetBSD numbers these nodes dynamically and so they have numbers above
+ * CREATE_BASE.  That also means that no userland application can possibly
+ * hardcode their numbers, and must perform lookups by name.  In turn, that
+ * means that we can safely change the 0 to another number if NetBSD ever
+ * introduces statically numbered nodes in these subtrees.
+ */
+static ssize_t
+net_local_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
+	struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
+{
+	struct udssock *uds;
+	struct kinfo_pcb ki;
+	ssize_t off;
+	int r, type, size, max;
+
+	if (call->call_namelen != 4)
+		return EINVAL;
+
+	/* The first two added name fields are not used. */
+
+	size = call->call_name[2];
+	if (size < 0 || (size_t)size > sizeof(ki))
+		return EINVAL;
+	if (size == 0)
+		size = sizeof(ki);
+	max = call->call_name[3];
+
+	type = call->call_oname[2];
+
+	off = 0;
+
+	for (uds = uds_enum(NULL, type); uds != NULL;
+	    uds = uds_enum(uds, type)) {
+		if (rmib_inrange(oldp, off)) {
+			memset(&ki, 0, sizeof(ki));
+
+			uds_get_info(&ki, uds);
+
+			if ((r = rmib_copyout(oldp, off, &ki, size)) < 0)
+				return r;
+		}
+
+		off += size;
+		if (max > 0 && --max == 0)
+			break;
+	}
+
+	/*
+	 * Margin to limit the possible effects of the inherent race condition
+	 * between receiving just the data size and receiving the actual data.
+	 */
+	if (oldp == NULL)
+		off += PCB_SLOP * size;
+
+	return off;
+}
+
+/* The CTL_NET PF_LOCAL SOCK_STREAM subtree. */
+static struct rmib_node net_local_stream_table[] = {
+	[0]	= RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist,
+		    "pcblist", "SOCK_STREAM protocol control block list"),
+};
+
+/* The CTL_NET PF_LOCAL SOCK_DGRAM subtree. */
+static struct rmib_node net_local_dgram_table[] = {
+	[0]	= RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist,
+		    "pcblist", "SOCK_DGRAM protocol control block list"),
+};
+
+/* The CTL_NET PF_LOCAL SOCK_SEQPACKET subtree. */
+static struct rmib_node net_local_seqpacket_table[] = {
+	[0]	= RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist,
+		    "pcblist", "SOCK_SEQPACKET protocol control block list"),
+};
+
+/* The CTL_NET PF_LOCAL subtree. */
+static struct rmib_node net_local_table[] = {
+/* 1*/	[SOCK_STREAM]		= RMIB_NODE(RMIB_RO, net_local_stream_table,
+				    "stream", "SOCK_STREAM settings"),
+/* 2*/	[SOCK_DGRAM]		= RMIB_NODE(RMIB_RO, net_local_dgram_table,
+				    "dgram", "SOCK_DGRAM settings"),
+/* 5*/	[SOCK_SEQPACKET]	= RMIB_NODE(RMIB_RO, net_local_seqpacket_table,
+				    "seqpacket", "SOCK_SEQPACKET settings"),
+};
+
+static struct rmib_node net_local_node =
+    RMIB_NODE(RMIB_RO, net_local_table, "local", "PF_LOCAL related settings");
+
+/*
+ * Initialize the status module.
+ */
+void
+uds_stat_init(void)
+{
+	const int mib[] = { CTL_NET, PF_LOCAL };
+	int r;
+
+	/*
+	 * Register our own "net.local" subtree with the MIB service.
+	 *
+	 * This call only returns local failures.  Remote failures (in the MIB
+	 * service) are silently ignored.  So, we can safely panic on failure.
+	 */
+	if ((r = rmib_register(mib, __arraycount(mib), &net_local_node)) != OK)
+		panic("UDS: unable to register remote MIB tree: %d", r);
+}
+
+/*
+ * Clean up the status module.
+ */
+void
+uds_stat_cleanup(void)
+{
+
+	rmib_deregister(&net_local_node);
+}
diff --git a/minix/net/uds/uds.8 b/minix/net/uds/uds.8
deleted file mode 100644
index 2484ea709..000000000
--- a/minix/net/uds/uds.8
+++ /dev/null
@@ -1,15 +0,0 @@
-.TH UDS 8
-.SH NAME
-uds \- unix domain sockets device
-.SH DESCRIPTION
-The \fIuds\fP device gives access to the unix domain socket services in 
-Minix. It is a virtual device similar to the \fItcp\fP and \fIudp\fP 
-Internet Protocol server devices.
-.SH SEE ALSO
-.BR socket(2),
-.BR socketpair(2),
-.BR dev(4),
-.BR ip(4),
-.BR unix(8)
-.SH HISTORY
-This device first appeared in Minix 3.1.8.
diff --git a/minix/net/uds/uds.c b/minix/net/uds/uds.c
index baca3c1ed..2052a2ac2 100644
--- a/minix/net/uds/uds.c
+++ b/minix/net/uds/uds.c
@@ -1,740 +1,1376 @@
-/*
- * Unix Domain Sockets Implementation (PF_UNIX, PF_LOCAL)
- * This code handles requests generated by operations on /dev/uds
- *
- * The interface to UNIX domain sockets is similar to the interface to network
- * sockets. There is a character device (/dev/uds) and this server is a
- * 'driver' for that device.
- */
+/* UNIX Domain Sockets - uds.c - socket management */
 
 #include "uds.h"
 
-static ssize_t uds_perform_write(devminor_t, endpoint_t, cp_grant_id_t, size_t,
-	int);
+static struct udssock uds_array[NR_UDSSOCK];
+static TAILQ_HEAD(uds_freelist, udssock) uds_freelist;
+static unsigned int uds_in_use;
+static int uds_running;
 
-static int uds_open(devminor_t, int, endpoint_t);
-static int uds_close(devminor_t);
-static ssize_t uds_read(devminor_t, u64_t, endpoint_t, cp_grant_id_t, size_t,
-	int, cdev_id_t);
-static ssize_t uds_write(devminor_t, u64_t, endpoint_t, cp_grant_id_t, size_t,
-	int, cdev_id_t);
-static int uds_ioctl(devminor_t, unsigned long, endpoint_t, cp_grant_id_t, int,
-	endpoint_t, cdev_id_t);
-static int uds_cancel(devminor_t, endpoint_t, cdev_id_t);
-static int uds_select(devminor_t, unsigned int, endpoint_t);
+static const struct sockevent_ops uds_ops;
 
-static struct chardriver uds_tab = {
-	.cdr_open	= uds_open,
-	.cdr_close	= uds_close,
-	.cdr_read	= uds_read,
-	.cdr_write	= uds_write,
-	.cdr_ioctl	= uds_ioctl,
-	.cdr_cancel	= uds_cancel,
-	.cdr_select	= uds_select
-};
+static SLIST_HEAD(udshash, udssock) udshash[UDSHASH_SLOTS];
 
-/* File Descriptor Table */
-uds_fd_t uds_fd_table[NR_FDS];
-
-static unsigned int uds_exit_left;
-
-static int
-uds_open(devminor_t UNUSED(orig_minor), int access,
-	endpoint_t user_endpt)
+/*
+ * Initialize file-to-socket hash table.
+ */
+static void
+udshash_init(void)
 {
-	devminor_t minor;
-	char *buf;
-	int i;
+	unsigned int slot;
 
-	dprintf(("UDS: uds_open() from %d\n", user_endpt));
-
-	/*
-	 * Find a slot in the descriptor table for the new descriptor.
-	 * The index of the descriptor in the table will be returned.
-	 * Subsequent calls to read/write/close/ioctl/etc will use this
-	 * minor number.  The minor number must be different from the
-	 * the /dev/uds device's minor number (0).
-	 */
-	for (minor = 1; minor < NR_FDS; minor++)
-		if (uds_fd_table[minor].state == UDS_FREE)
-			break;
-
-	if (minor == NR_FDS)
-		return ENFILE;
-
-	/*
-	 * Allocate memory for the ringer buffer.  In order to save on memory
-	 * in the common case, the buffer is allocated only when the socket is
-	 * in use.  We use mmap instead of malloc to allow the memory to be
-	 * actually freed later.
-	 */
-	if ((buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE,
-	    MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
-		return ENOMEM;
-
-	/*
-	 * Allocate the socket, and set its initial parameters.
-	 */
-	uds_fd_table[minor].state = UDS_INUSE;
-	uds_fd_table[minor].owner = user_endpt;
-	uds_fd_table[minor].sel_endpt = NONE;
-	uds_fd_table[minor].sel_ops = 0;
-	uds_fd_table[minor].buf = buf;
-	uds_fd_table[minor].pos = 0;
-	uds_fd_table[minor].size = 0;
-	uds_fd_table[minor].mode = UDS_R | UDS_W;
-	uds_fd_table[minor].type = -1;
-
-	for (i = 0; i < UDS_SOMAXCONN; i++)
-		uds_fd_table[minor].backlog[i] = -1;
-	uds_fd_table[minor].backlog_size = UDS_SOMAXCONN;
-
-	memset(&uds_fd_table[minor].ancillary_data, '\0',
-	    sizeof(struct ancillary));
-	for (i = 0; i < OPEN_MAX; i++)
-		uds_fd_table[minor].ancillary_data.fds[i] = -1;
-
-	uds_fd_table[minor].stale = FALSE;
-	uds_fd_table[minor].listening = FALSE;
-	uds_fd_table[minor].peer = -1;
-	uds_fd_table[minor].child = -1;
-
-	memset(&uds_fd_table[minor].addr, '\0', sizeof(struct sockaddr_un));
-	memset(&uds_fd_table[minor].source, '\0', sizeof(struct sockaddr_un));
-	memset(&uds_fd_table[minor].target, '\0', sizeof(struct sockaddr_un));
-
-	uds_fd_table[minor].suspended = UDS_NOT_SUSPENDED;
-
-	return CDEV_CLONED | minor;
+	for (slot = 0; slot < __arraycount(udshash); slot++)
+		SLIST_INIT(&udshash[slot]);
 }
 
-static void
-uds_reset(devminor_t minor)
+/*
+ * Return a hash table slot number for the given <dev,ino> pair.
+ */
+static unsigned int
+udshash_slot(dev_t dev, ino_t ino)
 {
-	/* Disconnect the socket from its peer. */
-	uds_fd_table[minor].peer = -1;
 
-	/* Set an error to pass to the caller. */
-	uds_fd_table[minor].err = ECONNRESET;
+	assert(dev != NO_DEV);
+	assert(ino != 0);
 
-	/* If a process was blocked on I/O, revive it. */
-	if (uds_fd_table[minor].suspended != UDS_NOT_SUSPENDED)
-		uds_unsuspend(minor);
+	/*
+	 * Effectively combining two 64-bit numbers into a single 6-or-so-bit
+	 * hash is not too easy.  This hash function is probably among the
+	 * worst options.  Then again it is not all that critical as we are not
+	 * expecting that many bound UDS sockets in the system anyway.
+	 */
+	return (unsigned int)(dev ^ ino) % UDSHASH_SLOTS;
+}
 
-	/* All of the peer's calls will fail immediately now. */
-	if (uds_fd_table[minor].sel_ops != 0) {
-		chardriver_reply_select(uds_fd_table[minor].sel_endpt, minor,
-		    uds_fd_table[minor].sel_ops);
-		uds_fd_table[minor].sel_ops = 0;
+/*
+ * Look for a socket that is bound to the given <dev,ino> pair.  Return a
+ * pointer to the socket if found, or NULL otherwise.
+ */
+static struct udssock *
+udshash_get(dev_t dev, ino_t ino)
+{
+	struct udssock *uds;
+	unsigned int slot;
+
+	slot = udshash_slot(dev, ino);
+
+	SLIST_FOREACH(uds, &udshash[slot], uds_hash) {
+		if (uds->uds_dev == dev && uds->uds_ino == ino)
+			return uds;
+	}
+
+	return NULL;
+}
+
+/*
+ * Add a socket to the file-to-socket hash table.  The socket must have its
+ * device and inode fields set, and must not be in the hash table already.
+ */
+static void
+udshash_add(struct udssock * uds)
+{
+	unsigned int slot;
+
+	slot = udshash_slot(uds->uds_dev, uds->uds_ino);
+
+	SLIST_INSERT_HEAD(&udshash[slot], uds, uds_hash);
+}
+
+/*
+ * Remove a socket from the file-to-socket hash table.  The socket must be in
+ * the hash table.
+ */
+static void
+udshash_del(struct udssock * uds)
+{
+	unsigned int slot;
+
+	slot = udshash_slot(uds->uds_dev, uds->uds_ino);
+
+	/* This macro is O(n). */
+	SLIST_REMOVE(&udshash[slot], uds, udssock, uds_hash);
+}
+
+/*
+ * Return the socket identifier for the given UDS socket object.
+ */
+sockid_t
+uds_get_id(struct udssock * uds)
+{
+
+	return (sockid_t)(uds - uds_array);
+}
+
+/*
+ * Given either NULL or a previously returned socket, return the next in-use
+ * UDS socket of the given socket type, or NULL if there are no more matches.
+ * The sockets are returned in random order, but each matching socket is
+ * returned exactly once (until any socket is allocated or freed).
+ */
+struct udssock *
+uds_enum(struct udssock * prev, int type)
+{
+	sockid_t id;
+
+	if (prev != NULL)
+		id = uds_get_id(prev) + 1;
+	else
+		id = 0;
+
+	for (; id < NR_UDSSOCK; id++)
+		if ((uds_array[id].uds_flags & UDSF_IN_USE) &&
+		    uds_get_type(&uds_array[id]) == type)
+			return &uds_array[id];
+
+	return NULL;
+}
+
+/*
+ * Invalidate credentials on the socket.
+ */
+static void
+uds_clear_cred(struct udssock * uds)
+{
+
+	uds->uds_cred.unp_pid = -1;
+	uds->uds_cred.unp_euid = -1;
+	uds->uds_cred.unp_egid = -1;
+}
+
+/*
+ * Obtain the credentials (process, user, and group ID) of the given user
+ * endpoint and associate them with the socket for later retrieval.  It is
+ * important to note that this information is obtained once at connect time,
+ * and never updated later.  The party receiving the credentials must take this
+ * into account.
+ */
+static void
+uds_get_cred(struct udssock * uds, endpoint_t user_endpt)
+{
+	int r;
+
+	if ((uds->uds_cred.unp_pid = r = getepinfo(user_endpt,
+	    &uds->uds_cred.unp_euid, &uds->uds_cred.unp_egid)) < 0) {
+		printf("UDS: failed obtaining credentials of %d (%d)\n",
+		    user_endpt, r);
+
+		uds_clear_cred(uds);
 	}
 }
 
+/*
+ * Allocate and initialize a UDS socket.  On succes, return OK with a pointer
+ * to the new socket in 'udsp'.  On failure, return a negative error code.
+ */
 static int
-uds_close(devminor_t minor)
+uds_alloc(struct udssock ** udsp)
 {
-	int i, peer;
+	struct udssock *uds;
+	int r;
 
-	dprintf(("UDS: uds_close(%d)\n", minor));
+	/* Allocate, initialize, and return a UNIX domain socket object. */
+	if (TAILQ_EMPTY(&uds_freelist))
+		return ENOBUFS;
 
-	if (minor < 0 || minor >= NR_FDS) return ENXIO;
+	uds = TAILQ_FIRST(&uds_freelist);
 
-	if (uds_fd_table[minor].state != UDS_INUSE)
+	uds->uds_conn = NULL;		/* not connected */
+	uds->uds_link = NULL;		/* not connecting or linked */
+	uds->uds_queued = 0;
+	uds->uds_flags = UDSF_IN_USE;	/* may be found through enumeration */
+	uds->uds_pathlen = 0;		/* not bound: no path */
+	uds->uds_dev = NO_DEV;		/* not hashed: no socket file device */
+	uds->uds_ino = 0;		/* not hashed: no socket file inode */
+	uds_clear_cred(uds);		/* no bind/connect-time credentials */
+	TAILQ_INIT(&uds->uds_queue);	/* an empty queue */
+
+	if ((r = uds_io_setup(uds)) != OK)
+		return r;
+
+	TAILQ_REMOVE(&uds_freelist, uds, uds_next);
+
+	assert(uds_in_use < NR_UDSSOCK);
+	uds_in_use++;
+
+	*udsp = uds;
+	return OK;
+}
+
+/*
+ * Free a previously allocated socket.
+ */
+static void
+uds_free(struct sock * sock)
+{
+	struct udssock *uds = (struct udssock *)sock;
+
+	uds_io_cleanup(uds);
+
+	uds->uds_flags = 0;		/* no longer in use */
+
+	TAILQ_INSERT_HEAD(&uds_freelist, uds, uds_next);
+
+	assert(uds_in_use > 0);
+	if (--uds_in_use == 0 && uds_running == FALSE)
+		sef_cancel();
+}
+
+/*
+ * Create a new socket.
+ */
+static sockid_t
+uds_socket(int domain, int type, int protocol, endpoint_t user_endpt __unused,
+	struct sock ** sockp, const struct sockevent_ops ** ops)
+{
+	struct udssock *uds;
+	int r;
+
+	dprintf(("UDS: socket(%d,%d,%d)\n", domain, type, protocol));
+
+	if (domain != PF_UNIX) {
+		/* This means the service was configured incorrectly. */
+		printf("UDS: got request for domain %d\n", domain);
+
+		return EAFNOSUPPORT;
+	}
+
+	/* We support the following three socket types. */
+	switch (type) {
+	case SOCK_STREAM:
+	case SOCK_SEQPACKET:
+	case SOCK_DGRAM:
+		break;
+	default:
+		return EPROTOTYPE;
+	}
+
+	/*
+	 * The PF_UNIX domain does not support particular protocols, so the
+	 * given protocol must be zero (= anything that matches).
+	 */
+	if (protocol != UDSPROTO_UDS)
+		return EPROTONOSUPPORT;
+
+	if ((r = uds_alloc(&uds)) != OK)
+		return r;
+
+	dprintf(("UDS: socket returns %d\n", uds_get_id(uds)));
+
+	*sockp = &uds->uds_sock;
+	*ops = &uds_ops;
+	return uds_get_id(uds);
+}
+
+/*
+ * Connect a pair of sockets.
+ */
+static int
+uds_pair(struct sock * sock1, struct sock * sock2, endpoint_t user_endpt)
+{
+	struct udssock *uds1 = (struct udssock *)sock1;
+	struct udssock *uds2 = (struct udssock *)sock2;
+
+	dprintf(("UDS: pair(%d,%d)\n", uds_get_id(uds1), uds_get_id(uds2)));
+
+	/* Only connection-oriented types are acceptable. */
+	if (uds_get_type(uds1) == SOCK_DGRAM)
+		return EOPNOTSUPP;
+
+	/* Connect the sockets. */
+	uds1->uds_conn = uds2;
+	uds2->uds_conn = uds1;
+	uds1->uds_flags |= UDSF_CONNECTED;
+	uds2->uds_flags |= UDSF_CONNECTED;
+
+	/* Obtain the (same) credentials for both sides of the connection. */
+	uds_get_cred(uds1, user_endpt);
+	memcpy(&uds2->uds_cred, &uds1->uds_cred, sizeof(uds2->uds_cred));
+
+	return OK;
+}
+
+/*
+ * Disconnect a UDS socket, notifying or freeing up the other end of the
+ * connection depending on whether the socket was linked, that is, on the
+ * accept queue of a listening socket.
+ */
+static void
+uds_disconnect(struct udssock * uds, int was_linked)
+{
+	struct udssock *conn;
+
+	assert(uds_is_connected(uds));
+	assert(uds_has_conn(uds));
+
+	conn = uds->uds_conn;
+
+	assert(uds_is_connected(conn));
+	assert(uds_has_conn(conn));
+	assert(!uds_has_link(conn));
+	assert(conn->uds_conn == uds);
+
+	/* Disconnect the sockets. */
+	uds->uds_conn = NULL;
+	conn->uds_conn = NULL;
+
+	/*
+	 * If the given socket is linked, then it is a connected socket for
+	 * which the other end has been created but not yet accepted.  In that
+	 * case, the other end ('conn') will have to be freed up.  Otherwise,
+	 * it is a regular user-created socket and we must properly transition
+	 * it into disconnected state.
+	 */
+	if (!was_linked) {
+		sockevent_raise(&conn->uds_sock, SEV_SEND | SEV_RECV);
+
+		/*
+		 * Clear the peer credentials so that they will not be mistaken
+		 * for having been obtained at bind time.
+		 */
+		uds_clear_cred(conn);
+	} else
+		sockevent_raise(&conn->uds_sock, SEV_CLOSE);
+}
+
+/*
+ * Add the socket 'link' to the queue of the socket 'uds'.  This also implies
+ * that 'link's link socket is set to 'uds'.
+ */
+static void
+uds_add_queue(struct udssock * uds, struct udssock * link)
+{
+
+	dprintf(("UDS: add_queue(%d,%d)\n",
+	    uds_get_id(uds), uds_get_id(link)));
+
+	TAILQ_INSERT_TAIL(&uds->uds_queue, link, uds_next);
+
+	uds->uds_queued++;
+	assert(uds->uds_queued != 0);
+
+	link->uds_link = uds;
+}
+
+/*
+ * Remove the socket 'link' from the queue of the socket 'uds'.  This also
+ * reset 'link's link to NULL.
+ */
+static void
+uds_del_queue(struct udssock * uds, struct udssock * link)
+{
+
+	dprintf(("UDS: del_queue(%d,%d)\n",
+	    uds_get_id(uds), uds_get_id(link)));
+
+	assert(link->uds_link == uds);
+
+	TAILQ_REMOVE(&uds->uds_queue, link, uds_next);
+
+	assert(uds->uds_queued > 0);
+	uds->uds_queued--;
+
+	link->uds_link = NULL;
+}
+
+/*
+ * Remove all sockets from the queue of the socket 'uds', with the exception of
+ * 'except' if non-NULL.  Raise an ECONNRESET error on all removed sockets that
+ * are not equal to 'uds'.
+ */
+static void
+uds_clear_queue(struct udssock * uds, struct udssock * except)
+{
+	struct udssock *link, *tmp;
+	int found;
+
+	dprintf(("UDS: clear_queue(%d,%d)\n",
+	    uds_get_id(uds), (except != NULL) ? uds_get_id(except) : -1));
+
+	found = 0;
+
+	/*
+	 * Abort all connecting sockets queued on this socket, except for the
+	 * given exception, which may be NULL.
+	 */
+	TAILQ_FOREACH_SAFE(link, &uds->uds_queue, uds_next, tmp) {
+		if (link == except) {
+			found++;
+
+			continue;
+		}
+
+		dprintf(("UDS: clear_queue removes %d\n", uds_get_id(link)));
+
+		assert(uds_get_type(link) == SOCK_DGRAM ||
+		    uds_is_connecting(link) || uds_is_connected(link));
+
+		uds_del_queue(uds, link);
+
+		/*
+		 * Generate an error only if the socket was not linked to
+		 * itself (only datagram sockets can be linked to themselves).
+		 * The error is not helpful for applications in that case.
+		 */
+		if (uds != link)
+			sockevent_set_error(&link->uds_sock, ECONNRESET);
+
+		/*
+		 * If this is a listening socket, disconnect the connecting or
+		 * connected end.  If a connected peer was already created for
+		 * the queued socket, dispose of that peer.
+		 *
+		 * Clear credentials obtained when starting to connect (in
+		 * which case the socket is always a connection-oriented
+		 * socket), so that they will not be mistaken for credentials
+		 * obtained at bind time.
+		 */
+		if (uds_get_type(link) != SOCK_DGRAM) {
+			if (uds_is_connected(link))
+				uds_disconnect(link, TRUE /*was_linked*/);
+			else
+				uds_clear_cred(link);
+		}
+	}
+
+	assert(uds->uds_queued == found);
+}
+
+/*
+ * Check whether the socket address given in 'addr', with length 'addr_len', is
+ * a valid UNIX domain socket address (including a path to a socket file).  On
+ * success, return the (non-zero) length of the socket file's path, minus the
+ * null terminator which may in fact not be present.  The caller is responsible
+ * for copying and terminating the path as needed.  A pointer to the path as
+ * stored in 'addr' is returned in 'pathp'.  On failure, return an error code.
+ */
+static int
+uds_check_addr(const struct sockaddr * addr, socklen_t addr_len,
+	const char ** pathp)
+{
+	const char *p;
+	size_t len;
+
+	/*
+	 * We could cast to a sockaddr_un structure pointer first, but that
+	 * would not provide any benefits here.  Instead, we use sa_data as the
+	 * generic equivalent of sun_path.
+	 */
+	if (addr_len < offsetof(struct sockaddr, sa_data))
 		return EINVAL;
 
-	peer = uds_fd_table[minor].peer;
-	if (peer != -1 && uds_fd_table[peer].peer == -1) {
-		/* Connecting socket: clear from server's backlog. */
-		if (!uds_fd_table[peer].listening)
-			panic("connecting socket attached to non-server");
+	if (addr->sa_family != AF_UNIX)
+		return EAFNOSUPPORT;
 
-		for (i = 0; i < uds_fd_table[peer].backlog_size; i++) {
-			if (uds_fd_table[peer].backlog[i] == minor) {
-				uds_fd_table[peer].backlog[i] = -1;
-				break;
-			}
-		}
-	} else if (peer != -1) {
-		/* Connected socket: disconnect it. */
-		uds_reset(peer);
-	} else if (uds_fd_table[minor].listening) {
-		/* Listening socket: disconnect all sockets in the backlog. */
-		for (i = 0; i < uds_fd_table[minor].backlog_size; i++)
-			if (uds_fd_table[minor].backlog[i] != -1)
-				uds_reset(uds_fd_table[minor].backlog[i]);
+	len = (size_t)addr_len - offsetof(struct sockaddr, sa_data);
+	if (len > 0 && (p = memchr(addr->sa_data, '\0', len)) != NULL)
+		len = (size_t)(p - addr->sa_data);
+
+	/* The given path name must not be an empty string. */
+	if (len == 0)
+		return ENOENT;
+
+	/* This check should be redundant but better safe than sorry. */
+	if (len >= UDS_PATH_MAX)
+		return EINVAL;
+
+	*pathp = (const char *)addr->sa_data;
+	return len;
+}
+
+/*
+ * Given the socket file path given as 'path' with length 'path_len' (not
+ * necessarily null terminated), store a socket address with the path in
+ * 'addr', and return the socket address length in 'addr_len'.  The calling
+ * libraries (libsockdriver, libsockevent) and the static assert in uds.h
+ * guarantee that 'addr' is sufficiently large to store any address we generate
+ * here.  The libraries may subsequently copy out only a part of it to the user
+ * process.  This function always succeeds.
+ */
+void
+uds_make_addr(const char * path, size_t len, struct sockaddr * addr,
+	socklen_t * addr_len)
+{
+
+	/*
+	 * Generate the address.  The stored length (sa_len/sun_len) does not
+	 * include a null terminator.  The entire structure does include a null
+	 * terminator, but only if the socket is bound.
+	 */
+	addr->sa_len = offsetof(struct sockaddr, sa_data) + len;
+	addr->sa_family = AF_UNIX;
+	if (len > 0) {
+		/* This call may (intentionally) overrun the sa_data size. */
+		memcpy((char *)addr->sa_data, path, len);
+		((char *)addr->sa_data)[len] = '\0';
+
+		/* The socket is bound, so include the null terminator. */
+		len++;
+		assert(len <= UDS_PATH_MAX);
 	}
 
-	if (uds_fd_table[minor].ancillary_data.nfiledes > 0)
-		uds_clear_fds(minor, &uds_fd_table[minor].ancillary_data);
+	/* Note that this length may be different from sa_len/sun_len now. */
+	*addr_len = offsetof(struct sockaddr, sa_data) + len;
+}
 
-	/* Release the memory for the ring buffer. */
-	munmap(uds_fd_table[minor].buf, UDS_BUF);
+/*
+ * Bind a socket to a local address.
+ */
+static int
+uds_bind(struct sock * sock, const struct sockaddr * addr, socklen_t addr_len,
+	endpoint_t user_endpt)
+{
+	struct udssock *uds = (struct udssock *)sock;
+	struct udssock *uds2;
+	const char *path;
+	size_t len;
+	dev_t dev;
+	ino_t ino;
+	int r;
 
-	/* Set the socket back to its original UDS_FREE state. */
-	memset(&uds_fd_table[minor], '\0', sizeof(uds_fd_t));
+	dprintf(("UDS: bind(%d)\n", uds_get_id(uds)));
 
-	/* If terminating, and this was the last open socket, exit now. */
-	if (uds_exit_left > 0) {
-		if (--uds_exit_left == 0)
-			chardriver_terminate();
+	/* A socket may be bound at any time, but only once. */
+	if (uds_is_bound(uds))
+		return EINVAL;
+
+	/* Verify that the user gave us an acceptable address. */
+	if ((r = uds_check_addr(addr, addr_len, &path)) < 0)
+		return r;
+	len = (size_t)r;
+
+	/* Attempt to create the socket file on the file system. */
+	r = socketpath(user_endpt, path, len, SPATH_CREATE, &dev, &ino);
+	if (r != OK)
+		return r;
+	assert(dev != NO_DEV && ino != 0);
+
+	/*
+	 * It is possible that a socket file of a previously bound socket was
+	 * unlinked, and due to inode number reuse, a new socket file has now
+	 * been created with the same <dev,ino> pair.  In that case, we must
+	 * unbind the old socket, because it must no longer be found.  The old
+	 * socket will still have a path (and behave as though it is bound) but
+	 * no longer be found through hash lookups.
+	 */
+	if ((uds2 = udshash_get(dev, ino)) != NULL) {
+		udshash_del(uds2);
+
+		uds2->uds_dev = NO_DEV;
+		uds2->uds_ino = 0;
+	}
+
+	/*
+	 * Obtain credentials for the socket, unless the socket is already
+	 * connecting or connected, in which case we must not replace the
+	 * credentials we obtained already.  We later clear those credentials
+	 * upon a connection failure or disconnect, so that if the socket is
+	 * then put in listening mode, we know there are no bind-time
+	 * credentials.  Not ideal, but we really need two separate sets of
+	 * credentials if we want to get this right, which is a waste of memory
+	 * as no sane application writer would ever rely on credential passing
+	 * after recycling a socket..
+	 */
+	if (uds_get_type(uds) != SOCK_DGRAM && !uds_is_connecting(uds) &&
+	    !uds_is_connected(uds))
+		uds_get_cred(uds, user_endpt);
+
+	/* Asssign the address to the socket. */
+	uds->uds_pathlen = len;
+	memcpy(&uds->uds_path, path, len);
+	uds->uds_dev = dev;
+	uds->uds_ino = ino;
+
+	udshash_add(uds);
+
+	return OK;
+}
+
+/*
+ * Look up a UDS socket based on a user-given address.  If a socket exists for
+ * the address, check if it is type-compatible with the given UDS socket.
+ * On succes, return OK, with 'peerp' set to the socket that was found.  On
+ * failure, return a negative error code.
+ */
+int
+uds_lookup(struct udssock * uds, const struct sockaddr * addr,
+	socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp)
+{
+	struct udssock *peer;
+	const char *path;
+	size_t len;
+	dev_t dev;
+	ino_t ino;
+	int r;
+
+	/* Verify that the user gave us an acceptable address. */
+	if ((r = uds_check_addr(addr, addr_len, &path)) < 0)
+		return r;
+	len = (size_t)r;
+
+	/* Attempt to look up the socket file on the file system. */
+	r = socketpath(user_endpt, path, len, SPATH_CHECK, &dev, &ino);
+	if (r != OK)
+		return r;
+	assert(dev != NO_DEV && ino != 0);
+
+	if ((peer = udshash_get(dev, ino)) == NULL)
+		return ECONNREFUSED;
+	if (uds_get_type(peer) != uds_get_type(uds))
+		return EPROTOTYPE;
+
+	*peerp = peer;
+	return OK;
+}
+
+/*
+ * Given the listening socket 'uds', and the socket 'link' that is calling or
+ * has called connect(2) and is or will be linked to the listening socket's
+ * queue, create a new socket and connect it to 'link', putting both sockets in
+ * the connected state.  The given link socket may be in unconnected,
+ * connecting, or disconnected state prior to the call.  Return OK or an error
+ * code.  The link state of the link socket remains unchanged in any case.
+ */
+static int
+uds_attach(struct udssock * uds, struct udssock * link)
+{
+	struct udssock *conn;
+	int r;
+
+	/*
+	 * Allocate a new socket to use as peer socket for the connection that
+	 * is about to be established.  The new socket is not yet known by
+	 * libsockevent.
+	 */
+	if ((r = uds_alloc(&conn)) != OK)
+		return r;
+
+	/*
+	 * Ask libsockevent to clone the sock object in the new UDS socket from
+	 * the listening socket.  This adds the sock object to libsockevent's
+	 * data structures and ensures that we can safely use the socket
+	 * despite the fact that it has not yet been accepted (and thus
+	 * returned to libsockevent).  From this moment on, we must either
+	 * return the socket's ID (but not a pointer to it!) from uds_accept()
+	 * or raise SEV_CLOSE on it.
+	 */
+	sockevent_clone(&uds->uds_sock, &conn->uds_sock, uds_get_id(conn));
+
+	/* Connect the link socket to the new socket. */
+	link->uds_conn = conn;
+	link->uds_flags |= UDSF_CONNECTED;
+
+	/*
+	 * Connect the new socket to the link socket as well.  The child
+	 * socket should also inherit pretty much all settings from the
+	 * listening socket, including the bind path and the listening socket's
+	 * bind-time credentials.
+	 */
+	conn->uds_conn = link;
+	conn->uds_flags = uds->uds_flags & (UDSF_PASSCRED | UDSF_CONNWAIT);
+	conn->uds_flags |= UDSF_CONNECTED;
+	conn->uds_pathlen = uds->uds_pathlen;
+	memcpy(conn->uds_path, uds->uds_path, (size_t)uds->uds_pathlen);
+	memcpy(&conn->uds_cred, &uds->uds_cred, sizeof(conn->uds_cred));
+
+	return OK;
+}
+
+/*
+ * Connect a socket to a remote address.
+ */
+static int
+uds_connect(struct sock * sock, const struct sockaddr * addr,
+	socklen_t addr_len, endpoint_t user_endpt)
+{
+	struct udssock *uds = (struct udssock *)sock;
+	struct udssock *link;
+	int r;
+
+	dprintf(("UDS: connect(%d)\n", uds_get_id(uds)));
+
+	/* For connection-oriented sockets, several state checks apply. */
+	if (uds_get_type(uds) != SOCK_DGRAM) {
+		if (uds_is_listening(uds))
+			return EOPNOTSUPP;
+		if (uds_is_connecting(uds))
+			return EALREADY;
+		if (uds_is_connected(uds))
+			return EISCONN;
+		/* Disconnected sockets may be reconnected, see below. */
+	} else {
+		/*
+		 * Connectionless sockets may be unconnected by providing an
+		 * address with family AF_UNSPEC.  Handle this case first here.
+		 */
+		if (addr_len >= offsetof(struct sockaddr, sa_data) &&
+		    addr->sa_family == AF_UNSPEC) {
+			/*
+			 * Reset this socket's previous connection to another
+			 * socket, if any.  Unconnecting has no effect on other
+			 * sockets connected to this socket, though.
+			 */
+			if (uds_has_link(uds))
+				uds_del_queue(uds->uds_link, uds);
+
+			return OK;
+		}
+	}
+
+	/*
+	 * Find the socket identified by the given address.  If it exists at
+	 * all, see if it is a proper match.
+	 */
+	if ((r = uds_lookup(uds, addr, addr_len, user_endpt, &link)) != OK)
+		return r;
+
+	/*
+	 * Handle connectionless sockets first, in which case a connect links
+	 * the socket to a send target and limits receipt to datagrams from
+	 * that target.  We actually point the socket to the peer socket,
+	 * through uds_link.  That also means that if the target socket
+	 * disappears, we have to reset any sockets connected to it, in which
+	 * case we return them to the unconnected state.  In order to allow
+	 * finding all sockets connected to a particular socket, we put all
+	 * those sockets on their target's queue, hence why we use uds_link and
+	 * not uds_conn.  As mentioned before, we allow reconnecting without
+	 * restrictions.
+	 * TODO: see if reconnecting should clear a pending ECONNRESET.
+	 *
+	 * An important note: 'uds' and 'link' may actually be the same socket,
+	 * if the caller chooses to connect a socket with itself!
+	 */
+	if (uds_get_type(uds) == SOCK_DGRAM) {
+		/* Reconnecting to the same socket has no effect. */
+		if (uds_has_link(uds) && uds->uds_link == link)
+			return OK;
+
+		/*
+		 * If the intended target is linked to another socket, we
+		 * refuse linking to it.  Sending or receiving would never work
+		 * anyway.  Do allow a socket to link to itself after being
+		 * linked to another socket.  The error code is the same as in
+		 * the sending code, borrowed from Linux.
+		 */
+		if (uds != link && uds_has_link(link) && link->uds_link != uds)
+			return EPERM;
+
+		/*
+		 * Reset this socket's previous link to another socket, if any.
+		 */
+		if (uds_has_link(uds))
+			uds_del_queue(uds->uds_link, uds);
+
+		/*
+		 * Reset any links to this socket, except for the one by
+		 * the intended target.  Sending or receiving would no longer
+		 * work anyway.  If the socket was linked to itself, clear its
+		 * self-link without generating an ECONNRESET.  If the socket
+		 * is relinking to itself, reestablish the link after first
+		 * clearing it.
+		 */
+		uds_clear_queue(uds, (uds != link) ? link : NULL);
+
+		uds_add_queue(link, uds);
+
+		return OK;
+	}
+
+	/*
+	 * For connection-oriented sockets there is more to do.  First, make
+	 * sure that the peer is a listening socket, that it has not been shut
+	 * down, and that its backlog is not already at the configured maximum.
+	 */
+	if (!uds_is_listening(link))
+		return ECONNREFUSED;
+
+	if (uds_is_shutdown(link, SFL_SHUT_RD | SFL_SHUT_WR))
+		return ECONNREFUSED;
+
+	if (link->uds_queued >= link->uds_backlog)
+		return ECONNREFUSED;
+
+	/*
+	 * The behavior of connect(2) now depends on whether LOCAL_CONNWAIT is
+	 * set on either the connecting or the listening socket.  If it is not,
+	 * the socket will be connected to a new as-yet invisible socket, which
+	 * will be the one returned from accept(2) later.  If it was, the
+	 * socket will be put in the connecting state.
+	 */
+	if (!((uds->uds_flags | link->uds_flags) & UDSF_CONNWAIT)) {
+		if ((r = uds_attach(link, uds)) != OK)
+			return r;
+
+		assert(uds_is_connected(uds));
+	} else {
+		/*
+		 * Disconnected sockets now stop being connected.  Any pending
+		 * data can still be received, though.
+		 */
+		uds->uds_flags &= ~UDSF_CONNECTED;
+
+		r = SUSPEND;
+	}
+
+	/* Obtain credentials for the socket. */
+	uds_get_cred(uds, user_endpt);
+
+	/* Add the socket at the end of the listening socket's queue. */
+	uds_add_queue(link, uds);
+
+	assert(r != SUSPEND || uds_is_connecting(uds));
+
+	/*
+	 * Let an accept call handle the rest, which will in turn resume this
+	 * connect call.  The sockevent library ensures that this works even if
+	 * the call is non-blocking.
+	 */
+	sockevent_raise(&link->uds_sock, SEV_ACCEPT);
+
+	return r;
+}
+
+/*
+ * Put a socket in listening mode.
+ */
+static int
+uds_listen(struct sock * sock, int backlog)
+{
+	struct udssock *uds = (struct udssock *)sock;
+
+	/* The maximum backlog value must not exceed its field size. */
+	assert(SOMAXCONN <= USHRT_MAX);
+
+	dprintf(("UDS: listen(%d)\n", uds_get_id(uds)));
+
+	/* Only connection-oriented types may be put in listening mode. */
+	if (uds_get_type(uds) == SOCK_DGRAM)
+		return EOPNOTSUPP;
+
+	/* A connecting or connected socket may not listen. */
+	if (uds_is_connecting(uds) || uds_is_connected(uds))
+		return EINVAL;
+
+	/* POSIX says that this is now the appropriate error code here. */
+	if (!uds_is_bound(uds))
+		return EDESTADDRREQ;
+
+	/*
+	 * The socket is now entering the listening state.  If it was
+	 * previously disconnected, clear the connection flag.
+	 */
+	uds->uds_flags &= ~UDSF_CONNECTED;
+
+	/*
+	 * We do not remove sockets from the backlog if it is now being dropped
+	 * below the current number of queued sockets.  We only refuse newly
+	 * connecting sockets beyond the backlog size.
+	 */
+	uds->uds_backlog = backlog;
+
+	return OK;
+}
+
+/*
+ * Test whether an accept request would block.  Return OK if a socket could be
+ * accepted, an appropriate error code if an accept call would fail instantly,
+ * or SUSPEND if the accept request would block waiting for a connection.
+ */
+static int
+uds_test_accept(struct sock * sock)
+{
+	struct udssock *uds = (struct udssock *)sock;
+
+	/*
+	 * Ensure that the socket is in listening mode.  If not, we must return
+	 * the error code that is appropriate for this socket type.
+	 */
+	if (uds_get_type(uds) == SOCK_DGRAM)
+		return EOPNOTSUPP;
+	if (!uds_is_listening(uds))
+		return EINVAL;
+
+	/*
+	 * If the socket has been shut down, new connections are no longer
+	 * accepted and accept calls no longer block.  This is not a POSIX
+	 * requirement, but rather an application convenience feature.
+	 */
+	if (uds->uds_queued == 0) {
+		if (uds_is_shutdown(uds, SFL_SHUT_RD | SFL_SHUT_WR))
+			return ECONNABORTED;
+
+		return SUSPEND;
 	}
 
 	return OK;
 }
 
-static int
-uds_select(devminor_t minor, unsigned int ops, endpoint_t endpt)
+/*
+ * Accept a connection on a listening socket, creating a new socket.  On
+ * success, return the new socket identifier, with the new socket stored in
+ * 'newsockp'.  Otherwise, return an error code.
+ */
+static sockid_t
+uds_accept(struct sock * sock, struct sockaddr * addr, socklen_t * addr_len,
+	endpoint_t user_endpt __unused, struct sock ** newsockp)
 {
-	unsigned int ready_ops;
-	int i, bytes, watch;
+	struct udssock *uds = (struct udssock *)sock;
+	struct udssock *link, *conn;
+	sockid_t r;
 
-	dprintf(("UDS: uds_select(%d)\n", minor));
+	dprintf(("UDS: accept(%d)\n", uds_get_id(uds)));
 
-	if (minor < 0 || minor >= NR_FDS) return ENXIO;
-
-	if (uds_fd_table[minor].state != UDS_INUSE)
-		return EINVAL;
-
-	watch = (ops & CDEV_NOTIFY);
-	ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR);
-
-	ready_ops = 0;
-
-	/* Check if there is data available to read. */
-	if (ops & CDEV_OP_RD) {
-		bytes = uds_perform_read(minor, NONE, GRANT_INVALID, 1, 1);
-		if (bytes > 0) {
-			ready_ops |= CDEV_OP_RD;	/* data available */
-		} else if (uds_fd_table[minor].listening == TRUE) {
-			/* Check for pending connections. */
-			for (i = 0; i < uds_fd_table[minor].backlog_size; i++)
-			{
-				if (uds_fd_table[minor].backlog[i] != -1) {
-					ready_ops |= CDEV_OP_RD;
-					break;
-				}
-			}
-		} else if (bytes != EDONTREPLY) {
-			ready_ops |= CDEV_OP_RD;	/* error */
-		}
-	}
-
-	/* Check if we can write without blocking. */
-	if (ops & CDEV_OP_WR) {
-		bytes = uds_perform_write(minor, NONE, GRANT_INVALID, 1, 1);
-		if (bytes != 0 && bytes != EDONTREPLY)
-			ready_ops |= CDEV_OP_WR;
-	}
-
-	/*
-	 * If not all requested ops were ready, and the caller requests to be
-	 * notified about changes, we add the remaining ops to the saved set.
-	 */
-	ops &= ~ready_ops;
-	if (ops && watch) {
-		uds_fd_table[minor].sel_endpt = endpt;
-		uds_fd_table[minor].sel_ops |= ops;
-	}
-
-	return ready_ops;
-}
-
-ssize_t
-uds_perform_read(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant,
-	size_t size, int pretend)
-{
-	size_t pos, subsize;
-	int r, peer;
-
-	dprintf(("UDS: uds_perform_read(%d)\n", minor));
-
-	peer = uds_fd_table[minor].peer;
-
-	/* Skip reads of zero bytes. */
-	if (size == 0)
-		return 0;
-
-	/* Check if the socket isn't shut down for reads. */
-	if (!(uds_fd_table[minor].mode & UDS_R))
-		return EPIPE;
-
-	if (uds_fd_table[minor].size == 0) {
-		if (peer == -1) {
-			/*
-			 * We're not connected. That's only a problem when this
-			 * socket is connection oriented.
-			 */
-			if (uds_fd_table[minor].type == SOCK_STREAM ||
-			    uds_fd_table[minor].type == SOCK_SEQPACKET) {
-				if (uds_fd_table[minor].err == ECONNRESET) {
-					if (!pretend)
-						uds_fd_table[minor].err = 0;
-					return ECONNRESET;
-				} else
-					return ENOTCONN;
-			}
-		}
-
-		/* Check if process is reading from a closed pipe. */
-		if (peer != -1 && !(uds_fd_table[peer].mode & UDS_W) &&
-		    uds_fd_table[minor].size == 0)
-			return 0;
-
-		if (pretend)
-			return EDONTREPLY;
-
-		if (peer != -1 &&
-			uds_fd_table[peer].suspended == UDS_SUSPENDED_WRITE)
-			panic("writer blocked on empty socket");
-
-		dprintf(("UDS: suspending read request on %d\n", minor));
-
-		/* Process is reading from an empty pipe.  Suspend it. */
-		return EDONTREPLY;
-	}
-
-	/* How much can we get from the ring buffer? */
-	if (size > uds_fd_table[minor].size)
-		size = uds_fd_table[minor].size;
-
-	if (pretend)
-		return size;
-
-	/* Get the data from the tail of the ring buffer. */
-	pos = uds_fd_table[minor].pos;
-
-	subsize = UDS_BUF - pos;
-	if (subsize > size)
-		subsize = size;
-
-	if ((r = sys_safecopyto(endpt, grant, 0,
-	    (vir_bytes) &uds_fd_table[minor].buf[pos], subsize)) != OK)
+	if ((r = uds_test_accept(sock)) != OK)
 		return r;
 
-	if (subsize < size) {
-		if ((r = sys_safecopyto(endpt, grant, subsize,
-		    (vir_bytes) uds_fd_table[minor].buf,
-		    size - subsize)) != OK)
-			return r;
-	}
+	/*
+	 * Take the first connecting socket off the listening queue.
+	 */
+	assert(!TAILQ_EMPTY(&uds->uds_queue));
 
-	/* Advance the buffer tail. */
-	uds_fd_table[minor].pos = (pos + size) % UDS_BUF;
-	uds_fd_table[minor].size -= size;
+	link = TAILQ_FIRST(&uds->uds_queue);
 
-	/* Reset position if the buffer is empty (it may save a copy call). */
-	if (uds_fd_table[minor].size == 0)
-		uds_fd_table[minor].pos = 0;
+	/*
+	 * Depending on the LOCAL_CONNWAIT setting at the time of connect(2),
+	 * the socket may be connecting or connected.  In the latter case, its
+	 * attached socket is the socket we will return now.  Otherwise we have
+	 * to attach a socket first.
+	 */
+	assert(uds_is_connecting(link) || uds_is_connected(link));
 
-	/* See if we can wake up a blocked writer. */
-	if (peer != -1 && uds_fd_table[peer].suspended == UDS_SUSPENDED_WRITE)
-		uds_unsuspend(peer);
-
-	/* See if we can satisfy an ongoing select. */
-	if (peer != -1 && (uds_fd_table[peer].sel_ops & CDEV_OP_WR) &&
-	    uds_fd_table[minor].size < UDS_BUF) {
-		/* A write on the peer is possible now. */
-		chardriver_reply_select(uds_fd_table[peer].sel_endpt, peer,
-		    CDEV_OP_WR);
-		uds_fd_table[peer].sel_ops &= ~CDEV_OP_WR;
-	}
-
-	return size; /* number of bytes read */
-}
-
-static ssize_t
-uds_perform_write(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant,
-	size_t size, int pretend)
-{
-	size_t subsize, pos;
-	int i, r, peer;
-
-	dprintf(("UDS: uds_perform_write(%d)\n", minor));
-
-	/* Skip writes of zero bytes. */
-	if (size == 0)
-		return 0;
-
-	/* Check if the socket isn't shut down for writes. */
-	if (!(uds_fd_table[minor].mode & UDS_W))
-		return EPIPE;
-
-	/* Datagram messages must fit in the buffer entirely. */
-	if (size > UDS_BUF && uds_fd_table[minor].type != SOCK_STREAM)
-		return EMSGSIZE;
-
-	if (uds_fd_table[minor].type == SOCK_STREAM ||
-	    uds_fd_table[minor].type == SOCK_SEQPACKET) {
+	if (uds_is_connecting(link)) {
 		/*
-		 * If we're writing to a connection-oriented socket, then it
-		 * needs a peer to write to.  For disconnected sockets, writing
-		 * is an error; for connecting sockets, writes should suspend.
+		 * Attach a new socket.  If this fails, return the error but
+		 * leave the connecting socket on the listening queue.
 		 */
-		peer = uds_fd_table[minor].peer;
-
-		if (peer == -1) {
-			if (uds_fd_table[minor].err == ECONNRESET) {
-				if (!pretend)
-					uds_fd_table[minor].err = 0;
-				return ECONNRESET;
-			} else
-				return ENOTCONN;
-		} else if (uds_fd_table[peer].peer == -1) /* connecting */
-			return EDONTREPLY;
-	} else /* uds_fd_table[minor].type == SOCK_DGRAM */ {
-		peer = -1;
-
-		/* Locate the "peer" we want to write to. */
-		for (i = 0; i < NR_FDS; i++) {
-			/*
-			 * Look for a SOCK_DGRAM socket that is bound on
-			 * the target address.
-			 */
-			if (uds_fd_table[i].type == SOCK_DGRAM &&
-			    uds_fd_table[i].stale == FALSE &&
-			    uds_fd_table[i].addr.sun_family == AF_UNIX &&
-			    !strncmp(uds_fd_table[minor].target.sun_path,
-			    uds_fd_table[i].addr.sun_path,
-			    sizeof(uds_fd_table[i].addr.sun_path))) {
-				peer = i;
-				break;
-			}
-		}
-
-		if (peer == -1)
-			return ENOENT;
-	}
-
-	/* Check if we write to a closed pipe. */
-	if (!(uds_fd_table[peer].mode & UDS_R))
-		return EPIPE;
-
-	/*
-	 * We have to preserve the boundary for DGRAM.  If there's already a
-	 * packet waiting, discard it silently and pretend it was written.
-	 */
-	if (uds_fd_table[minor].type == SOCK_DGRAM &&
-	    uds_fd_table[peer].size > 0)
-		return size;
-
-	/*
-	 * Check if the ring buffer is already full, and if the SEQPACKET
-	 * message wouldn't write to an empty buffer.
-	 */
-	if (uds_fd_table[peer].size == UDS_BUF ||
-	    (uds_fd_table[minor].type == SOCK_SEQPACKET &&
-	    uds_fd_table[peer].size > 0)) {
-		if (pretend)
-			return EDONTREPLY;
-
-		if (uds_fd_table[peer].suspended == UDS_SUSPENDED_READ)
-			panic("reader blocked on full socket");
-
-		dprintf(("UDS: suspending write request on %d\n", minor));
-
-		/* Process is reading from an empty pipe.  Suspend it. */
-		return EDONTREPLY;
-	}
-
-	/* How much can we add to the ring buffer? */
-	if (size > UDS_BUF - uds_fd_table[peer].size)
-		size = UDS_BUF - uds_fd_table[peer].size;
-
-	if (pretend)
-		return size;
-
-	/* Put the data at the head of the ring buffer. */
-	pos = (uds_fd_table[peer].pos + uds_fd_table[peer].size) % UDS_BUF;
-
-	subsize = UDS_BUF - pos;
-	if (subsize > size)
-		subsize = size;
-
-	if ((r = sys_safecopyfrom(endpt, grant, 0,
-	    (vir_bytes) &uds_fd_table[peer].buf[pos], subsize)) != OK)
-		return r;
-
-	if (subsize < size) {
-		if ((r = sys_safecopyfrom(endpt, grant, subsize,
-		    (vir_bytes) uds_fd_table[peer].buf, size - subsize)) != OK)
+		if ((r = uds_attach(uds, link)) != OK)
 			return r;
-	}
 
-	/* Advance the buffer head. */
-	uds_fd_table[peer].size += size;
+		assert(uds_is_connected(link));
 
-	/* Fill in the source address to be returned by recvfrom, recvmsg. */
-	if (uds_fd_table[minor].type == SOCK_DGRAM)
-		memcpy(&uds_fd_table[peer].source, &uds_fd_table[minor].addr,
-		    sizeof(struct sockaddr_un));
-
-	/* See if we can wake up a blocked reader. */
-	if (uds_fd_table[peer].suspended == UDS_SUSPENDED_READ)
-		uds_unsuspend(peer);
-
-	/* See if we can satisfy an ongoing select. */
-	if ((uds_fd_table[peer].sel_ops & CDEV_OP_RD) &&
-	    uds_fd_table[peer].size > 0) {
-		/* A read on the peer is possible now. */
-		chardriver_reply_select(uds_fd_table[peer].sel_endpt, peer,
-		    CDEV_OP_RD);
-		uds_fd_table[peer].sel_ops &= ~CDEV_OP_RD;
-	}
-
-	return size; /* number of bytes written */
-}
-
-static ssize_t
-uds_read(devminor_t minor, u64_t position, endpoint_t endpt,
-	cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
-{
-	ssize_t rc;
-
-	dprintf(("UDS: uds_read(%d)\n", minor));
-
-	if (minor < 0 || minor >= NR_FDS) return ENXIO;
-
-	if (uds_fd_table[minor].state != UDS_INUSE)
-		return EINVAL;
-
-	rc = uds_perform_read(minor, endpt, grant, size, 0);
-
-	/* If the call couldn't complete, suspend the caller. */
-	if (rc == EDONTREPLY) {
-		uds_fd_table[minor].suspended = UDS_SUSPENDED_READ;
-		uds_fd_table[minor].susp_endpt = endpt;
-		uds_fd_table[minor].susp_grant = grant;
-		uds_fd_table[minor].susp_size = size;
-		uds_fd_table[minor].susp_id = id;
-
-		/* If the call wasn't supposed to block, cancel immediately. */
-		if (flags & CDEV_NONBLOCK) {
-			uds_cancel(minor, endpt, id);
-
-			rc = EAGAIN;
-		}
-	}
-
-	return rc;
-}
-
-static ssize_t
-uds_write(devminor_t minor, u64_t position, endpoint_t endpt,
-	cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
-{
-	ssize_t rc;
-
-	dprintf(("UDS: uds_write(%d)\n", minor));
-
-	if (minor < 0 || minor >= NR_FDS) return ENXIO;
-
-	if (uds_fd_table[minor].state != UDS_INUSE)
-		return EINVAL;
-
-	rc = uds_perform_write(minor, endpt, grant, size, 0);
-
-	/* If the call couldn't complete, suspend the caller. */
-	if (rc == EDONTREPLY) {
-		uds_fd_table[minor].suspended = UDS_SUSPENDED_WRITE;
-		uds_fd_table[minor].susp_endpt = endpt;
-		uds_fd_table[minor].susp_grant = grant;
-		uds_fd_table[minor].susp_size = size;
-		uds_fd_table[minor].susp_id = id;
-
-		/* If the call wasn't supposed to block, cancel immediately. */
-		if (flags & CDEV_NONBLOCK) {
-			uds_cancel(minor, endpt, id);
-
-			rc = EAGAIN;
-		}
-	}
-
-	return rc;
-}
-
-static int
-uds_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
-	cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id)
-{
-	int rc, s;
-
-	dprintf(("UDS: uds_ioctl(%d, %lu)\n", minor, request));
-
-	if (minor < 0 || minor >= NR_FDS) return ENXIO;
-
-	if (uds_fd_table[minor].state != UDS_INUSE)
-		return EINVAL;
-
-	/* Update the owner endpoint. */
-	uds_fd_table[minor].owner = user_endpt;
-
-	/* Let the UDS ioctl subsystem handle the actual request. */
-	rc = uds_do_ioctl(minor, request, endpt, grant);
-
-	/* If the call couldn't complete, suspend the caller. */
-	if (rc == EDONTREPLY) {
-		/* The suspension type is already set by the IOCTL handler. */
-		if ((s = uds_fd_table[minor].suspended) == UDS_NOT_SUSPENDED)
-			panic("IOCTL did not actually suspend?");
-		uds_fd_table[minor].susp_endpt = endpt;
-		uds_fd_table[minor].susp_grant = grant;
-		uds_fd_table[minor].susp_size = 0; /* irrelevant */
-		uds_fd_table[minor].susp_id = id;
-
-		/* If the call wasn't supposed to block, cancel immediately. */
-		if (flags & CDEV_NONBLOCK) {
-			uds_cancel(minor, endpt, id);
-			if (s == UDS_SUSPENDED_CONNECT)
-				rc = EINPROGRESS;
-			else
-				rc = EAGAIN;
-		}
-	}
-
-	return rc;
-}
-
-void
-uds_unsuspend(devminor_t minor)
-{
-	int r;
-	uds_fd_t *fdp;
-
-	fdp = &uds_fd_table[minor];
-
-	switch (fdp->suspended) {
-	case UDS_SUSPENDED_READ:
-		r = uds_perform_read(minor, fdp->susp_endpt, fdp->susp_grant,
-		    fdp->susp_size, 0);
-
-		if (r == EDONTREPLY)
-			return;
-
-		break;
-
-	case UDS_SUSPENDED_WRITE:
-		r = uds_perform_write(minor, fdp->susp_endpt, fdp->susp_grant,
-		    fdp->susp_size, 0);
-
-		if (r == EDONTREPLY)
-			return;
-
-		break;
-
-	case UDS_SUSPENDED_CONNECT:
-	case UDS_SUSPENDED_ACCEPT:
 		/*
-		 * In both cases, the caller already set up the connection.
-		 * The only thing to do here is unblock.
+		 * Wake up blocked (connect, send, select) calls on the peer
+		 * socket.
 		 */
-		r = fdp->err;
-		fdp->err = 0;
-
-		break;
-
-	default:
-		panic("unknown suspension type %d", fdp->suspended);
+		sockevent_raise(&link->uds_sock, SEV_CONNECT);
 	}
 
-	chardriver_reply_task(fdp->susp_endpt, fdp->susp_id, r);
+	uds_del_queue(uds, link);
 
-	fdp->suspended = UDS_NOT_SUSPENDED;
-}
+	/* Return the peer socket's address to the caller. */
+	uds_make_addr(link->uds_path, link->uds_pathlen, addr, addr_len);
 
-static int
-uds_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id)
-{
-	uds_fd_t *fdp;
-	int i;
+	conn = link->uds_conn;
 
-	dprintf(("UDS: uds_cancel(%d)\n", minor));
-
-	if (minor < 0 || minor >= NR_FDS) return EDONTREPLY;
-
-	fdp = &uds_fd_table[minor];
-
-	if (fdp->state != UDS_INUSE) {
-		printf("UDS: cancel request for closed minor %d\n", minor);
-		return EDONTREPLY;
-	}
-
-	/* Make sure the cancel request is for a request we're hanging on. */
-	if (fdp->suspended == UDS_NOT_SUSPENDED || fdp->susp_endpt != endpt ||
-	    fdp->susp_id != id)
-		return EDONTREPLY;	/* this happens. */
+	dprintf(("UDS: accept returns %d\n", uds_get_id(conn)));
 
 	/*
-	 * The system call was cancelled, so the socket is not suspended
-	 * anymore.
+	 * We already cloned the sock object, so return its ID but not a
+	 * pointer to it.  That tells libsockevent not to reinitialize it.
 	 */
-	switch (fdp->suspended) {
-	case UDS_SUSPENDED_ACCEPT:
-		/* A partial accept() only sets the server's child. */
-		for (i = 0; i < NR_FDS; i++)
-			if (uds_fd_table[i].child == minor)
-				uds_fd_table[i].child = -1;
-
-		break;
-
-	case UDS_SUSPENDED_CONNECT:
-		/* Connect requests should continue asynchronously. */
-		break;
-
-	case UDS_SUSPENDED_READ:
-	case UDS_SUSPENDED_WRITE:
-		/* Nothing more to do. */
-		break;
-
-	default:
-		panic("unknown suspension type %d", fdp->suspended);
-	}
-
-	fdp->suspended = UDS_NOT_SUSPENDED;
-
-	return EINTR;	/* reply to the original request */
+	*newsockp = NULL;
+	return uds_get_id(conn);
 }
 
 /*
- * Initialize the server.
+ * Set socket options.
  */
 static int
-uds_init(int UNUSED(type), sef_init_info_t *UNUSED(info))
+uds_setsockopt(struct sock * sock, int level, int name,
+	const struct sockdriver_data * data, socklen_t len)
 {
-	/* Setting everything to NULL implicitly sets the state to UDS_FREE. */
-	memset(uds_fd_table, '\0', sizeof(uds_fd_t) * NR_FDS);
+	struct udssock *uds = (struct udssock *)sock;
+	int r, val;
 
-	uds_exit_left = 0;
+	dprintf(("UDS: setsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name));
 
-	/* Announce we are up! */
-	chardriver_announce();
+	switch (level) {
+	case SOL_SOCKET:
+		switch (name) {
+		case SO_SNDBUF:
+		case SO_RCVBUF:
+			/*
+			 * The send buffer size may not be changed because the
+			 * buffer is the same as the other side's receive
+			 * buffer, and what the other side is may vary from
+			 * send call to send call.  Changing the receive buffer
+			 * size would disallow us from even accurately guessing
+			 * the send buffer size in getsockopt calls.  Therefore
+			 * both are hardcoded and cannot actually be changed.
+			 * In order to support applications that want at least
+			 * a certain minimum, we do accept requests to shrink
+			 * either buffer, but we ignore the given size.
+			 */
+			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
+			    len)) != OK)
+				return r;
 
-	return(OK);
+			if (val <= 0 || (size_t)val > uds_io_buflen())
+				return EINVAL;
+
+			return OK; /* ignore new value */
+		}
+
+		break;
+
+	case UDSPROTO_UDS:
+		switch (name) {
+		case LOCAL_CREDS:
+			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
+			    len)) != OK)
+				return r;
+
+			if (val)
+				uds->uds_flags |= UDSF_PASSCRED;
+			else
+				uds->uds_flags &= ~UDSF_PASSCRED;
+
+			/*
+			 * In incredibly rare cases, disabling this flag may
+			 * allow blocked sends to be resumed, because suddenly
+			 * no room for the credentials is needed in the receive
+			 * buffer anymore.
+			 */
+			if (!val)
+				sockevent_raise(&uds->uds_sock, SEV_SEND);
+
+			return OK;
+
+		case LOCAL_CONNWAIT:
+			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
+			    len)) != OK)
+				return r;
+
+			if (val)
+				uds->uds_flags |= UDSF_CONNWAIT;
+			else
+				uds->uds_flags &= ~UDSF_CONNWAIT;
+
+			/*
+			 * Changing the setting does not affect sockets that
+			 * are currently pending to be accepted.  Therefore,
+			 * uds_accept() may have to deal with either case on a
+			 * socket-by-socket basis.
+			 */
+			return OK;
+
+		case LOCAL_PEEREID:
+			/* This option may be retrieved but not set. */
+			return ENOPROTOOPT;
+		}
+
+		break;
+	}
+
+	return ENOPROTOOPT;
 }
 
+/*
+ * Retrieve socket options.
+ */
+static int
+uds_getsockopt(struct sock * sock, int level, int name,
+	const struct sockdriver_data * data, socklen_t * len)
+{
+	struct udssock *uds = (struct udssock *)sock;
+	int val;
+
+	dprintf(("UDS: getsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name));
+
+	switch (level) {
+	case SOL_SOCKET:
+		switch (name) {
+		case SO_SNDBUF:
+		case SO_RCVBUF:
+			/* See uds_setsockopt() for why this is static. */
+			val = (int)uds_io_buflen();
+
+			return sockdriver_copyout_opt(data, &val, sizeof(val),
+			    len);
+		}
+
+		break;
+
+	case UDSPROTO_UDS:
+		switch (name) {
+		case LOCAL_CREDS:
+			val = !!(uds->uds_flags & UDSF_PASSCRED);
+
+			return sockdriver_copyout_opt(data, &val, sizeof(val),
+			    len);
+
+		case LOCAL_CONNWAIT:
+			val = !!(uds->uds_flags & UDSF_CONNWAIT);
+
+			return sockdriver_copyout_opt(data, &val, sizeof(val),
+			    len);
+
+		case LOCAL_PEEREID:
+			/* getpeereid(3) documents these error codes. */
+			if (uds_get_type(uds) == SOCK_DGRAM)
+				return EINVAL;
+			if (!uds_is_connected(uds))
+				return ENOTCONN;
+
+			/*
+			 * This is a custom MINIX3 error, indicating that there
+			 * are no credentials to return.  This could be due to
+			 * a failure to obtain them (which *should* not happen)
+			 * but also if the socket was bound while connected,
+			 * disconnected, and then reused as listening socket.
+			 */
+			if (uds->uds_conn->uds_cred.unp_pid == -1)
+				return EINVAL;
+
+			return sockdriver_copyout_opt(data,
+			    &uds->uds_conn->uds_cred,
+			    sizeof(uds->uds_conn->uds_cred), len);
+		}
+
+		break;
+	}
+
+	return ENOPROTOOPT;
+}
+
+/*
+ * Retrieve a socket's local address.
+ */
+static int
+uds_getsockname(struct sock * sock, struct sockaddr * addr,
+	socklen_t * addr_len)
+{
+	struct udssock *uds = (struct udssock *)sock;
+
+	dprintf(("UDS: getsockname(%d)\n", uds_get_id(uds)));
+
+	uds_make_addr(uds->uds_path, uds->uds_pathlen, addr, addr_len);
+
+	return OK;
+}
+
+/*
+ * Retrieve a socket's remote address.
+ */
+static int
+uds_getpeername(struct sock * sock, struct sockaddr * addr,
+	socklen_t * addr_len)
+{
+	struct udssock *uds = (struct udssock *)sock;
+	struct udssock *peer;
+
+	dprintf(("UDS: getpeername(%d)\n", uds_get_id(uds)));
+
+	/*
+	 * For disconnected sockets, we no longer have a peer socket and thus
+	 * also no peer address.  Too bad, but NetBSD does the same.
+	 *
+	 * For connecting sockets we could in fact return a peer address, but
+	 * POSIX says (and other platforms agree) that we should deny the call.
+	 */
+	peer = uds_get_peer(uds);
+
+	if (peer == NULL || uds_is_connecting(uds))
+		return ENOTCONN;
+
+	uds_make_addr(peer->uds_path, peer->uds_pathlen, addr, addr_len);
+
+	return OK;
+}
+
+/*
+ * Shut down socket send and receive operations.  Note that 'flags' is a
+ * bitwise mask with libsockevent's SFL_SHUT_{RD,WR} flags rather than the set
+ * of SHUT_{RD,WR,RDWR} values from userland.
+ */
+static int
+uds_shutdown(struct sock * sock, unsigned int flags)
+{
+	struct udssock *uds = (struct udssock *)sock;
+	struct udssock *conn;
+	unsigned int mask;
+
+	dprintf(("UDS: shutdown(%d,0x%x)\n", uds_get_id(uds), flags));
+
+	/*
+	 * If we are shutting down the socket for reading, we can already close
+	 * any in-flight file descriptors associated with this socket.
+	 */
+	if (flags & SFL_SHUT_RD)
+		uds_io_reset(uds);
+
+	/*
+	 * A shutdown on this side of a connection may have an effect on
+	 * ongoing operations on the other side.  Fire appropriate events.
+	 */
+	if (uds_is_connected(uds)) {
+		assert(uds_get_type(uds) != SOCK_DGRAM);
+
+		conn = uds->uds_conn;
+
+		mask = 0;
+		if (flags & SFL_SHUT_RD)
+			mask |= SEV_SEND;
+		if (flags & SFL_SHUT_WR)
+			mask |= SEV_RECV;
+
+		sockevent_raise(&conn->uds_sock, mask);
+	}
+
+	return OK;
+}
+
+/*
+ * Close a socket.
+ *
+ * The 'force' flag is unused because we need never wait for data to be sent,
+ * since we keep all in-flight data on the receiver side.
+ */
+static int
+uds_close(struct sock * sock, int force __unused)
+{
+	struct udssock *uds = (struct udssock *)sock;
+
+	dprintf(("UDS: close(%d)\n", uds_get_id(uds)));
+
+	if (uds_get_type(uds) == SOCK_DGRAM) {
+		/* If this socket is linked to a target, disconnect it. */
+		if (uds_has_link(uds))
+			uds_del_queue(uds->uds_link, uds);
+
+		/* Reset all sockets linked to this socket as a target. */
+		uds_clear_queue(uds, NULL);
+	} else if (uds_is_listening(uds)) {
+		/*
+		 * Abort all connecting sockets queued on this socket, and
+		 * break all connections for connected sockets queued on this
+		 * socket, freeing their peers.
+		 */
+		uds_clear_queue(uds, NULL);
+	} else if (uds_has_link(uds)) {
+		/*
+		 * This socket is connecting or connected while the other side
+		 * has not been accepted yet.  Remove the socket from the
+		 * listening socket's queue, and if it was connected, get rid
+		 * of its peer socket altogether.
+		 */
+		assert(uds_is_listening(uds->uds_link));
+
+		uds_del_queue(uds->uds_link, uds);
+
+		if (uds_is_connected(uds))
+			uds_disconnect(uds, TRUE /*was_linked*/);
+	} else if (uds_is_connected(uds)) {
+		/*
+		 * Decouple the peer socket from this socket, and possibly wake
+		 * up any pending operations on it.  The socket remains marked
+		 * as connected, but will now be disconnected.
+		 */
+		uds_disconnect(uds, FALSE /*was_linked*/);
+	}
+
+	if (uds_is_hashed(uds))
+		udshash_del(uds);
+
+	return OK;
+}
+
+static const struct sockevent_ops uds_ops = {
+	.sop_pair		= uds_pair,
+	.sop_bind		= uds_bind,
+	.sop_connect		= uds_connect,
+	.sop_listen		= uds_listen,
+	.sop_accept		= uds_accept,
+	.sop_test_accept	= uds_test_accept,
+	.sop_pre_send		= uds_pre_send,
+	.sop_send		= uds_send,
+	.sop_test_send		= uds_test_send,
+	.sop_pre_recv		= uds_pre_recv,
+	.sop_recv		= uds_recv,
+	.sop_test_recv		= uds_test_recv,
+	.sop_setsockopt		= uds_setsockopt,
+	.sop_getsockopt		= uds_getsockopt,
+	.sop_getsockname	= uds_getsockname,
+	.sop_getpeername	= uds_getpeername,
+	.sop_shutdown		= uds_shutdown,
+	.sop_close		= uds_close,
+	.sop_free		= uds_free
+};
+
+/*
+ * Initialize the service.
+ */
+static int
+uds_init(int type __unused, sef_init_info_t * info __unused)
+{
+	unsigned int i;
+
+	/* Initialize the list of free sockets. */
+	TAILQ_INIT(&uds_freelist);
+
+	for (i = 0; i < __arraycount(uds_array); i++) {
+		uds_array[i].uds_flags = 0;
+
+		TAILQ_INSERT_TAIL(&uds_freelist, &uds_array[i], uds_next);
+	}
+
+	/* Initialize the file-to-socket hash table. */
+	udshash_init();
+
+	/* Initialize the input/output module. */
+	uds_io_init();
+
+	/* Initialize the status module. */
+	uds_stat_init();
+
+	/* Initialize the sockevent library. */
+	sockevent_init(uds_socket);
+
+	uds_in_use = 0;
+	uds_running = TRUE;
+
+	return OK;
+}
+
+/*
+ * Clean up before shutdown.
+ */
+static void
+uds_cleanup(void)
+{
+
+	/* Tell the status module to clean up. */
+	uds_stat_cleanup();
+}
+
+/*
+ * The service has received a signal.
+ */
 static void
 uds_signal(int signo)
 {
-	int i;
 
-	/* Only check for termination signal, ignore anything else. */
-	if (signo != SIGTERM) return;
+	/* Only check for the termination signal.  Ignore anything else. */
+	if (signo != SIGTERM)
+		return;
 
-	/* Only exit once all sockets have been closed. */
-	uds_exit_left = 0;
-	for (i = 0; i < NR_FDS; i++)
-		if (uds_fd_table[i].state == UDS_INUSE)
-			uds_exit_left++;
+	/* Exit only once all sockets have been closed. */
+	uds_running = FALSE;
 
-	if (uds_exit_left == 0)
-		chardriver_terminate();
+	if (uds_in_use == 0)
+		sef_cancel();
 }
 
+/*
+ * Perform initialization using the System Event Framework (SEF).
+ */
 static void
 uds_startup(void)
 {
-	/* Register init callbacks. */
+
+	/* Register initialization callbacks. */
 	sef_setcb_init_fresh(uds_init);
 
-	/* Register signal callbacks. */
+	/* Register signal callback. */
 	sef_setcb_signal_handler(uds_signal);
 
 	/* Let SEF perform startup. */
@@ -742,14 +1378,40 @@ uds_startup(void)
 }
 
 /*
- * The UNIX domain sockets driver.
+ * The UNIX Domain Sockets driver.
  */
 int
 main(void)
 {
+	message m;
+	int r, ipc_status;
+
+	/* Initialize the service. */
 	uds_startup();
 
-	chardriver_task(&uds_tab);
+	/* Loop receiving and processing messages until instructed to stop. */
+	while (uds_running || uds_in_use > 0) {
+		if ((r = sef_receive_status(ANY, &m, &ipc_status)) != OK) {
+			if (r == EINTR)
+				continue;	/* sef_cancel() was called */
 
-	return(OK);
+			panic("UDS: sef_receive_status failed: %d", r);
+		}
+
+		/*
+		 * Messages from the MIB service are (ultimately) for the
+		 * status module.  Everything else is assumed to be a socket
+		 * request and passed to libsockevent, which will ignore
+		 * anything it does not recognize.
+		 */
+		if (m.m_source == MIB_PROC_NR)
+			rmib_process(&m, ipc_status);
+		else
+			sockevent_process(&m, ipc_status);
+	}
+
+	/* Clean up before graceful shutdown. */
+	uds_cleanup();
+
+	return EXIT_SUCCESS;
 }
diff --git a/minix/net/uds/uds.conf b/minix/net/uds/uds.conf
new file mode 100644
index 000000000..481f8919d
--- /dev/null
+++ b/minix/net/uds/uds.conf
@@ -0,0 +1,9 @@
+service uds
+{
+	domain	LOCAL;
+	system	KILL;	# for SIGPIPE
+	uid	0;	# for socketpath(2) and copyfd(2)
+	ipc
+		SYSTEM vfs rs vm mib
+	;
+};
diff --git a/minix/net/uds/uds.h b/minix/net/uds/uds.h
index 741b4bd47..4ccbaf425 100644
--- a/minix/net/uds/uds.h
+++ b/minix/net/uds/uds.h
@@ -1,23 +1,48 @@
-#ifndef __UDS_UDS_H
-#define __UDS_UDS_H
+#ifndef MINIX_NET_UDS_UDS_H
+#define MINIX_NET_UDS_UDS_H
 
 #include <minix/drivers.h>
-#include <minix/chardriver.h>
-#undef send
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/ucred.h>
+#include <minix/sockevent.h>
+#include <minix/rmib.h>
 #include <sys/un.h>
-#include <sys/mman.h>
 
-/* Maximum number of UNIX domain sockets. */
-#define NR_FDS		256
+/*
+ * Maximum number of UNIX domain sockets.  The control structures for all of
+ * these are allocated statically, although each socket's receive buffer is
+ * allocated only when the socket is in use.  If this constant is increased
+ * beyond 65535, a few field sizes need to be changed.
+ */
+#define NR_UDSSOCK	256
 
-/* Connection backlog size for incoming connections. */
-#define UDS_SOMAXCONN	64
+/* Number of slots in the <dev,ino>-to-udssock hash table. */
+#define UDSHASH_SLOTS	64
 
-/* Maximum UDS socket buffer size. */
-#define UDS_BUF		PIPE_BUF
+/* UDS has no protocols, so we accept only an "any protocol" value. */
+#define UDSPROTO_UDS	0
+
+/*
+ * The size of each socket's receive buffer.  This size is currently a global
+ * setting which cannot be changed per socket at run time, and it would be
+ * rather tricky to change that.  In order not to waste resources, this size
+ * should be a multiple of the page size.  Due to the fact that data and
+ * metadata (such as lengths, source addresses and sender credentials) are
+ * intermixed in the same buffer, the actual amount of data that can be in
+ * transit at once is typically less than this value.  If this constant is
+ * increased beyond 65535, several fields and field sizes need to be changed.
+ */
+#define UDS_BUF		32768
+
+/* Maximum size of control data that can be sent or received at once. */
+#define UDS_CTL_MAX	4096
+
+/*
+ * We allow longer path names than the size of struct sockaddr_un's sun_path
+ * field.  The actual limit is determined by the maximum value of the sun_len
+ * field, which is 255 and includes the first two fields of the structure (one
+ * byte each) but not the null terminator of the path.  Thus, the maximum
+ * length of the path minus null terminator is 253; with terminator it is 254.
+ */
+#define UDS_PATH_MAX	(UINT8_MAX - sizeof(uint8_t) - sizeof(sa_family_t) + 1)
 
 /* Output debugging information? */
 #define DEBUG		0
@@ -29,191 +54,201 @@
 #endif
 
 /*
- * A light version of the "uucred" credentials structure.  We basically do not
- * support passing around groups lists, and by not using struct uucred as
- * storage, we save memory for those groups lists as well.  Note that the
- * original Linux uucred structure has a 'cr_pid' field as well, but this is
- * unsupported in NetBSD's version of the structure (and rightly so).
+ * We declare this structure only for the static assert right below it.  We
+ * have no need for the structure otherwise, as we use "struct sockaddr"
+ * directly instead.
  */
-struct luucred {
-	uid_t uid;
-	gid_t gid;
+struct sockaddr_unx {
+	uint8_t sunx_len;
+	sa_family_t sunx_family;
+	char sunx_path[UDS_PATH_MAX];
 };
-
-/* ancillary data to be sent */
-struct ancillary {
-	int fds[OPEN_MAX];
-	int nfiledes;
-	struct luucred cred;
-};
-
-#define UDS_R	0x1
-#define UDS_W	0x2
+STATIC_SOCKADDR_MAX_ASSERT(sockaddr_unx);
 
 /*
- * Internal State Information for a socket descriptor.
+ * In-flight file descriptor object.  Each in-use object is part of a socket's
+ * file descriptor queue, and the file descriptor is for a file open by this
+ * service.  For each set of in-flight file descriptors associated with a
+ * particular segment, the first object's count field contains the number of
+ * file descriptors in that set.  For all other objects in that set, the count
+ * field is zero.  TODO: the count should be stored in the segment itself.
  */
 struct uds_fd {
-
-/* Flags */
-
-	enum UDS_STATE {
-		/* This file descriptor is UDS_FREE and can be allocated. */
-		UDS_FREE  = 0,
-
-		/* OR it is UDS_INUSE and can't be allocated. */
-		UDS_INUSE = 1
-
-	/* state is set to UDS_INUSE in uds_open(). state is Set to
-	 * UDS_FREE in uds_init() and uds_close(). state should be
-	 * checked prior to all operations.
-	 */
-	} state;
-
-/* Owner Info */
-
-	/* Socket Owner */
-	endpoint_t owner;
-
-/* Pipe Housekeeping */
-
-	char *buf;			/* ring buffer */
-	size_t pos;			/* tail position into ring buffer */
-	size_t size;			/* size of used part of ring buffer */
-
-	/* control read/write, set by uds_open() and shutdown(2).
-	 * Can be set to UDS_R|UDS_W, UDS_R, UDS_W, or 0
-	 * for read and write, read only, write only, or neither.
-	 * default is UDS_R|UDS_W.
-	 */
-	int mode;
-
-/* Socket Info */
-
-	/* socket type - SOCK_STREAM, SOCK_DGRAM, or SOCK_SEQPACKET
-	 * Set by uds_ioctl(NWIOSUDSTYPE). It defaults to -1 in
-	 * uds_open(). Any action on a socket with type -1 besides
-	 * uds_ioctl(NWIOSUDSTYPE) and uds_close() will result in
-	 * an error.
-	 */
-	int type;
-
-	/* queue of pending connections for server sockets.
-	 * connect(2) inserts and accept(2) removes from the queue
-	 */
-	int backlog[UDS_SOMAXCONN];
-
-	/* requested connection backlog size. Set by listen(2)
-	 * Bounds (0 <= backlog_size <= UDS_SOMAXCONN)
-	 * Defaults to UDS_SOMAXCONN which is defined above.
-	 */
-	unsigned char backlog_size;
-
-	/* index of peer in uds_fd_table for connected sockets.
-	 * -1 is used to mean no peer. Assumptions: peer != -1 means
-	 * connected.
-	 */
-	int peer;
-
-	/* index of child (client sd returned by accept(2))
-	 * -1 is used to mean no child.
-	 */
-	int child;
-
-	/* address -- the address the socket is bound to.
-	 * Assumptions: addr.sun_family == AF_UNIX means its bound.
-	 */
-	struct sockaddr_un addr;
-
-	/* target -- where DGRAMs are sent to on the next uds_write(). */
-	struct sockaddr_un target;
-
-	/* source -- address where DGRAMs are from. used to fill in the
-	 * from address in recvfrom(2) and recvmsg(2).
-	 */
-	struct sockaddr_un source;
-
-	/* Flag (TRUE or FALSE) - address overridden by newer socket.
-	 * Default to FALSE.  Set to TRUE by do_bind() on another socket with
-	 * the same path but its on-disk socket file removed in the meantime.
-	 */
-	int stale;
-
-	/* Flag (TRUE or FALSE) - listening for incoming connections.
-	 * Default to FALSE.  Set to TRUE by do_listen().
-	 */
-	int listening;
-
-	/* stores file pointers and credentials being sent between
-	 * processes with sendmsg(2) and recvmsg(2).
-	 */
-	struct ancillary ancillary_data;
-
-	/* Holds an errno. This is set when a connected socket is
-	 * closed and we need to pass ECONNRESET on to a suspended
-	 * peer.
-	 */
-	int err;
-
-/* Suspend/Revive Housekeeping */
-
-	/* SUSPEND State Flags */
-	enum UDS_SUSPENDED {
-
-		/* Socket isn't blocked. */
-		UDS_NOT_SUSPENDED     = 0,
-
-		/* Socket is blocked on read(2) waiting for data to read. */
-		UDS_SUSPENDED_READ    = 1,
-
-		/* Socket is blocked on write(2) for space to write data. */
-		UDS_SUSPENDED_WRITE   = 2,
-
-		/* Socket is blocked on connect(2) waiting for the server. */
-		UDS_SUSPENDED_CONNECT = 4,
-
-		/* Socket is blocked on accept(2) waiting for clients. */
-		UDS_SUSPENDED_ACCEPT  = 8
-	} suspended;
-
-	/* source endpoint, saved for later use by suspended procs */
-	endpoint_t susp_endpt;
-
-	/* i/o grant, saved for later use by suspended procs */
-	cp_grant_id_t susp_grant;
-
-	/* size of request, saved for later use by suspended procs */
-	size_t susp_size;
-
-	/* request ID, saved for later use by suspended procs */
-	cdev_id_t susp_id;
-
-/* select() */
-
-	/* when a select is in progress, we notify this endpoint
-	 * of new data.
-	 */
-	endpoint_t sel_endpt;
-
-	/* Options (CDEV_OP_RD,WR,ERR) that are requested. */
-	unsigned int sel_ops;
+	SIMPLEQ_ENTRY(uds_fd) ufd_next;	/* next FD object for this socket */
+	int ufd_fd;			/* local file descriptor number */
+	unsigned int ufd_count;		/* number of FDs for this segment */
 };
 
-typedef struct uds_fd uds_fd_t;
+/*
+ * Connection-type sockets (SOCK_STREAM, SOCK_SEQPACKET) are always in one of
+ * the following five states, each with unique characteristics:
+ *
+ * - Unconnected: this socket is not in any of the other states, usually
+ *   because it either has just been created, or because it has failed a
+ *   connection attempt.  This socket has no connected peer and does not have
+ *   the SO_ACCEPTCONN socket option set.
+ * - Listening: this socket is in listening mode.  It has a queue with sockets
+ *   that are connecting or connected to it but have not yet been accepted on
+ *   it.  This socket has no connected peer.  It has the SO_ACCEPTCONN socket
+ *   option set.
+ * - Connecting: this socket is on a listening socket's queue.  While in this
+ *   state, the socket has the listening socket as its linked peer, and it has
+ *   no connected peer.
+ * - Connected: this socket is connected to another socket, which is its
+ *   connected peer socket.  It has the UDSF_CONNECTED flag set.  A socket may
+ *   be connected and still be involved with a listening socket; see below.
+ * - Disconnected: this socket was connected to another socket, but that other
+ *   socket has been closed.  As a result, this socket has no peer.  It does
+ *   have the UDSF_CONNECTED flag set.
+ *
+ * The UDS service supports two different type of connect behaviors, depending
+ * on what the LOCAL_CONNWAIT option is set to on either the connecting or the
+ * listening socket.  If LOCAL_CONNWAIT is not set on either (the default), the
+ * connecting socket socket (let's call it "A") enters the connected state
+ * right away, even if the connection is not immediately accepted through
+ * accept(2).  In that case, a new limbo socket "B" is allocated as its
+ * connection peer.  Limbo socket B is also in connected state, and either
+ * returned from accept(2) later, or freed when socket A leaves the connected
+ * state.  Socket A can leave the connected state either by being closed or
+ * when the listening socket is closed.  If LOCAL_CONNWAIT is set, socket A
+ * stays in the connecting state until it is accepted through accept(2).
+ * Importantly, in both cases, it is socket A, and (in the first case) *not*
+ * socket B, that is on the queue of the listening socket!
+ *
+ * Connected peers (uds_conn) are always symmetric: if one socket is connected
+ * to another socket, that other socket is connected to it.  Any socket that is
+ * on the queue of another socket, is said to be "linked" to that other socket
+ * (uds_link). This is an asymmetric, one-to-many relationship: many sockets
+ * may be linked to one other socket, which keeps all those sockets on its
+ * queue. From the above story it should now be clear that for connection-type
+ * sockets, only listening sockets may have sockets on its queue, and while
+ * connecting sockets are always on a listening socket's queue, connected
+ * sockets may or may not be.  Sockets in other states never are.
+ *
+ * UNIX domain sockets are generally reusable.  This means that the listening
+ * state is the only final state; all other socket states allow the socket to
+ * enter another state, although not necessarily every other state.  For
+ * example, a disconnected socket may be reconnected to another target; if that
+ * connection fails, the socket will enter the unconnected state.  As a result,
+ * a socket in any state (even the listening state) may still have incoming
+ * data pending from a previous connection.  However, EOF is currently produced
+ * only for disconnected sockets.  To be sure: connecting and connected sockets
+ * must first enter the unconnected or disconnected state, respectively, before
+ * possibly being reconnected.
+ *
+ * For connectionless (i.e., SOCK_DGRAM) sockets, there are no separate states.
+ * However, a connectionless socket may have been connected to another socket.
+ * We maintain these links not with uds_conn but with uds_link, because such
+ * connections are not symmetric, and there is an interest in keeping track of
+ * which datagram sockets are connected to a particular socket (namely, to
+ * break the connection on close without doing an exhaustive search).  For that
+ * reason, when a datagram socket connects to another socket, it is linked to
+ * that other socket, and the other socket has this socket on its queue.  As a
+ * strange corner case, a connectionless socket may be connected to itself, in
+ * which case it is its own linked peer and it is also on its own queue.  For
+ * datagram sockets, uds_conn is always NULL and UDSF_CONNECTED is never set.
+ *
+ * For the purposes of sending and receiving, we generally refer to the
+ * communication partner of a socket as its "peer".  As should now be clear,
+ * for connection-type sockets, the socket's peer is identified with uds_conn;
+ * for connectionless sockets, the socket's peer is identified with uds_link.
+ */
+struct udssock {
+	struct sock uds_sock;		/* sock object */
+	struct udssock *uds_conn;	/* connected socket, or NULL if none */
+	struct udssock *uds_link;	/* linked socket, or NULL if none */
+	unsigned char *uds_buf;		/* receive buffer (memory-mapped) */
+	unsigned short uds_tail;	/* tail of data in receive buffer */
+	unsigned short uds_len;		/* length of data in receive buffer */
+	unsigned short uds_last;	/* offset to last header in buffer */
+	unsigned short uds_queued;	/* current nr of sockets on queue */
+	unsigned short uds_backlog;	/* maximum nr of connecting sockets */
+	unsigned char uds_flags;	/* UDS-specific flags (UDSF_) */
+	unsigned char uds_pathlen;	/* socket file path length (w/o nul) */
+	char uds_path[UDS_PATH_MAX - 1];/* socket file path (not terminated) */
+	dev_t uds_dev;			/* socket file device number */
+	ino_t uds_ino;			/* socket file inode number */
+	struct unpcbid uds_cred;	/* bind/connect-time credentials */
+	SLIST_ENTRY(udssock) uds_hash;	/* next in hash chain */
+	TAILQ_ENTRY(udssock) uds_next;	/* next in free list or queue */
+	SIMPLEQ_HEAD(, uds_fd) uds_fds;	/* in-flight file descriptors */
+	TAILQ_HEAD(, udssock) uds_queue;/* queue of linked sockets */
+};
 
-/* File Descriptor Table -- Defined in uds.c */
-EXTERN uds_fd_t uds_fd_table[NR_FDS];
+#define UDSF_IN_USE		0x01	/* in use (for enumeration only) */
+#define UDSF_CONNECTED		0x02	/* connected or disconnected */
+#define UDSF_CONNWAIT		0x04	/* leave connecting until accept */
+#define UDSF_PASSCRED		0x08	/* pass credentials when receiving */
+
+/* Macros. */
+#define uds_get_type(uds)	sockevent_get_type(&(uds)->uds_sock)
+
+/*
+ * A socket that can be found through hash table lookups always has a non-empty
+ * path as well as a valid <dev,ino> pair identifying the socket file that is,
+ * or once was, identified by that path.  However, a socket that is bound, even
+ * though it will still have an associated path, is not necessarily hashed.
+ * The reason for the difference is <dev,ino> pair reuse.  This case is
+ * elaborated on in uds_bind().
+ */
+#define uds_is_bound(uds)	((uds)->uds_pathlen != 0)
+#define uds_is_hashed(uds)	((uds)->uds_dev != NO_DEV)
+
+/*
+ * These macros may be used on all socket types.  However, the uds_is_connected
+ * macro returns TRUE only for connection-oriented sockets.  To see if a
+ * datagram socket is connected to a target, use uds_has_link instead.
+ */
+#define uds_has_conn(uds)	((uds)->uds_conn != NULL)
+#define uds_has_link(uds)	((uds)->uds_link != NULL)
+#define uds_get_peer(uds)	\
+	((uds_get_type(uds) != SOCK_DGRAM) ? (uds)->uds_conn : (uds)->uds_link)
+#define uds_is_listening(uds)	sockevent_is_listening(&(uds)->uds_sock)
+#define uds_is_connecting(uds)						\
+	(uds_has_link(uds) && !((uds)->uds_flags & UDSF_CONNECTED) &&	\
+	uds_get_type(uds) != SOCK_DGRAM)
+#define uds_is_connected(uds)	\
+	(((uds)->uds_flags & UDSF_CONNECTED) && uds_has_conn(uds))
+#define uds_is_disconnected(uds)	\
+	(((uds)->uds_flags & UDSF_CONNECTED) && !uds_has_conn(uds))
+
+#define uds_is_shutdown(uds, mask)	\
+	sockevent_is_shutdown(&(uds)->uds_sock, (mask))
 
 /* Function prototypes. */
 
-/* ioc_uds.c */
-int uds_clear_fds(devminor_t minor, struct ancillary *data);
-int uds_do_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
-	cp_grant_id_t grant);
-
 /* uds.c */
-ssize_t uds_perform_read(devminor_t minor, endpoint_t endpt,
-	cp_grant_id_t grant, size_t size, int pretend);
-void uds_unsuspend(devminor_t minor);
+sockid_t uds_get_id(struct udssock * uds);
+struct udssock *uds_enum(struct udssock * prev, int type);
+void uds_make_addr(const char * path, size_t len, struct sockaddr * addr,
+	socklen_t * addr_len);
+int uds_lookup(struct udssock * uds, const struct sockaddr * addr,
+	socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp);
 
-#endif /* !__UDS_UDS_H */
+/* io.c */
+void uds_io_init(void);
+int uds_io_setup(struct udssock * uds);
+void uds_io_cleanup(struct udssock * uds);
+void uds_io_reset(struct udssock * uds);
+size_t uds_io_buflen(void);
+int uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len,
+	const struct sockaddr * addr, socklen_t addr_len,
+	endpoint_t user_endpt, int flags);
+int uds_send(struct sock * sock, const struct sockdriver_data * data,
+	size_t len, size_t * off, const struct sockdriver_data * ctl,
+	socklen_t ctl_len, socklen_t * ctl_off, const struct sockaddr * addr,
+	socklen_t addr_len, endpoint_t user_endpt, int flags, size_t min);
+int uds_test_send(struct sock * sock, size_t min);
+int uds_pre_recv(struct sock * sock, endpoint_t user_endpt, int flags);
+int uds_recv(struct sock * sock, const struct sockdriver_data * data,
+	size_t len, size_t * off, const struct sockdriver_data * ctl,
+	socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr,
+	socklen_t * addr_len, endpoint_t user_endpt, int flags, size_t min,
+	int * rflags);
+int uds_test_recv(struct sock * sock, size_t min, size_t * size);
+
+/* stat.c */
+void uds_stat_init(void);
+void uds_stat_cleanup(void);
+
+#endif /* !MINIX_NET_UDS_UDS_H */
diff --git a/minix/net/uds/unix.8 b/minix/net/uds/unix.8
index 131753e49..ffde8edc9 100644
--- a/minix/net/uds/unix.8
+++ b/minix/net/uds/unix.8
@@ -10,6 +10,7 @@ unix \- Unix Domain Sockets (PF_UNIX) / Local Sockets (PF_LOCAL)
 .in +5
 .ti -5
 int socket(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP);
+.br
 .ti -5
 int socketpair(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP, int \fIsv[2]\fP);
 .br
@@ -18,9 +19,8 @@ int socketpair(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP, int \fIsv[2
 Local sockets, more commonly known as Unix Domain Sockets, provide a 
 means of interprocess communication using the socket API.
 .SH SEE ALSO
-.BR socket(2),
-.BR socketpair(2),
-.BR getpeereid(2),
-.BR uds(8)
+.BR socket(2) ,
+.BR socketpair(2) ,
+.BR getpeereid(3)
 .SH HISTORY
-This Unix Domain Sockets first appeared in Minix 3.1.8.
+This Unix Domain Sockets first appeared in MINIX 3.1.8.
diff --git a/minix/servers/vfs/filedes.c b/minix/servers/vfs/filedes.c
index 7ddb51b42..2051221e6 100644
--- a/minix/servers/vfs/filedes.c
+++ b/minix/servers/vfs/filedes.c
@@ -525,7 +525,8 @@ int do_copyfd(void)
 {
 /* Copy a file descriptor between processes, or close a remote file descriptor.
  * This call is used as back-call by device drivers (UDS, VND), and is expected
- * to be used in response to an IOCTL to such device drivers.
+ * to be used in response to either an IOCTL to VND or a SEND or RECV socket
+ * request to UDS.
  */
   struct fproc *rfp;
   struct filp *rfilp;
@@ -548,9 +549,9 @@ int do_copyfd(void)
   rfp = &fproc[slot];
 
   /* FIXME: we should now check that the user process is indeed blocked on an
-   * IOCTL call, so that we can safely mess with its file descriptors.  We
-   * currently do not have the necessary state to verify this, so we assume
-   * that the call is always used in the right way.
+   * IOCTL or socket call, so that we can safely mess with its file
+   * descriptors.  We currently do not have the necessary state to verify this,
+   * so we assume that the call is always used in the right way.
    */
 
   /* Depending on the operation, get the file descriptor from the caller or the
@@ -566,7 +567,7 @@ int do_copyfd(void)
    * passes in the file descriptor to the device node on which it is performing
    * the IOCTL.  We do not allow manipulation of such device nodes.  In
    * practice, this only applies to block-special files (and thus VND), because
-   * character-special files (as used by UDS) are unlocked during the IOCTL.
+   * socket files (as used by UDS) are unlocked during the socket operation.
    */
   if (rfilp->filp_ioctl_fp == rfp)
 	return(EBADF);
diff --git a/minix/servers/vfs/open.c b/minix/servers/vfs/open.c
index fc5b0c227..014122fd2 100644
--- a/minix/servers/vfs/open.c
+++ b/minix/servers/vfs/open.c
@@ -535,9 +535,9 @@ int do_mknod(void)
   resolve.l_vnode_lock = VNODE_WRITE;
 
   /* Only the super_user may make nodes other than fifos. */
-  if (!super_user && (!S_ISFIFO(mode_bits) && !S_ISSOCK(mode_bits))) {
+  if (!super_user && !S_ISFIFO(mode_bits))
 	return(EPERM);
-  }
+
   bits = (mode_bits & S_IFMT) | (mode_bits & ACCESSPERMS & fp->fp_umask);
 
   /* Open directory that's going to hold the new node. */
diff --git a/minix/servers/vfs/path.c b/minix/servers/vfs/path.c
index 5f0191a4b..5360d2c1f 100644
--- a/minix/servers/vfs/path.c
+++ b/minix/servers/vfs/path.c
@@ -15,7 +15,6 @@
 #include <minix/vfsif.h>
 #include <sys/param.h>
 #include <sys/stat.h>
-#include <sys/un.h>
 #include <sys/dirent.h>
 #include "vmnt.h"
 #include "vnode.h"
@@ -819,7 +818,6 @@ int do_socketpath(void)
   struct fproc *rfp;
   char path[PATH_MAX];
   struct lookup resolve, resolve2;
-  struct sockaddr_un sun;
   mode_t bits;
 
   /* This should be replaced by an ACL check. */
@@ -831,24 +829,16 @@ int do_socketpath(void)
   what = job_m_in.m_lsys_vfs_socketpath.what;
 
   if (isokendpt(ep, &slot) != OK) return(EINVAL);
-  if (pathlen < sizeof(sun.sun_path) || pathlen >= PATH_MAX) return(EINVAL);
+  rfp = &fproc[slot];
 
-  rfp = &(fproc[slot]);
+  /* Copy in the path name, which must not be empty.  It is typically not null
+   * terminated.
+   */
+  if (pathlen < 1 || pathlen >= sizeof(path)) return(EINVAL);
   r = sys_safecopyfrom(who_e, io_gr, (vir_bytes)0, (vir_bytes)path, pathlen);
   if (r != OK) return(r);
   path[pathlen] = '\0';
 
-  /* If requested, turn path into canonical path to the socket file */
-  if (what & SPATH_CANONIZE) {
-	if ((r = canonical_path(path, rfp)) != OK) return(r);
-	if (strlen(path) >= pathlen) return(ENAMETOOLONG);
-
-	/* copy path back to the caller */
-	r = sys_safecopyto(who_e, (cp_grant_id_t)io_gr, (vir_bytes)0,
-	    (vir_bytes)path, pathlen);
-	if (r != OK) return(r);
-  }
-
   /* Now perform the requested action.  For the SPATH_CHECK action, a socket
    * file is expected to exist already, and we should check whether the given
    * user process has access to it.  For the SPATH_CREATE action, no file is
@@ -859,7 +849,7 @@ int do_socketpath(void)
    * Since the above canonicalization releases all locks once done, we need to
    * recheck absolutely everything now.  TODO: do not release locks in between.
    */
-  switch (what & ~SPATH_CANONIZE) {
+  switch (what) {
   case SPATH_CHECK:
 	lookup_init(&resolve, path, PATH_NOFLAGS, &vmp, &vp);
 	resolve.l_vmnt_lock = VMNT_READ;
diff --git a/minix/tests/common-socket.c b/minix/tests/common-socket.c
index 1517513b4..e84ac351c 100644
--- a/minix/tests/common-socket.c
+++ b/minix/tests/common-socket.c
@@ -50,16 +50,19 @@ static char *get_timestamp(void)
 void test_fail_fl(char *msg, char *file, int line)
 {
 	char *timestamp;
+	int e;
+	e = errno;
 	timestamp = get_timestamp();
 	if (errct == 0) fprintf(stderr, "\n");
+	errno = e;
 	fprintf(stderr, "[ERROR][%s] (%s Line %d) %s [pid=%d:errno=%d:%s]\n",
-			timestamp, file, line, msg, getpid(),
-					errno, strerror(errno));
+	    timestamp, file, line, msg, getpid(), errno, strerror(errno));
 	fflush(stderr);
 	if (timestamp != NULL) {
 		free(timestamp);
 		timestamp = NULL;
 	}
+	errno = e;
 	e(7);
 }
 
@@ -317,7 +320,7 @@ void test_shutdown(const struct socket_test_info *info)
 		SOCKET(sd, info->domain, info->type, 0);
 		errno = 0;
 		rc = shutdown(sd, how[i]);
-		if (!(rc == -1 && errno == ENOTCONN) &&
+		if (rc != 0 && !(rc == -1 && errno == ENOTCONN) &&
 			!info->bug_shutdown_not_conn &&
 			!info->bug_shutdown) {
 			test_fail("shutdown() should have failed");
@@ -328,10 +331,10 @@ void test_shutdown(const struct socket_test_info *info)
 	SOCKET(sd, info->domain, info->type, 0);
 	errno = 0;
 	rc = shutdown(sd, -1);
-	if (!(rc == -1 && errno == ENOTCONN) &&
+	if (!(rc == -1 && errno == EINVAL) &&
 		!info->bug_shutdown_not_conn &&
 		!info->bug_shutdown) {
-		test_fail("shutdown(sd, -1) should have failed with ENOTCONN");
+		test_fail("shutdown(sd, -1) should have failed with EINVAL");
 	}
 	CLOSE(sd);
 
@@ -431,8 +434,6 @@ void test_sockopts(const struct socket_test_info *info)
 		CLOSE(sd);
 	}
 
-
-
 	SOCKET(sd, info->domain, info->type, 0);
 
 	debug("Test setsockopt() works");
@@ -901,9 +902,6 @@ static void test_xfer_client(const struct socket_test_info *info)
 		test_fail("[client] getpeername() should have worked");
 	}
 
-	/* we need to use the full path "/usr/src/test/DIR_56/test.sock"
-	 * because that is what is returned by getpeername().
-	 */
 
 	info->callback_check_sockaddr((struct sockaddr *) &peer_addr,
 		peer_addr_len, "getpeername", 1);
@@ -1299,8 +1297,8 @@ static void test_abort_client(const struct socket_test_info *info,
 			if (!info->ignore_write_conn_reset) {
 				test_fail("write should have failed\n");
 			}
-		} else if (errno != ECONNRESET) {
-			test_fail("errno should've been ECONNRESET\n");
+		} else if (errno != EPIPE && errno != ECONNRESET) {
+			test_fail("errno should've been EPIPE/ECONNRESET\n");
 		}
 	}
 
@@ -1353,7 +1351,7 @@ static void test_abort_server(const struct socket_test_info *info,
 	if (abort_type == 1) {
 		memset(buf, '\0', BUFSIZE);
 		rc = read(client_sd, buf, BUFSIZE);
-		if (rc != -1 && (rc != 0 || !info->ignore_read_conn_reset)) {
+		if (rc != 0 && rc != -1) {
 			test_fail("read should've failed or returned zero\n");
 		}
 		if (rc != 0 && errno != ECONNRESET) {
@@ -1518,9 +1516,6 @@ void test_msg_dgram(const struct socket_test_info *info)
 		test_fail("recvmsg");
 	}
 
-	/* we need to use the full path "/usr/src/test/DIR_56/testb.sock"
-	 * because that is what is returned by recvmsg().
-	 */
 	info->callback_check_sockaddr((struct sockaddr *) &addr,
 		msg2.msg_namelen, "recvmsg", 2);
 
@@ -1603,6 +1598,9 @@ test_nonblock(const struct socket_test_info *info)
 	if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
 		test_fail("bind() should have worked");
 
+	if (info->callback_set_listen_opt != NULL)
+		info->callback_set_listen_opt(server_sd);
+
 	if (listen(server_sd, 8) == -1)
 		test_fail("listen() should have worked");
 
@@ -1813,6 +1811,9 @@ test_intr(const struct socket_test_info *info)
 	if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
 		test_fail("bind() should have worked");
 
+	if (info->callback_set_listen_opt != NULL)
+		info->callback_set_listen_opt(server_sd);
+
 	if (listen(server_sd, 8) == -1)
 		test_fail("listen() should have worked");
 
@@ -1844,6 +1845,9 @@ test_intr(const struct socket_test_info *info)
 		errct = 0;
 		close(client_sd);
 
+		/* Ensure that the parent is blocked on the send(). */
+		sleep(1);
+
 		check_select(server_sd, 1 /*read*/, 1 /*write*/, 0 /*block*/);
 
 		len = sizeof(addr);
@@ -1932,6 +1936,9 @@ test_connect_close(const struct socket_test_info *info)
 	if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
 		test_fail("bind() should have worked");
 
+	if (info->callback_set_listen_opt != NULL)
+		info->callback_set_listen_opt(server_sd);
+
 	if (listen(server_sd, 8) == -1)
 		test_fail("listen() should have worked");
 
@@ -1989,6 +1996,9 @@ test_listen_close(const struct socket_test_info *info)
 	if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
 		test_fail("bind() should have worked");
 
+	if (info->callback_set_listen_opt != NULL)
+		info->callback_set_listen_opt(server_sd);
+
 	if (listen(server_sd, 8) == -1)
 		test_fail("listen() should have worked");
 
@@ -2009,7 +2019,6 @@ test_listen_close(const struct socket_test_info *info)
 
 	byte = 0;
 	if (write(client_sd, &byte, 1) != -1 || errno != ENOTCONN)
-		/* Yes, you fucked up the fix for the FIXME below. */
 		test_fail("write() should have yielded ENOTCONN");
 
 	if (connect(client_sd, info->clientaddr, info->clientaddrlen) != -1) {
@@ -2021,14 +2030,16 @@ test_listen_close(const struct socket_test_info *info)
 	}
 
 	/*
-	 * FIXME: currently UDS cannot distinguish between sockets that have
-	 * not yet been connected, and sockets that have been disconnected.
-	 * Thus, we get the same error for both: ENOTCONN instead of EPIPE.
+	 * The error we get on the next write() depends on whether the socket
+	 * may be reused after a failed connect: for TCP/IP, it may not, so we
+	 * get EPIPE; for UDS, it may be reused, so we get ENOTCONN.
 	 */
-#if 0
-	if (write(client_sd, &byte, 1) != -1 || errno != EPIPE)
-		test_fail("write() should have yielded EPIPE");
-#endif
+	if (!info->bug_connect_after_close) {
+		if (write(client_sd, &byte, 1) != -1 ||
+		    (errno != EPIPE && errno != ENOTCONN))
+			test_fail("write() should have yielded "
+			    "EPIPE/ENOTCONN");
+	}
 
 	close(client_sd);
 
@@ -2059,6 +2070,9 @@ test_listen_close_nb(const struct socket_test_info *info)
 	if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
 		test_fail("bind() should have worked");
 
+	if (info->callback_set_listen_opt != NULL)
+		info->callback_set_listen_opt(server_sd);
+
 	if (listen(server_sd, 8) == -1)
 		test_fail("listen() should have worked");
 
@@ -2097,16 +2111,6 @@ test_listen_close_nb(const struct socket_test_info *info)
 		test_fail("write() should have yielded ECONNRESET");
 	}
 
-	/*
-	 * FIXME: currently UDS cannot distinguish between sockets that have
-	 * not yet been connected, and sockets that have been disconnected.
-	 * Thus, we get the same error for both: ENOTCONN instead of EPIPE.
-	 */
-#if 0
-	if (write(client_sd, &byte, 1) != -1 || errno != EPIPE)
-		test_fail("write() should have yielded EPIPE");
-#endif
-
 	check_select_cond(client_sd, 1 /*read*/, 1 /*write*/, 0 /*block*/,
 		!info->ignore_select_delay);
 
diff --git a/minix/tests/common-socket.h b/minix/tests/common-socket.h
index f03015be5..03ba77e6c 100644
--- a/minix/tests/common-socket.h
+++ b/minix/tests/common-socket.h
@@ -88,7 +88,6 @@ struct socket_test_info {
 	int ignore_accept_delay; /* success from accept after aborted connect */
 	int ignore_connect_delay; /* nb connect not instant */
 	int ignore_connect_unaccepted; /* connect succeeds without accept */
-	int ignore_read_conn_reset; /* read does not guarantee ECONNRESET */
 	int ignore_select_delay; /* select delay reflecting other side nb op */
 	int ignore_send_waiting; /* can send while waiting for nb recv */
 	int ignore_write_conn_reset; /* write does not guarantee ECONNRESET */
@@ -98,6 +97,7 @@ struct socket_test_info {
 	void (* callback_cleanup)(void);
 	void (* callback_xfer_peercred)(int sd); /* can be NULL */
 	void (* callback_xfer_prepclient)(void); /* can be NULL */
+	void (* callback_set_listen_opt)(int sd); /* can be NULL */
 };
 
 void test_abort_client_server(const struct socket_test_info *info,
diff --git a/minix/tests/test56.c b/minix/tests/test56.c
index 583468dd5..d7a211b00 100644
--- a/minix/tests/test56.c
+++ b/minix/tests/test56.c
@@ -78,20 +78,6 @@ int max_error = 4;
 
 /* socket types supported */
 static int types[3] = {SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM};
-static char sock_fullpath[PATH_MAX + 1];
-
-/* Convert name to the full path of the socket. Assumes name is in cwd. */
-static char *fullpath(const char *name)
-{
-	char cwd[PATH_MAX + 1];
-
-	if (realpath(".", cwd) == NULL)
-		test_fail("Couldn't retrieve current working dir");
-
-	snprintf(sock_fullpath, PATH_MAX, "%s/%s", cwd, name);
-
-	return(sock_fullpath);
-}
 
 static void test_header(void)
 {
@@ -187,16 +173,16 @@ static void test_socketpair(void)
 
 static void test_ucred(void)
 {
-	struct uucred credentials;
+	struct unpcbid credentials;
 	socklen_t ucred_length;
 	uid_t euid = geteuid();
 	gid_t egid = getegid();
 	int sv[2];
 	int rc;
 
-	debug("Test credentials passing");
+	debug("Test peer credentials");
 
-	ucred_length = sizeof(struct uucred);
+	ucred_length = sizeof(credentials);
 
 	rc = socketpair(PF_UNIX, SOCK_STREAM, 0, sv);
 	if (rc == -1) {
@@ -204,22 +190,24 @@ static void test_ucred(void)
 	}
 
 	memset(&credentials, '\0', ucred_length);
-	rc = getsockopt(sv[0], SOL_SOCKET, SO_PEERCRED, &credentials, 
+	rc = getsockopt(sv[0], 0, LOCAL_PEEREID, &credentials,
 							&ucred_length);
 	if (rc == -1) {
-		test_fail("getsockopt(SO_PEERCRED) failed");
-	} else if (credentials.cr_ngroups != 0 ||
-			credentials.cr_uid != geteuid() ||
-			credentials.cr_gid != getegid()) {
-		/* printf("%d=%d %d=%d %d=%d",credentials.cr_ngroups, 0,
-		 credentials.cr_uid, geteuid(), credentials.cr_gid, getegid()); */
+		test_fail("getsockopt(LOCAL_PEEREID) failed");
+	} else if (credentials.unp_pid != getpid() ||
+			credentials.unp_euid != geteuid() ||
+			credentials.unp_egid != getegid()) {
+		printf("%d=%d %d=%d %d=%d",credentials.unp_pid, getpid(),
+		    credentials.unp_euid, geteuid(),
+		    credentials.unp_egid, getegid());
 		test_fail("Credential passing gave us the wrong cred");
 	}
 
 	rc = getpeereid(sv[0], &euid, &egid);
 	if (rc == -1) {
 		test_fail("getpeereid(sv[0], &euid, &egid) failed");
-	} else if (credentials.cr_uid != euid || credentials.cr_gid != egid) {
+	} else if (credentials.unp_euid != euid ||
+	    credentials.unp_egid != egid) {
 		test_fail("getpeereid() didn't give the correct euid/egid");
 	}
 
@@ -245,7 +233,7 @@ static void callback_check_sockaddr(const struct sockaddr *sockaddr,
 
 	if (!(sockaddr_un->sun_family == AF_UNIX &&
 			strncmp(sockaddr_un->sun_path,
-			fullpath(path),
+			path,
 			sizeof(sockaddr_un->sun_path) - 1) == 0)) {
 
 		snprintf(buf, sizeof(buf), "%s() didn't return the right addr",
@@ -293,7 +281,6 @@ static void test_bind_unix(void)
 	UNLINK(TEST_SYM_A);
 	UNLINK(TEST_SYM_B);
 
-	SYMLINK(TEST_SYM_A, TEST_SYM_B);
 	SYMLINK(TEST_SYM_B, TEST_SYM_A);
 
 	SOCKET(sd, PF_UNIX, SOCK_STREAM, 0);
@@ -301,6 +288,19 @@ static void test_bind_unix(void)
 	strncpy(addr.sun_path, TEST_SYM_A, sizeof(addr.sun_path) - 1);
 	errno = 0;
 	rc = bind(sd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un));
+	if (!((rc == -1) && (errno == EADDRINUSE))) {
+		test_fail("bind() should have failed with EADDRINUSE");
+	}
+	CLOSE(sd);
+
+	SYMLINK(TEST_SYM_A, TEST_SYM_B);
+
+	SOCKET(sd, PF_UNIX, SOCK_STREAM, 0);
+
+	strncpy(addr.sun_path, TEST_SYM_A, sizeof(addr.sun_path) - 1);
+	strlcat(addr.sun_path, "/x", sizeof(addr.sun_path));
+	errno = 0;
+	rc = bind(sd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un));
 	if (!((rc == -1) && (errno == ELOOP))) {
 		test_fail("bind() should have failed with ELOOP");
 	}
@@ -337,28 +337,49 @@ static void callback_xfer_prepclient(void) {
 }
 
 static void callback_xfer_peercred(int sd) {
-	struct uucred credentials;
+	struct unpcbid credentials;
 	int rc;
 	socklen_t ucred_length;
 
-	ucred_length = sizeof(struct uucred);
+	ucred_length = sizeof(credentials);
 
-	debug("Test passing the client credentials to the server");
+	debug("Test obtaining the peer credentials");
 
 	memset(&credentials, '\0', ucred_length);
-	rc = getsockopt(sd, SOL_SOCKET, SO_PEERCRED, &credentials,
-							&ucred_length);
+	rc = getsockopt(sd, 0, LOCAL_PEEREID, &credentials, &ucred_length);
 
 	if (rc == -1) {
 		test_fail("[client] getsockopt() failed");
-	}  else if (credentials.cr_uid != geteuid() ||
-					credentials.cr_gid != getegid()) {
-		printf("%d=%d=%d %d=%d=%d\n", credentials.cr_uid, getuid(),
-			geteuid(), credentials.cr_gid, getgid(), getegid());
+	} else if (credentials.unp_euid != geteuid() ||
+	    credentials.unp_egid != getegid()) {
+		printf("%d=* %d=%d %d=%d", credentials.unp_pid,
+		    credentials.unp_euid, geteuid(),
+		    credentials.unp_egid, getegid());
 		test_fail("[client] Credential passing gave us a bad UID/GID");
 	}
 }
 
+static void
+callback_set_listen_opt(int sd)
+{
+	int val;
+
+	/*
+	 * Several of the tests assume that a new connection to a server will
+	 * not be established (i.e., go from "connecting" to "connected" state)
+	 * until the server actually accepts the connection with an accept(2)
+	 * call.  With the new UDS implementation, this is no longer true: to
+	 * match the behavior of other systems, UDS now preemptively connects
+	 * the socket in anticipation of the accept(2) call.  We can change
+	 * back to the old behavior by setting LOCAL_CONNWAIT however, and
+	 * since the test effectively tests a larger set of socket transitions
+	 * that way, that is what we do for these tests.
+	 */
+	val = 1;
+	if (setsockopt(sd, 0, LOCAL_CONNWAIT, &val, sizeof(val)) != 0)
+		test_fail("setsockopt(LOCAL_CONNWAIT)");
+}
+
 static void test_vectorio(int type)
 {
 	int sv[2];
@@ -563,7 +584,11 @@ static void test_scm_credentials(void)
 	int rc;
 	int src;
 	int dst;
-	struct uucred cred;
+	int one;
+	union {
+		struct sockcred cred;
+		char buf[SOCKCREDSIZE(NGROUPS_MAX)];
+	} cred;
 	struct cmsghdr *cmsg = NULL;
 	struct sockaddr_un addr;
 	struct iovec iov[3];
@@ -573,7 +598,7 @@ static void test_scm_credentials(void)
 	char buf2[BUFSIZE];
 	char buf3[BUFSIZE];
 	char ctrl[BUFSIZE];
-	socklen_t addrlen = sizeof(struct sockaddr_un);
+	socklen_t len, addrlen = sizeof(struct sockaddr_un);
 
 	debug("test_scm_credentials");
 
@@ -615,6 +640,16 @@ static void test_scm_credentials(void)
 		test_fail("bind");
 	}
 
+	debug("request credential passing");
+
+	one = 1;
+	rc = setsockopt(dst, 0, LOCAL_CREDS, &one, sizeof(one));
+	if (rc == -1) {
+		test_fail("setsockopt(LOCAL_CREDS)");
+	}
+
+	debug("sending msg1");
+
 	memset(&buf1, '\0', BUFSIZE);
 	memset(&buf2, '\0', BUFSIZE);
 	memset(&buf3, '\0', BUFSIZE);
@@ -640,8 +675,6 @@ static void test_scm_credentials(void)
 	msg1.msg_controllen = 0;
 	msg1.msg_flags = 0;
 
-	debug("sending msg1");
-
 	rc = sendmsg(src, &msg1, 0);
 	if (rc == -1) {
 		test_fail("sendmsg");
@@ -684,27 +717,50 @@ static void test_scm_credentials(void)
 	 * because that is what is returned by recvmsg().
 	 */
 	if (addr.sun_family != AF_UNIX || strcmp(addr.sun_path,
-					fullpath(TEST_SUN_PATHB))) {
+					TEST_SUN_PATHB)) {
 		test_fail("recvmsg");
 	}
 
 	debug("looking for credentials");
 
-	memset(&cred, '\0', sizeof(struct uucred));
+	len = 0;
+
+	memset(&cred, 'x', sizeof(cred));
 	for (cmsg = CMSG_FIRSTHDR(&msg2); cmsg != NULL;
 					cmsg = CMSG_NXTHDR(&msg2, cmsg)) {
 
 		if (cmsg->cmsg_level == SOL_SOCKET &&
 				cmsg->cmsg_type == SCM_CREDS) {
+			/* Great, this alignment business!  But then at least
+			 * give me a macro to compute the actual data length..
+			 */
+			len = cmsg->cmsg_len - (socklen_t)
+			    ((char *)CMSG_DATA(cmsg) - (char *)cmsg);
 
-			memcpy(&cred, CMSG_DATA(cmsg), sizeof(struct uucred));
+			if (len < sizeof(struct sockcred))
+				test_fail("credentials too small");
+			else if (len > sizeof(cred))
+				test_fail("credentials too large");
+			memcpy(cred.buf, CMSG_DATA(cmsg), len);
 			break;
 		}
 	}
 
-	if (cred.cr_ngroups != 0 || cred.cr_uid != geteuid() ||
-						cred.cr_gid != getegid()) {
+	if (len == 0)
+		test_fail("no credentials found");
 
+	if (len != SOCKCREDSIZE(cred.cred.sc_ngroups))
+		test_fail("wrong credentials size");
+
+	/*
+	 * TODO: check supplementary groups.  This whole test is pretty much
+	 * pointless since we're running with very standard credentials anyway.
+	 */
+	if (cred.cred.sc_uid != getuid() ||
+	    cred.cred.sc_euid != geteuid() ||
+	    cred.cred.sc_gid != getgid() ||
+	    cred.cred.sc_egid != getegid() ||
+	    cred.cred.sc_ngroups < 0 || cred.cred.sc_ngroups > NGROUPS_MAX) {
 		test_fail("did no receive the proper credentials");
 	}
 
@@ -1384,22 +1440,18 @@ static void test_fchmod(void)
  * Test various aspects related to the socket files on the file system.
  * This subtest is woefully incomplete and currently only attempts to test
  * aspects that have recently been affected by code changes.  In the future,
- * there should be tests for path canonicalization and the entire range of file
- * system path and access related error codes (TODO).
+ * there should be tests for the entire range of file system path and access
+ * related error codes (TODO).
  */
 static void
 test_file(void)
 {
-	struct sockaddr_un addr;
-#if NOT_YET
-	struct sockaddr_un saddr, saddr2;
+	struct sockaddr_un addr, saddr, saddr2;
 	char buf[1];
 	socklen_t len;
 	struct stat st;
 	mode_t omask;
-	int, csd, fd;
-#endif
-	int sd, sd2;
+	int sd, sd2, csd, fd;
 
 	/*
 	 * If the provided socket path exists on the file system, the bind(2)
@@ -1426,7 +1478,6 @@ test_file(void)
 
 	CLOSE(sd);
 
-#if NOT_YET
 	if (bind(sd2, (struct sockaddr *)&addr, sizeof(addr)) != -1)
 		test_fail("Binding socket unexpectedly succeeded");
 	if (errno != EADDRINUSE)
@@ -1497,29 +1548,8 @@ test_file(void)
 	if (memcmp(&saddr, &saddr2, sizeof(saddr)))
 		test_fail("Unexpected old socket address");
 
-	/*
-	 * Currently, our implementation "hides" the old socket even if the new
-	 * socket is closed, but since this is not standard behavior and may be
-	 * changed later, we do not test for it.  However, in any case,
-	 * rebinding the hidden socket should make it "visible" again.
-	 */
-	strlcpy(saddr2.sun_path, TEST_SUN_PATHB, sizeof(saddr2.sun_path));
-	if (bind(sd, (struct sockaddr *)&saddr2, sizeof(saddr2)) != 0)
-		test_fail("Can't rebind socket");
-
-	memset(buf, 'Z', sizeof(buf));
-	if (sendto(csd, buf, sizeof(buf), 0, (struct sockaddr *)&saddr2,
-	    sizeof(saddr2)) != sizeof(buf))
-		test_fail("Can't send to socket");
-	if (recvfrom(sd, buf, sizeof(buf), 0, NULL, 0) != sizeof(buf))
-		test_fail("Can't receive from socket");
-	if (buf[0] != 'Z')
-		test_fail("Transmission failure");
-
 	if (unlink(TEST_SUN_PATH) != 0)
 		test_fail("Can't unlink socket");
-	if (unlink(TEST_SUN_PATHB) != 0)
-		test_fail("Can't unlink other socket");
 
 	CLOSE(sd);
 	CLOSE(sd2);
@@ -1580,7 +1610,6 @@ test_file(void)
 	UNLINK(TEST_SUN_PATH);
 
 	umask(omask);
-#endif
 
 	/*
 	 * Only socket(2), socketpair(2), and accept(2) may be used to obtain
@@ -1631,8 +1660,8 @@ int main(int argc, char *argv[])
 		.clientaddrsym            = (struct sockaddr *) &clientaddrsym,
 		.clientaddrsymlen         = sizeof(clientaddrsym),
 		.domain                   = PF_UNIX,
-		.expected_rcvbuf          = PIPE_BUF,
-		.expected_sndbuf          = PIPE_BUF,
+		.expected_rcvbuf          = 32768 - 5, /* no constants: */
+		.expected_sndbuf          = 32768 - 5, /* UDS internals */
 		.serveraddr               = (struct sockaddr *) &clientaddr,
 		.serveraddrlen            = sizeof(clientaddr),
 		.serveraddr2              = (struct sockaddr *) &clientaddr2,
@@ -1644,12 +1673,16 @@ int main(int argc, char *argv[])
 		.callback_cleanup         = callback_cleanup,
 		.callback_xfer_prepclient = callback_xfer_prepclient,
 		.callback_xfer_peercred   = callback_xfer_peercred,
+		.callback_set_listen_opt  = callback_set_listen_opt,
 	};
 
 	debug("entering main()");
 
 	start(56);
 
+	/* This test was written before UDS started supporting SIGPIPE. */
+	signal(SIGPIPE, SIG_IGN);
+
 	test_socket(&info);
 	test_bind(&info);
 	test_bind_unix();
diff --git a/minix/tests/test80.c b/minix/tests/test80.c
index 0975387aa..f6bc69ad9 100644
--- a/minix/tests/test80.c
+++ b/minix/tests/test80.c
@@ -96,7 +96,6 @@ int main(int argc, char *argv[])
 		.ignore_accept_delay       = 1,
 		.ignore_connect_unaccepted = 1,
 		.ignore_connect_delay      = 1,
-		.ignore_read_conn_reset    = 1,
 		.ignore_select_delay       = 1,
 		.ignore_send_waiting       = 1,
 		.ignore_write_conn_reset   = 1,
diff --git a/minix/tests/test81.c b/minix/tests/test81.c
index 1a3188cd6..f23c174d3 100644
--- a/minix/tests/test81.c
+++ b/minix/tests/test81.c
@@ -99,7 +99,6 @@ int main(int argc, char *argv[])
 		.ignore_accept_delay       = 1,
 		.ignore_connect_unaccepted = 1,
 		.ignore_connect_delay      = 1,
-		.ignore_read_conn_reset    = 1,
 		.ignore_select_delay       = 1,
 		.ignore_send_waiting       = 1,
 		.ignore_write_conn_reset   = 1,
diff --git a/minix/usr.bin/trace/ioctl/net.c b/minix/usr.bin/trace/ioctl/net.c
index 8d7591c80..1842d5e6e 100644
--- a/minix/usr.bin/trace/ioctl/net.c
+++ b/minix/usr.bin/trace/ioctl/net.c
@@ -189,6 +189,27 @@ static const struct flags udpopt_flags[] = {
 	FLAG(NWUO_DI_IPOPT),
 };
 
+static void
+put_struct_uucred(struct trace_proc * proc, const char * name, int flags,
+	vir_bytes addr)
+{
+	struct uucred cred;
+
+	if (!put_open_struct(proc, name, flags, addr, &cred, sizeof(cred)))
+		return;
+
+	put_value(proc, "cr_uid", "%u", cred.cr_uid);
+	if (verbose > 0) {
+		put_value(proc, "cr_gid", "%u", cred.cr_gid);
+		if (verbose > 1)
+			put_value(proc, "cr_ngroups", "%d", cred.cr_ngroups);
+		put_groups(proc, "cr_groups", PF_LOCADDR,
+		    (vir_bytes)&cred.cr_groups, cred.cr_ngroups);
+	}
+
+	put_close_struct(proc, verbose > 0);
+}
+
 static void
 put_msg_control(struct trace_proc * proc, struct msg_control * ptr)
 {
diff --git a/minix/usr.bin/trace/proto.h b/minix/usr.bin/trace/proto.h
index 11591edc1..e27637b2c 100644
--- a/minix/usr.bin/trace/proto.h
+++ b/minix/usr.bin/trace/proto.h
@@ -115,8 +115,6 @@ void put_dev(struct trace_proc *proc, const char *name, dev_t dev);
 void put_in_addr(struct trace_proc *proc, const char *name, struct in_addr in);
 void put_socket_type(struct trace_proc *proc, const char *name, int type);
 void put_socket_family(struct trace_proc *proc, const char *name, int family);
-void put_struct_uucred(struct trace_proc *proc, const char *name, int flags,
-	vir_bytes addr);
 void put_cmsg_type(struct trace_proc *proc, const char *name, int type);
 void put_shutdown_how(struct trace_proc *proc, const char *name, int how);
 
diff --git a/minix/usr.bin/trace/service/vfs.c b/minix/usr.bin/trace/service/vfs.c
index 93c0055fa..8163f4dd7 100644
--- a/minix/usr.bin/trace/service/vfs.c
+++ b/minix/usr.bin/trace/service/vfs.c
@@ -1802,25 +1802,32 @@ put_struct_iovec(struct trace_proc * proc, const char * name, int flags,
 	put_close(proc, "]");
 }
 
-void
-put_struct_uucred(struct trace_proc * proc, const char * name, int flags,
-	vir_bytes addr)
+static void
+put_struct_sockcred(struct trace_proc * proc, const char * name, int flags,
+	vir_bytes addr, size_t left)
 {
-	struct uucred cred;
+	struct sockcred sc;
 
-	if (!put_open_struct(proc, name, flags, addr, &cred, sizeof(cred)))
+	if (!put_open_struct(proc, name, flags, addr, &sc, sizeof(sc)))
 		return;
 
-	put_value(proc, "cr_uid", "%u", cred.cr_uid);
+	put_value(proc, "sc_uid", "%u", sc.sc_uid);
+	if (verbose > 0)
+		put_value(proc, "sc_euid", "%u", sc.sc_euid);
+	put_value(proc, "sc_gid", "%u", sc.sc_gid);
 	if (verbose > 0) {
-		put_value(proc, "cr_gid", "%u", cred.cr_gid);
+		put_value(proc, "sc_egid", "%u", sc.sc_egid);
 		if (verbose > 1)
-			put_value(proc, "cr_ngroups", "%d", cred.cr_ngroups);
-		put_groups(proc, "cr_groups", PF_LOCADDR,
-		    (vir_bytes)&cred.cr_groups, cred.cr_ngroups);
+			put_value(proc, "sc_ngroups", "%d", sc.sc_ngroups);
+		if (left >= sizeof(sc.sc_groups[0]) * (sc.sc_ngroups - 1)) {
+			put_groups(proc, "sc_groups", flags,
+			    addr + offsetof(struct sockcred, sc_groups),
+			    sc.sc_ngroups);
+		} else
+			put_field(proc, "sc_groups", "..");
 	}
 
-	put_close_struct(proc, verbose > 0);
+	put_close_struct(proc, verbose > 1);
 }
 
 static void
@@ -1907,7 +1914,7 @@ put_cmsg(struct trace_proc * proc, const char * name, vir_bytes addr,
 	size_t len)
 {
 	struct cmsghdr cmsg;
-	char buf[CMSG_SPACE(sizeof(struct uucred))];
+	char buf[CMSG_SPACE(sizeof(struct sockcred))];
 	size_t off, chunk, datalen;
 
 	if (valuesonly > 1 || addr == 0 || len < CMSG_LEN(0)) {
@@ -1960,10 +1967,11 @@ put_cmsg(struct trace_proc * proc, const char * name, vir_bytes addr,
 			    addr + off + chunk, datalen);
 		} else if (cmsg.cmsg_level == SOL_SOCKET &&
 		    cmsg.cmsg_type == SCM_CREDS &&
-		    datalen >= sizeof(struct uucred) &&
+		    datalen >= sizeof(struct sockcred) &&
 		    chunk >= CMSG_LEN(datalen)) {
-			put_struct_uucred(proc, "cmsg_data", PF_LOCADDR,
-			    (vir_bytes)&buf[CMSG_LEN(0)]);
+			put_struct_sockcred(proc, "cmsg_data", PF_LOCADDR,
+			    (vir_bytes)&buf[CMSG_LEN(0)],
+			    datalen - sizeof(struct sockcred));
 		} else if (datalen > 0)
 			put_field(proc, "cmsg_data", "..");
 
@@ -2129,8 +2137,6 @@ put_sockopt_name(struct trace_proc * proc, const char * name, int level,
 		TEXT(SO_REUSEPORT);
 		TEXT(SO_NOSIGPIPE);
 		TEXT(SO_TIMESTAMP);
-		TEXT(SO_PASSCRED);
-		TEXT(SO_PEERCRED);
 		TEXT(SO_SNDBUF);
 		TEXT(SO_RCVBUF);
 		TEXT(SO_SNDLOWAT);
@@ -2157,7 +2163,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags,
 	const char *text;
 	int i;
 	struct linger l;
-	struct uucred cr;
 	struct timeval tv;
 	void *ptr;
 	size_t size;
@@ -2183,7 +2188,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags,
 	case SO_REUSEPORT:
 	case SO_NOSIGPIPE:
 	case SO_TIMESTAMP:
-	case SO_PASSCRED:
 	case SO_SNDBUF:
 	case SO_RCVBUF:
 	case SO_SNDLOWAT:
@@ -2199,10 +2203,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags,
 		ptr = &l;
 		size = sizeof(l);
 		break;
-	case SO_PEERCRED:
-		ptr = &cr;
-		size = sizeof(cr);
-		break;
 	case SO_SNDTIMEO:
 	case SO_RCVTIMEO:
 		ptr = &tv;
@@ -2229,9 +2229,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags,
 		put_value(proc, "l_linger", "%d", l.l_linger);
 		put_close(proc, "}");
 		break;
-	case SO_PEERCRED:
-		put_struct_uucred(proc, name, PF_LOCADDR, (vir_bytes)&cr);
-		break;
 	case SO_ERROR:
 		put_open(proc, name, 0, "{", ", ");
 		if (!valuesonly && (text = get_error_name(i)) != NULL)
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 219b90baf..d7340af64 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -133,12 +133,6 @@ typedef	_BSD_SSIZE_T_	ssize_t;
 #define	SO_ACCEPTFILTER	0x1000		/* there is an accept filter */
 #define	SO_TIMESTAMP	0x2000		/* timestamp received dgram traffic */
 
-#if defined(__minix) && defined(_MINIX_SYSTEM)
-/* Minixism which should go, so hide it from userland. */
-#define SO_PASSCRED    0x100000
-#define SO_PEERCRED    0x200000
-#endif /* defined(__minix) */
-
 /*
  * Additional options, not kept in so_options.
  */