phunix/minix/lib/libsockevent/sockevent.c

/* Socket event dispatching library - by D.C. van Moolenbroek */

#include <minix/drivers.h>
#include <minix/sockdriver.h>
#include <minix/sockevent.h>
#include <sys/ioctl.h>

#include "sockevent_proc.h"

#define US		1000000UL	/* microseconds per second */

#define SOCKHASH_SLOTS	256		/* # slots in ID-to-sock hash table */

static SLIST_HEAD(, sock) sockhash[SOCKHASH_SLOTS];

static SLIST_HEAD(, sock) socktimer;

static minix_timer_t sockevent_timer;

static SIMPLEQ_HEAD(, sock) sockevent_pending;

static sockevent_socket_cb_t sockevent_socket_cb = NULL;

static int sockevent_working;

static void socktimer_del(struct sock * sock);
static void sockevent_cancel_send(struct sock * sock,
	struct sockevent_proc * spr, int err);
static void sockevent_cancel_recv(struct sock * sock,
	struct sockevent_proc * spr, int err);

/*
 * Initialize the hash table of sock objects.
 */
static void
sockhash_init(void)
{
	unsigned int slot;

	for (slot = 0; slot < __arraycount(sockhash); slot++)
		SLIST_INIT(&sockhash[slot]);
}

/*
 * Given a socket identifier, return a hash table slot number.
 */
static unsigned int
sockhash_slot(sockid_t id)
{

	/*
	 * The idea of the shift is that a socket driver may offer multiple
	 * classes of sockets, and put the class in the higher bits.  The shift
	 * aims to prevent that all classes' first sockets end up in the same
	 * hash slot.
	 */
	return (id + (id >> 16)) % SOCKHASH_SLOTS;
}

/*
 * Obtain a sock object from the hash table using its unique identifier.
 * Return a pointer to the object if found, or NULL otherwise.
 */
static struct sock *
sockhash_get(sockid_t id)
{
	struct sock *sock;
	unsigned int slot;

	slot = sockhash_slot(id);

	SLIST_FOREACH(sock, &sockhash[slot], sock_hash) {
		if (sock->sock_id == id)
			return sock;
	}

	return NULL;
}

/*
 * Add a sock object to the hash table.  The sock object must have a valid ID
 * in its 'sock_id' field, and must not be in the hash table already.
 */
static void
sockhash_add(struct sock * sock)
{
	unsigned int slot;

	slot = sockhash_slot(sock->sock_id);

	SLIST_INSERT_HEAD(&sockhash[slot], sock, sock_hash);
}

/*
 * Remove a sock object from the hash table.  The sock object must be in the
 * hash table.
 */
static void
sockhash_del(struct sock * sock)
{
	unsigned int slot;

	slot = sockhash_slot(sock->sock_id);

	/* This macro is O(n). */
	SLIST_REMOVE(&sockhash[slot], sock, sock, sock_hash);
}

/*
 * Reset a socket object to a proper initial state, with a particular socket
 * identifier, a SOCK_ type, and a socket operations table.  The socket is
 * added to the ID-to-object hash table.  This function always succeeds.
 */
static void
sockevent_reset(struct sock * sock, sockid_t id, int domain, int type,
	const struct sockevent_ops * ops)
{

	assert(sock != NULL);

	memset(sock, 0, sizeof(*sock));

	sock->sock_id = id;
	sock->sock_domain = domain;
	sock->sock_type = type;

	sock->sock_slowat = 1;
	sock->sock_rlowat = 1;

	sock->sock_ops = ops;
	sock->sock_proc = NULL;
	sock->sock_select.ss_endpt = NONE;

	sockhash_add(sock);
}

/*
 * Initialize a new socket that will serve as an accepted socket on the given
 * listening socket 'sock'.  The new socket is given as 'newsock', and its new
 * socket identifier is given as 'newid'.  This function always succeeds.
 */
void
sockevent_clone(struct sock * sock, struct sock * newsock, sockid_t newid)
{

	sockevent_reset(newsock, newid, (int)sock->sock_domain,
	    sock->sock_type, sock->sock_ops);

	/* These are the settings that are currently inherited. */
	newsock->sock_opt = sock->sock_opt & ~SO_ACCEPTCONN;
	newsock->sock_linger = sock->sock_linger;
	newsock->sock_stimeo = sock->sock_stimeo;
	newsock->sock_rtimeo = sock->sock_rtimeo;
	newsock->sock_slowat = sock->sock_slowat;
	newsock->sock_rlowat = sock->sock_rlowat;

	newsock->sock_flags |= SFL_CLONED;
}

/*
 * A new socket has just been accepted.  The corresponding listening socket is
 * given as 'sock'.  The new socket has ID 'newid', and if it had not already
 * been added to the hash table through sockevent_clone() before, 'newsock' is
 * a non-NULL pointer which identifies the socket object to clone into.
 */
static void
sockevent_accepted(struct sock * sock, struct sock * newsock, sockid_t newid)
{

	if (newsock == NULL) {
		if ((newsock = sockhash_get(newid)) == NULL)
			panic("libsockdriver: socket driver returned unknown "
			    "ID %d from accept callback", newid);
	} else
		sockevent_clone(sock, newsock, newid);

	assert(newsock->sock_flags & SFL_CLONED);
	newsock->sock_flags &= ~SFL_CLONED;
}

/*
 * Allocate a sock object, by asking the socket driver for one.  On success,
 * return OK, with a pointer to the new object stored in 'sockp'.  This new
 * object has all its fields set to initial values, in part based on the given
 * parameters.  On failure, return an error code.  Failure has two typical
 * cause: either the given domain, type, protocol combination is not supported,
 * or the socket driver is out of sockets (globally or for this combination).
 */
static int
sockevent_alloc(int domain, int type, int protocol, endpoint_t user_endpt,
	struct sock ** sockp)
{
	struct sock *sock;
	const struct sockevent_ops *ops;
	sockid_t r;

	/*
	 * Verify that the given domain is sane.  Unlike the type and protocol,
	 * the domain is already verified by VFS, so we do not limit ourselves
	 * here.  The result is that we can store the domain in just a byte.
	 */
	if (domain < 0 || domain > UINT8_MAX)
		return EAFNOSUPPORT;

	/* Make sure that the library has actually been initialized. */
	if (sockevent_socket_cb == NULL)
		panic("libsockevent: not initialized");

	sock = NULL;
	ops = NULL;

	/*
	 * Ask the socket driver to create a socket for the given combination
	 * of domain, type, and protocol.  If so, let it return a new sock
	 * object, a unique socket identifier for that object, and an
	 * operations table for it.
	 */
	if ((r = sockevent_socket_cb(domain, type, protocol, user_endpt, &sock,
	    &ops)) < 0)
		return r;

	assert(sock != NULL);
	assert(ops != NULL);

	sockevent_reset(sock, r, domain, type, ops);

	*sockp = sock;
	return OK;
}

/*
 * Free a previously allocated sock object.
 */
static void
sockevent_free(struct sock * sock)
{
	const struct sockevent_ops *ops;

	assert(sock->sock_proc == NULL);

	socktimer_del(sock);

	sockhash_del(sock);

	/*
	 * Invalidate the operations table on the socket, before freeing the
	 * socket.  This allows us to detect cases where sockevent functions
	 * are called on sockets that have already been freed.
	 */
	ops = sock->sock_ops;
	sock->sock_ops = NULL;

	assert(ops != NULL);
	assert(ops->sop_free != NULL);

	ops->sop_free(sock);
}

/*
 * Create a new socket.
 */
static sockid_t
sockevent_socket(int domain, int type, int protocol, endpoint_t user_endpt)
{
	struct sock *sock;
	int r;

	if ((r = sockevent_alloc(domain, type, protocol, user_endpt,
	    &sock)) != OK)
		return r;

	return sock->sock_id;
}

/*
 * Create a pair of connected sockets.
 */
static int
sockevent_socketpair(int domain, int type, int protocol, endpoint_t user_endpt,
	sockid_t id[2])
{
	struct sock *sock1, *sock2;
	int r;

	if ((r = sockevent_alloc(domain, type, protocol, user_endpt,
	    &sock1)) != OK)
		return r;

	/* Creating socket pairs is not always supported. */
	if (sock1->sock_ops->sop_pair == NULL) {
		sockevent_free(sock1);

		return EOPNOTSUPP;
	}

	if ((r = sockevent_alloc(domain, type, protocol, user_endpt,
	    &sock2)) != OK) {
		sockevent_free(sock1);

		return r;
	}

	assert(sock1->sock_ops == sock2->sock_ops);

	r = sock1->sock_ops->sop_pair(sock1, sock2, user_endpt);

	if (r != OK) {
		sockevent_free(sock2);
		sockevent_free(sock1);

		return r;
	}

	id[0] = sock1->sock_id;
	id[1] = sock2->sock_id;
	return OK;
}

/*
 * A send request returned EPIPE.  If desired, send a SIGPIPE signal to the
 * user process that issued the request.
 */
static void
sockevent_sigpipe(struct sock * sock, endpoint_t user_endpt, int flags)
{

	/*
	 * POSIX says that pipe signals should be generated for SOCK_STREAM
	 * sockets.  Linux does just this, NetBSD raises signals for all socket
	 * types.
	 */
	if (sock->sock_type != SOCK_STREAM)
		return;

	/*
	 * Why would there be fewer than four ways to do the same thing?
	 * O_NOSIGPIPE, MSG_NOSIGNAL, SO_NOSIGPIPE, and of course blocking
	 * SIGPIPE.  VFS already sets MSG_NOSIGNAL for calls on sockets with
	 * O_NOSIGPIPE.  The fact that SO_NOSIGPIPE is a thing, is also the
	 * reason why we cannot let VFS handle signal generation altogether.
	 */
	if (flags & MSG_NOSIGNAL)
		return;
	if (sock->sock_opt & SO_NOSIGPIPE)
		return;

	/*
	 * Send a SIGPIPE signal to the user process.  Unfortunately we cannot
	 * guarantee that the SIGPIPE reaches the user process before the send
	 * call returns.  Usually, the scheduling priorities of system services
	 * are such that the signal is likely to arrive first anyway, but if
	 * timely arrival of the signal is required, a more fundamental change
	 * to the system would be needed.
	 */
	sys_kill(user_endpt, SIGPIPE);
}

/*
 * Suspend a request without data, that is, a bind, connect, accept, or close
 * request.
 */
static void
sockevent_suspend(struct sock * sock, unsigned int event,
	const struct sockdriver_call * __restrict call, endpoint_t user_endpt)
{
	struct sockevent_proc *spr, **sprp;

	/* There is one slot for each process, so this should never fail. */
	if ((spr = sockevent_proc_alloc()) == NULL)
		panic("libsockevent: too many suspended processes");

	spr->spr_next = NULL;
	spr->spr_event = event;
	spr->spr_timer = FALSE;
	spr->spr_call = *call;
	spr->spr_endpt = user_endpt;

	/*
	 * Add the request to the tail of the queue.  This operation is O(n),
	 * but the number of suspended requests per socket is expected to be
	 * low at all times.
	 */
	for (sprp = &sock->sock_proc; *sprp != NULL;
	     sprp = &(*sprp)->spr_next);
	*sprp = spr;
}

/*
 * Suspend a request with data, that is, a send or receive request.
 */
static void
sockevent_suspend_data(struct sock * sock, unsigned int event, int timer,
	const struct sockdriver_call * __restrict call, endpoint_t user_endpt,
	const struct sockdriver_data * __restrict data, size_t len, size_t off,
	const struct sockdriver_data * __restrict ctl, socklen_t ctl_len,
	socklen_t ctl_off, int flags, int rflags, clock_t time)
{
	struct sockevent_proc *spr, **sprp;

	/* There is one slot for each process, so this should never fail. */
	if ((spr = sockevent_proc_alloc()) == NULL)
		panic("libsockevent: too many suspended processes");

	spr->spr_next = NULL;
	spr->spr_event = event;
	spr->spr_timer = timer;
	spr->spr_call = *call;
	spr->spr_endpt = user_endpt;
	sockdriver_pack_data(&spr->spr_data, call, data, len);
	spr->spr_datalen = len;
	spr->spr_dataoff = off;
	sockdriver_pack_data(&spr->spr_ctl, call, ctl, ctl_len);
	spr->spr_ctllen = ctl_len;
	spr->spr_ctloff = ctl_off;
	spr->spr_flags = flags;
	spr->spr_rflags = rflags;
	spr->spr_time = time;

	/*
	 * Add the request to the tail of the queue.  This operation is O(n),
	 * but the number of suspended requests per socket is expected to be
	 * low at all times.
	 */
	for (sprp = &sock->sock_proc; *sprp != NULL;
	     sprp = &(*sprp)->spr_next);
	*sprp = spr;
}

/*
 * Return TRUE if there are any suspended requests on the given socket's queue
 * that match any of the events in the given event mask, or FALSE otherwise.
 */
static int
sockevent_has_suspended(struct sock * sock, unsigned int mask)
{
	struct sockevent_proc *spr;

	for (spr = sock->sock_proc; spr != NULL; spr = spr->spr_next)
		if (spr->spr_event & mask)
			return TRUE;

	return FALSE;
}

/*
 * Check whether the given call is on the given socket's queue of suspended
 * requests.  If so, remove it from the queue and return a pointer to the
 * suspension data structure.  The caller is then responsible for freeing that
 * data structure using sockevent_proc_free().  If the call was not found, the
 * function returns NULL.
 */
static struct sockevent_proc *
sockevent_unsuspend(struct sock * sock, const struct sockdriver_call * call)
{
	struct sockevent_proc *spr, **sprp;

	/* Find the suspended request being canceled. */
	for (sprp = &sock->sock_proc; (spr = *sprp) != NULL;
	    sprp = &spr->spr_next) {
		if (spr->spr_call.sc_endpt == call->sc_endpt &&
		    spr->spr_call.sc_req == call->sc_req) {
			/* Found; remove and return it. */
			*sprp = spr->spr_next;

			return spr;
		}
	}

	return NULL;
}

/*
 * Attempt to resume the given suspended request for the given socket object.
 * Return TRUE if the suspended request has been fully resumed and can be
 * removed from the queue of suspended requests, or FALSE if it has not been
 * fully resumed and should stay on the queue.  In the latter case, no
 * resumption will be attempted for other suspended requests of the same type.
 */
static int
sockevent_resume(struct sock * sock, struct sockevent_proc * spr)
{
	struct sock *newsock;
	struct sockdriver_data data, ctl;
	char addr[SOCKADDR_MAX];
	socklen_t addr_len;
	size_t len, min;
	sockid_t r;

	switch (spr->spr_event) {
	case SEV_CONNECT:
		/*
		 * If the connect call was suspended for the purpose of
		 * intercepting resumption, simply remove it from the queue.
		 */
		if (spr->spr_call.sc_endpt == NONE)
			return TRUE;

		/* FALLTHROUGH */
	case SEV_BIND:
		if ((r = sock->sock_err) != OK)
			sock->sock_err = OK;

		sockdriver_reply_generic(&spr->spr_call, r);

		return TRUE;

	case SEV_ACCEPT:
		/*
		 * A previous accept call may not have blocked on a socket that
		 * was not in listening mode.
		 */
		assert(sock->sock_opt & SO_ACCEPTCONN);

		addr_len = 0;
		newsock = NULL;

		/*
		 * This call is suspended, which implies that the call table
		 * pointer has already tested to be non-NULL.
		 */
		if ((r = sock->sock_ops->sop_accept(sock,
		    (struct sockaddr *)&addr, &addr_len, spr->spr_endpt,
		    &newsock)) == SUSPEND)
			return FALSE;

		if (r >= 0) {
			assert(addr_len <= sizeof(addr));

			sockevent_accepted(sock, newsock, r);
		}

		sockdriver_reply_accept(&spr->spr_call, r,
		    (struct sockaddr *)&addr, addr_len);

		return TRUE;

	case SEV_SEND:
		if (sock->sock_err != OK || (sock->sock_flags & SFL_SHUT_WR)) {
			if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
				r = (int)spr->spr_dataoff;
			else if ((r = sock->sock_err) != OK)
				sock->sock_err = OK;
			else
				r = EPIPE;
		} else {
			sockdriver_unpack_data(&data, &spr->spr_call,
			    &spr->spr_data, spr->spr_datalen);
			sockdriver_unpack_data(&ctl, &spr->spr_call,
			    &spr->spr_ctl, spr->spr_ctllen);

			len = spr->spr_datalen - spr->spr_dataoff;

			min = sock->sock_slowat;
			if (min > len)
				min = len;

			/*
			 * As mentioned elsewhere, we do not save the address
			 * upon suspension so we cannot supply it anymore here.
			 */
			r = sock->sock_ops->sop_send(sock, &data, len,
			    &spr->spr_dataoff, &ctl,
			    spr->spr_ctllen - spr->spr_ctloff,
			    &spr->spr_ctloff, NULL, 0, spr->spr_endpt,
			    spr->spr_flags, min);

			assert(r <= 0);

			if (r == SUSPEND)
				return FALSE;

			/*
			 * If an error occurred but some data were already
			 * sent, return the progress rather than the error.
			 * Note that if the socket driver detects an
			 * asynchronous error during the send, it itself must
			 * perform this check and call sockevent_set_error() as
			 * needed, to make sure the error does not get lost.
			 */
			if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
				r = spr->spr_dataoff;
		}

		if (r == EPIPE)
			sockevent_sigpipe(sock, spr->spr_endpt,
			    spr->spr_flags);

		sockdriver_reply_generic(&spr->spr_call, r);

		return TRUE;

	case SEV_RECV:
		addr_len = 0;

		if (sock->sock_flags & SFL_SHUT_RD)
			r = SOCKEVENT_EOF;
		else {
			len = spr->spr_datalen - spr->spr_dataoff;

			if (sock->sock_err == OK) {
				min = sock->sock_rlowat;
				if (min > len)
					min = len;
			} else
				min = 0;

			sockdriver_unpack_data(&data, &spr->spr_call,
			    &spr->spr_data, spr->spr_datalen);
			sockdriver_unpack_data(&ctl, &spr->spr_call,
			    &spr->spr_ctl, spr->spr_ctllen);

			r = sock->sock_ops->sop_recv(sock, &data, len,
			    &spr->spr_dataoff, &ctl,
			    spr->spr_ctllen - spr->spr_ctloff,
			    &spr->spr_ctloff, (struct sockaddr *)&addr,
			    &addr_len, spr->spr_endpt, spr->spr_flags, min,
			    &spr->spr_rflags);

			/*
			 * If the call remains suspended but a socket error is
			 * pending, return the pending socket error instead.
			 */
			if (r == SUSPEND) {
				if (sock->sock_err == OK)
					return FALSE;

				r = SOCKEVENT_EOF;
			}

			assert(addr_len <= sizeof(addr));
		}

		/*
		 * If the receive call reported success, or if some data were
		 * already received, return the (partial) result.  Otherwise,
		 * return a pending error if any, or otherwise a regular error
		 * or 0 for EOF.
		 */
		if (r == OK || spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
			r = (int)spr->spr_dataoff;
		else if (sock->sock_err != OK) {
			r = sock->sock_err;

			sock->sock_err = OK;
		} else if (r == SOCKEVENT_EOF)
			r = 0; /* EOF */

		sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff,
		    (struct sockaddr *)&addr, addr_len, spr->spr_rflags);

		return TRUE;

	case SEV_CLOSE:
		sockdriver_reply_generic(&spr->spr_call, OK);

		return TRUE;

	default:
		panic("libsockevent: process suspended on unknown event 0x%x",
		    spr->spr_event);
	}
}

/*
 * Return TRUE if the given socket is ready for reading for a select call, or
 * FALSE otherwise.
 */
static int
sockevent_test_readable(struct sock * sock)
{
	int r;

	/*
	 * The meaning of "ready-to-read" depends on whether the socket is a
	 * listening socket or not.  For the former, it is a test on whether
	 * there are any new sockets to accept.  However, shutdown flags take
	 * precedence in both cases.
	 */
	if (sock->sock_flags & SFL_SHUT_RD)
		return TRUE;

	if (sock->sock_err != OK)
		return TRUE;

	/*
	 * Depending on whether this is a listening-mode socket, test whether
	 * either accepts or receives would block.
	 */
	if (sock->sock_opt & SO_ACCEPTCONN) {
		if (sock->sock_ops->sop_test_accept == NULL)
			return TRUE;

		r = sock->sock_ops->sop_test_accept(sock);
	} else {
		if (sock->sock_ops->sop_test_recv == NULL)
			return TRUE;

		r = sock->sock_ops->sop_test_recv(sock, sock->sock_rlowat,
		    NULL);
	}

	return (r != SUSPEND);
}

/*
 * Return TRUE if the given socket is ready for writing for a select call, or
 * FALSE otherwise.
 */
static int
sockevent_test_writable(struct sock * sock)
{
	int r;

	if (sock->sock_err != OK)
		return TRUE;

	if (sock->sock_flags & SFL_SHUT_WR)
		return TRUE;

	if (sock->sock_ops->sop_test_send == NULL)
		return TRUE;

	/*
	 * Test whether sends would block.  The low send watermark is relevant
	 * for stream-type sockets only.
	 */
	r = sock->sock_ops->sop_test_send(sock, sock->sock_slowat);

	return (r != SUSPEND);
}

/*
 * Test whether any of the given select operations are ready on the given
 * socket.  Return the subset of ready operations; zero if none.
 */
static unsigned int
sockevent_test_select(struct sock * sock, unsigned int ops)
{
	unsigned int ready_ops;

	assert(!(ops & ~(SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR)));

	/*
	 * We do not support the "bind in progress" case here.  If a blocking
	 * bind call is in progress, the file descriptor should not be ready
	 * for either reading or writing.  Currently, socket drivers will have
	 * to cover this case themselves.  Otherwise we would have to check the
	 * queue of suspended calls, or create a custom flag for this.
	 */

	ready_ops = 0;

	if ((ops & SDEV_OP_RD) && sockevent_test_readable(sock))
		ready_ops |= SDEV_OP_RD;

	if ((ops & SDEV_OP_WR) && sockevent_test_writable(sock))
		ready_ops |= SDEV_OP_WR;

	/* TODO: OOB receive support. */

	return ready_ops;
}

/*
 * Fire the given mask of events on the given socket object now.
 */
static void
sockevent_fire(struct sock * sock, unsigned int mask)
{
	struct sockevent_proc *spr, **sprp;
	unsigned int r, flag, ops;

	/*
	 * A completed connection attempt (successful or not) also always
	 * implies that the socket becomes writable.  For convenience we
	 * enforce this rule here, because it is easy to forget.  Note that in
	 * any case, a suspended connect request should be the first in the
	 * list, so we do not risk returning 0 from a connect call as a result
	 * of sock_err getting eaten by another resumed call.
	 */
	if (mask & SEV_CONNECT)
		mask |= SEV_SEND;

	/*
	 * First try resuming regular system calls.
	 */
	for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) {
		flag = spr->spr_event;

		if ((mask & flag) && sockevent_resume(sock, spr)) {
			*sprp = spr->spr_next;

			sockevent_proc_free(spr);
		} else {
			mask &= ~flag;

			sprp = &spr->spr_next;
		}
	}

	/*
	 * Then see if we can satisfy pending select queries.
	 */
	if ((mask & (SEV_ACCEPT | SEV_SEND | SEV_RECV)) &&
	    sock->sock_select.ss_endpt != NONE) {
		assert(sock->sock_selops != 0);

		/*
		 * Only retest select operations that, based on the given event
		 * mask, could possibly be satisfied now.
		 */
		ops = sock->sock_selops;
		if (!(mask & (SEV_ACCEPT | SEV_RECV)))
			ops &= ~SDEV_OP_RD;
		if (!(mask & SEV_SEND))
			ops &= ~SDEV_OP_WR;
		if (!(0))			/* TODO: OOB receive support */
			ops &= ~SDEV_OP_ERR;

		/* Are there any operations to test? */
		if (ops != 0) {
			/* Test those operations. */
			r = sockevent_test_select(sock, ops);

			/* Were any satisfied? */
			if (r != 0) {
				/* Let the caller know. */
				sockdriver_reply_select(&sock->sock_select,
				    sock->sock_id, r);

				sock->sock_selops &= ~r;

				/* Are there any saved operations left now? */
				if (sock->sock_selops == 0)
					sock->sock_select.ss_endpt = NONE;
			}
		}
	}

	/*
	 * Finally, a SEV_CLOSE event unconditionally frees the sock object.
	 * This event should be fired only for sockets that are either not yet,
	 * or not anymore, in use by userland.
	 */
	if (mask & SEV_CLOSE) {
		assert(sock->sock_flags & (SFL_CLONED | SFL_CLOSING));

		sockevent_free(sock);
	}
}

/*
 * Process all pending events.  Events must still be blocked, so that if
 * handling one event generates a new event, that event is handled from here
 * rather than immediately.
 */
static void
sockevent_pump(void)
{
	struct sock *sock;
	unsigned int mask;

	assert(sockevent_working);

	while (!SIMPLEQ_EMPTY(&sockevent_pending)) {
		sock = SIMPLEQ_FIRST(&sockevent_pending);
		SIMPLEQ_REMOVE_HEAD(&sockevent_pending, sock_next);

		mask = sock->sock_events;
		assert(mask != 0);
		sock->sock_events = 0;

		sockevent_fire(sock, mask);
		/*
		 * At this point, the sock object may already have been readded
		 * to the event list, or even be deallocated altogether.
		 */
	}
}

/*
 * Return TRUE if any events are pending on any sockets, or FALSE otherwise.
 */
static int
sockevent_has_events(void)
{

	return (!SIMPLEQ_EMPTY(&sockevent_pending));
}

/*
 * Raise the given bitwise-OR'ed set of events on the given socket object.
 * Depending on the context of the call, they events may or may not be
 * processed immediately.
 */
void
sockevent_raise(struct sock * sock, unsigned int mask)
{

	assert(sock->sock_ops != NULL);

	/*
	 * Handle SEV_CLOSE first.  This event must not be deferred, so as to
	 * let socket drivers recycle sock objects as they are needed.  For
	 * example, a user-closed TCP socket may stay open to transmit the
	 * remainder of its send buffer, until the TCP driver runs out of
	 * sockets, in which case the connection is aborted.  The driver would
	 * then raise SEV_CLOSE on the sock object so as to clean it up, and
	 * immediately reuse it afterward.  If the close event were to be
	 * deferred, this immediate reuse would not be possible.
	 *
	 * The sop_free() callback routine may not raise new events, and thus,
	 * the state of 'sockevent_working' need not be checked or set here.
	 */
	if (mask & SEV_CLOSE) {
		assert(mask == SEV_CLOSE);

		sockevent_fire(sock, mask);

		return;
	}

	/*
	 * If we are currently processing a socket message, store the event for
	 * later.  If not, this call is not coming from inside libsockevent,
	 * and we must handle the event immediately.
	 */
	if (sockevent_working) {
		assert(mask != 0);
		assert(mask <= UCHAR_MAX); /* sock_events field size check */

		if (sock->sock_events == 0)
			SIMPLEQ_INSERT_TAIL(&sockevent_pending, sock,
			    sock_next);

		sock->sock_events |= mask;
	} else {
		sockevent_working = TRUE;

		sockevent_fire(sock, mask);

		if (sockevent_has_events())
			sockevent_pump();

		sockevent_working = FALSE;
	}
}

/*
 * Set a pending error on the socket object, and wake up any suspended
 * operations that are affected by this.
 */
void
sockevent_set_error(struct sock * sock, int err)
{

	assert(err < 0);
	assert(sock->sock_ops != NULL);

	/* If an error was set already, it will be overridden. */
	sock->sock_err = err;

	sockevent_raise(sock, SEV_BIND | SEV_CONNECT | SEV_SEND | SEV_RECV);
}

/*
 * Initialize timer-related data structures.
 */
static void
socktimer_init(void)
{

	SLIST_INIT(&socktimer);

	init_timer(&sockevent_timer);
}

/*
 * Check whether the given socket object has any suspended requests that have
 * now expired.  If so, cancel them.  Also, if the socket object has any
 * suspended requests with a timeout that has not yet expired, return the
 * earliest (relative) timeout of all of them, or TMR_NEVER if no such requests
 * are present.
 */
static clock_t
sockevent_expire(struct sock * sock, clock_t now)
{
	struct sockevent_proc *spr, **sprp;
	clock_t lowest, left;
	int r;

	/*
	 * First handle the case that the socket is closed.  In this case,
	 * there may be a linger timer, although the socket may also simply
	 * still be on the timer list because of a request that did not time
	 * out right before the socket was closed.
	 */
	if (sock->sock_flags & SFL_CLOSING) {
		/* Was there a linger timer and has it expired? */
		if ((sock->sock_opt & SO_LINGER) &&
		    tmr_is_first(sock->sock_linger, now)) {
			assert(sock->sock_ops->sop_close != NULL);

			/*
			 * Whatever happens next, we must now resume the
			 * pending close operation, if it was not canceled
			 * earlier.  As before, we return OK rather than the
			 * standardized EWOULDBLOCK, to ensure that the user
			 * process knows the file descriptor has been closed.
			 */
			if ((spr = sock->sock_proc) != NULL) {
				assert(spr->spr_event == SEV_CLOSE);
				assert(spr->spr_next == NULL);

				sock->sock_proc = NULL;

				sockdriver_reply_generic(&spr->spr_call, OK);

				sockevent_proc_free(spr);
			}

			/*
			 * Tell the socket driver that closing the socket is
			 * now a bit more desired than the last time we asked.
			 */
			r = sock->sock_ops->sop_close(sock, TRUE /*force*/);

			assert(r == OK || r == SUSPEND);

			/*
			 * The linger timer fires once.  After that, the socket
			 * driver is free to decide that it still will not
			 * close the socket.  If it does, do not fire the
			 * linger timer again.
			 */
			if (r == SUSPEND)
				sock->sock_opt &= ~SO_LINGER;
			else
				sockevent_free(sock);
		}

		return TMR_NEVER;
	}

	/*
	 * Then see if any send and/or receive requests have expired.  Also see
	 * if there are any send and/or receive requests left that have not yet
	 * expired but do have a timeout, so that we can return the lowest of
	 * those timeouts.
	 */
	lowest = TMR_NEVER;

	for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) {
		/* Skip requests without a timeout. */
		if (spr->spr_timer == 0) {
			sprp = &spr->spr_next;

			continue;
		}

		assert(spr->spr_event == SEV_SEND ||
		    spr->spr_event == SEV_RECV);

		/*
		 * If the request has expired, cancel it and remove it from the
		 * list.  Otherwise, see if the request has the lowest number
		 * of ticks until its timeout so far.
		 */
		if (tmr_is_first(spr->spr_time, now)) {
			*sprp = spr->spr_next;

			if (spr->spr_event == SEV_SEND)
				sockevent_cancel_send(sock, spr, EWOULDBLOCK);
			else
				sockevent_cancel_recv(sock, spr, EWOULDBLOCK);

			sockevent_proc_free(spr);
		} else {
			left = spr->spr_time - now;

			if (lowest == TMR_NEVER || lowest > left)
				lowest = left;

			sprp = &spr->spr_next;
		}
	}

	return lowest;
}

/*
 * The socket event alarm went off.  Go through the set of socket objects with
 * timers, and see if any of their requests have now expired.  Set a new alarm
 * as necessary.
 */
static void
socktimer_expire(int arg __unused)
{
	SLIST_HEAD(, sock) oldtimer;
	struct sock *sock, *tsock;
	clock_t now, lowest, left;
	int working;

	/*
	 * This function may or may not be called from a context where we are
	 * already deferring events, so we have to cover both cases here.
	 */
	if ((working = sockevent_working) == FALSE)
		sockevent_working = TRUE;

	/* Start a new list. */
	memcpy(&oldtimer, &socktimer, sizeof(oldtimer));
	SLIST_INIT(&socktimer);

	now = getticks();
	lowest = TMR_NEVER;

	/*
	 * Go through all sockets that have or had a request with a timeout,
	 * canceling any expired requests and building a new list of sockets
	 * that still have requests with timeouts as we go.
	 */
	SLIST_FOREACH_SAFE(sock, &oldtimer, sock_timer, tsock) {
		assert(sock->sock_flags & SFL_TIMER);
		sock->sock_flags &= ~SFL_TIMER;

		left = sockevent_expire(sock, now);
		/*
		 * The sock object may already have been deallocated now.
		 * If 'next' is TMR_NEVER, do not touch 'sock' anymore.
		 */

		if (left != TMR_NEVER) {
			if (lowest == TMR_NEVER || lowest > left)
				lowest = left;

			SLIST_INSERT_HEAD(&socktimer, sock, sock_timer);

			sock->sock_flags |= SFL_TIMER;
		}
	}

	/* If there is a new lowest timeout at all, set a new timer. */
	if (lowest != TMR_NEVER)
		set_timer(&sockevent_timer, lowest, socktimer_expire, 0);

	if (!working) {
		/* If any new events were raised, process them now. */
		if (sockevent_has_events())
			sockevent_pump();

		sockevent_working = FALSE;
	}
}

/*
 * Set a timer for the given (relative) number of clock ticks, adding the
 * associated socket object to the set of socket objects with timers, if it was
 * not already in that set.  Set a new alarm if necessary, and return the
 * absolute timeout for the timer.  Since the timers list is maintained lazily,
 * the caller need not take the object off the set if the call was canceled
 * later; see also socktimer_del().
 */
static clock_t
socktimer_add(struct sock * sock, clock_t ticks)
{
	clock_t now;

	/*
	 * Relative time comparisons require that any two times are no more
	 * than half the comparison space (clock_t, unsigned long) apart.
	 */
	assert(ticks <= TMRDIFF_MAX);

	/* If the socket was not already on the timers list, put it on. */
	if (!(sock->sock_flags & SFL_TIMER)) {
		SLIST_INSERT_HEAD(&socktimer, sock, sock_timer);

		sock->sock_flags |= SFL_TIMER;
	}

	/*
	 * (Re)set the timer if either it was not running at all or this new
	 * timeout will occur sooner than the currently scheduled alarm.  Note
	 * that setting a timer that was already set is allowed.
	 */
	now = getticks();

	if (!tmr_is_set(&sockevent_timer) ||
	    tmr_is_first(now + ticks, tmr_exp_time(&sockevent_timer)))
		set_timer(&sockevent_timer, ticks, socktimer_expire, 0);

	/* Return the absolute timeout. */
	return now + ticks;
}

/*
 * Remove a socket object from the set of socket objects with timers.  Since
 * the timer list is maintained lazily, this needs to be done only right before
 * the socket object is freed.
 */
static void
socktimer_del(struct sock * sock)
{

	if (sock->sock_flags & SFL_TIMER) {
		/* This macro is O(n). */
		SLIST_REMOVE(&socktimer, sock, sock, sock_timer);

		sock->sock_flags &= ~SFL_TIMER;
	}
}

/*
 * Bind a socket to a local address.
 */
static int
sockevent_bind(sockid_t id, const struct sockaddr * __restrict addr,
	socklen_t addr_len, endpoint_t user_endpt,
	const struct sockdriver_call * __restrict call)
{
	struct sock *sock;
	int r;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	if (sock->sock_ops->sop_bind == NULL)
		return EOPNOTSUPP;

	/* Binding a socket in listening mode is never supported. */
	if (sock->sock_opt & SO_ACCEPTCONN)
		return EINVAL;

	r = sock->sock_ops->sop_bind(sock, addr, addr_len, user_endpt);

	if (r == SUSPEND) {
		if (call == NULL)
			return EINPROGRESS;

		sockevent_suspend(sock, SEV_BIND, call, user_endpt);
	}

	return r;
}

/*
 * Connect a socket to a remote address.
 */
static int
sockevent_connect(sockid_t id, const struct sockaddr * __restrict addr,
	socklen_t addr_len, endpoint_t user_endpt,
	const struct sockdriver_call * call)
{
	struct sockdriver_call fakecall;
	struct sockevent_proc *spr;
	struct sock *sock;
	int r;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	if (sock->sock_ops->sop_connect == NULL)
		return EOPNOTSUPP;

	/* Connecting a socket in listening mode is never supported. */
	if (sock->sock_opt & SO_ACCEPTCONN)
		return EOPNOTSUPP;

	/*
	 * The upcoming connect call may fire an accept event for which the
	 * handler may in turn fire a connect event on this socket.  Since we
	 * delay event processing until after processing calls, this would
	 * create the problem that even if the connection is accepted right
	 * away, non-blocking connect requests would return EINPROGRESS.  For
	 * UDS, this is undesirable behavior.  To remedy this, we use a hack:
	 * we temporarily suspend the connect even if non-blocking, then
	 * process events, and then cancel the connect request again.  If the
	 * connection was accepted immediately, the cancellation will have no
	 * effect, since the request has already been replied to.  In order not
	 * to violate libsockdriver rules with this hack, we fabricate a fake
	 * 'conn' object.
	 */
	r = sock->sock_ops->sop_connect(sock, addr, addr_len, user_endpt);

	if (r == SUSPEND) {
		if (call != NULL || sockevent_has_events()) {
			if (call == NULL) {
				fakecall.sc_endpt = NONE;

				call = &fakecall;
			}

			assert(!sockevent_has_suspended(sock,
			    SEV_SEND | SEV_RECV));

			sockevent_suspend(sock, SEV_CONNECT, call, user_endpt);

			if (call == &fakecall) {
				/* Process any pending events first now. */
				sockevent_pump();

				/*
				 * If the connect request has not been resumed
				 * yet now, we must remove it from the queue
				 * again, and return EINPROGRESS ourselves.
				 * Otherwise, return OK or a pending error.
				 */
				spr = sockevent_unsuspend(sock, call);
				if (spr != NULL) {
					sockevent_proc_free(spr);

					r = EINPROGRESS;
				} else if ((r = sock->sock_err) != OK)
					sock->sock_err = OK;
			}
		} else
			r = EINPROGRESS;
	}

	if (r == OK) {
		/*
		 * A completed connection attempt also always implies that the
		 * socket becomes writable.  For convenience we enforce this
		 * rule here, because it is easy to forget.
		 */
		sockevent_raise(sock, SEV_SEND);
	}

	return r;
}

/*
 * Put a socket in listening mode.
 */
static int
sockevent_listen(sockid_t id, int backlog)
{
	struct sock *sock;
	int r;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	if (sock->sock_ops->sop_listen == NULL)
		return EOPNOTSUPP;

	/*
	 * Perform a general adjustment on the backlog value, applying the
	 * customary BSD "fudge factor" of 1.5x.  Keep the value within bounds
	 * though.  POSIX imposes that a negative backlog value is equal to a
	 * backlog value of zero.  A backlog value of zero, in turn, may mean
	 * anything; we take it to be one.  POSIX also imposes that all socket
	 * drivers accept up to at least SOMAXCONN connections on the queue.
	 */
	if (backlog < 0)
		backlog = 0;
	if (backlog < SOMAXCONN)
		backlog += 1 + ((unsigned int)backlog >> 1);
	if (backlog > SOMAXCONN)
		backlog = SOMAXCONN;

	r = sock->sock_ops->sop_listen(sock, backlog);

	/*
	 * On success, the socket is now in listening mode.  As part of that,
	 * a select(2) ready-to-read condition now indicates that a connection
	 * may be accepted on the socket, rather than that data may be read.
	 * Since libsockevent is responsible for this distinction, we keep
	 * track of the listening mode at this level.  Conveniently, there is a
	 * socket option for this, which we support out of the box as a result.
	 */
	if (r == OK) {
		sock->sock_opt |= SO_ACCEPTCONN;

		/*
		 * For the extremely unlikely case that right after the socket
		 * is put into listening mode, it has a connection ready to
		 * accept, we retest blocked ready-to-read select queries now.
		 */
		sockevent_raise(sock, SEV_ACCEPT);
	}

	return r;
}

/*
 * Accept a connection on a listening socket, creating a new socket.
 */
static sockid_t
sockevent_accept(sockid_t id, struct sockaddr * __restrict addr,
	socklen_t * __restrict addr_len, endpoint_t user_endpt,
	const struct sockdriver_call * __restrict call)
{
	struct sock *sock, *newsock;
	sockid_t r;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	if (sock->sock_ops->sop_accept == NULL)
		return EOPNOTSUPP;

	/*
	 * Attempt to accept a connection.  The socket driver is responsible
	 * for allocating a sock object (and identifier) on success.  It may
	 * already have done so before, in which case it should leave newsock
	 * filled with NULL; otherwise, the returned sock object is cloned from
	 * the listening socket.  The socket driver is also responsible for
	 * failing the call if the socket is not in listening mode, because it
	 * must specify the error to return: EOPNOTSUPP or EINVAL.
	 */
	newsock = NULL;

	if ((r = sock->sock_ops->sop_accept(sock, addr, addr_len, user_endpt,
	    &newsock)) == SUSPEND) {
		assert(sock->sock_opt & SO_ACCEPTCONN);

		if (call == NULL)
			return EWOULDBLOCK;

		sockevent_suspend(sock, SEV_ACCEPT, call, user_endpt);

		return SUSPEND;
	}

	if (r >= 0)
		sockevent_accepted(sock, newsock, r);

	return r;
}

/*
 * Send regular and/or control data.
 */
static int
sockevent_send(sockid_t id, const struct sockdriver_data * __restrict data,
	size_t len, const struct sockdriver_data * __restrict ctl_data,
	socklen_t ctl_len, const struct sockaddr * __restrict addr,
	socklen_t addr_len, endpoint_t user_endpt, int flags,
	const struct sockdriver_call * __restrict call)
{
	struct sock *sock;
	clock_t time;
	size_t min, off;
	socklen_t ctl_off;
	int r, timer;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	/*
	 * The order of the following checks is not necessarily fixed, and may
	 * be changed later.  As far as applicable, they should match the order
	 * of the checks during call resumption, though.
	 */
	if ((r = sock->sock_err) != OK) {
		sock->sock_err = OK;

		return r;
	}

	if (sock->sock_flags & SFL_SHUT_WR) {
		sockevent_sigpipe(sock, user_endpt, flags);

		return EPIPE;
	}

	/*
	 * Translate the sticky SO_DONTROUTE option to a per-request
	 * MSG_DONTROUTE flag.  This achieves two purposes: socket drivers have
	 * to check only one flag, and socket drivers that do not support the
	 * flag will fail send requests in a consistent way.
	 */
	if (sock->sock_opt & SO_DONTROUTE)
		flags |= MSG_DONTROUTE;

	/*
	 * Check if this is a valid send request as far as the socket driver is
	 * concerned.  We do this separately from sop_send for the reason that
	 * this send request may immediately be queued behind other pending
	 * send requests (without a call to sop_send), which means even invalid
	 * requests would be queued and not return failure until much later.
	 */
	if (sock->sock_ops->sop_pre_send != NULL &&
	    (r = sock->sock_ops->sop_pre_send(sock, len, ctl_len, addr,
	    addr_len, user_endpt,
	    flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK)
		return r;

	if (sock->sock_ops->sop_send == NULL)
		return EOPNOTSUPP;

	off = 0;
	ctl_off = 0;

	/*
	 * Sending out-of-band data is treated differently from regular data:
	 *
	 * - sop_send is called immediately, even if a partial non-OOB send
	 *   operation is currently suspended (TODO: it may have to be aborted
	 *   in order to maintain atomicity guarantees - that should be easy);
	 * - sop_send must not return SUSPEND; instead, if it cannot process
	 *   the OOB data immediately, it must return an appropriate error;
	 * - the send low watermark is ignored.
	 *
	 * Given that none of the current socket drivers support OOB data at
	 * all, more sophisticated approaches would have no added value now.
	 */
	if (flags & MSG_OOB) {
		r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data,
		    ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, 0);

		if (r == SUSPEND)
			panic("libsockevent: MSG_OOB send calls may not be "
			    "suspended");

		return (r == OK) ? (int)off : r;
	}

	/*
	 * Only call the actual sop_send function now if no other send calls
	 * are suspended already.
	 *
	 * Call sop_send with 'min' set to the minimum of the request size and
	 * the socket's send low water mark, but only if the call is non-
	 * blocking.  For stream-oriented sockets, this should have the effect
	 * that non-blocking calls fail with EWOULDBLOCK if not at least that
	 * much can be sent immediately. For consistency, we choose to apply
	 * the same threshold to blocking calls.  For datagram-oriented
	 * sockets, the minimum is not a factor to be considered.
	 */
	if (!sockevent_has_suspended(sock, SEV_SEND)) {
		min = sock->sock_slowat;
		if (min > len)
			min = len;

		r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data,
		    ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, min);
	} else
		r = SUSPEND;

	if (r == SUSPEND) {
		/*
		 * We do not store the target's address on suspension, because
		 * that would add significantly to the per-process suspension
		 * state.  As a result, we disallow socket drivers from
		 * suspending send calls with addresses, because we would no
		 * longer have the address for proper call resumption.
		 * However, we do not know here whether the socket is in
		 * connection-oriented mode; if it is, the address is to be
		 * ignored altogether.  Therefore, there is no test on 'addr'
		 * here.  Resumed calls will get a NULL address pointer, and
		 * the socket driver is expected to do the right thing.
		 */

		/*
		 * For non-blocking socket calls, return an error only if we
		 * were not able to send anything at all.  If only control data
		 * were sent, the return value is therefore zero.
		 */
		if (call != NULL) {
			if (sock->sock_stimeo != 0) {
				timer = TRUE;
				time = socktimer_add(sock, sock->sock_stimeo);
			} else {
				timer = FALSE;
				time = 0;
			}

			sockevent_suspend_data(sock, SEV_SEND, timer, call,
			    user_endpt, data, len, off, ctl_data, ctl_len,
			    ctl_off, flags, 0, time);
		} else
			r = (off > 0 || ctl_off > 0) ? OK : EWOULDBLOCK;
	} else if (r == EPIPE)
		sockevent_sigpipe(sock, user_endpt, flags);

	return (r == OK) ? (int)off : r;
}

/*
 * The inner part of the receive request handler.  An error returned from here
 * may be overridden by an error pending on the socket, although data returned
 * from here trumps such pending errors.
 */
static int
sockevent_recv_inner(struct sock * sock,
	const struct sockdriver_data * __restrict data,
	size_t len, size_t * __restrict off,
	const struct sockdriver_data * __restrict ctl_data,
	socklen_t ctl_len, socklen_t * __restrict ctl_off,
	struct sockaddr * __restrict addr,
	socklen_t * __restrict addr_len, endpoint_t user_endpt,
	int * __restrict flags, const struct sockdriver_call * __restrict call)
{
	clock_t time;
	size_t min;
	int r, oob, inflags, timer;

	/*
	 * Check if this is a valid receive request as far as the socket driver
	 * is concerned.  We do this separately from sop_recv for the reason
	 * that this receive request may immediately be queued behind other
	 * pending receive requests (without a call to sop_recv), which means
	 * even invalid requests would be queued and not return failure until
	 * much later.
	 */
	inflags = *flags;
	*flags = 0;

	if (sock->sock_ops->sop_pre_recv != NULL &&
	    (r = sock->sock_ops->sop_pre_recv(sock, user_endpt,
	    inflags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK)
		return r;

	/*
	 * The order of the following checks is not necessarily fixed, and may
	 * be changed later.  As far as applicable, they should match the order
	 * of the checks during call resumption, though.
	 */
	if (sock->sock_flags & SFL_SHUT_RD)
		return SOCKEVENT_EOF;

	if (sock->sock_ops->sop_recv == NULL)
		return EOPNOTSUPP;

	/*
	 * Receiving out-of-band data is treated differently from regular data:
	 *
	 * - sop_recv is called immediately, even if a partial non-OOB receive
	 *   operation is currently suspended (TODO: it may have to be aborted
	 *   in order to maintain atomicity guarantees - that should be easy);
	 * - sop_recv must not return SUSPEND; instead, if it cannot return any
	 *   the OOB data immediately, it must return an appropriate error;
	 * - the receive low watermark is ignored.
	 *
	 * Given that none of the current socket drivers support OOB data at
	 * all, more sophisticated approaches would have no added value now.
	 */
	oob = (inflags & MSG_OOB);

	if (oob && (sock->sock_opt & SO_OOBINLINE))
		return EINVAL;

	/*
	 * Only call the actual sop_recv function now if no other receive
	 * calls are suspended already.
	 *
	 * Call sop_recv with 'min' set to the minimum of the request size and
	 * the socket's socket's low water mark, unless there is a pending
	 * error.  As a result, blocking calls will block, and non-blocking
	 * calls will yield EWOULDBLOCK, if at least that much can be received,
	 * unless another condition (EOF or that pending error) prevents more
	 * from being received anyway.  For datagram-oriented sockets, the
	 * minimum is not a factor to be considered.
	 */
	if (oob || !sockevent_has_suspended(sock, SEV_RECV)) {
		if (!oob && sock->sock_err == OK) {
			min = sock->sock_rlowat;
			if (min > len)
				min = len;
		} else
			min = 0; /* receive even no-data segments */

		r = sock->sock_ops->sop_recv(sock, data, len, off, ctl_data,
		    ctl_len, ctl_off, addr, addr_len, user_endpt, inflags, min,
		    flags);
	} else
		r = SUSPEND;

	assert(r <= 0 || r == SOCKEVENT_EOF);

	if (r == SUSPEND) {
		if (oob)
			panic("libsockevent: MSG_OOB receive calls may not be "
			    "suspended");

		/*
		 * For non-blocking socket calls, return EWOULDBLOCK only if we
		 * did not receive anything at all.  If only control data were
		 * received, the return value is therefore zero.  Suspension
		 * implies that there is nothing to read.  For the purpose of
		 * the calling wrapper function, never suspend a call when
		 * there is a pending error.
		 */
		if (call != NULL && sock->sock_err == OK) {
			if (sock->sock_rtimeo != 0) {
				timer = TRUE;
				time = socktimer_add(sock, sock->sock_rtimeo);
			} else {
				timer = FALSE;
				time = 0;
			}

			sockevent_suspend_data(sock, SEV_RECV, timer, call,
			    user_endpt, data, len, *off, ctl_data,
			    ctl_len, *ctl_off, inflags, *flags, time);
		} else
			r = EWOULDBLOCK;
	}

	return r;
}

/*
 * Receive regular and/or control data.
 */
static int
sockevent_recv(sockid_t id, const struct sockdriver_data * __restrict data,
	size_t len, const struct sockdriver_data * __restrict ctl_data,
	socklen_t * __restrict ctl_len, struct sockaddr * __restrict addr,
	socklen_t * __restrict addr_len, endpoint_t user_endpt,
	int * __restrict flags, const struct sockdriver_call * __restrict call)
{
	struct sock *sock;
	size_t off;
	socklen_t ctl_inlen;
	int r;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	/*
	 * This function is a wrapper around the actual receive functionality.
	 * The reason for this is that receiving data should take precedence
	 * over a pending socket error, while a pending socket error should
	 * take precedence over both regular errors as well as EOF.  In other
	 * words: if there is a pending error, we must try to receive anything
	 * at all; if receiving does not work, we must fail the call with the
	 * pending error.  However, until we call the receive callback, we have
	 * no way of telling whether any data can be received.  So we must try
	 * that before we can decide whether to return a pending error.
	 */
	off = 0;
	ctl_inlen = *ctl_len;
	*ctl_len = 0;

	/*
	 * Attempt to perform the actual receive call.
	 */
	r = sockevent_recv_inner(sock, data, len, &off, ctl_data, ctl_inlen,
	    ctl_len, addr, addr_len, user_endpt, flags, call);

	/*
	 * If the receive request succeeded, or it failed but yielded a partial
	 * result, then return the (partal) result.  Otherwise, if an error is
	 * pending, return that error.  Otherwise, return either a regular
	 * error or 0 for EOF.
	 */
	if (r == OK || (r != SUSPEND && (off > 0 || *ctl_len > 0)))
		r = (int)off;
	else if (sock->sock_err != OK) {
		assert(r != SUSPEND);

		r = sock->sock_err;

		sock->sock_err = OK;
	} else if (r == SOCKEVENT_EOF)
		r = 0;

	return r;
}

/*
 * Process an I/O control call.
 */
static int
sockevent_ioctl(sockid_t id, unsigned long request,
	const struct sockdriver_data * __restrict data, endpoint_t user_endpt,
	const struct sockdriver_call * __restrict call __unused)
{
	struct sock *sock;
	size_t size;
	int r, val;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	/* We handle a very small subset of generic IOCTLs here. */
	switch (request) {
	case FIONREAD:
		size = 0;
		if (!(sock->sock_flags & SFL_SHUT_RD) &&
		    sock->sock_ops->sop_test_recv != NULL)
			(void)sock->sock_ops->sop_test_recv(sock, 0, &size);

		val = (int)size;

		return sockdriver_copyout(data, 0, &val, sizeof(val));
	}

	if (sock->sock_ops->sop_ioctl == NULL)
		return ENOTTY;

	r = sock->sock_ops->sop_ioctl(sock, request, data, user_endpt);

	/*
	 * Suspending IOCTL requests is not currently supported by this
	 * library, even though the VFS protocol and libsockdriver do support
	 * it.  The reason is that IOCTLs do not match our proces suspension
	 * model: they could be neither queued nor repeated.  For now, it seems
	 * that this feature is not needed by the socket drivers either.  Thus,
	 * even though there are possible solutions, we defer implementing them
	 * until we know what exactly is needed.
	 */
	if (r == SUSPEND)
		panic("libsockevent: socket driver suspended IOCTL 0x%lx",
		    request);

	return r;
}

/*
 * Set socket options.
 */
static int
sockevent_setsockopt(sockid_t id, int level, int name,
	const struct sockdriver_data * data, socklen_t len)
{
	struct sock *sock;
	struct linger linger;
	struct timeval tv;
	clock_t secs, ticks;
	int r, val;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	if (level == SOL_SOCKET) {
		/*
		 * Handle a subset of the socket-level options here.  For most
		 * of them, this means that the socket driver itself need not
		 * handle changing or returning the options, but still needs to
		 * implement the correct behavior based on them where needed.
		 * A few of them are handled exclusively in this library:
		 * SO_ACCEPTCONN, SO_NOSIGPIPE, SO_ERROR, SO_TYPE, SO_LINGER,
		 * SO_SNDLOWAT, SO_RCVLOWAT, SO_SNDTIMEO, and SO_RCVTIMEO.
		 * The SO_USELOOPBACK option is explicitly absent, as it is
		 * valid for routing sockets only and is set by default there.
		 */
		switch (name) {
		case SO_DEBUG:
		case SO_REUSEADDR:
		case SO_KEEPALIVE:
		case SO_DONTROUTE:
		case SO_BROADCAST:
		case SO_OOBINLINE:
		case SO_REUSEPORT:
		case SO_NOSIGPIPE:
		case SO_TIMESTAMP:
			/*
			 * Simple on-off options.  Changing them does not
			 * involve the socket driver.
			 */
			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
			    len)) != OK)
				return r;

			if (val)
				sock->sock_opt |= (unsigned int)name;
			else
				sock->sock_opt &= ~(unsigned int)name;

			/*
			 * In priciple these on-off options are maintained in
			 * this library, but some socket drivers may need to
			 * apply the options elsewhere, so we notify them that
			 * something has changed.  Using the sop_setsockopt
			 * callback would be inconvenient for this for two
			 * reasons: multiple value copy-ins and default errors.
			 */
			if (sock->sock_ops->sop_setsockmask != NULL)
				sock->sock_ops->sop_setsockmask(sock,
				    sock->sock_opt);

			/*
			 * The inlining of OOB data may make new data available
			 * through regular receive calls.  Thus, see if we can
			 * wake up any suspended receive calls now.
			 */
			if (name == SO_OOBINLINE && val)
				sockevent_raise(sock, SEV_RECV);

			return OK;

		case SO_LINGER:
			/* The only on-off option with an associated value. */
			if ((r = sockdriver_copyin_opt(data, &linger,
			    sizeof(linger), len)) != OK)
				return r;

			if (linger.l_onoff) {
				if (linger.l_linger < 0)
					return EINVAL;
				/* EDOM is the closest applicable error.. */
				secs = (clock_t)linger.l_linger;
				if (secs >= TMRDIFF_MAX / sys_hz())
					return EDOM;

				sock->sock_opt |= SO_LINGER;
				sock->sock_linger = secs * sys_hz();
			} else {
				sock->sock_opt &= ~SO_LINGER;
				sock->sock_linger = 0;
			}

			return OK;

		case SO_SNDLOWAT:
		case SO_RCVLOWAT:
			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
			    len)) != OK)
				return r;

			if (val <= 0)
				return EINVAL;

			/*
			 * Setting these values may allow suspended operations
			 * (send, recv, select) to be resumed, so recheck.
			 */
			if (name == SO_SNDLOWAT) {
				sock->sock_slowat = (size_t)val;

				sockevent_raise(sock, SEV_SEND);
			} else {
				sock->sock_rlowat = (size_t)val;

				sockevent_raise(sock, SEV_RECV);
			}

			return OK;

		case SO_SNDTIMEO:
		case SO_RCVTIMEO:
			if ((r = sockdriver_copyin_opt(data, &tv, sizeof(tv),
			    len)) != OK)
				return r;

			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
			    (unsigned long)tv.tv_usec >= US)
				return EINVAL;
			if (tv.tv_sec >= TMRDIFF_MAX / sys_hz())
				return EDOM;

			ticks = tv.tv_sec * sys_hz() +
			    (tv.tv_usec * sys_hz() + US - 1) / US;

			if (name == SO_SNDTIMEO)
				sock->sock_stimeo = ticks;
			else
				sock->sock_rtimeo = ticks;

			/*
			 * The timeouts for any calls already in progress for
			 * this socket are left as is.
			 */
			return OK;

		case SO_ACCEPTCONN:
		case SO_ERROR:
		case SO_TYPE:
			/* These options may be retrieved but not set. */
			return ENOPROTOOPT;

		default:
			/*
			 * The remaining options either cannot be handled in a
			 * generic way, or are not recognized altogether.  Pass
			 * them to the socket driver, which should handle what
			 * it knows and reject the rest.
			 */
			break;
		}
	}

	if (sock->sock_ops->sop_setsockopt == NULL)
		return ENOPROTOOPT;

	/*
	 * The socket driver must return ENOPROTOOPT for all options it does
	 * not recognize.
	 */
	return sock->sock_ops->sop_setsockopt(sock, level, name, data, len);
}

/*
 * Retrieve socket options.
 */
static int
sockevent_getsockopt(sockid_t id, int level, int name,
	const struct sockdriver_data * __restrict data,
	socklen_t * __restrict len)
{
	struct sock *sock;
	struct linger linger;
	struct timeval tv;
	clock_t ticks;
	int val;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	if (level == SOL_SOCKET) {
		/*
		 * As with setting, handle a subset of the socket-level options
		 * here.  The rest is to be taken care of by the socket driver.
		 */
		switch (name) {
		case SO_DEBUG:
		case SO_ACCEPTCONN:
		case SO_REUSEADDR:
		case SO_KEEPALIVE:
		case SO_DONTROUTE:
		case SO_BROADCAST:
		case SO_OOBINLINE:
		case SO_REUSEPORT:
		case SO_NOSIGPIPE:
		case SO_TIMESTAMP:
			val = !!(sock->sock_opt & (unsigned int)name);

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);

		case SO_LINGER:
			linger.l_onoff = !!(sock->sock_opt & SO_LINGER);
			linger.l_linger = sock->sock_linger / sys_hz();

			return sockdriver_copyout_opt(data, &linger,
			   sizeof(linger), len);

		case SO_ERROR:
			if ((val = -sock->sock_err) != OK)
				sock->sock_err = OK;

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);

		case SO_TYPE:
			val = sock->sock_type;

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);

		case SO_SNDLOWAT:
			val = (int)sock->sock_slowat;

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);

		case SO_RCVLOWAT:
			val = (int)sock->sock_rlowat;

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);

		case SO_SNDTIMEO:
		case SO_RCVTIMEO:
			if (name == SO_SNDTIMEO)
				ticks = sock->sock_stimeo;
			else
				ticks = sock->sock_rtimeo;

			tv.tv_sec = ticks / sys_hz();
			tv.tv_usec = (ticks % sys_hz()) * US / sys_hz();

			return sockdriver_copyout_opt(data, &tv, sizeof(tv),
			    len);

		default:
			break;
		}
	}

	if (sock->sock_ops->sop_getsockopt == NULL)
		return ENOPROTOOPT;

	/*
	 * The socket driver must return ENOPROTOOPT for all options it does
	 * not recognize.
	 */
	return sock->sock_ops->sop_getsockopt(sock, level, name, data, len);
}

/*
 * Retrieve a socket's local address.
 */
static int
sockevent_getsockname(sockid_t id, struct sockaddr * __restrict addr,
	socklen_t * __restrict addr_len)
{
	struct sock *sock;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	if (sock->sock_ops->sop_getsockname == NULL)
		return EOPNOTSUPP;

	return sock->sock_ops->sop_getsockname(sock, addr, addr_len);
}

/*
 * Retrieve a socket's remote address.
 */
static int
sockevent_getpeername(sockid_t id, struct sockaddr * __restrict addr,
	socklen_t * __restrict addr_len)
{
	struct sock *sock;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	/* Listening-mode sockets cannot possibly have a peer address. */
	if (sock->sock_opt & SO_ACCEPTCONN)
		return ENOTCONN;

	if (sock->sock_ops->sop_getpeername == NULL)
		return EOPNOTSUPP;

	return sock->sock_ops->sop_getpeername(sock, addr, addr_len);
}

/*
 * Mark the socket object as shut down for sending and/or receiving.  The flags
 * parameter may be a bitwise-OR'ed combination of SFL_SHUT_RD and SFL_SHUT_WR.
 * This function will wake up any suspended requests affected by this change,
 * but it will not invoke the sop_shutdown() callback function on the socket.
 * The function may in fact be called from sop_shutdown() before completion to
 * mark the socket as shut down as reflected by sockevent_is_shutdown().
 */
void
sockevent_set_shutdown(struct sock * sock, unsigned int flags)
{
	unsigned int mask;

	assert(sock->sock_ops != NULL);
	assert(!(flags & ~(SFL_SHUT_RD | SFL_SHUT_WR)));

	/* Look at the newly set flags only. */
	flags &= ~(unsigned int)sock->sock_flags;

	if (flags != 0) {
		sock->sock_flags |= flags;

		/*
		 * Wake up any blocked calls that are affected by the shutdown.
		 * Shutting down listening sockets causes ongoing accept calls
		 * to be rechecked.
		 */
		mask = 0;
		if (flags & SFL_SHUT_RD)
			mask |= SEV_RECV;
		if (flags & SFL_SHUT_WR)
			mask |= SEV_SEND;
		if (sock->sock_opt & SO_ACCEPTCONN)
			mask |= SEV_ACCEPT;

		assert(mask != 0);
		sockevent_raise(sock, mask);
	}
}

/*
 * Shut down socket send and receive operations.
 */
static int
sockevent_shutdown(sockid_t id, int how)
{
	struct sock *sock;
	unsigned int flags;
	int r;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	/* Convert the request to a set of flags. */
	flags = 0;
	if (how == SHUT_RD || how == SHUT_RDWR)
		flags |= SFL_SHUT_RD;
	if (how == SHUT_WR || how == SHUT_RDWR)
		flags |= SFL_SHUT_WR;

	if (sock->sock_ops->sop_shutdown != NULL)
		r = sock->sock_ops->sop_shutdown(sock, flags);
	else
		r = OK;

	/* On success, update our internal state as well. */
	if (r == OK)
		sockevent_set_shutdown(sock, flags);

	return r;
}

/*
 * Close a socket.
 */
static int
sockevent_close(sockid_t id, const struct sockdriver_call * call)
{
	struct sock *sock;
	int r, force;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	assert(sock->sock_proc == NULL);
	sock->sock_select.ss_endpt = NONE;

	/*
	 * There are several scenarios when it comes to closing sockets.  First
	 * of all, we never actually force the socket driver to close a socket.
	 * The driver may always suspend the close call and take as long as it
	 * wants.  After a suspension, it signals its completion of the close
	 * through the SEV_CLOSE socket event.
	 *
	 * With that said, we offer two levels of urgency regarding the close
	 * request: regular and forced.  The former allows for a graceful
	 * close; the latter urges the socket driver to close the socket as
	 * soon as possible.  A socket that has been requested to be closed
	 * gracefully can, as long as it is still open (i.e., no SEV_CLOSE was
	 * fired yet), later be requested to be closed forcefully.  This is how
	 * SO_LINGER with a nonzero timeout is implemented.  If SO_LINGER is
	 * set with a zero timeout, the socket is force-closed immediately.
	 * Finally, if SO_LINGER is not set, the socket will be closed normally
	 * and never be forced--akin to SO_LINGER with an infinite timeout.
	 *
	 * The return value of the caller's close(2) may only ever be either
	 * OK or EINPROGRESS, to ensure that the caller knows that the file
	 * descriptor is freed up, as per Austin Group Defect #529.  In fact,
	 * EINPROGRESS is to be returned only on signal interruption (i.e.,
	 * cancel).  For that reason, this function only ever returns OK.
	 */
	force = ((sock->sock_opt & SO_LINGER) && sock->sock_linger == 0);

	if (sock->sock_ops->sop_close != NULL)
		r = sock->sock_ops->sop_close(sock, force);
	else
		r = OK;

	assert(r == OK || r == SUSPEND);

	if (r == SUSPEND) {
		sock->sock_flags |= SFL_CLOSING;

		/*
		 * If we were requested to force-close the socket immediately,
		 * but the socket driver needs more time anyway, then tell the
		 * caller that the socket was closed right away.
		 */
		if (force)
			return OK;

		/*
		 * If we are to force-close the socket only after a specific
		 * linger timeout, set the timer for that now, even if the call
		 * is non-blocking.  This also means that we cannot associate
		 * the linger timeout with the close call.  Instead, we convert
		 * the sock_linger value from a (relative) duration to an
		 * (absolute) timeout time, and use the SFL_CLOSING flag (along
		 * with SFL_TIMER) to tell the difference.  Since the socket is
		 * otherwise unreachable from userland at this point, the
		 * conversion is never visible in any way.
		 *
		 * The socket may already be in the timers list, so we must
		 * always check the SO_LINGER flag before checking sock_linger.
		 *
		 * If SO_LINGER is not set, we must never suspend the call.
		 */
		if (sock->sock_opt & SO_LINGER) {
			sock->sock_linger =
			    socktimer_add(sock, sock->sock_linger);
		} else
			call = NULL;

		/*
		 * A non-blocking close is completed asynchronously.  The
		 * caller is not told about this with EWOULDBLOCK as usual, for
		 * the reasons mentioned above.
		 */
		if (call != NULL)
			sockevent_suspend(sock, SEV_CLOSE, call, NONE);
		else
			r = OK;
	} else if (r == OK)
		sockevent_free(sock);

	return r;
}

/*
 * Cancel a suspended send request.
 */
static void
sockevent_cancel_send(struct sock * sock, struct sockevent_proc * spr, int err)
{
	int r;

	/*
	 * If any regular or control data were sent, return the number of data
	 * bytes sent--possibly zero.  Otherwise return the given error code.
	 */
	if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
		r = (int)spr->spr_dataoff;
	else
		r = err;

	sockdriver_reply_generic(&spr->spr_call, r);

	/*
	 * In extremely rare circumstances, one send may be queued behind
	 * another send even though the former can actually be sent on the
	 * socket right away.  For this reason, we retry sending when canceling
	 * a send.  We need to do this only when the first send in the queue
	 * was canceled, but multiple blocked sends on a single socket should
	 * be rare anyway.
	 */
	sockevent_raise(sock, SEV_SEND);
}

/*
 * Cancel a suspended receive request.
 */
static void
sockevent_cancel_recv(struct sock * sock, struct sockevent_proc * spr, int err)
{
	int r;

	/*
	 * If any regular or control data were received, return the number of
	 * data bytes received--possibly zero.  Otherwise return the given
	 * error code.
	 */
	if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0)
		r = (int)spr->spr_dataoff;
	else
		r = err;

	/*
	 * Also return any flags set for the data received so far, e.g.
	 * MSG_CTRUNC.  Do not return an address: receive calls on unconnected
	 * sockets must never block after receiving some data--instead, they
	 * are supposed to return MSG_TRUNC if not all data were copied out.
	 */
	sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff, NULL, 0,
	    spr->spr_rflags);

	/*
	 * The same story as for sends (see above) applies to receives,
	 * although this case should be even more rare in practice.
	 */
	sockevent_raise(sock, SEV_RECV);
}

/*
 * Cancel a previous request that may currently be suspended.  The cancel
 * operation itself does not have a reply.  Instead, if the given request was
 * found to be suspended, that request must be aborted and an appropriate reply
 * must be sent for the request.  If no matching request was found, no reply
 * must be sent at all.
 */
static void
sockevent_cancel(sockid_t id, const struct sockdriver_call * call)
{
	struct sockevent_proc *spr;
	struct sock *sock;

	/*
	 * Due to asynchronous close(2) operations, not even the sock object
	 * may be found.  If this (entirely legitimate) case, do not send any
	 * reply.
	 */
	if ((sock = sockhash_get(id)) == NULL)
		return;

	/*
	 * The request may already have completed by the time we receive the
	 * cancel request, in which case we can not find it.  In this (entirely
	 * legitimate) case, do not send any reply.
	 */
	if ((spr = sockevent_unsuspend(sock, call)) == NULL)
		return;

	/*
	 * We found the operation.  Cancel it according to its call type.
	 * Then, once fully done with it, free the suspension data structure.
	 *
	 * Note that we have to use the call structure from the suspension data
	 * structure rather than the given 'call' pointer: only the former
	 * includes all the information necessary to resume the request!
	 */
	switch (spr->spr_event) {
	case SEV_BIND:
	case SEV_CONNECT:
		assert(spr->spr_call.sc_endpt != NONE);

		sockdriver_reply_generic(&spr->spr_call, EINTR);

		break;

	case SEV_ACCEPT:
		sockdriver_reply_accept(&spr->spr_call, EINTR, NULL, 0);

		break;

	case SEV_SEND:
		sockevent_cancel_send(sock, spr, EINTR);

		break;

	case SEV_RECV:
		sockevent_cancel_recv(sock, spr, EINTR);

		break;

	case SEV_CLOSE:
		/*
		 * Return EINPROGRESS rather than EINTR, so that the user
		 * process can tell from the close(2) result that the file
		 * descriptor has in fact been closed.
		 */
		sockdriver_reply_generic(&spr->spr_call, EINPROGRESS);

		/*
		 * Do not free the sock object here: the socket driver will
		 * complete the close in the background, and fire SEV_CLOSE
		 * once it is done.  Only then is the sock object freed.
		 */
		break;

	default:
		panic("libsockevent: process suspended on unknown event 0x%x",
		    spr->spr_event);
	}

	sockevent_proc_free(spr);
}

/*
 * Process a select request.
 */
static int
sockevent_select(sockid_t id, unsigned int ops,
	const struct sockdriver_select * sel)
{
	struct sock *sock;
	unsigned int r, notify;

	if ((sock = sockhash_get(id)) == NULL)
		return EINVAL;

	notify = (ops & SDEV_NOTIFY);
	ops &= (SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR);

	/*
	 * See if any of the requested select operations can be satisfied
	 * immediately.
	 */
	r = sockevent_test_select(sock, ops);

	/*
	 * If select operations were pending, the new results must not indicate
	 * that any of those were satisfied, as that would indicate an internal
	 * logic error: the socket driver is supposed to update its state
	 * proactively, and thus, discovering that things have changed here is
	 * not something that should ever happen.
	 */
	assert(!(sock->sock_selops & r));

	/*
	 * If any select operations are not satisfied immediately, and we are
	 * asked to notify the caller when they are satisfied later, save them
	 * for later retesting.
	 */
	ops &= ~r;

	if (notify && ops != 0) {
		/*
		 * For now, we support only one caller when it comes to select
		 * queries: VFS.  If we want to support a networked file system
		 * (or so) directly calling select as well, this library will
		 * have to be extended accordingly (should not be too hard).
		 */
		if (sock->sock_select.ss_endpt != NONE) {
			if (sock->sock_select.ss_endpt != sel->ss_endpt) {
				printf("libsockevent: no support for multiple "
				    "select callers yet\n");

				return EIO;
			}

			/*
			 * If a select query was already pending for this
			 * caller, we must simply merge in the new operations.
			 */
			sock->sock_selops |= ops;
		} else {
			assert(sel->ss_endpt != NONE);

			sock->sock_select = *sel;
			sock->sock_selops = ops;
		}
	}

	return r;
}

/*
 * An alarm has triggered.  Expire any timers.  Socket drivers that do not pass
 * clock notification messages to libsockevent must call expire_timers(3)
 * themselves instead.
 */
static void
sockevent_alarm(clock_t now)
{

	expire_timers(now);
}

static const struct sockdriver sockevent_tab = {
	.sdr_socket		= sockevent_socket,
	.sdr_socketpair		= sockevent_socketpair,
	.sdr_bind		= sockevent_bind,
	.sdr_connect		= sockevent_connect,
	.sdr_listen		= sockevent_listen,
	.sdr_accept		= sockevent_accept,
	.sdr_send		= sockevent_send,
	.sdr_recv		= sockevent_recv,
	.sdr_ioctl		= sockevent_ioctl,
	.sdr_setsockopt		= sockevent_setsockopt,
	.sdr_getsockopt		= sockevent_getsockopt,
	.sdr_getsockname	= sockevent_getsockname,
	.sdr_getpeername	= sockevent_getpeername,
	.sdr_shutdown		= sockevent_shutdown,
	.sdr_close		= sockevent_close,
	.sdr_cancel		= sockevent_cancel,
	.sdr_select		= sockevent_select,
	.sdr_alarm		= sockevent_alarm
};

/*
 * Initialize the socket event library.
 */
void
sockevent_init(sockevent_socket_cb_t socket_cb)
{

	sockhash_init();

	socktimer_init();

	sockevent_proc_init();

	SIMPLEQ_INIT(&sockevent_pending);

	assert(socket_cb != NULL);
	sockevent_socket_cb = socket_cb;

	/* Announce we are up. */
	sockdriver_announce();

	sockevent_working = FALSE;
}

/*
 * Process a socket driver request message.
 */
void
sockevent_process(const message * m_ptr, int ipc_status)
{

	/* Block events until after we have processed the request. */
	assert(!sockevent_working);
	sockevent_working = TRUE;

	/* Actually process the request. */
	sockdriver_process(&sockevent_tab, m_ptr, ipc_status);

	/*
	 * If any events were fired while processing the request, they will
	 * have been queued for later.  Go through them now.
	 */
	if (sockevent_has_events())
		sockevent_pump();

	sockevent_working = FALSE;
}