phunix/minix/net/lwip/tcpsock.c
David van Moolenbroek ef8d499e2d Add lwip: a new lwIP-based TCP/IP service
This commit adds a new TCP/IP service to MINIX 3.  As its core, the
service uses the lwIP TCP/IP stack for maintenance reasons.  The
service aims to be compatible with NetBSD userland, including its
low-level network management utilities.  It also aims to support
modern features such as IPv6.  In summary, the new LWIP service has
support for the following main features:

- TCP, UDP, RAW sockets with mostly standard BSD API semantics;
- IPv6 support: host mode (complete) and router mode (partial);
- most of the standard BSD API socket options (SO_);
- all of the standard BSD API message flags (MSG_);
- the most used protocol-specific socket and control options;
- a default loopback interface and the ability to create one more;
- configuration-free ethernet interfaces and driver tracking;
- queuing and multiple concurrent requests to each ethernet driver;
- standard ioctl(2)-based BSD interface management;
- radix tree backed, destination-based routing;
- routing sockets for standard BSD route reporting and management;
- multicast traffic and multicast group membership tracking;
- Berkeley Packet Filter (BPF) devices;
- standard and custom sysctl(7) nodes for many internals;
- a slab allocation based, hybrid static/dynamic memory pool model.

Many of its modules come with fairly elaborate comments that cover
many aspects of what is going on.  The service is primarily a socket
driver built on top of the libsockdriver library, but for BPF devices
it is at the same time also a character driver.

Change-Id: Ib0c02736234b21143915e5fcc0fda8fe408f046f
2017-04-30 13:16:03 +00:00

2794 lines
81 KiB
C

/* LWIP service - tcpsock.c - TCP sockets */
/*
* This module implements support for TCP sockets based on lwIP's core TCP PCB
* module, which is largely but not fully cooperative with exactly what we want
* to achieve, with as a result that this module is rather complicated.
*
* Each socket has a send queue and a receive queue. Both are using lwIP's own
* (pbuf) buffers, which largely come out of the main 512-byte buffer pool.
* The buffers on the send queue are allocated and freed by us--the latter only
* once they are no longer in use by lwIP as well. A bit counterintuitively,
* we deliberately use a smaller lwIP per-PCB TCP send buffer limit
* (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more
* easily trigger conditions where we cannot enqueue data (or the final FIN)
* right away. This way, we get to test the internal logic of this module a
* lot more easily. The small lwIP send queue size should not have any impact
* on performance, as our own per-socket send queues can be much larger and we
* enqueue more of that on the lwIP PCB as soon as we can in all cases.
*
* The receive queue consists of whatever buffers were given to us by lwIP, but
* since those may be many buffers with small amounts of data each, we perform
* fairly aggressive merging of consecutive buffers. The intended result is
* that we waste no more than 50% of memory within the receive queue. Merging
* requires memory copies, which makes it expensive, but we do not configure
* lwIP with enough buffers to make running out of buffers a non-issue, so this
* trade-off is necessary. Practical experience and measurements of the merge
* policy will have to show whether and how the current policy may be improved.
*
* As can be expected, the connection close semantics are by far the most
* complicated part of this module. We attempt to get rid of the lwIP PCB as
* soon as we can, letting lwIP take care of the TIME_WAIT state for example.
* However, there are various conditions that have to be met before we can
* forget about the PCB here--most importantly, that none of our sent data
* blocks are still referenced by lwIP because they have not yet been sent or
* acknowledged. We can only free the data blocks once lwIP is done with them.
*
* We do consider the TCP state of lwIP's PCB, in order to avoid duplicating
* full state tracking here. However, we do not look at a socket's TCP state
* while in a lwIP-generated event for that socket, because the state may not
* necessarily reflect the (correct or new) TCP state of the connection, nor
* may the PCB be available--this is the case for error events. For these
* reasons we use a few internal TCPF_ flags to perform partial state tracking.
*
* More generally, we tend to access lwIP PCB fields directly only when lwIP's
* own BSD API implementation does that too and there is no better alternative.
* One example of this is the check to see if our FIN was acknowledged, for
* SO_LINGER support. In terms of maintenance, our hope is that if lwIP's API
* changes later, we can change our code to imitate whatever lwIP's BSD API
* implementation does at that point.
*/
#include <sys/socketvar.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_fsm.h>
/*
* Unfortunately, NetBSD and lwIP have different definitions of a few relevant
* preprocessor variables. Make sure we do not attempt to use the NetBSD one
* where it matters. We do need one of the NetBSD definitions though.
*/
static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY;
#undef TF_NODELAY
#undef TCP_MSS
#include "lwip.h"
#include "tcpisn.h"
#include "lwip/tcp.h"
#include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */
/*
* The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration.
*/
/*
* We fully control the send buffer, so we can let its size be set to whatever
* we want. The receive buffer is different: if it is smaller than the window
* size, we may have to refuse data that lwIP hands us, at which point more
* incoming data will cause lwIP to abort the TCP connection--even aside from
* performance issues. Therefore, we must make sure the receive buffer is
* larger than the TCP window at all times.
*/
#define TCP_SNDBUF_MIN 1 /* minimum TCP send buffer size */
#define TCP_SNDBUF_DEF 32768 /* default TCP send buffer size */
#define TCP_SNDBUF_MAX 131072 /* maximum TCP send buffer size */
#define TCP_RCVBUF_MIN TCP_WND /* minimum TCP receive buffer size */
#define TCP_RCVBUF_DEF MAX(TCP_WND, 32768) /* default TCP recv buffer size */
#define TCP_RCVBUF_MAX MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */
/*
* The total number of buffers that may in use for TCP socket send queues. The
* goal is to allow at least some progress to be made on receiving from TCP
* sockets and on differently-typed sockets, at least as long as the LWIP
* service can manage to allocate the memory it wants. For the case that it
* does not, we can only reactively kill off TCP sockets and/or free enqueued
* ethernet packets, neither of which is currently implemented (TODO).
*/
#define TCP_MAX_SENDBUFS (mempool_max_buffers() * 3 / 4)
/* Polling intervals, in 500-millsecond units. */
#define TCP_POLL_REG_INTERVAL 10 /* interval for reattempting sends */
#define TCP_POLL_CLOSE_INTERVAL 1 /* interval while closing connection */
static struct tcpsock {
struct ipsock tcp_ipsock; /* IP socket, MUST be first */
struct tcp_pcb *tcp_pcb; /* lwIP TCP control block */
union pxfer_tcp_queue { /* free/accept queue */
TAILQ_ENTRY(tcpsock) tq_next; /* next in queue */
TAILQ_HEAD(, tcpsock) tq_head; /* head of queue */
} tcp_queue;
struct tcpsock *tcp_listener; /* listener if on accept q. */
struct { /* send queue */
struct pbuf *ts_head; /* first pbuf w/unacked data */
struct pbuf *ts_unsent; /* first pbuf w/unsent data */
struct pbuf *ts_tail; /* most recently added data */
size_t ts_len; /* total sent + unsent */
unsigned short ts_head_off; /* offset into head pbuf */
unsigned short ts_unsent_off; /* offset into unsent pbuf */
} tcp_snd;
struct { /* receive queue */
struct pbuf *tr_head; /* first pbuf w/unrecvd data */
struct pbuf **tr_pre_tailp; /* ptr-ptr to newest pbuf */
size_t tr_len; /* bytes on receive queue */
unsigned short tr_head_off; /* offset into head pbuf */
unsigned short tr_unacked; /* current window reduction */
} tcp_rcv;
} tcp_array[NR_TCPSOCK];
static TAILQ_HEAD(, tcpsock) tcp_freelist; /* list of free TCP sockets */
static const struct sockevent_ops tcpsock_ops;
static unsigned int tcpsock_sendbufs; /* # send buffers in use */
static unsigned int tcpsock_recvbufs; /* # receive buffers in use */
/* A bunch of macros that are just for convenience. */
#define tcpsock_get_id(tcp) (SOCKID_TCP | (sockid_t)((tcp) - tcp_array))
#define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock)
#define tcpsock_get_sock(tcp) (ipsock_get_sock(tcpsock_get_ipsock(tcp)))
#define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp)))
#define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp)))
#define tcpsock_is_ipv6(tcp) (ipsock_is_ipv6(tcpsock_get_ipsock(tcp)))
#define tcpsock_is_shutdown(tcp,fl) \
(sockevent_is_shutdown(tcpsock_get_sock(tcp), fl))
#define tcpsock_is_listening(tcp) \
(sockevent_is_listening(tcpsock_get_sock(tcp)))
#define tcpsock_get_flags(tcp) (ipsock_get_flags(tcpsock_get_ipsock(tcp)))
#define tcpsock_set_flag(tcp,fl) \
(ipsock_set_flag(tcpsock_get_ipsock(tcp), fl))
#define tcpsock_clear_flag(tcp,fl) \
(ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl))
static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *,
struct rmib_oldp *, struct rmib_newp *);
/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */
/* TODO: add many more and make some of them writable.. */
static struct rmib_node net_inet_tcp_table[] = {
/* 2*/ [TCPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF,
"sendspace",
"Default TCP send buffer size"),
/* 3*/ [TCPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF,
"recvspace",
"Default TCP receive buffer size"),
/*29*/ [TCPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
loopif_cksum, "do_loopback_cksum",
"Perform TCP checksum on loopback"),
/*+0*/ [TCPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0,
tcpsock_pcblist, "pcblist",
"TCP protocol control block list"),
/*+1*/ [TCPCTL_MAXID + 1] = RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE |
CTLFLAG_HIDDEN | CTLTYPE_STRING,
TCPISN_SECRET_HEX_LENGTH, tcpisn_secret,
"isn_secret",
"TCP ISN secret (MINIX 3 specific)")
};
static struct rmib_node net_inet_tcp_node =
RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings");
static struct rmib_node net_inet6_tcp6_node =
RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings");
/*
* Initialize the TCP sockets module.
*/
void
tcpsock_init(void)
{
unsigned int slot;
/* Initialize the list of free TCP sockets. */
TAILQ_INIT(&tcp_freelist);
for (slot = 0; slot < __arraycount(tcp_array); slot++)
TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot],
tcp_queue.tq_next);
/* Initialize other variables. */
tcpsock_sendbufs = 0;
/* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */
mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node);
mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node);
}
/*
* Initialize the state of a TCP socket's send queue.
*/
static void
tcpsock_reset_send(struct tcpsock * tcp)
{
tcp->tcp_snd.ts_tail = NULL;
tcp->tcp_snd.ts_unsent = NULL;
tcp->tcp_snd.ts_head = NULL;
tcp->tcp_snd.ts_len = 0;
tcp->tcp_snd.ts_unsent_off = 0;
tcp->tcp_snd.ts_head_off = 0;
}
/*
* Initialize the state of a TCP socket's receive queue.
*/
static void
tcpsock_reset_recv(struct tcpsock * tcp)
{
tcp->tcp_rcv.tr_pre_tailp = NULL;
tcp->tcp_rcv.tr_head = NULL;
tcp->tcp_rcv.tr_len = 0;
tcp->tcp_rcv.tr_head_off = 0;
tcp->tcp_rcv.tr_unacked = 0;
}
/*
* Create a TCP socket.
*/
sockid_t
tcpsock_socket(int domain, int protocol, struct sock ** sockp,
const struct sockevent_ops ** ops)
{
struct tcpsock *tcp;
uint8_t ip_type;
switch (protocol) {
case 0:
case IPPROTO_TCP:
break;
default:
return EPROTONOSUPPORT;
}
if (TAILQ_EMPTY(&tcp_freelist))
return ENOBUFS;
tcp = TAILQ_FIRST(&tcp_freelist);
/*
* Initialize the structure. Do not memset it to zero, as it is still
* part of the linked free list. Initialization may still fail. When
* adding new fields, make sure to change tcpsock_clone() accordingly.
*/
ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain,
TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp);
if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL)
return ENOBUFS;
tcp_arg(tcp->tcp_pcb, tcp);
tcp->tcp_listener = NULL;
tcpsock_reset_send(tcp);
tcpsock_reset_recv(tcp);
TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
*ops = &tcpsock_ops;
return tcpsock_get_id(tcp);
}
/*
* Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection
* incoming on listening socket 'listener'. The new socket is essentially a
* "clone" of the listening TCP socket, in that it should inherit any settings
* from the listening socket. The socket has not yet been accepted by userland
* so add it to the queue of connetions pending for the listening socket. On
* success, return OK. On failure, return a negative error code.
*/
static int
tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb)
{
struct tcpsock *tcp;
if (TAILQ_EMPTY(&tcp_freelist))
return ENOBUFS;
tcp = TAILQ_FIRST(&tcp_freelist);
/*
* Initialize the structure. Do not memset it to zero, as it is still
* part of the linked free list. Initialization may still fail. Most
* settings should be inherited from the listening socket here, rather
* than being initialized to their default state.
*/
ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp),
tcpsock_get_id(tcp));
tcp->tcp_pcb = pcb;
tcp_arg(pcb, tcp);
tcpsock_reset_send(tcp);
tcpsock_reset_recv(tcp);
/*
* Remove the new socket from the free list, and add it to the queue of
* the listening socket--in this order, because the same next pointer
* is used for both.
*/
TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp,
tcp_queue.tq_next);
tcp->tcp_listener = listener;
return OK;
}
/*
* Allocate a buffer from the pool, using the standard pool size. The returned
* buffer is a single element--never a chain.
*/
static struct pbuf *
tcpsock_alloc_buf(void)
{
struct pbuf *pbuf;
pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM);
assert(pbuf == NULL || pbuf->len == pbuf->tot_len);
return pbuf;
}
/*
* Free the given buffer. Ensure that pbuf_free() will not attempt to free the
* next buffer(s) in the chain as well. This may be called for pbufs other
* than those allocated with tcpsock_alloc_buf().
*/
static void
tcpsock_free_buf(struct pbuf * pbuf)
{
/*
* Resetting the length is currently not necessary, but better safe
* than sorry..
*/
pbuf->len = pbuf->tot_len;
pbuf->next = NULL;
pbuf_free(pbuf);
}
/*
* Clear the send queue of a TCP socket. The caller must ensure that lwIP will
* no longer access any of data on the send queue.
*/
static void
tcpsock_clear_send(struct tcpsock * tcp)
{
struct pbuf *phead;
assert(tcp->tcp_pcb == NULL);
while ((phead = tcp->tcp_snd.ts_head) != NULL) {
tcp->tcp_snd.ts_head = phead->next;
assert(tcpsock_sendbufs > 0);
tcpsock_sendbufs--;
tcpsock_free_buf(phead);
}
tcpsock_reset_send(tcp);
}
/*
* Clear the receive queue of a TCP socket. If 'ack_data' is set, also
* acknowledge the previous contents of the receive queue to lwIP.
*/
static size_t
tcpsock_clear_recv(struct tcpsock * tcp, int ack_data)
{
struct pbuf *phead;
size_t rlen;
rlen = tcp->tcp_rcv.tr_len;
while ((phead = tcp->tcp_rcv.tr_head) != NULL) {
tcp->tcp_rcv.tr_head = phead->next;
assert(tcpsock_recvbufs > 0);
tcpsock_recvbufs--;
tcpsock_free_buf(phead);
}
/*
* From now on, we will basically be discarding incoming data as fast
* as possible, to keep the full window open at all times.
*/
if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0)
tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked);
tcpsock_reset_recv(tcp);
return rlen;
}
/*
* The TCP socket's PCB has been detached from the socket, typically because
* the connection was aborted, either by us or by lwIP. Either way, any TCP
* connection is gone. Clear the socket's send queue, remove the socket from
* a listening socket's queue, and if the socket itself is ready and allowed to
* be freed, free it now. The socket is ready to be freed if it was either on
* a listening queue or being closed already. The socket is allowed to be
* freed only if 'may_free' is TRUE. If the socket is not freed, its receive
* queue is left as is, as it may still have data to be received by userland.
*/
static int
tcpsock_cleanup(struct tcpsock * tcp, int may_free)
{
int destroy;
assert(tcp->tcp_pcb == NULL);
/*
* Free any data on the send queue. This is safe to do right now,
* because the PCB has been aborted (or was already gone). We must be
* very careful about clearing the send queue in all other situations.
*/
tcpsock_clear_send(tcp);
/*
* If this was a socket pending acceptance, remove it from the
* corresponding listener socket's queue, and free it. Otherwise, free
* the socket only if it suspended a graceful close operation.
*/
if (tcp->tcp_listener != NULL) {
TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp,
tcp_queue.tq_next);
tcp->tcp_listener = NULL;
/*
* The listener socket's backlog count should be adjusted by
* lwIP whenever the PCB is freed up, so we need (and must) not
* attempt to do that here.
*/
destroy = TRUE;
} else
destroy = sockevent_is_closing(tcpsock_get_sock(tcp));
/*
* Do not free the socket if 'may_free' is FALSE. That flag may be set
* if we are currently in the second tcpsock_close() call on the
* socket, in which case sockevent_is_closing() is TRUE but we must
* still not free the socket now: doing so would derail libsockevent.
*/
if (destroy && may_free) {
(void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
}
return destroy;
}
/*
* Abort the lwIP PCB for the given socket, using tcp_abort(). If the PCB is
* connected, this will cause the connection to be reset. The PCB, which must
* have still been present before the call, will be gone after the call.
*/
static void
tcpsock_pcb_abort(struct tcpsock * tcp)
{
assert(tcp->tcp_pcb != NULL);
assert(!tcpsock_is_listening(tcp));
tcp_recv(tcp->tcp_pcb, NULL);
tcp_sent(tcp->tcp_pcb, NULL);
tcp_err(tcp->tcp_pcb, NULL);
tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
tcp_arg(tcp->tcp_pcb, NULL);
tcp_abort(tcp->tcp_pcb);
tcp->tcp_pcb = NULL;
}
/*
* Close the lwIP PCB for the given socket, using tcp_close(). If the PCB is
* connected, its graceful close will be finished by lwIP in the background.
* The PCB, which must have still been present before the call, will be gone
* after the call.
*/
static void
tcpsock_pcb_close(struct tcpsock * tcp)
{
err_t err;
assert(tcp->tcp_pcb != NULL);
assert(tcp->tcp_snd.ts_len == 0);
if (!tcpsock_is_listening(tcp)) {
tcp_recv(tcp->tcp_pcb, NULL);
tcp_sent(tcp->tcp_pcb, NULL);
tcp_err(tcp->tcp_pcb, NULL);
tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
}
tcp_arg(tcp->tcp_pcb, NULL);
if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK)
panic("unexpected TCP close failure: %d", err);
tcp->tcp_pcb = NULL;
}
/*
* Return TRUE if all conditions are met for closing the TCP socket's PCB, or
* FALSE if they are not. Upon calling this function, the socket's PCB must
* still be around.
*/
static int
tcpsock_may_close(struct tcpsock * tcp)
{
assert(tcp->tcp_pcb != NULL);
/*
* Regular closing of the PCB requires three conditions to be met:
*
* 1. all our data has been transmitted AND acknowledged, so that we do
* not risk corruption in case there are still unsent or unack'ed
* data buffers that may otherwise be recycled too soon;
* 2. we have sent our FIN to the peer; and,
* 3. we have received a FIN from the peer.
*/
return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) ==
(TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0);
}
/*
* The given socket is ready to be closed as per the tcpsock_may_close() rules.
* This implies that its send queue is already empty. Gracefully close the
* PCB. In addition, if the socket is being closed gracefully, meaning we
* suspended an earlier tcpsock_close() call (and as such already emptied the
* receive queue as well), then tell libsockevent that the close is finished,
* freeing the socket. Return TRUE if the socket has indeed been freed this
* way, or FALSE if the socket is still around.
*/
static int
tcpsock_finish_close(struct tcpsock * tcp)
{
assert(tcp->tcp_snd.ts_len == 0);
assert(tcp->tcp_listener == NULL);
/*
* If we get here, we have already shut down the sending side of the
* PCB. Technically, we are interested only in shutting down the
* receiving side of the PCB here, so that lwIP may decide to recycle
* the socket later etcetera. We call tcp_close() because we do not
* want to rely on tcp_shutdown(RX) doing the exact same thing.
* However, we do rely on the fact that the PCB is not immediately
* destroyed by the tcp_close() call: otherwise we may have to return
* ERR_ABRT if this function is called from a lwIP-generated event.
*/
tcpsock_pcb_close(tcp);
/*
* If we suspended an earlier tcpsock_close() call, we have to tell
* libsockevent that the close operation is now complete.
*/
if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
assert(tcp->tcp_rcv.tr_len == 0);
sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
return TRUE;
} else
return FALSE;
}
/*
* Attempt to start or resume enqueuing data and/or a FIN to send on the given
* TCP socket. Return TRUE if anything at all could be newly enqueued on the
* lwIP PCB, even if less than desired. In that case, the caller should try to
* send whatever was enqueued, and if applicable, check if the socket may now
* be closed (due to the FIN being enqueued). In particular, in any situation
* where the socket may be in the process of being closed, the caller must use
* tcpsock_may_close() if TRUE is returned. Return FALSE if nothing new could
* be enqueued, in which case no send attempt need to be made either.
*/
static int
tcpsock_pcb_enqueue(struct tcpsock * tcp)
{
struct pbuf *punsent;
size_t space, chunk;
unsigned int flags;
err_t err;
int enqueued;
assert(tcp->tcp_pcb != NULL);
if (tcpsock_get_flags(tcp) & TCPF_FULL)
return FALSE;
/*
* Attempt to enqueue more unsent data, if any, on the PCB's send
* queue.
*/
enqueued = FALSE;
while (tcp->tcp_snd.ts_unsent != NULL) {
if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0)
break;
/*
* We may maintain a non-NULL unsent pointer even when there is
* nothing more to send right now, because the tail buffer may
* be filled up further later on.
*/
punsent = tcp->tcp_snd.ts_unsent;
assert(punsent->len >= tcp->tcp_snd.ts_unsent_off);
chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off;
if (chunk == 0)
break;
if (chunk > space)
chunk = space;
/* Try to enqueue more data for sending. */
if (chunk < punsent->len || punsent->next != NULL)
flags = TCP_WRITE_FLAG_MORE;
else
flags = 0;
err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload +
tcp->tcp_snd.ts_unsent_off, chunk, flags);
/*
* Since tcp_write() enqueues data only, it should only return
* out-of-memory errors; no fatal ones. In any case, stop.
*/
if (err != ERR_OK) {
assert(err == ERR_MEM);
break;
}
/* We have successfully enqueued data. */
enqueued = TRUE;
tcp->tcp_snd.ts_unsent_off += chunk;
if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) {
assert(tcp->tcp_snd.ts_unsent_off < punsent->len ||
punsent->next == NULL);
break;
}
tcp->tcp_snd.ts_unsent = punsent->next;
tcp->tcp_snd.ts_unsent_off = 0;
}
/*
* If all pending data has been enqueued for sending, and we should
* shut down the sending end of the socket, try that now.
*/
if ((tcp->tcp_snd.ts_unsent == NULL ||
tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) &&
tcpsock_is_shutdown(tcp, SFL_SHUT_WR) &&
!(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) {
err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/);
if (err == ERR_OK) {
/*
* We have successfully enqueued a FIN. The caller is
* now responsible for checking whether the PCB and
* possibly even the socket object can now be freed.
*/
tcpsock_set_flag(tcp, TCPF_SENT_FIN);
enqueued = TRUE;
} else {
assert(err == ERR_MEM);
/*
* FIXME: the resolution for lwIP bug #47485 has taken
* away even more control over the closing process from
* us, making tracking sockets especially for SO_LINGER
* even harder. For now, we simply effectively undo
* the patch by clearing TF_CLOSEPEND if tcp_shutdown()
* returns ERR_MEM. This will not be sustainable in
* the long term, though.
*/
tcp->tcp_pcb->flags &= ~TF_CLOSEPEND;
tcpsock_set_flag(tcp, TCPF_FULL);
}
}
return enqueued;
}
/*
* Request lwIP to start sending any enqueued data and/or FIN on the TCP
* socket's lwIP PCB. On success, return OK. On failure, return a negative
* error code, after cleaning up the socket, freeing the PCB. If the socket
* was already being closed, also free the socket object in that case; the
* caller must then not touch the socket object anymore upon return. If the
* socket object is not freed, and if 'raise_error' is TRUE, raise the error
* on the socket object.
*/
static int
tcpsock_pcb_send(struct tcpsock * tcp, int raise_error)
{
err_t err;
int r;
assert(tcp->tcp_pcb != NULL);
/*
* If we have enqueued something, ask lwIP to send TCP packets now.
* This may result in a fatal error, in which case we clean up the
* socket and return the error to the caller. Since cleaning up the
* socket may free the socket object, and the caller cannot tell
* whether that will happen or has happened, also possibly raise the
* error on the socket object if it is not gone. As such, callers that
* set 'raise_error' to FALSE must know for sure that the socket was
* not being closed, for example because the caller is processing a
* (send) call from userland.
*/
err = tcp_output(tcp->tcp_pcb);
if (err != ERR_OK && err != ERR_MEM) {
tcpsock_pcb_abort(tcp);
r = util_convert_err(err);
if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
if (raise_error)
sockevent_set_error(tcpsock_get_sock(tcp), r);
}
/* Otherwise, do not touch the socket object anymore! */
return r;
} else
return OK;
}
/*
* Callback from lwIP. The given number of data bytes have been acknowledged
* as received by the remote end. Dequeue and free data from the TCP socket's
* send queue as appropriate.
*/
static err_t
tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len)
{
struct tcpsock *tcp = (struct tcpsock *)arg;
struct pbuf *phead;
size_t left;
assert(tcp != NULL);
assert(pcb == tcp->tcp_pcb);
assert(len > 0);
assert(tcp->tcp_snd.ts_len >= len);
assert(tcp->tcp_snd.ts_head != NULL);
left = len;
/*
* First see if we can free up whole buffers. Check against the head
* buffer's 'len' rather than 'tot_len', or we may end up leaving an
* empty buffer on the chain.
*/
while ((phead = tcp->tcp_snd.ts_head) != NULL &&
left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) {
left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off;
tcp->tcp_snd.ts_head = phead->next;
tcp->tcp_snd.ts_head_off = 0;
if (phead == tcp->tcp_snd.ts_unsent) {
assert(tcp->tcp_snd.ts_unsent_off == phead->len);
tcp->tcp_snd.ts_unsent = phead->next;
tcp->tcp_snd.ts_unsent_off = 0;
}
assert(tcpsock_sendbufs > 0);
tcpsock_sendbufs--;
tcpsock_free_buf(phead);
}
/*
* The rest of the given length is for less than the current head
* buffer.
*/
if (left > 0) {
assert(tcp->tcp_snd.ts_head != NULL);
assert((size_t)tcp->tcp_snd.ts_head->len -
tcp->tcp_snd.ts_head_off > left);
tcp->tcp_snd.ts_head_off += left;
}
tcp->tcp_snd.ts_len -= (size_t)len;
if (tcp->tcp_snd.ts_head == NULL) {
assert(tcp->tcp_snd.ts_len == 0);
assert(tcp->tcp_snd.ts_unsent == NULL);
tcp->tcp_snd.ts_tail = NULL;
} else
assert(tcp->tcp_snd.ts_len > 0);
/*
* If we emptied the send queue, and we already managed to send a FIN
* earlier, we may now have met all requirements to close the socket's
* PCB. Otherwise, we may also be able to send more now, so try to
* resume sending. Since we are invoked from the "sent" event,
* tcp_output() will not actually process anything, and so we do not
* call it either. If we did, we would have to deal with errors here.
*/
if (tcpsock_may_close(tcp)) {
if (tcpsock_finish_close(tcp))
return ERR_OK;
} else {
tcpsock_clear_flag(tcp, TCPF_FULL);
/*
* If we now manage to enqueue a FIN, we may be ready to close
* the PCB after all.
*/
if (tcpsock_pcb_enqueue(tcp)) {
if (tcpsock_may_close(tcp) &&
tcpsock_finish_close(tcp))
return ERR_OK;
}
}
/* The user may also be able to send more now. */
sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
return ERR_OK;
}
/*
* Check whether any (additional) data previously received on a TCP socket
* should be acknowledged, possibly allowing the remote end to send additional
* data as a result.
*/
static void
tcpsock_ack_recv(struct tcpsock * tcp)
{
size_t rcvbuf, left, delta, ack;
assert(tcp->tcp_pcb != NULL);
/*
* We must make sure that at all times, we can still add an entire
* window's worth of data to the receive queue. If the amount of free
* space drops below that threshold, we stop acknowledging received
* data. The user may change the receive buffer size at all times; we
* update the window size lazily as appropriate.
*/
rcvbuf = tcpsock_get_rcvbuf(tcp);
if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) {
/*
* The number of bytes that lwIP can still give us at any time
* is represented as 'left'. The number of bytes that we still
* allow to be stored in the receive queue is represented as
* 'delta'. We must make sure that 'left' does not ever exceed
* 'delta' while acknowledging as many bytes as possible under
* that rule.
*/
left = TCP_WND - tcp->tcp_rcv.tr_unacked;
delta = rcvbuf - tcp->tcp_rcv.tr_len;
if (left < delta) {
ack = delta - left;
if (ack > tcp->tcp_rcv.tr_unacked)
ack = tcp->tcp_rcv.tr_unacked;
tcp_recved(tcp->tcp_pcb, ack);
tcp->tcp_rcv.tr_unacked -= ack;
assert(tcp->tcp_rcv.tr_len + TCP_WND -
tcp->tcp_rcv.tr_unacked <= rcvbuf);
}
}
}
/*
* Attempt to merge two consecutive underfilled buffers in the receive queue of
* a TCP socket, freeing up one of the two buffers as a result. The first
* (oldest) buffer is 'ptail', and the pointer to this buffer is stored at
* 'pnext'. The second (new) buffer is 'pbuf', which is already attached to
* the first buffer. The second buffer may be followed by additional buffers
* with even more new data. Return TRUE if buffers have been merged, in which
* case the pointer at 'pnext' may have changed, and no assumptions should be
* made about whether 'ptail' and 'pbuf' still exist in any form. Return FALSE
* if no merging was necessary or if no new buffer could be allocated.
*/
static int
tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf)
{
struct pbuf *pnew;
assert(*pnext == ptail);
assert(ptail->next == pbuf);
/*
* Unfortunately, we cannot figure out what kind of pbuf we were given
* by the lower layers, so we cannot merge two buffers without first
* allocating a third. Once we have done that, though, we can easily
* merge more into that new buffer. For now we use the following
* policies:
*
* 1. if two consecutive lwIP-provided buffers are both used less than
* half the size of a full buffer, try to allocate a new buffer and
* copy both lwIP-provided buffers into that new buffer, freeing up
* the pair afterwards;
* 2. if the tail buffer on the chain is allocated by us and not yet
* full, and the next buffer's contents can be added to the tail
* buffer in their entirety, do just that.
*
* Obviously there is a trade-off between the performance overhead of
* copying and the resource overhead of keeping less-than-full buffers
* on the receive queue, but this policy should both keep actual memory
* usage to no more than twice the receive queue length and prevent
* excessive copying. The policy deliberately performs more aggressive
* merging into a buffer that we allocated ourselves.
*/
if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 &&
pbuf->len <= MEMPOOL_BUFSIZE / 2) {
/*
* Case #1.
*/
assert(ptail->tot_len == ptail->len);
assert(pbuf->tot_len == pbuf->len);
pnew = tcpsock_alloc_buf();
if (pnew == NULL)
return FALSE;
memcpy(pnew->payload, ptail->payload, ptail->len);
memcpy((char *)pnew->payload + ptail->len, pbuf->payload,
pbuf->len);
pnew->len = ptail->len + pbuf->len;
assert(pnew->len <= pnew->tot_len);
pnew->next = pbuf->next;
/* For now, we need not inherit any flags from either pbuf. */
*pnext = pnew;
/* One allocated, two about to be deallocated. */
assert(tcpsock_recvbufs > 0);
tcpsock_recvbufs--;
tcpsock_free_buf(ptail);
tcpsock_free_buf(pbuf);
return TRUE;
} else if (ptail->tot_len - ptail->len >= pbuf->len) {
/*
* Case #2.
*/
memcpy((char *)ptail->payload + ptail->len, pbuf->payload,
pbuf->len);
ptail->len += pbuf->len;
ptail->next = pbuf->next;
assert(tcpsock_recvbufs > 0);
tcpsock_recvbufs--;
tcpsock_free_buf(pbuf);
return TRUE;
} else
return FALSE;
}
/*
* Callback from lwIP. New data or flags have been received on a TCP socket.
*/
static err_t
tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused,
struct pbuf * pbuf, err_t err)
{
struct tcpsock *tcp = (struct tcpsock *)arg;
struct pbuf *ptail, **pprevp;
size_t len;
assert(tcp != NULL);
assert(pcb == tcp->tcp_pcb);
/*
* lwIP should never provide anything other than ERR_OK in 'err', and
* it is not clear what we should do if it would. If lwIP ever changes
* in this regard, we will likely have to change this code accordingly.
*/
if (err != ERR_OK)
panic("TCP receive event with error: %d", err);
/* If the given buffer is NULL, we have received a FIN. */
if (pbuf == NULL) {
tcpsock_set_flag(tcp, TCPF_RCVD_FIN);
/* Userland may now receive EOF. */
if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD))
sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
/*
* If we were in the process of closing the socket, and we
* receive a FIN before our FIN got acknowledged, we close the
* socket anyway, as described in tcpsock_close(). However, if
* there is still unacknowledged outgoing data or we did not
* even manage to send our FIN yet, hold off closing the socket
* for now.
*/
if (tcpsock_may_close(tcp))
(void)tcpsock_finish_close(tcp);
return ERR_OK;
}
/*
* If the socket is being closed, receiving new data should cause a
* reset.
*/
if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
tcpsock_pcb_abort(tcp);
(void)tcpsock_cleanup(tcp, TRUE /*may_free*/);
/* Do not touch the socket object anymore! */
pbuf_free(pbuf);
return ERR_ABRT;
}
/*
* If the socket has already been shut down for reading, discard the
* incoming data and do nothing else.
*/
if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) {
tcp_recved(tcp->tcp_pcb, pbuf->tot_len);
pbuf_free(pbuf);
return ERR_OK;
}
/*
* We deliberately ignore the PBUF_FLAG_PUSH flag. This flag would
* enable the receive functionality to delay delivering "un-pushed"
* data to applications. The implementation of this scheme could track
* the amount of data up to and including the last-pushed segment using
* a "tr_push_len" field or so. Deciding when to deliver "un-pushed"
* data after all is a bit tricker though. As far as I can tell, the
* BSDs do not implement anything like that. Windows does, and this
* results in interaction problems with even more lightweight TCP/IP
* stacks that do not send the TCP PSH flag. Currently, there is no
* obvious benefit for us to support delaying data delivery like that.
* In addition, testing its implementation reliably would be difficult.
*/
len = (size_t)pbuf->tot_len;
/*
* Count the number of buffers that are now owned by us. The new total
* of buffers owned by us must not exceed the size of the memory pool.
* Any more would indicate an accounting error. Note that
* tcpsock_recvbufs is currently used for debugging only!
*/
tcpsock_recvbufs += pbuf_clen(pbuf);
assert(tcpsock_recvbufs < mempool_cur_buffers());
/*
* The pre-tail pointer points to whatever is pointing to the tail
* buffer. The latter pointer may be the 'tr_head' field in our
* tcpsock structure, or the 'next' field in the penultimate buffer,
* or NULL if there are currently no buffers on the receive queue.
*/
if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) {
ptail = *pprevp;
assert(ptail != NULL);
assert(ptail->next == NULL);
assert(tcp->tcp_rcv.tr_head != NULL);
ptail->next = pbuf;
pbuf->tot_len = pbuf->len; /* to help freeing on merges */
if (tcpsock_try_merge(pprevp, ptail, pbuf)) {
ptail = *pprevp;
pbuf = ptail->next;
}
if (pbuf != NULL)
pprevp = &ptail->next;
} else {
assert(tcp->tcp_rcv.tr_head == NULL);
assert(tcp->tcp_rcv.tr_head_off == 0);
tcp->tcp_rcv.tr_head = pbuf;
pprevp = &tcp->tcp_rcv.tr_head;
}
/*
* Chop up the chain into individual buffers. This is necessary as we
* overload 'tot_len' to mean "space available in the buffer", as we
* want for buffers allocated by us as part of buffer merges. Also get
* a pointer to the pointer to the new penultimate tail buffer. Due to
* merging, the chain may already be empty by now, though.
*/
if (pbuf != NULL) {
for (; pbuf->next != NULL; pbuf = pbuf->next) {
pbuf->tot_len = pbuf->len;
pprevp = &pbuf->next;
}
assert(pbuf->len == pbuf->tot_len);
}
assert(*pprevp != NULL);
assert((*pprevp)->next == NULL);
tcp->tcp_rcv.tr_pre_tailp = pprevp;
tcp->tcp_rcv.tr_len += len;
tcp->tcp_rcv.tr_unacked += len;
assert(tcp->tcp_rcv.tr_unacked <= TCP_WND);
/*
* Note that tr_len may now exceed the receive buffer size in the
* highly exceptional case that the user is adjusting the latter after
* the socket had already received data.
*/
/* See if we can immediately acknowledge some or all of the data. */
tcpsock_ack_recv(tcp);
/* Also wake up any receivers now. */
sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
return ERR_OK;
}
/*
* Callback from lwIP. The PCB corresponding to the socket identified by 'arg'
* has been closed by lwIP, with the reason specified in 'err': either the
* connection has been aborted locally (ERR_ABRT), it has been reset by the
* remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD).
*/
static void
tcpsock_event_err(void * arg, err_t err)
{
struct tcpsock *tcp = (struct tcpsock *)arg;
int r;
assert(tcp != NULL);
assert(tcp->tcp_pcb != NULL);
assert(err != ERR_OK);
/* The original PCB is now gone, or will be shortly. */
tcp->tcp_pcb = NULL;
/*
* Clean up the socket. As a result it may be freed, in which case we
* must not touch it anymore. No need to return ERR_ABRT from here, as
* the PCB has been aborted already.
*/
if (tcpsock_cleanup(tcp, TRUE /*may_free*/))
return;
if (err == ERR_CLSD) {
/*
* We may get here if the socket is shut down for writing and
* we already received a FIN from the remote side, thus putting
* the socket in LAST_ACK state, and we receive that last
* acknowledgment. There is nothing more we need to do.
*
* We will never get here in the other case that ERR_CLSD is
* raised, which is when the socket is reset because of
* unacknowledged data while closing: we handle the
* reset-on-ACK case ourselves in tcpsock_close(), and the
* socket is in closing state after that.
*/
assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
} else {
/*
* Anything else should be an error directly from lwIP;
* currently either ERR_ABRT and ERR_RST. Covert it to a
* regular error and set it on the socket. Doing so will also
* raise the appropriate events.
*/
/*
* Unfortunately, lwIP is not throwing accurate errors even
* when it can. We convert some errors to reflect more
* accurately the most likely cause.
*
* TODO: fix lwIP in this regard..
*/
r = util_convert_err(err);
if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) {
switch (err) {
case ERR_ABRT: r = ETIMEDOUT; break;
case ERR_RST: r = ECONNREFUSED; break;
}
}
sockevent_set_error(tcpsock_get_sock(tcp), r);
}
}
/*
* Callback from lwIP. Perform regular checks on a TCP socket. This function
* is called one per five seconds on connected sockets, and twice per second on
* closing sockets.
*/
static err_t
tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused)
{
struct tcpsock *tcp = (struct tcpsock *)arg;
err_t err;
int r;
assert(tcp != NULL);
assert(pcb == tcp->tcp_pcb);
/*
* If we ended up running out of buffers earlier, try resuming any send
* requests now, both for enqueuing TCP data with lwIP and for user
* requests.
*/
if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) {
tcpsock_clear_flag(tcp, TCPF_FULL);
tcpsock_clear_flag(tcp, TCPF_OOM);
/* See if we can enqueue more data with lwIP. */
if (tcpsock_pcb_enqueue(tcp)) {
/* In some cases, we can now close the PCB. */
if (tcpsock_may_close(tcp)) {
(void)tcpsock_finish_close(tcp);
/*
* The PCB is definitely gone here, and the
* entire socket object may be gone now too.
* Do not touch either anymore!
*/
return ERR_OK;
}
/*
* If actually sending the data fails, the PCB will be
* gone, and the socket object may be gone as well. Do
* not touch either anymore in that case!
*/
if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK)
return ERR_ABRT;
}
/*
* If we ran out of buffers earlier, it may be possible to take
* in more data from a user process now, even if we did not
* manage to enqueue any more pending data with lwIP.
*/
sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
assert(tcp->tcp_pcb != NULL);
} else if (tcp->tcp_snd.ts_unsent != NULL &&
tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) {
/*
* If the send buffer is full, we will no longer call
* tcp_output(), which means we may also miss out on fatal
* errors that would otherwise kill the connection (e.g., no
* route). As a result, the connection may erroneously
* continue to exist for a long time. To avoid this, we call
* tcp_output() every once in a while when there are still
* unsent data.
*/
err = tcp_output(tcp->tcp_pcb);
if (err != ERR_OK && err != ERR_MEM) {
tcpsock_pcb_abort(tcp);
if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
r = util_convert_err(err);
sockevent_set_error(tcpsock_get_sock(tcp), r);
}
/* Otherwise do not touch the socket object anymore! */
return ERR_ABRT;
}
}
/*
* If we are closing the socket, and we sent a FIN, see if the FIN got
* acknowledged. If so, finish closing the socket. Unfortunately, we
* can perform this check by polling only. TODO: change lwIP..
*/
if (sockevent_is_closing(tcpsock_get_sock(tcp)) &&
(tcpsock_get_flags(tcp) & TCPF_SENT_FIN) &&
tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) {
assert(tcp->tcp_snd.ts_len == 0);
tcpsock_finish_close(tcp);
}
return ERR_OK;
}
/*
* Bind a TCP socket to a local address.
*/
static int
tcpsock_bind(struct sock * sock, const struct sockaddr * addr,
socklen_t addr_len, endpoint_t user_endpt)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
ip_addr_t ipaddr;
uint16_t port;
err_t err;
int r;
if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)
return EINVAL;
if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port,
FALSE /*allow_mcast*/, &ipaddr, &port)) != OK)
return r;
err = tcp_bind(tcp->tcp_pcb, &ipaddr, port);
return util_convert_err(err);
}
/*
* Callback from lwIP. A new connection 'pcb' has arrived on the listening
* socket identified by 'arg'. Note that 'pcb' may be NULL in the case that
* lwIP could not accept the connection itself.
*/
static err_t
tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err)
{
struct tcpsock *tcp = (struct tcpsock *)arg;
assert(tcp != NULL);
assert(tcpsock_is_listening(tcp));
/*
* If the given PCB is NULL, then lwIP ran out of memory allocating a
* PCB for the new connection. There is nothing we can do with that
* information. Also check 'err' just to make sure.
*/
if (pcb == NULL || err != OK)
return ERR_OK;
/*
* The TCP socket is the listening socket, but the PCB is for the
* incoming connection.
*/
if (tcpsock_clone(tcp, pcb) != OK) {
/*
* We could not allocate the resources necessary to accept the
* connection. Abort it immediately.
*/
tcp_abort(pcb);
return ERR_ABRT;
}
/*
* The connection has not yet been accepted, and thus should still be
* considered on the listen queue.
*/
tcp_backlog_delayed(pcb);
/* Set the callback functions. */
tcp_recv(pcb, tcpsock_event_recv);
tcp_sent(pcb, tcpsock_event_sent);
tcp_err(pcb, tcpsock_event_err);
tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT);
return ERR_OK;
}
/*
* Put a TCP socket in listening mode.
*/
static int
tcpsock_listen(struct sock * sock, int backlog)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
struct tcp_pcb *pcb;
err_t err;
/* The maximum backlog value must not exceed its field size. */
assert(SOMAXCONN <= UINT8_MAX);
/*
* Allow only CLOSED sockets to enter listening mode. If the socket
* was already in listening mode, allow its backlog value to be
* updated, even if it was shut down already (making this a no-op).
*/
if (!tcpsock_is_listening(tcp) &&
(tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED))
return EINVAL;
/*
* If the socket was not already in listening mode, put it in that mode
* now. That involves switching PCBs as lwIP attempts to save memory
* by replacing the original PCB with a smaller one. If the socket was
* already in listening mode, simply update its backlog value--this has
* no effect on the sockets already in the backlog.
*/
if (!tcpsock_is_listening(tcp)) {
assert(tcp->tcp_pcb != NULL);
/*
* If the socket has not been bound to a port yet, do that
* first. This does mean that the listen call may fail with
* side effects, but that is acceptable in this case.
*/
if (tcp->tcp_pcb->local_port == 0) {
err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip,
0 /*port*/);
if (err != ERR_OK)
return util_convert_err(err);
}
/*
* Clear the argument on the PCB that is about to be replaced,
* because if we do not, once the PCB is reused (which does not
* clear the argument), we might get weird events. Do this
* before the tcp_listen() call, because we should no longer
* access the old PCB afterwards (even if we can).
*/
tcp_arg(tcp->tcp_pcb, NULL);
pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog,
&err);
if (pcb == NULL) {
tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */
return util_convert_err(err);
}
tcp_arg(pcb, tcp);
tcp->tcp_pcb = pcb;
tcp_accept(pcb, tcpsock_event_accept);
/* Initialize the queue head for sockets pending acceptance. */
TAILQ_INIT(&tcp->tcp_queue.tq_head);
} else if (tcp->tcp_pcb != NULL)
tcp_backlog_set(tcp->tcp_pcb, backlog);
return OK;
}
/*
* Callback from lwIP. A socket connection attempt has succeeded. Note that
* failed socket events will trigger the tcpsock_event_err() callback instead.
*/
static err_t
tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err)
{
struct tcpsock *tcp = (struct tcpsock *)arg;
assert(tcp != NULL);
assert(pcb == tcp->tcp_pcb);
assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING);
/*
* If lwIP ever changes so that this callback is called for connect
* failures as well, then we need to change the code here accordingly.
*/
if (err != ERR_OK)
panic("TCP connected event with error: %d", err);
tcpsock_clear_flag(tcp, TCPF_CONNECTING);
sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND);
return ERR_OK;
}
/*
* Connect a TCP socket to a remote address.
*/
static int
tcpsock_connect(struct sock * sock, const struct sockaddr * addr,
socklen_t addr_len, endpoint_t user_endpt)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
ip_addr_t dst_addr;
uint16_t dst_port;
err_t err;
int r;
/*
* Listening sockets may not have a PCB, so we use higher-level flags
* to throw the correct error code for those instead.
*/
if (tcpsock_is_listening(tcp))
return EOPNOTSUPP;
/*
* If there is no longer any PCB, we obviously cannot perform the
* connection, but POSIX is not clear on which error to return. We
* copy NetBSD's.
*/
if (tcp->tcp_pcb == NULL)
return EINVAL;
/*
* The only state from which a connection can be initiated, is CLOSED.
* Some of the other states require distinct error codes, though.
*/
switch (tcp->tcp_pcb->state) {
case CLOSED:
break;
case SYN_SENT:
return EALREADY;
case LISTEN:
assert(0); /* we just checked.. */
default:
return EISCONN;
}
/*
* Get the destination address, and attempt to start connecting. If
* the socket was not bound before, or it was bound to a port only,
* then lwIP will select a source address for us. We cannot do this
* ourselves even if we wanted to: it is impossible to re-bind a TCP
* PCB in the case it was previously bound to a port only.
*/
if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
&tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK)
return r;
err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port,
tcpsock_event_connected);
/*
* Note that various tcp_connect() error cases will leave the PCB with
* a newly set local and remote IP address anyway. We should be
* careful not to rely on the addresses being as they were before.
*/
if (err != ERR_OK)
return util_convert_err(err);
/* Set the other callback functions. */
tcp_recv(tcp->tcp_pcb, tcpsock_event_recv);
tcp_sent(tcp->tcp_pcb, tcpsock_event_sent);
tcp_err(tcp->tcp_pcb, tcpsock_event_err);
tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
/*
* Set a flag so that we can correct lwIP's error codes in case the
* connection fails.
*/
tcpsock_set_flag(tcp, TCPF_CONNECTING);
return SUSPEND;
}
/*
* Test whether any new connections are pending on a listening TCP socket.
*/
static int
tcpsock_test_accept(struct sock * sock)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
/* Is this socket in listening mode at all? */
if (!tcpsock_is_listening(tcp))
return EINVAL;
/* Are there any connections to accept right now? */
if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head))
return OK;
/* If the socket has been shut down, we return ECONNABORTED. */
if (tcp->tcp_pcb == NULL)
return ECONNABORTED;
/* Otherwise, wait for a new connection first. */
return SUSPEND;
}
/*
* Accept a connection on a listening TCP socket, creating a new TCP socket.
*/
static sockid_t
tcpsock_accept(struct sock * sock, struct sockaddr * addr,
socklen_t * addr_len, endpoint_t user_endpt __unused,
struct sock ** newsockp)
{
struct tcpsock *listener = (struct tcpsock *)sock;
struct tcpsock *tcp;
int r;
if ((r = tcpsock_test_accept(sock)) != OK)
return r;
/* Below, we must not assume that the listener has a PCB. */
tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head);
assert(tcp->tcp_listener == listener);
assert(tcp->tcp_pcb != NULL);
TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next);
tcp->tcp_listener = NULL;
tcp_backlog_accepted(tcp->tcp_pcb);
ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
&tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
/*
* Set 'newsockp' to NULL so that libsockevent knows we already cloned
* the socket, and it must not be reinitialized anymore.
*/
*newsockp = NULL;
return tcpsock_get_id(tcp);
}
/*
* Perform preliminary checks on a send request.
*/
static int
tcpsock_pre_send(struct sock * sock, size_t len __unused,
socklen_t ctl_len __unused, const struct sockaddr * addr __unused,
socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags)
{
/*
* Reject calls with unknown flags. Since libsockevent strips out the
* flags it handles itself here, we only have to test for ones we can
* not handle. Currently, there are no send flags that we support.
*/
if (flags != 0)
return EOPNOTSUPP;
return OK;
}
/*
* Test whether the given number of data bytes can be sent on a TCP socket.
*/
static int
tcpsock_test_send(struct sock * sock, size_t min)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
size_t sndbuf;
if (tcp->tcp_pcb == NULL)
return EPIPE;
switch (tcp->tcp_pcb->state) {
case CLOSED: /* new */
case LISTEN: /* listening */
return ENOTCONN;
case SYN_SENT: /* connecting */
case SYN_RCVD: /* simultaneous open, maybe someday? */
return SUSPEND;
case ESTABLISHED: /* connected */
case CLOSE_WAIT: /* closed remotely */
break;
default: /* shut down locally */
assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
return EPIPE;
}
sndbuf = tcpsock_get_sndbuf(tcp);
if (min > sndbuf)
min = sndbuf;
if (tcp->tcp_snd.ts_len + min > sndbuf)
return SUSPEND;
else
return OK;
}
/*
* Send data on a TCP socket.
*/
static int
tcpsock_send(struct sock * sock, const struct sockdriver_data * data,
size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
const struct sockaddr * addr __unused, socklen_t addr_len __unused,
endpoint_t user_endpt __unused, int flags __unused, size_t min)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
struct pbuf *ptail, *pfirst, *pnext, *plast;
size_t off, tail_off, chunk, left, sndbuf;
int r;
if ((r = tcpsock_test_send(sock, min)) != OK)
return r;
if (len == 0)
return OK; /* nothing to do */
sndbuf = tcpsock_get_sndbuf(tcp);
if (min > sndbuf)
min = sndbuf;
assert(min > 0);
assert(sndbuf > tcp->tcp_snd.ts_len);
left = sndbuf - tcp->tcp_snd.ts_len;
if (left > len)
left = len;
/*
* First see if we can fit any more data in the current tail buffer.
* If so, we set 'ptail' to point to it and 'tail_off' to the previous
* length of the tail buffer, while optimistically extending it to
* include the new data. If not, we set them to NULL/0.
*/
if ((ptail = tcp->tcp_snd.ts_tail) != NULL &&
ptail->len < ptail->tot_len) {
assert(ptail->len > 0);
tail_off = (size_t)ptail->len;
/*
* Optimistically extend the head buffer to include whatever
* fits in it. This is needed for util_copy_data().
*/
assert(ptail->tot_len > ptail->len);
off = (size_t)ptail->tot_len - (size_t)ptail->len;
if (off > left)
off = left;
ptail->len += off;
} else {
ptail = NULL;
tail_off = 0;
off = 0;
}
/*
* Then, if there is more to send, allocate new buffers as needed. If
* we run out of memory, work with whatever we did manage to grab.
*/
pfirst = NULL;
plast = NULL;
while (off < left) {
if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS ||
(pnext = tcpsock_alloc_buf()) == NULL) {
/*
* Chances are that we will end up suspending this send
* request because of being out of buffers. We try to
* resume such requests from the polling function.
*/
tcpsock_set_flag(tcp, TCPF_OOM);
break;
}
tcpsock_sendbufs++;
if (pfirst == NULL)
pfirst = pnext;
else
plast->next = pnext;
plast = pnext;
chunk = (size_t)pnext->tot_len;
if (chunk > left - off)
chunk = left - off;
pnext->len = chunk;
off += chunk;
}
/*
* Copy in the data and continue, unless we did not manage to find
* enough space to even meet the low send watermark, in which case we
* undo any allocation and suspend the call until later.
*/
if (off >= min) {
/*
* Optimistically attach the new buffers to the tail, also for
* util_copy_data(). We undo all this if the copy fails.
*/
if (ptail != NULL) {
ptail->next = pfirst;
pnext = ptail;
} else
pnext = pfirst;
assert(pnext != NULL);
r = util_copy_data(data, off, *offp, pnext, tail_off,
TRUE /*copy_in*/);
} else
r = SUSPEND;
if (r != OK) {
/* Undo the modifications made so far. */
while (pfirst != NULL) {
pnext = pfirst->next;
assert(tcpsock_sendbufs > 0);
tcpsock_sendbufs--;
tcpsock_free_buf(pfirst);
pfirst = pnext;
}
if (ptail != NULL) {
ptail->next = NULL;
ptail->len = tail_off;
}
return r;
}
/* Attach the new buffers, if any, to the buffer tail. */
if (pfirst != NULL) {
if ((ptail = tcp->tcp_snd.ts_tail) != NULL) {
assert(ptail->len == ptail->tot_len);
/*
* Due to our earlier optimistic modifications, this
* may or may not be redundant.
*/
ptail->next = pfirst;
}
assert(plast != NULL);
tcp->tcp_snd.ts_tail = plast;
if (tcp->tcp_snd.ts_head == NULL) {
tcp->tcp_snd.ts_head = pfirst;
assert(tcp->tcp_snd.ts_head_off == 0);
}
if (tcp->tcp_snd.ts_unsent == NULL) {
tcp->tcp_snd.ts_unsent = pfirst;
assert(tcp->tcp_snd.ts_unsent_off == 0);
}
}
tcp->tcp_snd.ts_len += off;
/*
* See if we can send any of the data we just enqueued. The socket is
* still open as we are still processing a call from userland on it;
* this saves us from having to deal with the cases that the following
* calls end up freeing the socket object.
*/
if (tcpsock_pcb_enqueue(tcp) &&
(r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) {
/*
* That did not go well. Return the error immediately if we
* had not made any progress earlier. Otherwise, return our
* partial progress and leave the error to be picked up later.
*/
if (*offp > 0) {
sockevent_set_error(tcpsock_get_sock(tcp), r);
return OK;
} else
return r;
}
*offp += off;
return (off < len) ? SUSPEND : OK;
}
/*
* Perform preliminary checks on a receive request.
*/
static int
tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
int flags)
{
/*
* Reject calls with unknown flags. Since libsockevent strips out the
* flags it handles itself here, we only have to test for ones we can
* not handle.
*/
if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
return EOPNOTSUPP;
return OK;
}
/*
* Return TRUE if receive calls may wait for more data to come in on the
* connection, or FALSE if we already know that that is not going to happen.
*/
static int
tcpsock_may_wait(struct tcpsock * tcp)
{
return (tcp->tcp_pcb != NULL &&
!(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN));
}
/*
* Test whether data can be received on a TCP socket, and if so, how many bytes
* of data.
*/
static int
tcpsock_test_recv(struct sock * sock, size_t min, size_t * size)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
int may_wait;
/* If there is and never was a connection, refuse the call at all. */
if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED ||
tcp->tcp_pcb->state == LISTEN))
return ENOTCONN;
/*
* If we are certain that no more data will come in later, ignore the
* low receive watermark. Otherwise, bound it to the size of the
* receive buffer, or receive calls may block forever.
*/
if (!(may_wait = tcpsock_may_wait(tcp)))
min = 1;
else if (min > tcpsock_get_rcvbuf(tcp))
min = tcpsock_get_rcvbuf(tcp);
if (tcp->tcp_rcv.tr_len >= min) {
if (size != NULL)
*size = tcp->tcp_rcv.tr_len;
return OK;
}
return (may_wait) ? SUSPEND : SOCKEVENT_EOF;
}
/*
* Receive data on a TCP socket.
*/
static int
tcpsock_recv(struct sock * sock, const struct sockdriver_data * data,
size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
struct sockaddr * addr __unused, socklen_t * addr_len __unused,
endpoint_t user_endpt __unused, int flags, size_t min,
int * rflags __unused)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
struct pbuf *ptail;
size_t off, left;
int r;
/* See if we can receive at all, and if so, how much at most. */
if ((r = tcpsock_test_recv(sock, min, NULL)) != OK)
return r;
if (len == 0)
return OK; /* nothing to do */
off = tcp->tcp_rcv.tr_len;
if (off > len)
off = len;
assert(tcp->tcp_rcv.tr_head != NULL);
assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len);
/* Copy out the data to the caller. */
if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head,
tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK)
return r;
/* Unless peeking, remove the data from the receive queue. */
if (!(flags & MSG_PEEK)) {
left = off;
/* Dequeue and free as many entire buffers as possible. */
while ((ptail = tcp->tcp_rcv.tr_head) != NULL &&
left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) {
left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off;
tcp->tcp_rcv.tr_head = ptail->next;
tcp->tcp_rcv.tr_head_off = 0;
if (tcp->tcp_rcv.tr_head == NULL)
tcp->tcp_rcv.tr_pre_tailp = NULL;
else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next)
tcp->tcp_rcv.tr_pre_tailp =
&tcp->tcp_rcv.tr_head;
assert(tcpsock_recvbufs > 0);
tcpsock_recvbufs--;
tcpsock_free_buf(ptail);
}
/*
* If only part of the (new) head buffer is consumed, adjust
* the saved offset into that buffer.
*/
if (left > 0) {
assert(tcp->tcp_rcv.tr_head != NULL);
assert((size_t)tcp->tcp_rcv.tr_head->len -
tcp->tcp_rcv.tr_head_off > left);
tcp->tcp_rcv.tr_head_off += left;
}
tcp->tcp_rcv.tr_len -= off;
if (tcp->tcp_rcv.tr_head != NULL) {
assert(tcp->tcp_rcv.tr_pre_tailp != NULL);
assert(tcp->tcp_rcv.tr_len > 0);
} else {
assert(tcp->tcp_rcv.tr_pre_tailp == NULL);
assert(tcp->tcp_rcv.tr_len == 0);
}
/*
* The receive buffer has shrunk, so there may now be space to
* receive more data.
*/
if (tcp->tcp_pcb != NULL)
tcpsock_ack_recv(tcp);
} else
flags &= ~MSG_WAITALL; /* for the check below */
/* Advance the current copy position, and see if we are done. */
*offp += off;
if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp))
return SUSPEND;
else
return OK;
}
/*
* Update the set of flag-type socket options on a TCP socket.
*/
static void
tcpsock_setsockmask(struct sock * sock, unsigned int mask)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
if (tcp->tcp_pcb == NULL)
return;
if (mask & SO_REUSEADDR)
ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR);
else
ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR);
if (mask & SO_KEEPALIVE)
ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE);
else
ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE);
}
/*
* Prepare a helper structure for IP-level option processing.
*/
static void
tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts)
{
ipopts->local_ip = &tcp->tcp_pcb->local_ip;
ipopts->remote_ip = &tcp->tcp_pcb->remote_ip;
ipopts->tos = &tcp->tcp_pcb->tos;
ipopts->ttl = &tcp->tcp_pcb->ttl;
ipopts->sndmin = TCP_SNDBUF_MIN;
ipopts->sndmax = TCP_SNDBUF_MAX;
ipopts->rcvmin = TCP_RCVBUF_MIN;
ipopts->rcvmax = TCP_RCVBUF_MAX;
}
/*
* Set socket options on a TCP socket.
*/
static int
tcpsock_setsockopt(struct sock * sock, int level, int name,
const struct sockdriver_data * data, socklen_t len)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
struct ipopts ipopts;
uint32_t uval;
int r, val;
if (tcp->tcp_pcb == NULL)
return ECONNRESET;
/* Handle TCP-level options. */
switch (level) {
case IPPROTO_IPV6:
switch (name) {
case IPV6_RECVTCLASS:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
/*
* This option is not supported for TCP sockets; it
* would not even make sense. However, named(8)
* insists on trying to set it anyway. We accept the
* request but ignore the value, not even returning
* what was set through getsockopt(2).
*/
return OK;
case IPV6_FAITH:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
/*
* This option is not supported at all, but to save
* ourselves from having to remember the current state
* for getsockopt(2), we also refuse to enable it.
*/
if (val != 0)
return EINVAL;
return OK;
}
break;
case IPPROTO_TCP:
switch (name) {
case TCP_NODELAY:
/*
* lwIP's listening TCP PCBs do not have this field.
* If this ever becomes an issue, we can create our own
* shadow flag and do the inheritance ourselves.
*/
if (tcp->tcp_pcb->state == LISTEN)
return EINVAL;
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val)
tcp_nagle_disable(tcp->tcp_pcb);
else
tcp_nagle_enable(tcp->tcp_pcb);
return OK;
case TCP_KEEPIDLE:
case TCP_KEEPINTVL:
/*
* lwIP's listening TCP PCBs do not have these fields.
*/
if (tcp->tcp_pcb->state == LISTEN)
return EINVAL;
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val == 0)
return EINVAL;
/*
* The given value is unsigned, but lwIP stores the
* value in milliseconds in a uint32_t field, so we
* have to limit large values to whatever fits in the
* field anyway.
*/
if (val < 0 || (uint32_t)val > UINT32_MAX / 1000)
uval = UINT32_MAX;
else
uval = (uint32_t)val * 1000;
if (name == TCP_KEEPIDLE)
tcp->tcp_pcb->keep_idle = uval;
else
tcp->tcp_pcb->keep_intvl = uval;
return OK;
case TCP_KEEPCNT:
/* lwIP's listening TCP PCBs do not have this field. */
if (tcp->tcp_pcb->state == LISTEN)
return EINVAL;
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val == 0)
return EINVAL;
tcp->tcp_pcb->keep_cnt = (uint32_t)val;
return OK;
}
return EOPNOTSUPP;
}
/* Handle all other options at the IP level. */
tcpsock_get_ipopts(tcp, &ipopts);
return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data,
len, &ipopts);
}
/*
* Retrieve socket options on a TCP socket.
*/
static int
tcpsock_getsockopt(struct sock * sock, int level, int name,
const struct sockdriver_data * data, socklen_t * len)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
struct ipopts ipopts;
int val;
if (tcp->tcp_pcb == NULL)
return ECONNRESET;
/* Handle TCP-level options. */
switch (level) {
case IPPROTO_IPV6:
switch (name) {
case IPV6_RECVTCLASS:
case IPV6_FAITH:
val = 0;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
}
break;
case IPPROTO_TCP:
switch (name) {
case TCP_NODELAY:
/* lwIP's listening TCP PCBs do not have this field. */
if (tcp->tcp_pcb->state == LISTEN)
return EINVAL;
val = tcp_nagle_disabled(tcp->tcp_pcb);
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case TCP_MAXSEG:
/* lwIP's listening TCP PCBs do not have this field. */
if (tcp->tcp_pcb->state == LISTEN)
return EINVAL;
/* This option is read-only at this time. */
val = tcp->tcp_pcb->mss;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case TCP_KEEPIDLE:
/* lwIP's listening TCP PCBs do not have this field. */
if (tcp->tcp_pcb->state == LISTEN)
return EINVAL;
val = (int)(tcp->tcp_pcb->keep_idle / 1000);
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case TCP_KEEPINTVL:
/* lwIP's listening TCP PCBs do not have this field. */
if (tcp->tcp_pcb->state == LISTEN)
return EINVAL;
val = (int)(tcp->tcp_pcb->keep_intvl / 1000);
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case TCP_KEEPCNT:
/* lwIP's listening TCP PCBs do not have this field. */
if (tcp->tcp_pcb->state == LISTEN)
return EINVAL;
val = (int)tcp->tcp_pcb->keep_cnt;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
}
return EOPNOTSUPP;
}
/* Handle all other options at the IP level. */
tcpsock_get_ipopts(tcp, &ipopts);
return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data,
len, &ipopts);
}
/*
* Retrieve the local socket address of a TCP socket.
*/
static int
tcpsock_getsockname(struct sock * sock, struct sockaddr * addr,
socklen_t * addr_len)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
if (tcp->tcp_pcb == NULL)
return EINVAL;
ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
&tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port);
return OK;
}
/*
* Retrieve the remote socket address of a TCP socket.
*/
static int
tcpsock_getpeername(struct sock * sock, struct sockaddr * addr,
socklen_t * addr_len)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED ||
tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT)
return ENOTCONN;
ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
&tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
return OK;
}
/*
* Perform a TCP half-close on a TCP socket. This operation may not complete
* immediately due to memory conditions, in which case it will be completed at
* a later time.
*/
static void
tcpsock_send_fin(struct tcpsock * tcp)
{
sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR);
/*
* Attempt to send the FIN. If a fatal error occurs as a result, raise
* it as an asynchronous error, because this function's callers cannot
* do much with it. That happens to match the way these functions are
* used elsewhere. In any case, as a result, the PCB may be closed.
* However, we are never called from a situation where the socket is
* being closed here, so the socket object will not be freed either.
*/
if (tcpsock_pcb_enqueue(tcp)) {
assert(!sockevent_is_closing(tcpsock_get_sock(tcp)));
if (tcpsock_may_close(tcp))
tcpsock_finish_close(tcp);
else
(void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/);
}
}
/*
* Shut down a TCP socket for reading and/or writing.
*/
static int
tcpsock_shutdown(struct sock * sock, unsigned int mask)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
/*
* If the PCB is gone, we want to allow shutdowns for reading but not
* writing: shutting down for writing affects the PCB, shutting down
* for reading does not. Also, if the PCB is in CLOSED state, we would
* not know how to deal with subsequent operations after a shutdown for
* writing, so forbid such calls altogether.
*/
if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) &&
(mask & SFL_SHUT_WR))
return ENOTCONN;
/*
* Handle listening sockets as a special case. Shutting down a
* listening socket frees its PCB. Sockets pending on the accept queue
* may still be accepted, but after that, accept(2) will start
* returning ECONNABORTED. This feature allows multi-process server
* applications to shut down gracefully, supposedly..
*/
if (tcpsock_is_listening(tcp)) {
if (tcp->tcp_pcb != NULL)
tcpsock_pcb_close(tcp);
return OK;
}
/*
* We control shutdown-for-reading locally, and intentially do not tell
* lwIP about it: if we do that and also shut down for writing, the PCB
* may disappear (now or eventually), which is not what we want.
* Instead, we only tell lwIP to shut down for reading once we actually
* want to get rid of the PCB, using tcp_close(). In the meantime, if
* the socket is shut down for reading by the user, we simply discard
* received data as fast as we can--one out of a number of possible
* design choices there, and (reportedly) the one used by the BSDs.
*/
if (mask & SFL_SHUT_RD)
(void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/);
/*
* Shutting down for writing a connecting socket simply closes its PCB.
* Closing a PCB in SYN_SENT state simply deallocates it, so this can
* not fail. On the other hand, for connected sockets we want to send
* a FIN, which may fail due to memory shortage, in which case we have
* to try again later..
*/
if (mask & SFL_SHUT_WR) {
if (tcp->tcp_pcb->state == SYN_SENT)
tcpsock_pcb_close(tcp);
else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
tcpsock_send_fin(tcp);
}
return OK;
}
/*
* Close a TCP socket. Complete the operation immediately if possible, or
* otherwise initiate the closing process and complete it later, notifying
* libsockevent about that as well. Depending on linger settings, this
* function may be called twice on the same socket: the first time with the
* 'force' flag cleared, and the second time with the 'force' flag set.
*/
static int
tcpsock_close(struct sock * sock, int force)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
struct tcpsock *queued;
size_t rlen;
assert(tcp->tcp_listener == NULL);
/*
* If this was a listening socket, so abort and clean up any and all
* connections on its listener queue. Note that the listening socket
* may or may not have a PCB at this point.
*/
if (tcpsock_is_listening(tcp)) {
while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) {
queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head);
tcpsock_pcb_abort(queued);
(void)tcpsock_cleanup(queued, TRUE /*may_free*/);
}
}
/*
* Clear the receive queue, and make sure that we no longer add new
* data to it. The latter is relevant only for the case that we end up
* returning SUSPEND below. Remember whether there were bytes left,
* because we should reset the connection if there were.
*/
rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD);
/*
* If the socket is connected, perform a graceful shutdown, unless 1)
* we are asked to force-close the socket, or 2) if the local side has
* not consumed all data, as per RFC 1122 Sec.4.2.2.13. Normally lwIP
* would take care of the second point, but we may have data in our
* receive buffer of which lwIP is not aware.
*
* Implementing proper linger support is somewhat difficult with lwIP.
* In particular, we cannot reliably wait for our FIN to be ACK'ed by
* the other side in all cases:
*
* - the lwIP TCP transition from states CLOSING to TIME_WAIT does not
* trigger any event and once in the TIME_WAIT state, the poll event
* no longer triggers either;
* - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to
* TIME_WAIT will trigger a receive event, but it is not clear
* whether we can reliably check that our FIN was ACK'ed from there.
*
* That means we have to compromise. Instead of the proper approach,
* we complete our side of the close operation whenever:
*
* 1. all of or data was acknowledged, AND,
* 2. our FIN was sent, AND,
* 3a. our FIN was acknowledged, OR,
* 3b. we received a FIN from the other side.
*
* With the addition of the rule 3b, we do not run into the above
* reliability problems, but we may return from SO_LINGER-blocked close
* calls too early and thus give callers a false impression of success.
* TODO: if lwIP ever gets improved on this point, the code in this
* module should be rewritten to make use of the improvements.
*
* The set of rules is basically the same as for closing the PCB early
* as per tcpsock_may_close(), except with the check for our FIN being
* acknowledged. Unfortunately only the FIN_WAIT_2, TIME_WAIT, and
* (reentered) CLOSED TCP states guarantee that there are no
* unacknowledged data segments anymore, so we may have to wait for
* reaching any one of these before we can actually finish closing the
* socket with tcp_close().
*
* In addition, lwIP does not tell us when our FIN gets acknowledged,
* so we have to use polling and direct access to lwIP's PCB fields
* instead, just like lwIP's BSD API does. There is no other way.
* Also, we may not even be able to send the FIN right away, in which
* case we must defer that until later.
*/
if (tcp->tcp_pcb != NULL) {
switch (tcp->tcp_pcb->state) {
case CLOSE_WAIT:
case CLOSING:
case LAST_ACK:
assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
/* FALLTHROUGH */
case SYN_RCVD:
case ESTABLISHED:
case FIN_WAIT_1:
/* First check if we should abort the connection. */
if (force || rlen > 0)
break;
/*
* If we have not sent a FIN yet, try sending it now;
* if all other conditions are met for closing the
* socket, successful FIN transmission will complete
* the close. Otherwise, perform the close check
* explicitly.
*/
if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
tcpsock_send_fin(tcp);
else if (tcpsock_may_close(tcp))
tcpsock_pcb_close(tcp);
/*
* If at this point the PCB is gone, we managed to
* close the connection immediately, and the socket has
* already been cleaned up by now. This may occur if
* there is no unacknowledged data and we already
* received a FIN earlier on.
*/
if (tcp->tcp_pcb == NULL)
return OK;
/*
* Complete the close operation at a later time.
* Adjust the polling interval, so that we can detect
* completion of the close as quickly as possible.
*/
tcp_poll(tcp->tcp_pcb, tcpsock_event_poll,
TCP_POLL_CLOSE_INTERVAL);
return SUSPEND;
default:
/*
* The connection is either not yet established, or
* already in a state where we can close it right now.
*/
tcpsock_pcb_close(tcp);
}
}
/*
* Abort the connection is the PCB is still around, and clean up the
* socket. We cannot let tcpsock_cleanup() free the socket object yet,
* because we are still in the callback from libsockevent, and the
* latter cannot handle the socket object being freed from here.
*/
if (tcp->tcp_pcb != NULL)
tcpsock_pcb_abort(tcp);
(void)tcpsock_cleanup(tcp, FALSE /*may_free*/);
return OK;
}
/*
* Free up a closed TCP socket.
*/
static void
tcpsock_free(struct sock * sock)
{
struct tcpsock *tcp = (struct tcpsock *)sock;
assert(tcp->tcp_pcb == NULL);
assert(tcp->tcp_snd.ts_len == 0);
assert(tcp->tcp_snd.ts_head == NULL);
assert(tcp->tcp_rcv.tr_len == 0);
assert(tcp->tcp_rcv.tr_head == NULL);
TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next);
}
/* This table maps TCP states from lwIP numbers to NetBSD numbers. */
static const struct {
int tsm_tstate;
int tsm_sostate;
} tcpsock_statemap[] = {
[CLOSED] = { TCPS_CLOSED, SS_ISDISCONNECTED },
[LISTEN] = { TCPS_LISTEN, 0 },
[SYN_SENT] = { TCPS_SYN_SENT, SS_ISCONNECTING },
[SYN_RCVD] = { TCPS_SYN_RECEIVED, SS_ISCONNECTING },
[ESTABLISHED] = { TCPS_ESTABLISHED, SS_ISCONNECTED },
[FIN_WAIT_1] = { TCPS_FIN_WAIT_1, SS_ISDISCONNECTING },
[FIN_WAIT_2] = { TCPS_FIN_WAIT_2, SS_ISDISCONNECTING },
[CLOSE_WAIT] = { TCPS_CLOSE_WAIT, SS_ISCONNECTED },
[CLOSING] = { TCPS_CLOSING, SS_ISDISCONNECTING },
[LAST_ACK] = { TCPS_LAST_ACK, SS_ISDISCONNECTING },
[TIME_WAIT] = { TCPS_TIME_WAIT, SS_ISDISCONNECTED },
};
/*
* Fill the given kinfo_pcb sysctl(7) structure with information about the TCP
* PCB identified by the given pointer.
*/
static void
tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr)
{
const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr;
struct tcpsock *tcp;
/*
* Not all TCP PCBs have an associated tcpsock structure. We are
* careful enough clearing the callback argument for PCBs on any of the
* TCP lists that we can use that callback argument to determine
* whether there is an associated tcpsock structure, although with one
* exception: PCBs for incoming connections that have not yet been
* fully established (i.e., in SYN_RCVD state). These will have the
* callback argument of the listening socket (which itself may already
* have been deallocated at this point) but should not be considered as
* associated with the listening socket's tcpsock structure.
*/
if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) {
tcp = (struct tcpsock *)pcb->callback_arg;
assert(tcp >= tcp_array &&
tcp < &tcp_array[__arraycount(tcp_array)]);
/* TODO: change this so that sockstat(1) may work one day. */
ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp);
} else {
/* No tcpsock. Could also be in TIME_WAIT state etc. */
tcp = NULL;
ki->ki_sostate = SS_NOFDREF;
}
ki->ki_type = SOCK_STREAM;
if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) {
ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate;
/* TODO: this needs work, but does anything rely on it? */
ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate;
}
/* Careful with the LISTEN state here (see below). */
ipsock_get_info(ki, &pcb->local_ip, pcb->local_port,
&pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0);
/*
* The PCBs for listening sockets are actually smaller. Thus, for
* listening sockets, do not attempt to access any of the fields beyond
* those provided in the smaller structure.
*/
if (pcb->state == LISTEN) {
assert(tcp != NULL);
ki->ki_refs =
(uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head);
} else {
if (tcp_nagle_disabled(pcb))
ki->ki_tflags |= NETBSD_TF_NODELAY;
if (tcp != NULL) {
ki->ki_rcvq = tcp->tcp_rcv.tr_len;
ki->ki_sndq = tcp->tcp_snd.ts_len;
if (tcp->tcp_listener != NULL)
ki->ki_nextref = (uint64_t)(uintptr_t)
TAILQ_NEXT(tcp, tcp_queue.tq_next);
}
}
}
/*
* Given either NULL or a previously returned TCP PCB pointer, return the first
* or next TCP PCB pointer, or NULL if there are no more. The current
* implementation supports only one concurrent iteration at once.
*/
static const void *
tcpsock_enum(const void * last)
{
static struct {
unsigned int i;
const struct tcp_pcb *pcb;
} iter;
if (last != NULL && (iter.pcb = iter.pcb->next) != NULL)
return (const void *)iter.pcb;
for (iter.i = (last != NULL) ? iter.i + 1 : 0;
iter.i < __arraycount(tcp_pcb_lists); iter.i++) {
if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL)
return (const void *)iter.pcb;
}
return NULL;
}
/*
* Obtain the list of TCP protocol control blocks, for sysctl(7).
*/
static ssize_t
tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
{
return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info);
}
static const struct sockevent_ops tcpsock_ops = {
.sop_bind = tcpsock_bind,
.sop_listen = tcpsock_listen,
.sop_connect = tcpsock_connect,
.sop_accept = tcpsock_accept,
.sop_test_accept = tcpsock_test_accept,
.sop_pre_send = tcpsock_pre_send,
.sop_send = tcpsock_send,
.sop_test_send = tcpsock_test_send,
.sop_pre_recv = tcpsock_pre_recv,
.sop_recv = tcpsock_recv,
.sop_test_recv = tcpsock_test_recv,
.sop_ioctl = ifconf_ioctl,
.sop_setsockmask = tcpsock_setsockmask,
.sop_setsockopt = tcpsock_setsockopt,
.sop_getsockopt = tcpsock_getsockopt,
.sop_getsockname = tcpsock_getsockname,
.sop_getpeername = tcpsock_getpeername,
.sop_shutdown = tcpsock_shutdown,
.sop_close = tcpsock_close,
.sop_free = tcpsock_free
};