
This commit adds a new TCP/IP service to MINIX 3. As its core, the service uses the lwIP TCP/IP stack for maintenance reasons. The service aims to be compatible with NetBSD userland, including its low-level network management utilities. It also aims to support modern features such as IPv6. In summary, the new LWIP service has support for the following main features: - TCP, UDP, RAW sockets with mostly standard BSD API semantics; - IPv6 support: host mode (complete) and router mode (partial); - most of the standard BSD API socket options (SO_); - all of the standard BSD API message flags (MSG_); - the most used protocol-specific socket and control options; - a default loopback interface and the ability to create one more; - configuration-free ethernet interfaces and driver tracking; - queuing and multiple concurrent requests to each ethernet driver; - standard ioctl(2)-based BSD interface management; - radix tree backed, destination-based routing; - routing sockets for standard BSD route reporting and management; - multicast traffic and multicast group membership tracking; - Berkeley Packet Filter (BPF) devices; - standard and custom sysctl(7) nodes for many internals; - a slab allocation based, hybrid static/dynamic memory pool model. Many of its modules come with fairly elaborate comments that cover many aspects of what is going on. The service is primarily a socket driver built on top of the libsockdriver library, but for BPF devices it is at the same time also a character driver. Change-Id: Ib0c02736234b21143915e5fcc0fda8fe408f046f
2794 lines
81 KiB
C
2794 lines
81 KiB
C
/* LWIP service - tcpsock.c - TCP sockets */
|
|
/*
|
|
* This module implements support for TCP sockets based on lwIP's core TCP PCB
|
|
* module, which is largely but not fully cooperative with exactly what we want
|
|
* to achieve, with as a result that this module is rather complicated.
|
|
*
|
|
* Each socket has a send queue and a receive queue. Both are using lwIP's own
|
|
* (pbuf) buffers, which largely come out of the main 512-byte buffer pool.
|
|
* The buffers on the send queue are allocated and freed by us--the latter only
|
|
* once they are no longer in use by lwIP as well. A bit counterintuitively,
|
|
* we deliberately use a smaller lwIP per-PCB TCP send buffer limit
|
|
* (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more
|
|
* easily trigger conditions where we cannot enqueue data (or the final FIN)
|
|
* right away. This way, we get to test the internal logic of this module a
|
|
* lot more easily. The small lwIP send queue size should not have any impact
|
|
* on performance, as our own per-socket send queues can be much larger and we
|
|
* enqueue more of that on the lwIP PCB as soon as we can in all cases.
|
|
*
|
|
* The receive queue consists of whatever buffers were given to us by lwIP, but
|
|
* since those may be many buffers with small amounts of data each, we perform
|
|
* fairly aggressive merging of consecutive buffers. The intended result is
|
|
* that we waste no more than 50% of memory within the receive queue. Merging
|
|
* requires memory copies, which makes it expensive, but we do not configure
|
|
* lwIP with enough buffers to make running out of buffers a non-issue, so this
|
|
* trade-off is necessary. Practical experience and measurements of the merge
|
|
* policy will have to show whether and how the current policy may be improved.
|
|
*
|
|
* As can be expected, the connection close semantics are by far the most
|
|
* complicated part of this module. We attempt to get rid of the lwIP PCB as
|
|
* soon as we can, letting lwIP take care of the TIME_WAIT state for example.
|
|
* However, there are various conditions that have to be met before we can
|
|
* forget about the PCB here--most importantly, that none of our sent data
|
|
* blocks are still referenced by lwIP because they have not yet been sent or
|
|
* acknowledged. We can only free the data blocks once lwIP is done with them.
|
|
*
|
|
* We do consider the TCP state of lwIP's PCB, in order to avoid duplicating
|
|
* full state tracking here. However, we do not look at a socket's TCP state
|
|
* while in a lwIP-generated event for that socket, because the state may not
|
|
* necessarily reflect the (correct or new) TCP state of the connection, nor
|
|
* may the PCB be available--this is the case for error events. For these
|
|
* reasons we use a few internal TCPF_ flags to perform partial state tracking.
|
|
*
|
|
* More generally, we tend to access lwIP PCB fields directly only when lwIP's
|
|
* own BSD API implementation does that too and there is no better alternative.
|
|
* One example of this is the check to see if our FIN was acknowledged, for
|
|
* SO_LINGER support. In terms of maintenance, our hope is that if lwIP's API
|
|
* changes later, we can change our code to imitate whatever lwIP's BSD API
|
|
* implementation does at that point.
|
|
*/
|
|
|
|
#include <sys/socketvar.h>
|
|
#include <netinet/in.h>
|
|
#include <netinet/tcp.h>
|
|
#include <netinet/ip_var.h>
|
|
#include <netinet/tcp_timer.h>
|
|
#include <netinet/tcp_var.h>
|
|
#include <netinet/tcp_fsm.h>
|
|
|
|
/*
|
|
* Unfortunately, NetBSD and lwIP have different definitions of a few relevant
|
|
* preprocessor variables. Make sure we do not attempt to use the NetBSD one
|
|
* where it matters. We do need one of the NetBSD definitions though.
|
|
*/
|
|
static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY;
|
|
#undef TF_NODELAY
|
|
#undef TCP_MSS
|
|
|
|
#include "lwip.h"
|
|
#include "tcpisn.h"
|
|
|
|
#include "lwip/tcp.h"
|
|
#include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */
|
|
|
|
/*
|
|
* The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration.
|
|
*/
|
|
|
|
/*
|
|
* We fully control the send buffer, so we can let its size be set to whatever
|
|
* we want. The receive buffer is different: if it is smaller than the window
|
|
* size, we may have to refuse data that lwIP hands us, at which point more
|
|
* incoming data will cause lwIP to abort the TCP connection--even aside from
|
|
* performance issues. Therefore, we must make sure the receive buffer is
|
|
* larger than the TCP window at all times.
|
|
*/
|
|
#define TCP_SNDBUF_MIN 1 /* minimum TCP send buffer size */
|
|
#define TCP_SNDBUF_DEF 32768 /* default TCP send buffer size */
|
|
#define TCP_SNDBUF_MAX 131072 /* maximum TCP send buffer size */
|
|
#define TCP_RCVBUF_MIN TCP_WND /* minimum TCP receive buffer size */
|
|
#define TCP_RCVBUF_DEF MAX(TCP_WND, 32768) /* default TCP recv buffer size */
|
|
#define TCP_RCVBUF_MAX MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */
|
|
|
|
/*
|
|
* The total number of buffers that may in use for TCP socket send queues. The
|
|
* goal is to allow at least some progress to be made on receiving from TCP
|
|
* sockets and on differently-typed sockets, at least as long as the LWIP
|
|
* service can manage to allocate the memory it wants. For the case that it
|
|
* does not, we can only reactively kill off TCP sockets and/or free enqueued
|
|
* ethernet packets, neither of which is currently implemented (TODO).
|
|
*/
|
|
#define TCP_MAX_SENDBUFS (mempool_max_buffers() * 3 / 4)
|
|
|
|
/* Polling intervals, in 500-millsecond units. */
|
|
#define TCP_POLL_REG_INTERVAL 10 /* interval for reattempting sends */
|
|
#define TCP_POLL_CLOSE_INTERVAL 1 /* interval while closing connection */
|
|
|
|
static struct tcpsock {
|
|
struct ipsock tcp_ipsock; /* IP socket, MUST be first */
|
|
struct tcp_pcb *tcp_pcb; /* lwIP TCP control block */
|
|
union pxfer_tcp_queue { /* free/accept queue */
|
|
TAILQ_ENTRY(tcpsock) tq_next; /* next in queue */
|
|
TAILQ_HEAD(, tcpsock) tq_head; /* head of queue */
|
|
} tcp_queue;
|
|
struct tcpsock *tcp_listener; /* listener if on accept q. */
|
|
struct { /* send queue */
|
|
struct pbuf *ts_head; /* first pbuf w/unacked data */
|
|
struct pbuf *ts_unsent; /* first pbuf w/unsent data */
|
|
struct pbuf *ts_tail; /* most recently added data */
|
|
size_t ts_len; /* total sent + unsent */
|
|
unsigned short ts_head_off; /* offset into head pbuf */
|
|
unsigned short ts_unsent_off; /* offset into unsent pbuf */
|
|
} tcp_snd;
|
|
struct { /* receive queue */
|
|
struct pbuf *tr_head; /* first pbuf w/unrecvd data */
|
|
struct pbuf **tr_pre_tailp; /* ptr-ptr to newest pbuf */
|
|
size_t tr_len; /* bytes on receive queue */
|
|
unsigned short tr_head_off; /* offset into head pbuf */
|
|
unsigned short tr_unacked; /* current window reduction */
|
|
} tcp_rcv;
|
|
} tcp_array[NR_TCPSOCK];
|
|
|
|
static TAILQ_HEAD(, tcpsock) tcp_freelist; /* list of free TCP sockets */
|
|
|
|
static const struct sockevent_ops tcpsock_ops;
|
|
|
|
static unsigned int tcpsock_sendbufs; /* # send buffers in use */
|
|
static unsigned int tcpsock_recvbufs; /* # receive buffers in use */
|
|
|
|
/* A bunch of macros that are just for convenience. */
|
|
#define tcpsock_get_id(tcp) (SOCKID_TCP | (sockid_t)((tcp) - tcp_array))
|
|
#define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock)
|
|
#define tcpsock_get_sock(tcp) (ipsock_get_sock(tcpsock_get_ipsock(tcp)))
|
|
#define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp)))
|
|
#define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp)))
|
|
#define tcpsock_is_ipv6(tcp) (ipsock_is_ipv6(tcpsock_get_ipsock(tcp)))
|
|
#define tcpsock_is_shutdown(tcp,fl) \
|
|
(sockevent_is_shutdown(tcpsock_get_sock(tcp), fl))
|
|
#define tcpsock_is_listening(tcp) \
|
|
(sockevent_is_listening(tcpsock_get_sock(tcp)))
|
|
#define tcpsock_get_flags(tcp) (ipsock_get_flags(tcpsock_get_ipsock(tcp)))
|
|
#define tcpsock_set_flag(tcp,fl) \
|
|
(ipsock_set_flag(tcpsock_get_ipsock(tcp), fl))
|
|
#define tcpsock_clear_flag(tcp,fl) \
|
|
(ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl))
|
|
|
|
static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *,
|
|
struct rmib_oldp *, struct rmib_newp *);
|
|
|
|
/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */
|
|
/* TODO: add many more and make some of them writable.. */
|
|
static struct rmib_node net_inet_tcp_table[] = {
|
|
/* 2*/ [TCPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF,
|
|
"sendspace",
|
|
"Default TCP send buffer size"),
|
|
/* 3*/ [TCPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF,
|
|
"recvspace",
|
|
"Default TCP receive buffer size"),
|
|
/*29*/ [TCPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
|
|
loopif_cksum, "do_loopback_cksum",
|
|
"Perform TCP checksum on loopback"),
|
|
/*+0*/ [TCPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0,
|
|
tcpsock_pcblist, "pcblist",
|
|
"TCP protocol control block list"),
|
|
/*+1*/ [TCPCTL_MAXID + 1] = RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE |
|
|
CTLFLAG_HIDDEN | CTLTYPE_STRING,
|
|
TCPISN_SECRET_HEX_LENGTH, tcpisn_secret,
|
|
"isn_secret",
|
|
"TCP ISN secret (MINIX 3 specific)")
|
|
};
|
|
|
|
static struct rmib_node net_inet_tcp_node =
|
|
RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings");
|
|
static struct rmib_node net_inet6_tcp6_node =
|
|
RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings");
|
|
|
|
/*
|
|
* Initialize the TCP sockets module.
|
|
*/
|
|
void
|
|
tcpsock_init(void)
|
|
{
|
|
unsigned int slot;
|
|
|
|
/* Initialize the list of free TCP sockets. */
|
|
TAILQ_INIT(&tcp_freelist);
|
|
|
|
for (slot = 0; slot < __arraycount(tcp_array); slot++)
|
|
TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot],
|
|
tcp_queue.tq_next);
|
|
|
|
/* Initialize other variables. */
|
|
tcpsock_sendbufs = 0;
|
|
|
|
/* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */
|
|
mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node);
|
|
mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node);
|
|
}
|
|
|
|
/*
|
|
* Initialize the state of a TCP socket's send queue.
|
|
*/
|
|
static void
|
|
tcpsock_reset_send(struct tcpsock * tcp)
|
|
{
|
|
|
|
tcp->tcp_snd.ts_tail = NULL;
|
|
tcp->tcp_snd.ts_unsent = NULL;
|
|
tcp->tcp_snd.ts_head = NULL;
|
|
tcp->tcp_snd.ts_len = 0;
|
|
tcp->tcp_snd.ts_unsent_off = 0;
|
|
tcp->tcp_snd.ts_head_off = 0;
|
|
}
|
|
|
|
/*
|
|
* Initialize the state of a TCP socket's receive queue.
|
|
*/
|
|
static void
|
|
tcpsock_reset_recv(struct tcpsock * tcp)
|
|
{
|
|
|
|
tcp->tcp_rcv.tr_pre_tailp = NULL;
|
|
tcp->tcp_rcv.tr_head = NULL;
|
|
tcp->tcp_rcv.tr_len = 0;
|
|
tcp->tcp_rcv.tr_head_off = 0;
|
|
tcp->tcp_rcv.tr_unacked = 0;
|
|
}
|
|
|
|
/*
|
|
* Create a TCP socket.
|
|
*/
|
|
sockid_t
|
|
tcpsock_socket(int domain, int protocol, struct sock ** sockp,
|
|
const struct sockevent_ops ** ops)
|
|
{
|
|
struct tcpsock *tcp;
|
|
uint8_t ip_type;
|
|
|
|
switch (protocol) {
|
|
case 0:
|
|
case IPPROTO_TCP:
|
|
break;
|
|
|
|
default:
|
|
return EPROTONOSUPPORT;
|
|
}
|
|
|
|
if (TAILQ_EMPTY(&tcp_freelist))
|
|
return ENOBUFS;
|
|
|
|
tcp = TAILQ_FIRST(&tcp_freelist);
|
|
|
|
/*
|
|
* Initialize the structure. Do not memset it to zero, as it is still
|
|
* part of the linked free list. Initialization may still fail. When
|
|
* adding new fields, make sure to change tcpsock_clone() accordingly.
|
|
*/
|
|
|
|
ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain,
|
|
TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp);
|
|
|
|
if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL)
|
|
return ENOBUFS;
|
|
tcp_arg(tcp->tcp_pcb, tcp);
|
|
|
|
tcp->tcp_listener = NULL;
|
|
|
|
tcpsock_reset_send(tcp);
|
|
tcpsock_reset_recv(tcp);
|
|
|
|
TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
|
|
|
|
*ops = &tcpsock_ops;
|
|
return tcpsock_get_id(tcp);
|
|
}
|
|
|
|
/*
|
|
* Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection
|
|
* incoming on listening socket 'listener'. The new socket is essentially a
|
|
* "clone" of the listening TCP socket, in that it should inherit any settings
|
|
* from the listening socket. The socket has not yet been accepted by userland
|
|
* so add it to the queue of connetions pending for the listening socket. On
|
|
* success, return OK. On failure, return a negative error code.
|
|
*/
|
|
static int
|
|
tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb)
|
|
{
|
|
struct tcpsock *tcp;
|
|
|
|
if (TAILQ_EMPTY(&tcp_freelist))
|
|
return ENOBUFS;
|
|
|
|
tcp = TAILQ_FIRST(&tcp_freelist);
|
|
|
|
/*
|
|
* Initialize the structure. Do not memset it to zero, as it is still
|
|
* part of the linked free list. Initialization may still fail. Most
|
|
* settings should be inherited from the listening socket here, rather
|
|
* than being initialized to their default state.
|
|
*/
|
|
|
|
ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp),
|
|
tcpsock_get_id(tcp));
|
|
|
|
tcp->tcp_pcb = pcb;
|
|
tcp_arg(pcb, tcp);
|
|
|
|
tcpsock_reset_send(tcp);
|
|
tcpsock_reset_recv(tcp);
|
|
|
|
/*
|
|
* Remove the new socket from the free list, and add it to the queue of
|
|
* the listening socket--in this order, because the same next pointer
|
|
* is used for both.
|
|
*/
|
|
TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
|
|
|
|
TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp,
|
|
tcp_queue.tq_next);
|
|
tcp->tcp_listener = listener;
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Allocate a buffer from the pool, using the standard pool size. The returned
|
|
* buffer is a single element--never a chain.
|
|
*/
|
|
static struct pbuf *
|
|
tcpsock_alloc_buf(void)
|
|
{
|
|
struct pbuf *pbuf;
|
|
|
|
pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM);
|
|
|
|
assert(pbuf == NULL || pbuf->len == pbuf->tot_len);
|
|
|
|
return pbuf;
|
|
}
|
|
|
|
/*
|
|
* Free the given buffer. Ensure that pbuf_free() will not attempt to free the
|
|
* next buffer(s) in the chain as well. This may be called for pbufs other
|
|
* than those allocated with tcpsock_alloc_buf().
|
|
*/
|
|
static void
|
|
tcpsock_free_buf(struct pbuf * pbuf)
|
|
{
|
|
|
|
/*
|
|
* Resetting the length is currently not necessary, but better safe
|
|
* than sorry..
|
|
*/
|
|
pbuf->len = pbuf->tot_len;
|
|
pbuf->next = NULL;
|
|
|
|
pbuf_free(pbuf);
|
|
}
|
|
|
|
/*
|
|
* Clear the send queue of a TCP socket. The caller must ensure that lwIP will
|
|
* no longer access any of data on the send queue.
|
|
*/
|
|
static void
|
|
tcpsock_clear_send(struct tcpsock * tcp)
|
|
{
|
|
struct pbuf *phead;
|
|
|
|
assert(tcp->tcp_pcb == NULL);
|
|
|
|
while ((phead = tcp->tcp_snd.ts_head) != NULL) {
|
|
tcp->tcp_snd.ts_head = phead->next;
|
|
|
|
assert(tcpsock_sendbufs > 0);
|
|
tcpsock_sendbufs--;
|
|
|
|
tcpsock_free_buf(phead);
|
|
}
|
|
|
|
tcpsock_reset_send(tcp);
|
|
}
|
|
|
|
/*
|
|
* Clear the receive queue of a TCP socket. If 'ack_data' is set, also
|
|
* acknowledge the previous contents of the receive queue to lwIP.
|
|
*/
|
|
static size_t
|
|
tcpsock_clear_recv(struct tcpsock * tcp, int ack_data)
|
|
{
|
|
struct pbuf *phead;
|
|
size_t rlen;
|
|
|
|
rlen = tcp->tcp_rcv.tr_len;
|
|
|
|
while ((phead = tcp->tcp_rcv.tr_head) != NULL) {
|
|
tcp->tcp_rcv.tr_head = phead->next;
|
|
|
|
assert(tcpsock_recvbufs > 0);
|
|
tcpsock_recvbufs--;
|
|
|
|
tcpsock_free_buf(phead);
|
|
}
|
|
|
|
/*
|
|
* From now on, we will basically be discarding incoming data as fast
|
|
* as possible, to keep the full window open at all times.
|
|
*/
|
|
if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0)
|
|
tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked);
|
|
|
|
tcpsock_reset_recv(tcp);
|
|
|
|
return rlen;
|
|
}
|
|
|
|
/*
|
|
* The TCP socket's PCB has been detached from the socket, typically because
|
|
* the connection was aborted, either by us or by lwIP. Either way, any TCP
|
|
* connection is gone. Clear the socket's send queue, remove the socket from
|
|
* a listening socket's queue, and if the socket itself is ready and allowed to
|
|
* be freed, free it now. The socket is ready to be freed if it was either on
|
|
* a listening queue or being closed already. The socket is allowed to be
|
|
* freed only if 'may_free' is TRUE. If the socket is not freed, its receive
|
|
* queue is left as is, as it may still have data to be received by userland.
|
|
*/
|
|
static int
|
|
tcpsock_cleanup(struct tcpsock * tcp, int may_free)
|
|
{
|
|
int destroy;
|
|
|
|
assert(tcp->tcp_pcb == NULL);
|
|
|
|
/*
|
|
* Free any data on the send queue. This is safe to do right now,
|
|
* because the PCB has been aborted (or was already gone). We must be
|
|
* very careful about clearing the send queue in all other situations.
|
|
*/
|
|
tcpsock_clear_send(tcp);
|
|
|
|
/*
|
|
* If this was a socket pending acceptance, remove it from the
|
|
* corresponding listener socket's queue, and free it. Otherwise, free
|
|
* the socket only if it suspended a graceful close operation.
|
|
*/
|
|
if (tcp->tcp_listener != NULL) {
|
|
TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp,
|
|
tcp_queue.tq_next);
|
|
tcp->tcp_listener = NULL;
|
|
|
|
/*
|
|
* The listener socket's backlog count should be adjusted by
|
|
* lwIP whenever the PCB is freed up, so we need (and must) not
|
|
* attempt to do that here.
|
|
*/
|
|
|
|
destroy = TRUE;
|
|
} else
|
|
destroy = sockevent_is_closing(tcpsock_get_sock(tcp));
|
|
|
|
/*
|
|
* Do not free the socket if 'may_free' is FALSE. That flag may be set
|
|
* if we are currently in the second tcpsock_close() call on the
|
|
* socket, in which case sockevent_is_closing() is TRUE but we must
|
|
* still not free the socket now: doing so would derail libsockevent.
|
|
*/
|
|
if (destroy && may_free) {
|
|
(void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
|
|
|
|
sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
|
|
}
|
|
|
|
return destroy;
|
|
}
|
|
|
|
/*
|
|
* Abort the lwIP PCB for the given socket, using tcp_abort(). If the PCB is
|
|
* connected, this will cause the connection to be reset. The PCB, which must
|
|
* have still been present before the call, will be gone after the call.
|
|
*/
|
|
static void
|
|
tcpsock_pcb_abort(struct tcpsock * tcp)
|
|
{
|
|
|
|
assert(tcp->tcp_pcb != NULL);
|
|
assert(!tcpsock_is_listening(tcp));
|
|
|
|
tcp_recv(tcp->tcp_pcb, NULL);
|
|
tcp_sent(tcp->tcp_pcb, NULL);
|
|
tcp_err(tcp->tcp_pcb, NULL);
|
|
tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
|
|
|
|
tcp_arg(tcp->tcp_pcb, NULL);
|
|
|
|
tcp_abort(tcp->tcp_pcb);
|
|
|
|
tcp->tcp_pcb = NULL;
|
|
}
|
|
|
|
/*
|
|
* Close the lwIP PCB for the given socket, using tcp_close(). If the PCB is
|
|
* connected, its graceful close will be finished by lwIP in the background.
|
|
* The PCB, which must have still been present before the call, will be gone
|
|
* after the call.
|
|
*/
|
|
static void
|
|
tcpsock_pcb_close(struct tcpsock * tcp)
|
|
{
|
|
err_t err;
|
|
|
|
assert(tcp->tcp_pcb != NULL);
|
|
assert(tcp->tcp_snd.ts_len == 0);
|
|
|
|
if (!tcpsock_is_listening(tcp)) {
|
|
tcp_recv(tcp->tcp_pcb, NULL);
|
|
tcp_sent(tcp->tcp_pcb, NULL);
|
|
tcp_err(tcp->tcp_pcb, NULL);
|
|
tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
|
|
}
|
|
|
|
tcp_arg(tcp->tcp_pcb, NULL);
|
|
|
|
if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK)
|
|
panic("unexpected TCP close failure: %d", err);
|
|
|
|
tcp->tcp_pcb = NULL;
|
|
}
|
|
|
|
/*
|
|
* Return TRUE if all conditions are met for closing the TCP socket's PCB, or
|
|
* FALSE if they are not. Upon calling this function, the socket's PCB must
|
|
* still be around.
|
|
*/
|
|
static int
|
|
tcpsock_may_close(struct tcpsock * tcp)
|
|
{
|
|
|
|
assert(tcp->tcp_pcb != NULL);
|
|
|
|
/*
|
|
* Regular closing of the PCB requires three conditions to be met:
|
|
*
|
|
* 1. all our data has been transmitted AND acknowledged, so that we do
|
|
* not risk corruption in case there are still unsent or unack'ed
|
|
* data buffers that may otherwise be recycled too soon;
|
|
* 2. we have sent our FIN to the peer; and,
|
|
* 3. we have received a FIN from the peer.
|
|
*/
|
|
return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) ==
|
|
(TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0);
|
|
}
|
|
|
|
/*
|
|
* The given socket is ready to be closed as per the tcpsock_may_close() rules.
|
|
* This implies that its send queue is already empty. Gracefully close the
|
|
* PCB. In addition, if the socket is being closed gracefully, meaning we
|
|
* suspended an earlier tcpsock_close() call (and as such already emptied the
|
|
* receive queue as well), then tell libsockevent that the close is finished,
|
|
* freeing the socket. Return TRUE if the socket has indeed been freed this
|
|
* way, or FALSE if the socket is still around.
|
|
*/
|
|
static int
|
|
tcpsock_finish_close(struct tcpsock * tcp)
|
|
{
|
|
|
|
assert(tcp->tcp_snd.ts_len == 0);
|
|
assert(tcp->tcp_listener == NULL);
|
|
|
|
/*
|
|
* If we get here, we have already shut down the sending side of the
|
|
* PCB. Technically, we are interested only in shutting down the
|
|
* receiving side of the PCB here, so that lwIP may decide to recycle
|
|
* the socket later etcetera. We call tcp_close() because we do not
|
|
* want to rely on tcp_shutdown(RX) doing the exact same thing.
|
|
* However, we do rely on the fact that the PCB is not immediately
|
|
* destroyed by the tcp_close() call: otherwise we may have to return
|
|
* ERR_ABRT if this function is called from a lwIP-generated event.
|
|
*/
|
|
tcpsock_pcb_close(tcp);
|
|
|
|
/*
|
|
* If we suspended an earlier tcpsock_close() call, we have to tell
|
|
* libsockevent that the close operation is now complete.
|
|
*/
|
|
if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
|
|
assert(tcp->tcp_rcv.tr_len == 0);
|
|
|
|
sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
|
|
|
|
return TRUE;
|
|
} else
|
|
return FALSE;
|
|
}
|
|
|
|
/*
|
|
* Attempt to start or resume enqueuing data and/or a FIN to send on the given
|
|
* TCP socket. Return TRUE if anything at all could be newly enqueued on the
|
|
* lwIP PCB, even if less than desired. In that case, the caller should try to
|
|
* send whatever was enqueued, and if applicable, check if the socket may now
|
|
* be closed (due to the FIN being enqueued). In particular, in any situation
|
|
* where the socket may be in the process of being closed, the caller must use
|
|
* tcpsock_may_close() if TRUE is returned. Return FALSE if nothing new could
|
|
* be enqueued, in which case no send attempt need to be made either.
|
|
*/
|
|
static int
|
|
tcpsock_pcb_enqueue(struct tcpsock * tcp)
|
|
{
|
|
struct pbuf *punsent;
|
|
size_t space, chunk;
|
|
unsigned int flags;
|
|
err_t err;
|
|
int enqueued;
|
|
|
|
assert(tcp->tcp_pcb != NULL);
|
|
|
|
if (tcpsock_get_flags(tcp) & TCPF_FULL)
|
|
return FALSE;
|
|
|
|
/*
|
|
* Attempt to enqueue more unsent data, if any, on the PCB's send
|
|
* queue.
|
|
*/
|
|
enqueued = FALSE;
|
|
|
|
while (tcp->tcp_snd.ts_unsent != NULL) {
|
|
if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0)
|
|
break;
|
|
|
|
/*
|
|
* We may maintain a non-NULL unsent pointer even when there is
|
|
* nothing more to send right now, because the tail buffer may
|
|
* be filled up further later on.
|
|
*/
|
|
punsent = tcp->tcp_snd.ts_unsent;
|
|
|
|
assert(punsent->len >= tcp->tcp_snd.ts_unsent_off);
|
|
|
|
chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off;
|
|
if (chunk == 0)
|
|
break;
|
|
|
|
if (chunk > space)
|
|
chunk = space;
|
|
|
|
/* Try to enqueue more data for sending. */
|
|
if (chunk < punsent->len || punsent->next != NULL)
|
|
flags = TCP_WRITE_FLAG_MORE;
|
|
else
|
|
flags = 0;
|
|
|
|
err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload +
|
|
tcp->tcp_snd.ts_unsent_off, chunk, flags);
|
|
|
|
/*
|
|
* Since tcp_write() enqueues data only, it should only return
|
|
* out-of-memory errors; no fatal ones. In any case, stop.
|
|
*/
|
|
if (err != ERR_OK) {
|
|
assert(err == ERR_MEM);
|
|
|
|
break;
|
|
}
|
|
|
|
/* We have successfully enqueued data. */
|
|
enqueued = TRUE;
|
|
|
|
tcp->tcp_snd.ts_unsent_off += chunk;
|
|
|
|
if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) {
|
|
assert(tcp->tcp_snd.ts_unsent_off < punsent->len ||
|
|
punsent->next == NULL);
|
|
|
|
break;
|
|
}
|
|
|
|
tcp->tcp_snd.ts_unsent = punsent->next;
|
|
tcp->tcp_snd.ts_unsent_off = 0;
|
|
}
|
|
|
|
/*
|
|
* If all pending data has been enqueued for sending, and we should
|
|
* shut down the sending end of the socket, try that now.
|
|
*/
|
|
if ((tcp->tcp_snd.ts_unsent == NULL ||
|
|
tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) &&
|
|
tcpsock_is_shutdown(tcp, SFL_SHUT_WR) &&
|
|
!(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) {
|
|
err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/);
|
|
|
|
if (err == ERR_OK) {
|
|
/*
|
|
* We have successfully enqueued a FIN. The caller is
|
|
* now responsible for checking whether the PCB and
|
|
* possibly even the socket object can now be freed.
|
|
*/
|
|
tcpsock_set_flag(tcp, TCPF_SENT_FIN);
|
|
|
|
enqueued = TRUE;
|
|
} else {
|
|
assert(err == ERR_MEM);
|
|
|
|
/*
|
|
* FIXME: the resolution for lwIP bug #47485 has taken
|
|
* away even more control over the closing process from
|
|
* us, making tracking sockets especially for SO_LINGER
|
|
* even harder. For now, we simply effectively undo
|
|
* the patch by clearing TF_CLOSEPEND if tcp_shutdown()
|
|
* returns ERR_MEM. This will not be sustainable in
|
|
* the long term, though.
|
|
*/
|
|
tcp->tcp_pcb->flags &= ~TF_CLOSEPEND;
|
|
|
|
tcpsock_set_flag(tcp, TCPF_FULL);
|
|
}
|
|
}
|
|
|
|
return enqueued;
|
|
}
|
|
|
|
/*
|
|
* Request lwIP to start sending any enqueued data and/or FIN on the TCP
|
|
* socket's lwIP PCB. On success, return OK. On failure, return a negative
|
|
* error code, after cleaning up the socket, freeing the PCB. If the socket
|
|
* was already being closed, also free the socket object in that case; the
|
|
* caller must then not touch the socket object anymore upon return. If the
|
|
* socket object is not freed, and if 'raise_error' is TRUE, raise the error
|
|
* on the socket object.
|
|
*/
|
|
static int
|
|
tcpsock_pcb_send(struct tcpsock * tcp, int raise_error)
|
|
{
|
|
err_t err;
|
|
int r;
|
|
|
|
assert(tcp->tcp_pcb != NULL);
|
|
|
|
/*
|
|
* If we have enqueued something, ask lwIP to send TCP packets now.
|
|
* This may result in a fatal error, in which case we clean up the
|
|
* socket and return the error to the caller. Since cleaning up the
|
|
* socket may free the socket object, and the caller cannot tell
|
|
* whether that will happen or has happened, also possibly raise the
|
|
* error on the socket object if it is not gone. As such, callers that
|
|
* set 'raise_error' to FALSE must know for sure that the socket was
|
|
* not being closed, for example because the caller is processing a
|
|
* (send) call from userland.
|
|
*/
|
|
err = tcp_output(tcp->tcp_pcb);
|
|
|
|
if (err != ERR_OK && err != ERR_MEM) {
|
|
tcpsock_pcb_abort(tcp);
|
|
|
|
r = util_convert_err(err);
|
|
|
|
if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
|
|
if (raise_error)
|
|
sockevent_set_error(tcpsock_get_sock(tcp), r);
|
|
}
|
|
/* Otherwise, do not touch the socket object anymore! */
|
|
|
|
return r;
|
|
} else
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Callback from lwIP. The given number of data bytes have been acknowledged
|
|
* as received by the remote end. Dequeue and free data from the TCP socket's
|
|
* send queue as appropriate.
|
|
*/
|
|
static err_t
|
|
tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)arg;
|
|
struct pbuf *phead;
|
|
size_t left;
|
|
|
|
assert(tcp != NULL);
|
|
assert(pcb == tcp->tcp_pcb);
|
|
assert(len > 0);
|
|
|
|
assert(tcp->tcp_snd.ts_len >= len);
|
|
assert(tcp->tcp_snd.ts_head != NULL);
|
|
|
|
left = len;
|
|
|
|
/*
|
|
* First see if we can free up whole buffers. Check against the head
|
|
* buffer's 'len' rather than 'tot_len', or we may end up leaving an
|
|
* empty buffer on the chain.
|
|
*/
|
|
while ((phead = tcp->tcp_snd.ts_head) != NULL &&
|
|
left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) {
|
|
left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off;
|
|
|
|
tcp->tcp_snd.ts_head = phead->next;
|
|
tcp->tcp_snd.ts_head_off = 0;
|
|
|
|
if (phead == tcp->tcp_snd.ts_unsent) {
|
|
assert(tcp->tcp_snd.ts_unsent_off == phead->len);
|
|
|
|
tcp->tcp_snd.ts_unsent = phead->next;
|
|
tcp->tcp_snd.ts_unsent_off = 0;
|
|
}
|
|
|
|
assert(tcpsock_sendbufs > 0);
|
|
tcpsock_sendbufs--;
|
|
|
|
tcpsock_free_buf(phead);
|
|
}
|
|
|
|
/*
|
|
* The rest of the given length is for less than the current head
|
|
* buffer.
|
|
*/
|
|
if (left > 0) {
|
|
assert(tcp->tcp_snd.ts_head != NULL);
|
|
assert((size_t)tcp->tcp_snd.ts_head->len -
|
|
tcp->tcp_snd.ts_head_off > left);
|
|
|
|
tcp->tcp_snd.ts_head_off += left;
|
|
}
|
|
|
|
tcp->tcp_snd.ts_len -= (size_t)len;
|
|
|
|
if (tcp->tcp_snd.ts_head == NULL) {
|
|
assert(tcp->tcp_snd.ts_len == 0);
|
|
assert(tcp->tcp_snd.ts_unsent == NULL);
|
|
tcp->tcp_snd.ts_tail = NULL;
|
|
} else
|
|
assert(tcp->tcp_snd.ts_len > 0);
|
|
|
|
/*
|
|
* If we emptied the send queue, and we already managed to send a FIN
|
|
* earlier, we may now have met all requirements to close the socket's
|
|
* PCB. Otherwise, we may also be able to send more now, so try to
|
|
* resume sending. Since we are invoked from the "sent" event,
|
|
* tcp_output() will not actually process anything, and so we do not
|
|
* call it either. If we did, we would have to deal with errors here.
|
|
*/
|
|
if (tcpsock_may_close(tcp)) {
|
|
if (tcpsock_finish_close(tcp))
|
|
return ERR_OK;
|
|
} else {
|
|
tcpsock_clear_flag(tcp, TCPF_FULL);
|
|
|
|
/*
|
|
* If we now manage to enqueue a FIN, we may be ready to close
|
|
* the PCB after all.
|
|
*/
|
|
if (tcpsock_pcb_enqueue(tcp)) {
|
|
if (tcpsock_may_close(tcp) &&
|
|
tcpsock_finish_close(tcp))
|
|
return ERR_OK;
|
|
}
|
|
}
|
|
|
|
/* The user may also be able to send more now. */
|
|
sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
/*
|
|
* Check whether any (additional) data previously received on a TCP socket
|
|
* should be acknowledged, possibly allowing the remote end to send additional
|
|
* data as a result.
|
|
*/
|
|
static void
|
|
tcpsock_ack_recv(struct tcpsock * tcp)
|
|
{
|
|
size_t rcvbuf, left, delta, ack;
|
|
|
|
assert(tcp->tcp_pcb != NULL);
|
|
|
|
/*
|
|
* We must make sure that at all times, we can still add an entire
|
|
* window's worth of data to the receive queue. If the amount of free
|
|
* space drops below that threshold, we stop acknowledging received
|
|
* data. The user may change the receive buffer size at all times; we
|
|
* update the window size lazily as appropriate.
|
|
*/
|
|
rcvbuf = tcpsock_get_rcvbuf(tcp);
|
|
|
|
if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) {
|
|
/*
|
|
* The number of bytes that lwIP can still give us at any time
|
|
* is represented as 'left'. The number of bytes that we still
|
|
* allow to be stored in the receive queue is represented as
|
|
* 'delta'. We must make sure that 'left' does not ever exceed
|
|
* 'delta' while acknowledging as many bytes as possible under
|
|
* that rule.
|
|
*/
|
|
left = TCP_WND - tcp->tcp_rcv.tr_unacked;
|
|
delta = rcvbuf - tcp->tcp_rcv.tr_len;
|
|
|
|
if (left < delta) {
|
|
ack = delta - left;
|
|
|
|
if (ack > tcp->tcp_rcv.tr_unacked)
|
|
ack = tcp->tcp_rcv.tr_unacked;
|
|
|
|
tcp_recved(tcp->tcp_pcb, ack);
|
|
|
|
tcp->tcp_rcv.tr_unacked -= ack;
|
|
|
|
assert(tcp->tcp_rcv.tr_len + TCP_WND -
|
|
tcp->tcp_rcv.tr_unacked <= rcvbuf);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Attempt to merge two consecutive underfilled buffers in the receive queue of
|
|
* a TCP socket, freeing up one of the two buffers as a result. The first
|
|
* (oldest) buffer is 'ptail', and the pointer to this buffer is stored at
|
|
* 'pnext'. The second (new) buffer is 'pbuf', which is already attached to
|
|
* the first buffer. The second buffer may be followed by additional buffers
|
|
* with even more new data. Return TRUE if buffers have been merged, in which
|
|
* case the pointer at 'pnext' may have changed, and no assumptions should be
|
|
* made about whether 'ptail' and 'pbuf' still exist in any form. Return FALSE
|
|
* if no merging was necessary or if no new buffer could be allocated.
|
|
*/
|
|
static int
|
|
tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf)
|
|
{
|
|
struct pbuf *pnew;
|
|
|
|
assert(*pnext == ptail);
|
|
assert(ptail->next == pbuf);
|
|
|
|
/*
|
|
* Unfortunately, we cannot figure out what kind of pbuf we were given
|
|
* by the lower layers, so we cannot merge two buffers without first
|
|
* allocating a third. Once we have done that, though, we can easily
|
|
* merge more into that new buffer. For now we use the following
|
|
* policies:
|
|
*
|
|
* 1. if two consecutive lwIP-provided buffers are both used less than
|
|
* half the size of a full buffer, try to allocate a new buffer and
|
|
* copy both lwIP-provided buffers into that new buffer, freeing up
|
|
* the pair afterwards;
|
|
* 2. if the tail buffer on the chain is allocated by us and not yet
|
|
* full, and the next buffer's contents can be added to the tail
|
|
* buffer in their entirety, do just that.
|
|
*
|
|
* Obviously there is a trade-off between the performance overhead of
|
|
* copying and the resource overhead of keeping less-than-full buffers
|
|
* on the receive queue, but this policy should both keep actual memory
|
|
* usage to no more than twice the receive queue length and prevent
|
|
* excessive copying. The policy deliberately performs more aggressive
|
|
* merging into a buffer that we allocated ourselves.
|
|
*/
|
|
if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 &&
|
|
pbuf->len <= MEMPOOL_BUFSIZE / 2) {
|
|
/*
|
|
* Case #1.
|
|
*/
|
|
assert(ptail->tot_len == ptail->len);
|
|
assert(pbuf->tot_len == pbuf->len);
|
|
|
|
pnew = tcpsock_alloc_buf();
|
|
if (pnew == NULL)
|
|
return FALSE;
|
|
|
|
memcpy(pnew->payload, ptail->payload, ptail->len);
|
|
memcpy((char *)pnew->payload + ptail->len, pbuf->payload,
|
|
pbuf->len);
|
|
pnew->len = ptail->len + pbuf->len;
|
|
assert(pnew->len <= pnew->tot_len);
|
|
|
|
pnew->next = pbuf->next;
|
|
/* For now, we need not inherit any flags from either pbuf. */
|
|
|
|
*pnext = pnew;
|
|
|
|
/* One allocated, two about to be deallocated. */
|
|
assert(tcpsock_recvbufs > 0);
|
|
tcpsock_recvbufs--;
|
|
|
|
tcpsock_free_buf(ptail);
|
|
tcpsock_free_buf(pbuf);
|
|
|
|
return TRUE;
|
|
} else if (ptail->tot_len - ptail->len >= pbuf->len) {
|
|
/*
|
|
* Case #2.
|
|
*/
|
|
memcpy((char *)ptail->payload + ptail->len, pbuf->payload,
|
|
pbuf->len);
|
|
|
|
ptail->len += pbuf->len;
|
|
|
|
ptail->next = pbuf->next;
|
|
|
|
assert(tcpsock_recvbufs > 0);
|
|
tcpsock_recvbufs--;
|
|
|
|
tcpsock_free_buf(pbuf);
|
|
|
|
return TRUE;
|
|
} else
|
|
return FALSE;
|
|
}
|
|
|
|
/*
|
|
* Callback from lwIP. New data or flags have been received on a TCP socket.
|
|
*/
|
|
static err_t
|
|
tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused,
|
|
struct pbuf * pbuf, err_t err)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)arg;
|
|
struct pbuf *ptail, **pprevp;
|
|
size_t len;
|
|
|
|
assert(tcp != NULL);
|
|
assert(pcb == tcp->tcp_pcb);
|
|
|
|
/*
|
|
* lwIP should never provide anything other than ERR_OK in 'err', and
|
|
* it is not clear what we should do if it would. If lwIP ever changes
|
|
* in this regard, we will likely have to change this code accordingly.
|
|
*/
|
|
if (err != ERR_OK)
|
|
panic("TCP receive event with error: %d", err);
|
|
|
|
/* If the given buffer is NULL, we have received a FIN. */
|
|
if (pbuf == NULL) {
|
|
tcpsock_set_flag(tcp, TCPF_RCVD_FIN);
|
|
|
|
/* Userland may now receive EOF. */
|
|
if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD))
|
|
sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
|
|
|
|
/*
|
|
* If we were in the process of closing the socket, and we
|
|
* receive a FIN before our FIN got acknowledged, we close the
|
|
* socket anyway, as described in tcpsock_close(). However, if
|
|
* there is still unacknowledged outgoing data or we did not
|
|
* even manage to send our FIN yet, hold off closing the socket
|
|
* for now.
|
|
*/
|
|
if (tcpsock_may_close(tcp))
|
|
(void)tcpsock_finish_close(tcp);
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
/*
|
|
* If the socket is being closed, receiving new data should cause a
|
|
* reset.
|
|
*/
|
|
if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
|
|
tcpsock_pcb_abort(tcp);
|
|
|
|
(void)tcpsock_cleanup(tcp, TRUE /*may_free*/);
|
|
/* Do not touch the socket object anymore! */
|
|
|
|
pbuf_free(pbuf);
|
|
|
|
return ERR_ABRT;
|
|
}
|
|
|
|
/*
|
|
* If the socket has already been shut down for reading, discard the
|
|
* incoming data and do nothing else.
|
|
*/
|
|
if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) {
|
|
tcp_recved(tcp->tcp_pcb, pbuf->tot_len);
|
|
|
|
pbuf_free(pbuf);
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
/*
|
|
* We deliberately ignore the PBUF_FLAG_PUSH flag. This flag would
|
|
* enable the receive functionality to delay delivering "un-pushed"
|
|
* data to applications. The implementation of this scheme could track
|
|
* the amount of data up to and including the last-pushed segment using
|
|
* a "tr_push_len" field or so. Deciding when to deliver "un-pushed"
|
|
* data after all is a bit tricker though. As far as I can tell, the
|
|
* BSDs do not implement anything like that. Windows does, and this
|
|
* results in interaction problems with even more lightweight TCP/IP
|
|
* stacks that do not send the TCP PSH flag. Currently, there is no
|
|
* obvious benefit for us to support delaying data delivery like that.
|
|
* In addition, testing its implementation reliably would be difficult.
|
|
*/
|
|
|
|
len = (size_t)pbuf->tot_len;
|
|
|
|
/*
|
|
* Count the number of buffers that are now owned by us. The new total
|
|
* of buffers owned by us must not exceed the size of the memory pool.
|
|
* Any more would indicate an accounting error. Note that
|
|
* tcpsock_recvbufs is currently used for debugging only!
|
|
*/
|
|
tcpsock_recvbufs += pbuf_clen(pbuf);
|
|
assert(tcpsock_recvbufs < mempool_cur_buffers());
|
|
|
|
/*
|
|
* The pre-tail pointer points to whatever is pointing to the tail
|
|
* buffer. The latter pointer may be the 'tr_head' field in our
|
|
* tcpsock structure, or the 'next' field in the penultimate buffer,
|
|
* or NULL if there are currently no buffers on the receive queue.
|
|
*/
|
|
if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) {
|
|
ptail = *pprevp;
|
|
|
|
assert(ptail != NULL);
|
|
assert(ptail->next == NULL);
|
|
assert(tcp->tcp_rcv.tr_head != NULL);
|
|
|
|
ptail->next = pbuf;
|
|
pbuf->tot_len = pbuf->len; /* to help freeing on merges */
|
|
|
|
if (tcpsock_try_merge(pprevp, ptail, pbuf)) {
|
|
ptail = *pprevp;
|
|
pbuf = ptail->next;
|
|
}
|
|
|
|
if (pbuf != NULL)
|
|
pprevp = &ptail->next;
|
|
} else {
|
|
assert(tcp->tcp_rcv.tr_head == NULL);
|
|
assert(tcp->tcp_rcv.tr_head_off == 0);
|
|
|
|
tcp->tcp_rcv.tr_head = pbuf;
|
|
|
|
pprevp = &tcp->tcp_rcv.tr_head;
|
|
}
|
|
|
|
/*
|
|
* Chop up the chain into individual buffers. This is necessary as we
|
|
* overload 'tot_len' to mean "space available in the buffer", as we
|
|
* want for buffers allocated by us as part of buffer merges. Also get
|
|
* a pointer to the pointer to the new penultimate tail buffer. Due to
|
|
* merging, the chain may already be empty by now, though.
|
|
*/
|
|
if (pbuf != NULL) {
|
|
for (; pbuf->next != NULL; pbuf = pbuf->next) {
|
|
pbuf->tot_len = pbuf->len;
|
|
|
|
pprevp = &pbuf->next;
|
|
}
|
|
assert(pbuf->len == pbuf->tot_len);
|
|
}
|
|
|
|
assert(*pprevp != NULL);
|
|
assert((*pprevp)->next == NULL);
|
|
tcp->tcp_rcv.tr_pre_tailp = pprevp;
|
|
|
|
tcp->tcp_rcv.tr_len += len;
|
|
tcp->tcp_rcv.tr_unacked += len;
|
|
|
|
assert(tcp->tcp_rcv.tr_unacked <= TCP_WND);
|
|
|
|
/*
|
|
* Note that tr_len may now exceed the receive buffer size in the
|
|
* highly exceptional case that the user is adjusting the latter after
|
|
* the socket had already received data.
|
|
*/
|
|
|
|
/* See if we can immediately acknowledge some or all of the data. */
|
|
tcpsock_ack_recv(tcp);
|
|
|
|
/* Also wake up any receivers now. */
|
|
sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
/*
|
|
* Callback from lwIP. The PCB corresponding to the socket identified by 'arg'
|
|
* has been closed by lwIP, with the reason specified in 'err': either the
|
|
* connection has been aborted locally (ERR_ABRT), it has been reset by the
|
|
* remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD).
|
|
*/
|
|
static void
|
|
tcpsock_event_err(void * arg, err_t err)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)arg;
|
|
int r;
|
|
|
|
assert(tcp != NULL);
|
|
assert(tcp->tcp_pcb != NULL);
|
|
assert(err != ERR_OK);
|
|
|
|
/* The original PCB is now gone, or will be shortly. */
|
|
tcp->tcp_pcb = NULL;
|
|
|
|
/*
|
|
* Clean up the socket. As a result it may be freed, in which case we
|
|
* must not touch it anymore. No need to return ERR_ABRT from here, as
|
|
* the PCB has been aborted already.
|
|
*/
|
|
if (tcpsock_cleanup(tcp, TRUE /*may_free*/))
|
|
return;
|
|
|
|
if (err == ERR_CLSD) {
|
|
/*
|
|
* We may get here if the socket is shut down for writing and
|
|
* we already received a FIN from the remote side, thus putting
|
|
* the socket in LAST_ACK state, and we receive that last
|
|
* acknowledgment. There is nothing more we need to do.
|
|
*
|
|
* We will never get here in the other case that ERR_CLSD is
|
|
* raised, which is when the socket is reset because of
|
|
* unacknowledged data while closing: we handle the
|
|
* reset-on-ACK case ourselves in tcpsock_close(), and the
|
|
* socket is in closing state after that.
|
|
*/
|
|
assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
|
|
assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
|
|
} else {
|
|
/*
|
|
* Anything else should be an error directly from lwIP;
|
|
* currently either ERR_ABRT and ERR_RST. Covert it to a
|
|
* regular error and set it on the socket. Doing so will also
|
|
* raise the appropriate events.
|
|
*/
|
|
/*
|
|
* Unfortunately, lwIP is not throwing accurate errors even
|
|
* when it can. We convert some errors to reflect more
|
|
* accurately the most likely cause.
|
|
*
|
|
* TODO: fix lwIP in this regard..
|
|
*/
|
|
r = util_convert_err(err);
|
|
|
|
if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) {
|
|
switch (err) {
|
|
case ERR_ABRT: r = ETIMEDOUT; break;
|
|
case ERR_RST: r = ECONNREFUSED; break;
|
|
}
|
|
}
|
|
|
|
sockevent_set_error(tcpsock_get_sock(tcp), r);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Callback from lwIP. Perform regular checks on a TCP socket. This function
|
|
* is called one per five seconds on connected sockets, and twice per second on
|
|
* closing sockets.
|
|
*/
|
|
static err_t
|
|
tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)arg;
|
|
err_t err;
|
|
int r;
|
|
|
|
assert(tcp != NULL);
|
|
assert(pcb == tcp->tcp_pcb);
|
|
|
|
/*
|
|
* If we ended up running out of buffers earlier, try resuming any send
|
|
* requests now, both for enqueuing TCP data with lwIP and for user
|
|
* requests.
|
|
*/
|
|
if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) {
|
|
tcpsock_clear_flag(tcp, TCPF_FULL);
|
|
tcpsock_clear_flag(tcp, TCPF_OOM);
|
|
|
|
/* See if we can enqueue more data with lwIP. */
|
|
if (tcpsock_pcb_enqueue(tcp)) {
|
|
/* In some cases, we can now close the PCB. */
|
|
if (tcpsock_may_close(tcp)) {
|
|
(void)tcpsock_finish_close(tcp);
|
|
/*
|
|
* The PCB is definitely gone here, and the
|
|
* entire socket object may be gone now too.
|
|
* Do not touch either anymore!
|
|
*/
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
/*
|
|
* If actually sending the data fails, the PCB will be
|
|
* gone, and the socket object may be gone as well. Do
|
|
* not touch either anymore in that case!
|
|
*/
|
|
if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK)
|
|
return ERR_ABRT;
|
|
}
|
|
|
|
/*
|
|
* If we ran out of buffers earlier, it may be possible to take
|
|
* in more data from a user process now, even if we did not
|
|
* manage to enqueue any more pending data with lwIP.
|
|
*/
|
|
sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
|
|
|
|
assert(tcp->tcp_pcb != NULL);
|
|
} else if (tcp->tcp_snd.ts_unsent != NULL &&
|
|
tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) {
|
|
/*
|
|
* If the send buffer is full, we will no longer call
|
|
* tcp_output(), which means we may also miss out on fatal
|
|
* errors that would otherwise kill the connection (e.g., no
|
|
* route). As a result, the connection may erroneously
|
|
* continue to exist for a long time. To avoid this, we call
|
|
* tcp_output() every once in a while when there are still
|
|
* unsent data.
|
|
*/
|
|
err = tcp_output(tcp->tcp_pcb);
|
|
|
|
if (err != ERR_OK && err != ERR_MEM) {
|
|
tcpsock_pcb_abort(tcp);
|
|
|
|
if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
|
|
r = util_convert_err(err);
|
|
|
|
sockevent_set_error(tcpsock_get_sock(tcp), r);
|
|
}
|
|
/* Otherwise do not touch the socket object anymore! */
|
|
|
|
return ERR_ABRT;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If we are closing the socket, and we sent a FIN, see if the FIN got
|
|
* acknowledged. If so, finish closing the socket. Unfortunately, we
|
|
* can perform this check by polling only. TODO: change lwIP..
|
|
*/
|
|
if (sockevent_is_closing(tcpsock_get_sock(tcp)) &&
|
|
(tcpsock_get_flags(tcp) & TCPF_SENT_FIN) &&
|
|
tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) {
|
|
assert(tcp->tcp_snd.ts_len == 0);
|
|
|
|
tcpsock_finish_close(tcp);
|
|
}
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
/*
|
|
* Bind a TCP socket to a local address.
|
|
*/
|
|
static int
|
|
tcpsock_bind(struct sock * sock, const struct sockaddr * addr,
|
|
socklen_t addr_len, endpoint_t user_endpt)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
ip_addr_t ipaddr;
|
|
uint16_t port;
|
|
err_t err;
|
|
int r;
|
|
|
|
if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)
|
|
return EINVAL;
|
|
|
|
if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
|
|
user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port,
|
|
FALSE /*allow_mcast*/, &ipaddr, &port)) != OK)
|
|
return r;
|
|
|
|
err = tcp_bind(tcp->tcp_pcb, &ipaddr, port);
|
|
|
|
return util_convert_err(err);
|
|
}
|
|
|
|
/*
|
|
* Callback from lwIP. A new connection 'pcb' has arrived on the listening
|
|
* socket identified by 'arg'. Note that 'pcb' may be NULL in the case that
|
|
* lwIP could not accept the connection itself.
|
|
*/
|
|
static err_t
|
|
tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)arg;
|
|
|
|
assert(tcp != NULL);
|
|
assert(tcpsock_is_listening(tcp));
|
|
|
|
/*
|
|
* If the given PCB is NULL, then lwIP ran out of memory allocating a
|
|
* PCB for the new connection. There is nothing we can do with that
|
|
* information. Also check 'err' just to make sure.
|
|
*/
|
|
if (pcb == NULL || err != OK)
|
|
return ERR_OK;
|
|
|
|
/*
|
|
* The TCP socket is the listening socket, but the PCB is for the
|
|
* incoming connection.
|
|
*/
|
|
if (tcpsock_clone(tcp, pcb) != OK) {
|
|
/*
|
|
* We could not allocate the resources necessary to accept the
|
|
* connection. Abort it immediately.
|
|
*/
|
|
tcp_abort(pcb);
|
|
|
|
return ERR_ABRT;
|
|
}
|
|
|
|
/*
|
|
* The connection has not yet been accepted, and thus should still be
|
|
* considered on the listen queue.
|
|
*/
|
|
tcp_backlog_delayed(pcb);
|
|
|
|
/* Set the callback functions. */
|
|
tcp_recv(pcb, tcpsock_event_recv);
|
|
tcp_sent(pcb, tcpsock_event_sent);
|
|
tcp_err(pcb, tcpsock_event_err);
|
|
tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
|
|
|
|
sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT);
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
/*
|
|
* Put a TCP socket in listening mode.
|
|
*/
|
|
static int
|
|
tcpsock_listen(struct sock * sock, int backlog)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
struct tcp_pcb *pcb;
|
|
err_t err;
|
|
|
|
/* The maximum backlog value must not exceed its field size. */
|
|
assert(SOMAXCONN <= UINT8_MAX);
|
|
|
|
/*
|
|
* Allow only CLOSED sockets to enter listening mode. If the socket
|
|
* was already in listening mode, allow its backlog value to be
|
|
* updated, even if it was shut down already (making this a no-op).
|
|
*/
|
|
if (!tcpsock_is_listening(tcp) &&
|
|
(tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED))
|
|
return EINVAL;
|
|
|
|
/*
|
|
* If the socket was not already in listening mode, put it in that mode
|
|
* now. That involves switching PCBs as lwIP attempts to save memory
|
|
* by replacing the original PCB with a smaller one. If the socket was
|
|
* already in listening mode, simply update its backlog value--this has
|
|
* no effect on the sockets already in the backlog.
|
|
*/
|
|
if (!tcpsock_is_listening(tcp)) {
|
|
assert(tcp->tcp_pcb != NULL);
|
|
|
|
/*
|
|
* If the socket has not been bound to a port yet, do that
|
|
* first. This does mean that the listen call may fail with
|
|
* side effects, but that is acceptable in this case.
|
|
*/
|
|
if (tcp->tcp_pcb->local_port == 0) {
|
|
err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip,
|
|
0 /*port*/);
|
|
|
|
if (err != ERR_OK)
|
|
return util_convert_err(err);
|
|
}
|
|
|
|
/*
|
|
* Clear the argument on the PCB that is about to be replaced,
|
|
* because if we do not, once the PCB is reused (which does not
|
|
* clear the argument), we might get weird events. Do this
|
|
* before the tcp_listen() call, because we should no longer
|
|
* access the old PCB afterwards (even if we can).
|
|
*/
|
|
tcp_arg(tcp->tcp_pcb, NULL);
|
|
|
|
pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog,
|
|
&err);
|
|
|
|
if (pcb == NULL) {
|
|
tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */
|
|
|
|
return util_convert_err(err);
|
|
}
|
|
|
|
tcp_arg(pcb, tcp);
|
|
tcp->tcp_pcb = pcb;
|
|
|
|
tcp_accept(pcb, tcpsock_event_accept);
|
|
|
|
/* Initialize the queue head for sockets pending acceptance. */
|
|
TAILQ_INIT(&tcp->tcp_queue.tq_head);
|
|
} else if (tcp->tcp_pcb != NULL)
|
|
tcp_backlog_set(tcp->tcp_pcb, backlog);
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Callback from lwIP. A socket connection attempt has succeeded. Note that
|
|
* failed socket events will trigger the tcpsock_event_err() callback instead.
|
|
*/
|
|
static err_t
|
|
tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)arg;
|
|
|
|
assert(tcp != NULL);
|
|
assert(pcb == tcp->tcp_pcb);
|
|
assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING);
|
|
|
|
/*
|
|
* If lwIP ever changes so that this callback is called for connect
|
|
* failures as well, then we need to change the code here accordingly.
|
|
*/
|
|
if (err != ERR_OK)
|
|
panic("TCP connected event with error: %d", err);
|
|
|
|
tcpsock_clear_flag(tcp, TCPF_CONNECTING);
|
|
|
|
sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND);
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
/*
|
|
* Connect a TCP socket to a remote address.
|
|
*/
|
|
static int
|
|
tcpsock_connect(struct sock * sock, const struct sockaddr * addr,
|
|
socklen_t addr_len, endpoint_t user_endpt)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
ip_addr_t dst_addr;
|
|
uint16_t dst_port;
|
|
err_t err;
|
|
int r;
|
|
|
|
/*
|
|
* Listening sockets may not have a PCB, so we use higher-level flags
|
|
* to throw the correct error code for those instead.
|
|
*/
|
|
if (tcpsock_is_listening(tcp))
|
|
return EOPNOTSUPP;
|
|
|
|
/*
|
|
* If there is no longer any PCB, we obviously cannot perform the
|
|
* connection, but POSIX is not clear on which error to return. We
|
|
* copy NetBSD's.
|
|
*/
|
|
if (tcp->tcp_pcb == NULL)
|
|
return EINVAL;
|
|
|
|
/*
|
|
* The only state from which a connection can be initiated, is CLOSED.
|
|
* Some of the other states require distinct error codes, though.
|
|
*/
|
|
switch (tcp->tcp_pcb->state) {
|
|
case CLOSED:
|
|
break;
|
|
case SYN_SENT:
|
|
return EALREADY;
|
|
case LISTEN:
|
|
assert(0); /* we just checked.. */
|
|
default:
|
|
return EISCONN;
|
|
}
|
|
|
|
/*
|
|
* Get the destination address, and attempt to start connecting. If
|
|
* the socket was not bound before, or it was bound to a port only,
|
|
* then lwIP will select a source address for us. We cannot do this
|
|
* ourselves even if we wanted to: it is impossible to re-bind a TCP
|
|
* PCB in the case it was previously bound to a port only.
|
|
*/
|
|
if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
|
|
&tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK)
|
|
return r;
|
|
|
|
err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port,
|
|
tcpsock_event_connected);
|
|
|
|
/*
|
|
* Note that various tcp_connect() error cases will leave the PCB with
|
|
* a newly set local and remote IP address anyway. We should be
|
|
* careful not to rely on the addresses being as they were before.
|
|
*/
|
|
if (err != ERR_OK)
|
|
return util_convert_err(err);
|
|
|
|
/* Set the other callback functions. */
|
|
tcp_recv(tcp->tcp_pcb, tcpsock_event_recv);
|
|
tcp_sent(tcp->tcp_pcb, tcpsock_event_sent);
|
|
tcp_err(tcp->tcp_pcb, tcpsock_event_err);
|
|
tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
|
|
|
|
/*
|
|
* Set a flag so that we can correct lwIP's error codes in case the
|
|
* connection fails.
|
|
*/
|
|
tcpsock_set_flag(tcp, TCPF_CONNECTING);
|
|
|
|
return SUSPEND;
|
|
}
|
|
|
|
/*
|
|
* Test whether any new connections are pending on a listening TCP socket.
|
|
*/
|
|
static int
|
|
tcpsock_test_accept(struct sock * sock)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
|
|
/* Is this socket in listening mode at all? */
|
|
if (!tcpsock_is_listening(tcp))
|
|
return EINVAL;
|
|
|
|
/* Are there any connections to accept right now? */
|
|
if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head))
|
|
return OK;
|
|
|
|
/* If the socket has been shut down, we return ECONNABORTED. */
|
|
if (tcp->tcp_pcb == NULL)
|
|
return ECONNABORTED;
|
|
|
|
/* Otherwise, wait for a new connection first. */
|
|
return SUSPEND;
|
|
}
|
|
|
|
/*
|
|
* Accept a connection on a listening TCP socket, creating a new TCP socket.
|
|
*/
|
|
static sockid_t
|
|
tcpsock_accept(struct sock * sock, struct sockaddr * addr,
|
|
socklen_t * addr_len, endpoint_t user_endpt __unused,
|
|
struct sock ** newsockp)
|
|
{
|
|
struct tcpsock *listener = (struct tcpsock *)sock;
|
|
struct tcpsock *tcp;
|
|
int r;
|
|
|
|
if ((r = tcpsock_test_accept(sock)) != OK)
|
|
return r;
|
|
/* Below, we must not assume that the listener has a PCB. */
|
|
|
|
tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head);
|
|
assert(tcp->tcp_listener == listener);
|
|
assert(tcp->tcp_pcb != NULL);
|
|
|
|
TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next);
|
|
tcp->tcp_listener = NULL;
|
|
|
|
tcp_backlog_accepted(tcp->tcp_pcb);
|
|
|
|
ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
|
|
&tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
|
|
|
|
/*
|
|
* Set 'newsockp' to NULL so that libsockevent knows we already cloned
|
|
* the socket, and it must not be reinitialized anymore.
|
|
*/
|
|
*newsockp = NULL;
|
|
return tcpsock_get_id(tcp);
|
|
}
|
|
|
|
/*
|
|
* Perform preliminary checks on a send request.
|
|
*/
|
|
static int
|
|
tcpsock_pre_send(struct sock * sock, size_t len __unused,
|
|
socklen_t ctl_len __unused, const struct sockaddr * addr __unused,
|
|
socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags)
|
|
{
|
|
|
|
/*
|
|
* Reject calls with unknown flags. Since libsockevent strips out the
|
|
* flags it handles itself here, we only have to test for ones we can
|
|
* not handle. Currently, there are no send flags that we support.
|
|
*/
|
|
if (flags != 0)
|
|
return EOPNOTSUPP;
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Test whether the given number of data bytes can be sent on a TCP socket.
|
|
*/
|
|
static int
|
|
tcpsock_test_send(struct sock * sock, size_t min)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
size_t sndbuf;
|
|
|
|
if (tcp->tcp_pcb == NULL)
|
|
return EPIPE;
|
|
|
|
switch (tcp->tcp_pcb->state) {
|
|
case CLOSED: /* new */
|
|
case LISTEN: /* listening */
|
|
return ENOTCONN;
|
|
case SYN_SENT: /* connecting */
|
|
case SYN_RCVD: /* simultaneous open, maybe someday? */
|
|
return SUSPEND;
|
|
case ESTABLISHED: /* connected */
|
|
case CLOSE_WAIT: /* closed remotely */
|
|
break;
|
|
default: /* shut down locally */
|
|
assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
|
|
return EPIPE;
|
|
}
|
|
|
|
sndbuf = tcpsock_get_sndbuf(tcp);
|
|
if (min > sndbuf)
|
|
min = sndbuf;
|
|
|
|
if (tcp->tcp_snd.ts_len + min > sndbuf)
|
|
return SUSPEND;
|
|
else
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Send data on a TCP socket.
|
|
*/
|
|
static int
|
|
tcpsock_send(struct sock * sock, const struct sockdriver_data * data,
|
|
size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
|
|
socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
|
|
const struct sockaddr * addr __unused, socklen_t addr_len __unused,
|
|
endpoint_t user_endpt __unused, int flags __unused, size_t min)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
struct pbuf *ptail, *pfirst, *pnext, *plast;
|
|
size_t off, tail_off, chunk, left, sndbuf;
|
|
int r;
|
|
|
|
if ((r = tcpsock_test_send(sock, min)) != OK)
|
|
return r;
|
|
|
|
if (len == 0)
|
|
return OK; /* nothing to do */
|
|
|
|
sndbuf = tcpsock_get_sndbuf(tcp);
|
|
if (min > sndbuf)
|
|
min = sndbuf;
|
|
assert(min > 0);
|
|
|
|
assert(sndbuf > tcp->tcp_snd.ts_len);
|
|
left = sndbuf - tcp->tcp_snd.ts_len;
|
|
if (left > len)
|
|
left = len;
|
|
|
|
/*
|
|
* First see if we can fit any more data in the current tail buffer.
|
|
* If so, we set 'ptail' to point to it and 'tail_off' to the previous
|
|
* length of the tail buffer, while optimistically extending it to
|
|
* include the new data. If not, we set them to NULL/0.
|
|
*/
|
|
if ((ptail = tcp->tcp_snd.ts_tail) != NULL &&
|
|
ptail->len < ptail->tot_len) {
|
|
assert(ptail->len > 0);
|
|
tail_off = (size_t)ptail->len;
|
|
|
|
/*
|
|
* Optimistically extend the head buffer to include whatever
|
|
* fits in it. This is needed for util_copy_data().
|
|
*/
|
|
assert(ptail->tot_len > ptail->len);
|
|
off = (size_t)ptail->tot_len - (size_t)ptail->len;
|
|
if (off > left)
|
|
off = left;
|
|
ptail->len += off;
|
|
} else {
|
|
ptail = NULL;
|
|
tail_off = 0;
|
|
off = 0;
|
|
}
|
|
|
|
/*
|
|
* Then, if there is more to send, allocate new buffers as needed. If
|
|
* we run out of memory, work with whatever we did manage to grab.
|
|
*/
|
|
pfirst = NULL;
|
|
plast = NULL;
|
|
while (off < left) {
|
|
if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS ||
|
|
(pnext = tcpsock_alloc_buf()) == NULL) {
|
|
/*
|
|
* Chances are that we will end up suspending this send
|
|
* request because of being out of buffers. We try to
|
|
* resume such requests from the polling function.
|
|
*/
|
|
tcpsock_set_flag(tcp, TCPF_OOM);
|
|
|
|
break;
|
|
}
|
|
|
|
tcpsock_sendbufs++;
|
|
|
|
if (pfirst == NULL)
|
|
pfirst = pnext;
|
|
else
|
|
plast->next = pnext;
|
|
plast = pnext;
|
|
|
|
chunk = (size_t)pnext->tot_len;
|
|
if (chunk > left - off)
|
|
chunk = left - off;
|
|
pnext->len = chunk;
|
|
off += chunk;
|
|
}
|
|
|
|
/*
|
|
* Copy in the data and continue, unless we did not manage to find
|
|
* enough space to even meet the low send watermark, in which case we
|
|
* undo any allocation and suspend the call until later.
|
|
*/
|
|
if (off >= min) {
|
|
/*
|
|
* Optimistically attach the new buffers to the tail, also for
|
|
* util_copy_data(). We undo all this if the copy fails.
|
|
*/
|
|
if (ptail != NULL) {
|
|
ptail->next = pfirst;
|
|
|
|
pnext = ptail;
|
|
} else
|
|
pnext = pfirst;
|
|
|
|
assert(pnext != NULL);
|
|
|
|
r = util_copy_data(data, off, *offp, pnext, tail_off,
|
|
TRUE /*copy_in*/);
|
|
} else
|
|
r = SUSPEND;
|
|
|
|
if (r != OK) {
|
|
/* Undo the modifications made so far. */
|
|
while (pfirst != NULL) {
|
|
pnext = pfirst->next;
|
|
|
|
assert(tcpsock_sendbufs > 0);
|
|
tcpsock_sendbufs--;
|
|
|
|
tcpsock_free_buf(pfirst);
|
|
|
|
pfirst = pnext;
|
|
}
|
|
|
|
if (ptail != NULL) {
|
|
ptail->next = NULL;
|
|
|
|
ptail->len = tail_off;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
/* Attach the new buffers, if any, to the buffer tail. */
|
|
if (pfirst != NULL) {
|
|
if ((ptail = tcp->tcp_snd.ts_tail) != NULL) {
|
|
assert(ptail->len == ptail->tot_len);
|
|
|
|
/*
|
|
* Due to our earlier optimistic modifications, this
|
|
* may or may not be redundant.
|
|
*/
|
|
ptail->next = pfirst;
|
|
}
|
|
|
|
assert(plast != NULL);
|
|
tcp->tcp_snd.ts_tail = plast;
|
|
|
|
if (tcp->tcp_snd.ts_head == NULL) {
|
|
tcp->tcp_snd.ts_head = pfirst;
|
|
assert(tcp->tcp_snd.ts_head_off == 0);
|
|
}
|
|
if (tcp->tcp_snd.ts_unsent == NULL) {
|
|
tcp->tcp_snd.ts_unsent = pfirst;
|
|
assert(tcp->tcp_snd.ts_unsent_off == 0);
|
|
}
|
|
}
|
|
|
|
tcp->tcp_snd.ts_len += off;
|
|
|
|
/*
|
|
* See if we can send any of the data we just enqueued. The socket is
|
|
* still open as we are still processing a call from userland on it;
|
|
* this saves us from having to deal with the cases that the following
|
|
* calls end up freeing the socket object.
|
|
*/
|
|
if (tcpsock_pcb_enqueue(tcp) &&
|
|
(r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) {
|
|
/*
|
|
* That did not go well. Return the error immediately if we
|
|
* had not made any progress earlier. Otherwise, return our
|
|
* partial progress and leave the error to be picked up later.
|
|
*/
|
|
if (*offp > 0) {
|
|
sockevent_set_error(tcpsock_get_sock(tcp), r);
|
|
|
|
return OK;
|
|
} else
|
|
return r;
|
|
}
|
|
|
|
*offp += off;
|
|
return (off < len) ? SUSPEND : OK;
|
|
}
|
|
|
|
/*
|
|
* Perform preliminary checks on a receive request.
|
|
*/
|
|
static int
|
|
tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
|
|
int flags)
|
|
{
|
|
|
|
/*
|
|
* Reject calls with unknown flags. Since libsockevent strips out the
|
|
* flags it handles itself here, we only have to test for ones we can
|
|
* not handle.
|
|
*/
|
|
if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
|
|
return EOPNOTSUPP;
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Return TRUE if receive calls may wait for more data to come in on the
|
|
* connection, or FALSE if we already know that that is not going to happen.
|
|
*/
|
|
static int
|
|
tcpsock_may_wait(struct tcpsock * tcp)
|
|
{
|
|
|
|
return (tcp->tcp_pcb != NULL &&
|
|
!(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN));
|
|
}
|
|
|
|
/*
|
|
* Test whether data can be received on a TCP socket, and if so, how many bytes
|
|
* of data.
|
|
*/
|
|
static int
|
|
tcpsock_test_recv(struct sock * sock, size_t min, size_t * size)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
int may_wait;
|
|
|
|
/* If there is and never was a connection, refuse the call at all. */
|
|
if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED ||
|
|
tcp->tcp_pcb->state == LISTEN))
|
|
return ENOTCONN;
|
|
|
|
/*
|
|
* If we are certain that no more data will come in later, ignore the
|
|
* low receive watermark. Otherwise, bound it to the size of the
|
|
* receive buffer, or receive calls may block forever.
|
|
*/
|
|
if (!(may_wait = tcpsock_may_wait(tcp)))
|
|
min = 1;
|
|
else if (min > tcpsock_get_rcvbuf(tcp))
|
|
min = tcpsock_get_rcvbuf(tcp);
|
|
|
|
if (tcp->tcp_rcv.tr_len >= min) {
|
|
if (size != NULL)
|
|
*size = tcp->tcp_rcv.tr_len;
|
|
|
|
return OK;
|
|
}
|
|
|
|
return (may_wait) ? SUSPEND : SOCKEVENT_EOF;
|
|
}
|
|
|
|
/*
|
|
* Receive data on a TCP socket.
|
|
*/
|
|
static int
|
|
tcpsock_recv(struct sock * sock, const struct sockdriver_data * data,
|
|
size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
|
|
socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
|
|
struct sockaddr * addr __unused, socklen_t * addr_len __unused,
|
|
endpoint_t user_endpt __unused, int flags, size_t min,
|
|
int * rflags __unused)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
struct pbuf *ptail;
|
|
size_t off, left;
|
|
int r;
|
|
|
|
/* See if we can receive at all, and if so, how much at most. */
|
|
if ((r = tcpsock_test_recv(sock, min, NULL)) != OK)
|
|
return r;
|
|
|
|
if (len == 0)
|
|
return OK; /* nothing to do */
|
|
|
|
off = tcp->tcp_rcv.tr_len;
|
|
if (off > len)
|
|
off = len;
|
|
|
|
assert(tcp->tcp_rcv.tr_head != NULL);
|
|
assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len);
|
|
|
|
/* Copy out the data to the caller. */
|
|
if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head,
|
|
tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK)
|
|
return r;
|
|
|
|
/* Unless peeking, remove the data from the receive queue. */
|
|
if (!(flags & MSG_PEEK)) {
|
|
left = off;
|
|
|
|
/* Dequeue and free as many entire buffers as possible. */
|
|
while ((ptail = tcp->tcp_rcv.tr_head) != NULL &&
|
|
left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) {
|
|
left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off;
|
|
|
|
tcp->tcp_rcv.tr_head = ptail->next;
|
|
tcp->tcp_rcv.tr_head_off = 0;
|
|
|
|
if (tcp->tcp_rcv.tr_head == NULL)
|
|
tcp->tcp_rcv.tr_pre_tailp = NULL;
|
|
else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next)
|
|
tcp->tcp_rcv.tr_pre_tailp =
|
|
&tcp->tcp_rcv.tr_head;
|
|
|
|
assert(tcpsock_recvbufs > 0);
|
|
tcpsock_recvbufs--;
|
|
|
|
tcpsock_free_buf(ptail);
|
|
}
|
|
|
|
/*
|
|
* If only part of the (new) head buffer is consumed, adjust
|
|
* the saved offset into that buffer.
|
|
*/
|
|
if (left > 0) {
|
|
assert(tcp->tcp_rcv.tr_head != NULL);
|
|
assert((size_t)tcp->tcp_rcv.tr_head->len -
|
|
tcp->tcp_rcv.tr_head_off > left);
|
|
|
|
tcp->tcp_rcv.tr_head_off += left;
|
|
}
|
|
|
|
tcp->tcp_rcv.tr_len -= off;
|
|
|
|
if (tcp->tcp_rcv.tr_head != NULL) {
|
|
assert(tcp->tcp_rcv.tr_pre_tailp != NULL);
|
|
assert(tcp->tcp_rcv.tr_len > 0);
|
|
} else {
|
|
assert(tcp->tcp_rcv.tr_pre_tailp == NULL);
|
|
assert(tcp->tcp_rcv.tr_len == 0);
|
|
}
|
|
|
|
/*
|
|
* The receive buffer has shrunk, so there may now be space to
|
|
* receive more data.
|
|
*/
|
|
if (tcp->tcp_pcb != NULL)
|
|
tcpsock_ack_recv(tcp);
|
|
} else
|
|
flags &= ~MSG_WAITALL; /* for the check below */
|
|
|
|
/* Advance the current copy position, and see if we are done. */
|
|
*offp += off;
|
|
if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp))
|
|
return SUSPEND;
|
|
else
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Update the set of flag-type socket options on a TCP socket.
|
|
*/
|
|
static void
|
|
tcpsock_setsockmask(struct sock * sock, unsigned int mask)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
|
|
if (tcp->tcp_pcb == NULL)
|
|
return;
|
|
|
|
if (mask & SO_REUSEADDR)
|
|
ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR);
|
|
else
|
|
ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR);
|
|
|
|
if (mask & SO_KEEPALIVE)
|
|
ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE);
|
|
else
|
|
ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE);
|
|
}
|
|
|
|
/*
|
|
* Prepare a helper structure for IP-level option processing.
|
|
*/
|
|
static void
|
|
tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts)
|
|
{
|
|
|
|
ipopts->local_ip = &tcp->tcp_pcb->local_ip;
|
|
ipopts->remote_ip = &tcp->tcp_pcb->remote_ip;
|
|
ipopts->tos = &tcp->tcp_pcb->tos;
|
|
ipopts->ttl = &tcp->tcp_pcb->ttl;
|
|
ipopts->sndmin = TCP_SNDBUF_MIN;
|
|
ipopts->sndmax = TCP_SNDBUF_MAX;
|
|
ipopts->rcvmin = TCP_RCVBUF_MIN;
|
|
ipopts->rcvmax = TCP_RCVBUF_MAX;
|
|
}
|
|
|
|
/*
|
|
* Set socket options on a TCP socket.
|
|
*/
|
|
static int
|
|
tcpsock_setsockopt(struct sock * sock, int level, int name,
|
|
const struct sockdriver_data * data, socklen_t len)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
struct ipopts ipopts;
|
|
uint32_t uval;
|
|
int r, val;
|
|
|
|
if (tcp->tcp_pcb == NULL)
|
|
return ECONNRESET;
|
|
|
|
/* Handle TCP-level options. */
|
|
switch (level) {
|
|
case IPPROTO_IPV6:
|
|
switch (name) {
|
|
case IPV6_RECVTCLASS:
|
|
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
|
|
len)) != OK)
|
|
return r;
|
|
|
|
/*
|
|
* This option is not supported for TCP sockets; it
|
|
* would not even make sense. However, named(8)
|
|
* insists on trying to set it anyway. We accept the
|
|
* request but ignore the value, not even returning
|
|
* what was set through getsockopt(2).
|
|
*/
|
|
return OK;
|
|
|
|
case IPV6_FAITH:
|
|
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
|
|
len)) != OK)
|
|
return r;
|
|
|
|
/*
|
|
* This option is not supported at all, but to save
|
|
* ourselves from having to remember the current state
|
|
* for getsockopt(2), we also refuse to enable it.
|
|
*/
|
|
if (val != 0)
|
|
return EINVAL;
|
|
|
|
return OK;
|
|
}
|
|
|
|
break;
|
|
|
|
case IPPROTO_TCP:
|
|
switch (name) {
|
|
case TCP_NODELAY:
|
|
/*
|
|
* lwIP's listening TCP PCBs do not have this field.
|
|
* If this ever becomes an issue, we can create our own
|
|
* shadow flag and do the inheritance ourselves.
|
|
*/
|
|
if (tcp->tcp_pcb->state == LISTEN)
|
|
return EINVAL;
|
|
|
|
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
|
|
len)) != OK)
|
|
return r;
|
|
|
|
if (val)
|
|
tcp_nagle_disable(tcp->tcp_pcb);
|
|
else
|
|
tcp_nagle_enable(tcp->tcp_pcb);
|
|
|
|
return OK;
|
|
|
|
case TCP_KEEPIDLE:
|
|
case TCP_KEEPINTVL:
|
|
/*
|
|
* lwIP's listening TCP PCBs do not have these fields.
|
|
*/
|
|
if (tcp->tcp_pcb->state == LISTEN)
|
|
return EINVAL;
|
|
|
|
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
|
|
len)) != OK)
|
|
return r;
|
|
|
|
if (val == 0)
|
|
return EINVAL;
|
|
|
|
/*
|
|
* The given value is unsigned, but lwIP stores the
|
|
* value in milliseconds in a uint32_t field, so we
|
|
* have to limit large values to whatever fits in the
|
|
* field anyway.
|
|
*/
|
|
if (val < 0 || (uint32_t)val > UINT32_MAX / 1000)
|
|
uval = UINT32_MAX;
|
|
else
|
|
uval = (uint32_t)val * 1000;
|
|
|
|
if (name == TCP_KEEPIDLE)
|
|
tcp->tcp_pcb->keep_idle = uval;
|
|
else
|
|
tcp->tcp_pcb->keep_intvl = uval;
|
|
|
|
return OK;
|
|
|
|
case TCP_KEEPCNT:
|
|
/* lwIP's listening TCP PCBs do not have this field. */
|
|
if (tcp->tcp_pcb->state == LISTEN)
|
|
return EINVAL;
|
|
|
|
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
|
|
len)) != OK)
|
|
return r;
|
|
|
|
if (val == 0)
|
|
return EINVAL;
|
|
|
|
tcp->tcp_pcb->keep_cnt = (uint32_t)val;
|
|
|
|
return OK;
|
|
}
|
|
|
|
return EOPNOTSUPP;
|
|
}
|
|
|
|
/* Handle all other options at the IP level. */
|
|
tcpsock_get_ipopts(tcp, &ipopts);
|
|
|
|
return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data,
|
|
len, &ipopts);
|
|
}
|
|
|
|
/*
|
|
* Retrieve socket options on a TCP socket.
|
|
*/
|
|
static int
|
|
tcpsock_getsockopt(struct sock * sock, int level, int name,
|
|
const struct sockdriver_data * data, socklen_t * len)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
struct ipopts ipopts;
|
|
int val;
|
|
|
|
if (tcp->tcp_pcb == NULL)
|
|
return ECONNRESET;
|
|
|
|
/* Handle TCP-level options. */
|
|
switch (level) {
|
|
case IPPROTO_IPV6:
|
|
switch (name) {
|
|
case IPV6_RECVTCLASS:
|
|
case IPV6_FAITH:
|
|
val = 0;
|
|
|
|
return sockdriver_copyout_opt(data, &val, sizeof(val),
|
|
len);
|
|
}
|
|
|
|
break;
|
|
|
|
case IPPROTO_TCP:
|
|
switch (name) {
|
|
case TCP_NODELAY:
|
|
/* lwIP's listening TCP PCBs do not have this field. */
|
|
if (tcp->tcp_pcb->state == LISTEN)
|
|
return EINVAL;
|
|
|
|
val = tcp_nagle_disabled(tcp->tcp_pcb);
|
|
|
|
return sockdriver_copyout_opt(data, &val, sizeof(val),
|
|
len);
|
|
|
|
case TCP_MAXSEG:
|
|
/* lwIP's listening TCP PCBs do not have this field. */
|
|
if (tcp->tcp_pcb->state == LISTEN)
|
|
return EINVAL;
|
|
|
|
/* This option is read-only at this time. */
|
|
val = tcp->tcp_pcb->mss;
|
|
|
|
return sockdriver_copyout_opt(data, &val, sizeof(val),
|
|
len);
|
|
|
|
case TCP_KEEPIDLE:
|
|
/* lwIP's listening TCP PCBs do not have this field. */
|
|
if (tcp->tcp_pcb->state == LISTEN)
|
|
return EINVAL;
|
|
|
|
val = (int)(tcp->tcp_pcb->keep_idle / 1000);
|
|
|
|
return sockdriver_copyout_opt(data, &val, sizeof(val),
|
|
len);
|
|
|
|
case TCP_KEEPINTVL:
|
|
/* lwIP's listening TCP PCBs do not have this field. */
|
|
if (tcp->tcp_pcb->state == LISTEN)
|
|
return EINVAL;
|
|
|
|
val = (int)(tcp->tcp_pcb->keep_intvl / 1000);
|
|
|
|
return sockdriver_copyout_opt(data, &val, sizeof(val),
|
|
len);
|
|
|
|
case TCP_KEEPCNT:
|
|
/* lwIP's listening TCP PCBs do not have this field. */
|
|
if (tcp->tcp_pcb->state == LISTEN)
|
|
return EINVAL;
|
|
|
|
val = (int)tcp->tcp_pcb->keep_cnt;
|
|
|
|
return sockdriver_copyout_opt(data, &val, sizeof(val),
|
|
len);
|
|
}
|
|
|
|
return EOPNOTSUPP;
|
|
}
|
|
|
|
/* Handle all other options at the IP level. */
|
|
tcpsock_get_ipopts(tcp, &ipopts);
|
|
|
|
return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data,
|
|
len, &ipopts);
|
|
}
|
|
|
|
/*
|
|
* Retrieve the local socket address of a TCP socket.
|
|
*/
|
|
static int
|
|
tcpsock_getsockname(struct sock * sock, struct sockaddr * addr,
|
|
socklen_t * addr_len)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
|
|
if (tcp->tcp_pcb == NULL)
|
|
return EINVAL;
|
|
|
|
ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
|
|
&tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port);
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Retrieve the remote socket address of a TCP socket.
|
|
*/
|
|
static int
|
|
tcpsock_getpeername(struct sock * sock, struct sockaddr * addr,
|
|
socklen_t * addr_len)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
|
|
if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED ||
|
|
tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT)
|
|
return ENOTCONN;
|
|
|
|
ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
|
|
&tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Perform a TCP half-close on a TCP socket. This operation may not complete
|
|
* immediately due to memory conditions, in which case it will be completed at
|
|
* a later time.
|
|
*/
|
|
static void
|
|
tcpsock_send_fin(struct tcpsock * tcp)
|
|
{
|
|
|
|
sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR);
|
|
|
|
/*
|
|
* Attempt to send the FIN. If a fatal error occurs as a result, raise
|
|
* it as an asynchronous error, because this function's callers cannot
|
|
* do much with it. That happens to match the way these functions are
|
|
* used elsewhere. In any case, as a result, the PCB may be closed.
|
|
* However, we are never called from a situation where the socket is
|
|
* being closed here, so the socket object will not be freed either.
|
|
*/
|
|
if (tcpsock_pcb_enqueue(tcp)) {
|
|
assert(!sockevent_is_closing(tcpsock_get_sock(tcp)));
|
|
|
|
if (tcpsock_may_close(tcp))
|
|
tcpsock_finish_close(tcp);
|
|
else
|
|
(void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Shut down a TCP socket for reading and/or writing.
|
|
*/
|
|
static int
|
|
tcpsock_shutdown(struct sock * sock, unsigned int mask)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
|
|
/*
|
|
* If the PCB is gone, we want to allow shutdowns for reading but not
|
|
* writing: shutting down for writing affects the PCB, shutting down
|
|
* for reading does not. Also, if the PCB is in CLOSED state, we would
|
|
* not know how to deal with subsequent operations after a shutdown for
|
|
* writing, so forbid such calls altogether.
|
|
*/
|
|
if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) &&
|
|
(mask & SFL_SHUT_WR))
|
|
return ENOTCONN;
|
|
|
|
/*
|
|
* Handle listening sockets as a special case. Shutting down a
|
|
* listening socket frees its PCB. Sockets pending on the accept queue
|
|
* may still be accepted, but after that, accept(2) will start
|
|
* returning ECONNABORTED. This feature allows multi-process server
|
|
* applications to shut down gracefully, supposedly..
|
|
*/
|
|
if (tcpsock_is_listening(tcp)) {
|
|
if (tcp->tcp_pcb != NULL)
|
|
tcpsock_pcb_close(tcp);
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* We control shutdown-for-reading locally, and intentially do not tell
|
|
* lwIP about it: if we do that and also shut down for writing, the PCB
|
|
* may disappear (now or eventually), which is not what we want.
|
|
* Instead, we only tell lwIP to shut down for reading once we actually
|
|
* want to get rid of the PCB, using tcp_close(). In the meantime, if
|
|
* the socket is shut down for reading by the user, we simply discard
|
|
* received data as fast as we can--one out of a number of possible
|
|
* design choices there, and (reportedly) the one used by the BSDs.
|
|
*/
|
|
if (mask & SFL_SHUT_RD)
|
|
(void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/);
|
|
|
|
/*
|
|
* Shutting down for writing a connecting socket simply closes its PCB.
|
|
* Closing a PCB in SYN_SENT state simply deallocates it, so this can
|
|
* not fail. On the other hand, for connected sockets we want to send
|
|
* a FIN, which may fail due to memory shortage, in which case we have
|
|
* to try again later..
|
|
*/
|
|
if (mask & SFL_SHUT_WR) {
|
|
if (tcp->tcp_pcb->state == SYN_SENT)
|
|
tcpsock_pcb_close(tcp);
|
|
else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
|
|
tcpsock_send_fin(tcp);
|
|
}
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Close a TCP socket. Complete the operation immediately if possible, or
|
|
* otherwise initiate the closing process and complete it later, notifying
|
|
* libsockevent about that as well. Depending on linger settings, this
|
|
* function may be called twice on the same socket: the first time with the
|
|
* 'force' flag cleared, and the second time with the 'force' flag set.
|
|
*/
|
|
static int
|
|
tcpsock_close(struct sock * sock, int force)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
struct tcpsock *queued;
|
|
size_t rlen;
|
|
|
|
assert(tcp->tcp_listener == NULL);
|
|
|
|
/*
|
|
* If this was a listening socket, so abort and clean up any and all
|
|
* connections on its listener queue. Note that the listening socket
|
|
* may or may not have a PCB at this point.
|
|
*/
|
|
if (tcpsock_is_listening(tcp)) {
|
|
while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) {
|
|
queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head);
|
|
|
|
tcpsock_pcb_abort(queued);
|
|
|
|
(void)tcpsock_cleanup(queued, TRUE /*may_free*/);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Clear the receive queue, and make sure that we no longer add new
|
|
* data to it. The latter is relevant only for the case that we end up
|
|
* returning SUSPEND below. Remember whether there were bytes left,
|
|
* because we should reset the connection if there were.
|
|
*/
|
|
rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
|
|
|
|
sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD);
|
|
|
|
/*
|
|
* If the socket is connected, perform a graceful shutdown, unless 1)
|
|
* we are asked to force-close the socket, or 2) if the local side has
|
|
* not consumed all data, as per RFC 1122 Sec.4.2.2.13. Normally lwIP
|
|
* would take care of the second point, but we may have data in our
|
|
* receive buffer of which lwIP is not aware.
|
|
*
|
|
* Implementing proper linger support is somewhat difficult with lwIP.
|
|
* In particular, we cannot reliably wait for our FIN to be ACK'ed by
|
|
* the other side in all cases:
|
|
*
|
|
* - the lwIP TCP transition from states CLOSING to TIME_WAIT does not
|
|
* trigger any event and once in the TIME_WAIT state, the poll event
|
|
* no longer triggers either;
|
|
* - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to
|
|
* TIME_WAIT will trigger a receive event, but it is not clear
|
|
* whether we can reliably check that our FIN was ACK'ed from there.
|
|
*
|
|
* That means we have to compromise. Instead of the proper approach,
|
|
* we complete our side of the close operation whenever:
|
|
*
|
|
* 1. all of or data was acknowledged, AND,
|
|
* 2. our FIN was sent, AND,
|
|
* 3a. our FIN was acknowledged, OR,
|
|
* 3b. we received a FIN from the other side.
|
|
*
|
|
* With the addition of the rule 3b, we do not run into the above
|
|
* reliability problems, but we may return from SO_LINGER-blocked close
|
|
* calls too early and thus give callers a false impression of success.
|
|
* TODO: if lwIP ever gets improved on this point, the code in this
|
|
* module should be rewritten to make use of the improvements.
|
|
*
|
|
* The set of rules is basically the same as for closing the PCB early
|
|
* as per tcpsock_may_close(), except with the check for our FIN being
|
|
* acknowledged. Unfortunately only the FIN_WAIT_2, TIME_WAIT, and
|
|
* (reentered) CLOSED TCP states guarantee that there are no
|
|
* unacknowledged data segments anymore, so we may have to wait for
|
|
* reaching any one of these before we can actually finish closing the
|
|
* socket with tcp_close().
|
|
*
|
|
* In addition, lwIP does not tell us when our FIN gets acknowledged,
|
|
* so we have to use polling and direct access to lwIP's PCB fields
|
|
* instead, just like lwIP's BSD API does. There is no other way.
|
|
* Also, we may not even be able to send the FIN right away, in which
|
|
* case we must defer that until later.
|
|
*/
|
|
if (tcp->tcp_pcb != NULL) {
|
|
switch (tcp->tcp_pcb->state) {
|
|
case CLOSE_WAIT:
|
|
case CLOSING:
|
|
case LAST_ACK:
|
|
assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
|
|
|
|
/* FALLTHROUGH */
|
|
case SYN_RCVD:
|
|
case ESTABLISHED:
|
|
case FIN_WAIT_1:
|
|
/* First check if we should abort the connection. */
|
|
if (force || rlen > 0)
|
|
break;
|
|
|
|
/*
|
|
* If we have not sent a FIN yet, try sending it now;
|
|
* if all other conditions are met for closing the
|
|
* socket, successful FIN transmission will complete
|
|
* the close. Otherwise, perform the close check
|
|
* explicitly.
|
|
*/
|
|
if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
|
|
tcpsock_send_fin(tcp);
|
|
else if (tcpsock_may_close(tcp))
|
|
tcpsock_pcb_close(tcp);
|
|
|
|
/*
|
|
* If at this point the PCB is gone, we managed to
|
|
* close the connection immediately, and the socket has
|
|
* already been cleaned up by now. This may occur if
|
|
* there is no unacknowledged data and we already
|
|
* received a FIN earlier on.
|
|
*/
|
|
if (tcp->tcp_pcb == NULL)
|
|
return OK;
|
|
|
|
/*
|
|
* Complete the close operation at a later time.
|
|
* Adjust the polling interval, so that we can detect
|
|
* completion of the close as quickly as possible.
|
|
*/
|
|
tcp_poll(tcp->tcp_pcb, tcpsock_event_poll,
|
|
TCP_POLL_CLOSE_INTERVAL);
|
|
|
|
return SUSPEND;
|
|
|
|
default:
|
|
/*
|
|
* The connection is either not yet established, or
|
|
* already in a state where we can close it right now.
|
|
*/
|
|
tcpsock_pcb_close(tcp);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Abort the connection is the PCB is still around, and clean up the
|
|
* socket. We cannot let tcpsock_cleanup() free the socket object yet,
|
|
* because we are still in the callback from libsockevent, and the
|
|
* latter cannot handle the socket object being freed from here.
|
|
*/
|
|
if (tcp->tcp_pcb != NULL)
|
|
tcpsock_pcb_abort(tcp);
|
|
|
|
(void)tcpsock_cleanup(tcp, FALSE /*may_free*/);
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Free up a closed TCP socket.
|
|
*/
|
|
static void
|
|
tcpsock_free(struct sock * sock)
|
|
{
|
|
struct tcpsock *tcp = (struct tcpsock *)sock;
|
|
|
|
assert(tcp->tcp_pcb == NULL);
|
|
assert(tcp->tcp_snd.ts_len == 0);
|
|
assert(tcp->tcp_snd.ts_head == NULL);
|
|
assert(tcp->tcp_rcv.tr_len == 0);
|
|
assert(tcp->tcp_rcv.tr_head == NULL);
|
|
|
|
TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next);
|
|
}
|
|
|
|
/* This table maps TCP states from lwIP numbers to NetBSD numbers. */
|
|
static const struct {
|
|
int tsm_tstate;
|
|
int tsm_sostate;
|
|
} tcpsock_statemap[] = {
|
|
[CLOSED] = { TCPS_CLOSED, SS_ISDISCONNECTED },
|
|
[LISTEN] = { TCPS_LISTEN, 0 },
|
|
[SYN_SENT] = { TCPS_SYN_SENT, SS_ISCONNECTING },
|
|
[SYN_RCVD] = { TCPS_SYN_RECEIVED, SS_ISCONNECTING },
|
|
[ESTABLISHED] = { TCPS_ESTABLISHED, SS_ISCONNECTED },
|
|
[FIN_WAIT_1] = { TCPS_FIN_WAIT_1, SS_ISDISCONNECTING },
|
|
[FIN_WAIT_2] = { TCPS_FIN_WAIT_2, SS_ISDISCONNECTING },
|
|
[CLOSE_WAIT] = { TCPS_CLOSE_WAIT, SS_ISCONNECTED },
|
|
[CLOSING] = { TCPS_CLOSING, SS_ISDISCONNECTING },
|
|
[LAST_ACK] = { TCPS_LAST_ACK, SS_ISDISCONNECTING },
|
|
[TIME_WAIT] = { TCPS_TIME_WAIT, SS_ISDISCONNECTED },
|
|
};
|
|
|
|
/*
|
|
* Fill the given kinfo_pcb sysctl(7) structure with information about the TCP
|
|
* PCB identified by the given pointer.
|
|
*/
|
|
static void
|
|
tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr)
|
|
{
|
|
const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr;
|
|
struct tcpsock *tcp;
|
|
|
|
/*
|
|
* Not all TCP PCBs have an associated tcpsock structure. We are
|
|
* careful enough clearing the callback argument for PCBs on any of the
|
|
* TCP lists that we can use that callback argument to determine
|
|
* whether there is an associated tcpsock structure, although with one
|
|
* exception: PCBs for incoming connections that have not yet been
|
|
* fully established (i.e., in SYN_RCVD state). These will have the
|
|
* callback argument of the listening socket (which itself may already
|
|
* have been deallocated at this point) but should not be considered as
|
|
* associated with the listening socket's tcpsock structure.
|
|
*/
|
|
if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) {
|
|
tcp = (struct tcpsock *)pcb->callback_arg;
|
|
assert(tcp >= tcp_array &&
|
|
tcp < &tcp_array[__arraycount(tcp_array)]);
|
|
|
|
/* TODO: change this so that sockstat(1) may work one day. */
|
|
ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp);
|
|
} else {
|
|
/* No tcpsock. Could also be in TIME_WAIT state etc. */
|
|
tcp = NULL;
|
|
|
|
ki->ki_sostate = SS_NOFDREF;
|
|
}
|
|
|
|
ki->ki_type = SOCK_STREAM;
|
|
|
|
if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) {
|
|
ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate;
|
|
/* TODO: this needs work, but does anything rely on it? */
|
|
ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate;
|
|
}
|
|
|
|
/* Careful with the LISTEN state here (see below). */
|
|
ipsock_get_info(ki, &pcb->local_ip, pcb->local_port,
|
|
&pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0);
|
|
|
|
/*
|
|
* The PCBs for listening sockets are actually smaller. Thus, for
|
|
* listening sockets, do not attempt to access any of the fields beyond
|
|
* those provided in the smaller structure.
|
|
*/
|
|
if (pcb->state == LISTEN) {
|
|
assert(tcp != NULL);
|
|
ki->ki_refs =
|
|
(uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head);
|
|
} else {
|
|
if (tcp_nagle_disabled(pcb))
|
|
ki->ki_tflags |= NETBSD_TF_NODELAY;
|
|
|
|
if (tcp != NULL) {
|
|
ki->ki_rcvq = tcp->tcp_rcv.tr_len;
|
|
ki->ki_sndq = tcp->tcp_snd.ts_len;
|
|
|
|
if (tcp->tcp_listener != NULL)
|
|
ki->ki_nextref = (uint64_t)(uintptr_t)
|
|
TAILQ_NEXT(tcp, tcp_queue.tq_next);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Given either NULL or a previously returned TCP PCB pointer, return the first
|
|
* or next TCP PCB pointer, or NULL if there are no more. The current
|
|
* implementation supports only one concurrent iteration at once.
|
|
*/
|
|
static const void *
|
|
tcpsock_enum(const void * last)
|
|
{
|
|
static struct {
|
|
unsigned int i;
|
|
const struct tcp_pcb *pcb;
|
|
} iter;
|
|
|
|
if (last != NULL && (iter.pcb = iter.pcb->next) != NULL)
|
|
return (const void *)iter.pcb;
|
|
|
|
for (iter.i = (last != NULL) ? iter.i + 1 : 0;
|
|
iter.i < __arraycount(tcp_pcb_lists); iter.i++) {
|
|
if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL)
|
|
return (const void *)iter.pcb;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Obtain the list of TCP protocol control blocks, for sysctl(7).
|
|
*/
|
|
static ssize_t
|
|
tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
|
|
struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
|
|
{
|
|
|
|
return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info);
|
|
}
|
|
|
|
static const struct sockevent_ops tcpsock_ops = {
|
|
.sop_bind = tcpsock_bind,
|
|
.sop_listen = tcpsock_listen,
|
|
.sop_connect = tcpsock_connect,
|
|
.sop_accept = tcpsock_accept,
|
|
.sop_test_accept = tcpsock_test_accept,
|
|
.sop_pre_send = tcpsock_pre_send,
|
|
.sop_send = tcpsock_send,
|
|
.sop_test_send = tcpsock_test_send,
|
|
.sop_pre_recv = tcpsock_pre_recv,
|
|
.sop_recv = tcpsock_recv,
|
|
.sop_test_recv = tcpsock_test_recv,
|
|
.sop_ioctl = ifconf_ioctl,
|
|
.sop_setsockmask = tcpsock_setsockmask,
|
|
.sop_setsockopt = tcpsock_setsockopt,
|
|
.sop_getsockopt = tcpsock_getsockopt,
|
|
.sop_getsockname = tcpsock_getsockname,
|
|
.sop_getpeername = tcpsock_getpeername,
|
|
.sop_shutdown = tcpsock_shutdown,
|
|
.sop_close = tcpsock_close,
|
|
.sop_free = tcpsock_free
|
|
};
|