/* LWIP service - tcpsock.c - TCP sockets */ /* * This module implements support for TCP sockets based on lwIP's core TCP PCB * module, which is largely but not fully cooperative with exactly what we want * to achieve, with as a result that this module is rather complicated. * * Each socket has a send queue and a receive queue. Both are using lwIP's own * (pbuf) buffers, which largely come out of the main 512-byte buffer pool. * The buffers on the send queue are allocated and freed by us--the latter only * once they are no longer in use by lwIP as well. A bit counterintuitively, * we deliberately use a smaller lwIP per-PCB TCP send buffer limit * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more * easily trigger conditions where we cannot enqueue data (or the final FIN) * right away. This way, we get to test the internal logic of this module a * lot more easily. The small lwIP send queue size should not have any impact * on performance, as our own per-socket send queues can be much larger and we * enqueue more of that on the lwIP PCB as soon as we can in all cases. * * The receive queue consists of whatever buffers were given to us by lwIP, but * since those may be many buffers with small amounts of data each, we perform * fairly aggressive merging of consecutive buffers. The intended result is * that we waste no more than 50% of memory within the receive queue. Merging * requires memory copies, which makes it expensive, but we do not configure * lwIP with enough buffers to make running out of buffers a non-issue, so this * trade-off is necessary. Practical experience and measurements of the merge * policy will have to show whether and how the current policy may be improved. * * As can be expected, the connection close semantics are by far the most * complicated part of this module. We attempt to get rid of the lwIP PCB as * soon as we can, letting lwIP take care of the TIME_WAIT state for example. * However, there are various conditions that have to be met before we can * forget about the PCB here--most importantly, that none of our sent data * blocks are still referenced by lwIP because they have not yet been sent or * acknowledged. We can only free the data blocks once lwIP is done with them. * * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating * full state tracking here. However, we do not look at a socket's TCP state * while in a lwIP-generated event for that socket, because the state may not * necessarily reflect the (correct or new) TCP state of the connection, nor * may the PCB be available--this is the case for error events. For these * reasons we use a few internal TCPF_ flags to perform partial state tracking. * * More generally, we tend to access lwIP PCB fields directly only when lwIP's * own BSD API implementation does that too and there is no better alternative. * One example of this is the check to see if our FIN was acknowledged, for * SO_LINGER support. In terms of maintenance, our hope is that if lwIP's API * changes later, we can change our code to imitate whatever lwIP's BSD API * implementation does at that point. */ #include #include #include #include #include #include #include /* * Unfortunately, NetBSD and lwIP have different definitions of a few relevant * preprocessor variables. Make sure we do not attempt to use the NetBSD one * where it matters. We do need one of the NetBSD definitions though. */ static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY; #undef TF_NODELAY #undef TCP_MSS #include "lwip.h" #include "tcpisn.h" #include "lwip/tcp.h" #include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */ /* * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration. */ /* * We fully control the send buffer, so we can let its size be set to whatever * we want. The receive buffer is different: if it is smaller than the window * size, we may have to refuse data that lwIP hands us, at which point more * incoming data will cause lwIP to abort the TCP connection--even aside from * performance issues. Therefore, we must make sure the receive buffer is * larger than the TCP window at all times. */ #define TCP_SNDBUF_MIN 1 /* minimum TCP send buffer size */ #define TCP_SNDBUF_DEF 32768 /* default TCP send buffer size */ #define TCP_SNDBUF_MAX 131072 /* maximum TCP send buffer size */ #define TCP_RCVBUF_MIN TCP_WND /* minimum TCP receive buffer size */ #define TCP_RCVBUF_DEF MAX(TCP_WND, 32768) /* default TCP recv buffer size */ #define TCP_RCVBUF_MAX MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */ /* * The total number of buffers that may in use for TCP socket send queues. The * goal is to allow at least some progress to be made on receiving from TCP * sockets and on differently-typed sockets, at least as long as the LWIP * service can manage to allocate the memory it wants. For the case that it * does not, we can only reactively kill off TCP sockets and/or free enqueued * ethernet packets, neither of which is currently implemented (TODO). */ #define TCP_MAX_SENDBUFS (mempool_max_buffers() * 3 / 4) /* Polling intervals, in 500-millsecond units. */ #define TCP_POLL_REG_INTERVAL 10 /* interval for reattempting sends */ #define TCP_POLL_CLOSE_INTERVAL 1 /* interval while closing connection */ static struct tcpsock { struct ipsock tcp_ipsock; /* IP socket, MUST be first */ struct tcp_pcb *tcp_pcb; /* lwIP TCP control block */ union pxfer_tcp_queue { /* free/accept queue */ TAILQ_ENTRY(tcpsock) tq_next; /* next in queue */ TAILQ_HEAD(, tcpsock) tq_head; /* head of queue */ } tcp_queue; struct tcpsock *tcp_listener; /* listener if on accept q. */ struct { /* send queue */ struct pbuf *ts_head; /* first pbuf w/unacked data */ struct pbuf *ts_unsent; /* first pbuf w/unsent data */ struct pbuf *ts_tail; /* most recently added data */ size_t ts_len; /* total sent + unsent */ unsigned short ts_head_off; /* offset into head pbuf */ unsigned short ts_unsent_off; /* offset into unsent pbuf */ } tcp_snd; struct { /* receive queue */ struct pbuf *tr_head; /* first pbuf w/unrecvd data */ struct pbuf **tr_pre_tailp; /* ptr-ptr to newest pbuf */ size_t tr_len; /* bytes on receive queue */ unsigned short tr_head_off; /* offset into head pbuf */ unsigned short tr_unacked; /* current window reduction */ } tcp_rcv; } tcp_array[NR_TCPSOCK]; static TAILQ_HEAD(, tcpsock) tcp_freelist; /* list of free TCP sockets */ static const struct sockevent_ops tcpsock_ops; static unsigned int tcpsock_sendbufs; /* # send buffers in use */ static unsigned int tcpsock_recvbufs; /* # receive buffers in use */ /* A bunch of macros that are just for convenience. */ #define tcpsock_get_id(tcp) (SOCKID_TCP | (sockid_t)((tcp) - tcp_array)) #define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock) #define tcpsock_get_sock(tcp) (ipsock_get_sock(tcpsock_get_ipsock(tcp))) #define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp))) #define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp))) #define tcpsock_is_ipv6(tcp) (ipsock_is_ipv6(tcpsock_get_ipsock(tcp))) #define tcpsock_is_shutdown(tcp,fl) \ (sockevent_is_shutdown(tcpsock_get_sock(tcp), fl)) #define tcpsock_is_listening(tcp) \ (sockevent_is_listening(tcpsock_get_sock(tcp))) #define tcpsock_get_flags(tcp) (ipsock_get_flags(tcpsock_get_ipsock(tcp))) #define tcpsock_set_flag(tcp,fl) \ (ipsock_set_flag(tcpsock_get_ipsock(tcp), fl)) #define tcpsock_clear_flag(tcp,fl) \ (ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl)) static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *, struct rmib_oldp *, struct rmib_newp *); /* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */ /* TODO: add many more and make some of them writable.. */ static struct rmib_node net_inet_tcp_table[] = { /* 2*/ [TCPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF, "sendspace", "Default TCP send buffer size"), /* 3*/ [TCPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF, "recvspace", "Default TCP receive buffer size"), /*29*/ [TCPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int), loopif_cksum, "do_loopback_cksum", "Perform TCP checksum on loopback"), /*+0*/ [TCPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, tcpsock_pcblist, "pcblist", "TCP protocol control block list"), /*+1*/ [TCPCTL_MAXID + 1] = RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE | CTLFLAG_HIDDEN | CTLTYPE_STRING, TCPISN_SECRET_HEX_LENGTH, tcpisn_secret, "isn_secret", "TCP ISN secret (MINIX 3 specific)") }; static struct rmib_node net_inet_tcp_node = RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings"); static struct rmib_node net_inet6_tcp6_node = RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings"); /* * Initialize the TCP sockets module. */ void tcpsock_init(void) { unsigned int slot; /* Initialize the list of free TCP sockets. */ TAILQ_INIT(&tcp_freelist); for (slot = 0; slot < __arraycount(tcp_array); slot++) TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot], tcp_queue.tq_next); /* Initialize other variables. */ tcpsock_sendbufs = 0; /* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */ mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node); mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node); } /* * Initialize the state of a TCP socket's send queue. */ static void tcpsock_reset_send(struct tcpsock * tcp) { tcp->tcp_snd.ts_tail = NULL; tcp->tcp_snd.ts_unsent = NULL; tcp->tcp_snd.ts_head = NULL; tcp->tcp_snd.ts_len = 0; tcp->tcp_snd.ts_unsent_off = 0; tcp->tcp_snd.ts_head_off = 0; } /* * Initialize the state of a TCP socket's receive queue. */ static void tcpsock_reset_recv(struct tcpsock * tcp) { tcp->tcp_rcv.tr_pre_tailp = NULL; tcp->tcp_rcv.tr_head = NULL; tcp->tcp_rcv.tr_len = 0; tcp->tcp_rcv.tr_head_off = 0; tcp->tcp_rcv.tr_unacked = 0; } /* * Create a TCP socket. */ sockid_t tcpsock_socket(int domain, int protocol, struct sock ** sockp, const struct sockevent_ops ** ops) { struct tcpsock *tcp; uint8_t ip_type; switch (protocol) { case 0: case IPPROTO_TCP: break; default: return EPROTONOSUPPORT; } if (TAILQ_EMPTY(&tcp_freelist)) return ENOBUFS; tcp = TAILQ_FIRST(&tcp_freelist); /* * Initialize the structure. Do not memset it to zero, as it is still * part of the linked free list. Initialization may still fail. When * adding new fields, make sure to change tcpsock_clone() accordingly. */ ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain, TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp); if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL) return ENOBUFS; tcp_arg(tcp->tcp_pcb, tcp); tcp->tcp_listener = NULL; tcpsock_reset_send(tcp); tcpsock_reset_recv(tcp); TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next); *ops = &tcpsock_ops; return tcpsock_get_id(tcp); } /* * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection * incoming on listening socket 'listener'. The new socket is essentially a * "clone" of the listening TCP socket, in that it should inherit any settings * from the listening socket. The socket has not yet been accepted by userland * so add it to the queue of connetions pending for the listening socket. On * success, return OK. On failure, return a negative error code. */ static int tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb) { struct tcpsock *tcp; if (TAILQ_EMPTY(&tcp_freelist)) return ENOBUFS; tcp = TAILQ_FIRST(&tcp_freelist); /* * Initialize the structure. Do not memset it to zero, as it is still * part of the linked free list. Initialization may still fail. Most * settings should be inherited from the listening socket here, rather * than being initialized to their default state. */ ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp), tcpsock_get_id(tcp)); tcp->tcp_pcb = pcb; tcp_arg(pcb, tcp); tcpsock_reset_send(tcp); tcpsock_reset_recv(tcp); /* * Remove the new socket from the free list, and add it to the queue of * the listening socket--in this order, because the same next pointer * is used for both. */ TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next); TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next); tcp->tcp_listener = listener; return OK; } /* * Allocate a buffer from the pool, using the standard pool size. The returned * buffer is a single element--never a chain. */ static struct pbuf * tcpsock_alloc_buf(void) { struct pbuf *pbuf; pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM); assert(pbuf == NULL || pbuf->len == pbuf->tot_len); return pbuf; } /* * Free the given buffer. Ensure that pbuf_free() will not attempt to free the * next buffer(s) in the chain as well. This may be called for pbufs other * than those allocated with tcpsock_alloc_buf(). */ static void tcpsock_free_buf(struct pbuf * pbuf) { /* * Resetting the length is currently not necessary, but better safe * than sorry.. */ pbuf->len = pbuf->tot_len; pbuf->next = NULL; pbuf_free(pbuf); } /* * Clear the send queue of a TCP socket. The caller must ensure that lwIP will * no longer access any of data on the send queue. */ static void tcpsock_clear_send(struct tcpsock * tcp) { struct pbuf *phead; assert(tcp->tcp_pcb == NULL); while ((phead = tcp->tcp_snd.ts_head) != NULL) { tcp->tcp_snd.ts_head = phead->next; assert(tcpsock_sendbufs > 0); tcpsock_sendbufs--; tcpsock_free_buf(phead); } tcpsock_reset_send(tcp); } /* * Clear the receive queue of a TCP socket. If 'ack_data' is set, also * acknowledge the previous contents of the receive queue to lwIP. */ static size_t tcpsock_clear_recv(struct tcpsock * tcp, int ack_data) { struct pbuf *phead; size_t rlen; rlen = tcp->tcp_rcv.tr_len; while ((phead = tcp->tcp_rcv.tr_head) != NULL) { tcp->tcp_rcv.tr_head = phead->next; assert(tcpsock_recvbufs > 0); tcpsock_recvbufs--; tcpsock_free_buf(phead); } /* * From now on, we will basically be discarding incoming data as fast * as possible, to keep the full window open at all times. */ if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0) tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked); tcpsock_reset_recv(tcp); return rlen; } /* * The TCP socket's PCB has been detached from the socket, typically because * the connection was aborted, either by us or by lwIP. Either way, any TCP * connection is gone. Clear the socket's send queue, remove the socket from * a listening socket's queue, and if the socket itself is ready and allowed to * be freed, free it now. The socket is ready to be freed if it was either on * a listening queue or being closed already. The socket is allowed to be * freed only if 'may_free' is TRUE. If the socket is not freed, its receive * queue is left as is, as it may still have data to be received by userland. */ static int tcpsock_cleanup(struct tcpsock * tcp, int may_free) { int destroy; assert(tcp->tcp_pcb == NULL); /* * Free any data on the send queue. This is safe to do right now, * because the PCB has been aborted (or was already gone). We must be * very careful about clearing the send queue in all other situations. */ tcpsock_clear_send(tcp); /* * If this was a socket pending acceptance, remove it from the * corresponding listener socket's queue, and free it. Otherwise, free * the socket only if it suspended a graceful close operation. */ if (tcp->tcp_listener != NULL) { TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next); tcp->tcp_listener = NULL; /* * The listener socket's backlog count should be adjusted by * lwIP whenever the PCB is freed up, so we need (and must) not * attempt to do that here. */ destroy = TRUE; } else destroy = sockevent_is_closing(tcpsock_get_sock(tcp)); /* * Do not free the socket if 'may_free' is FALSE. That flag may be set * if we are currently in the second tcpsock_close() call on the * socket, in which case sockevent_is_closing() is TRUE but we must * still not free the socket now: doing so would derail libsockevent. */ if (destroy && may_free) { (void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/); sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE); } return destroy; } /* * Abort the lwIP PCB for the given socket, using tcp_abort(). If the PCB is * connected, this will cause the connection to be reset. The PCB, which must * have still been present before the call, will be gone after the call. */ static void tcpsock_pcb_abort(struct tcpsock * tcp) { assert(tcp->tcp_pcb != NULL); assert(!tcpsock_is_listening(tcp)); tcp_recv(tcp->tcp_pcb, NULL); tcp_sent(tcp->tcp_pcb, NULL); tcp_err(tcp->tcp_pcb, NULL); tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL); tcp_arg(tcp->tcp_pcb, NULL); tcp_abort(tcp->tcp_pcb); tcp->tcp_pcb = NULL; } /* * Close the lwIP PCB for the given socket, using tcp_close(). If the PCB is * connected, its graceful close will be finished by lwIP in the background. * The PCB, which must have still been present before the call, will be gone * after the call. */ static void tcpsock_pcb_close(struct tcpsock * tcp) { err_t err; assert(tcp->tcp_pcb != NULL); assert(tcp->tcp_snd.ts_len == 0); if (!tcpsock_is_listening(tcp)) { tcp_recv(tcp->tcp_pcb, NULL); tcp_sent(tcp->tcp_pcb, NULL); tcp_err(tcp->tcp_pcb, NULL); tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL); } tcp_arg(tcp->tcp_pcb, NULL); if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK) panic("unexpected TCP close failure: %d", err); tcp->tcp_pcb = NULL; } /* * Return TRUE if all conditions are met for closing the TCP socket's PCB, or * FALSE if they are not. Upon calling this function, the socket's PCB must * still be around. */ static int tcpsock_may_close(struct tcpsock * tcp) { assert(tcp->tcp_pcb != NULL); /* * Regular closing of the PCB requires three conditions to be met: * * 1. all our data has been transmitted AND acknowledged, so that we do * not risk corruption in case there are still unsent or unack'ed * data buffers that may otherwise be recycled too soon; * 2. we have sent our FIN to the peer; and, * 3. we have received a FIN from the peer. */ return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) == (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0); } /* * The given socket is ready to be closed as per the tcpsock_may_close() rules. * This implies that its send queue is already empty. Gracefully close the * PCB. In addition, if the socket is being closed gracefully, meaning we * suspended an earlier tcpsock_close() call (and as such already emptied the * receive queue as well), then tell libsockevent that the close is finished, * freeing the socket. Return TRUE if the socket has indeed been freed this * way, or FALSE if the socket is still around. */ static int tcpsock_finish_close(struct tcpsock * tcp) { assert(tcp->tcp_snd.ts_len == 0); assert(tcp->tcp_listener == NULL); /* * If we get here, we have already shut down the sending side of the * PCB. Technically, we are interested only in shutting down the * receiving side of the PCB here, so that lwIP may decide to recycle * the socket later etcetera. We call tcp_close() because we do not * want to rely on tcp_shutdown(RX) doing the exact same thing. * However, we do rely on the fact that the PCB is not immediately * destroyed by the tcp_close() call: otherwise we may have to return * ERR_ABRT if this function is called from a lwIP-generated event. */ tcpsock_pcb_close(tcp); /* * If we suspended an earlier tcpsock_close() call, we have to tell * libsockevent that the close operation is now complete. */ if (sockevent_is_closing(tcpsock_get_sock(tcp))) { assert(tcp->tcp_rcv.tr_len == 0); sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE); return TRUE; } else return FALSE; } /* * Attempt to start or resume enqueuing data and/or a FIN to send on the given * TCP socket. Return TRUE if anything at all could be newly enqueued on the * lwIP PCB, even if less than desired. In that case, the caller should try to * send whatever was enqueued, and if applicable, check if the socket may now * be closed (due to the FIN being enqueued). In particular, in any situation * where the socket may be in the process of being closed, the caller must use * tcpsock_may_close() if TRUE is returned. Return FALSE if nothing new could * be enqueued, in which case no send attempt need to be made either. */ static int tcpsock_pcb_enqueue(struct tcpsock * tcp) { struct pbuf *punsent; size_t space, chunk; unsigned int flags; err_t err; int enqueued; assert(tcp->tcp_pcb != NULL); if (tcpsock_get_flags(tcp) & TCPF_FULL) return FALSE; /* * Attempt to enqueue more unsent data, if any, on the PCB's send * queue. */ enqueued = FALSE; while (tcp->tcp_snd.ts_unsent != NULL) { if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0) break; /* * We may maintain a non-NULL unsent pointer even when there is * nothing more to send right now, because the tail buffer may * be filled up further later on. */ punsent = tcp->tcp_snd.ts_unsent; assert(punsent->len >= tcp->tcp_snd.ts_unsent_off); chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off; if (chunk == 0) break; if (chunk > space) chunk = space; /* Try to enqueue more data for sending. */ if (chunk < punsent->len || punsent->next != NULL) flags = TCP_WRITE_FLAG_MORE; else flags = 0; err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload + tcp->tcp_snd.ts_unsent_off, chunk, flags); /* * Since tcp_write() enqueues data only, it should only return * out-of-memory errors; no fatal ones. In any case, stop. */ if (err != ERR_OK) { assert(err == ERR_MEM); break; } /* We have successfully enqueued data. */ enqueued = TRUE; tcp->tcp_snd.ts_unsent_off += chunk; if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) { assert(tcp->tcp_snd.ts_unsent_off < punsent->len || punsent->next == NULL); break; } tcp->tcp_snd.ts_unsent = punsent->next; tcp->tcp_snd.ts_unsent_off = 0; } /* * If all pending data has been enqueued for sending, and we should * shut down the sending end of the socket, try that now. */ if ((tcp->tcp_snd.ts_unsent == NULL || tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) && tcpsock_is_shutdown(tcp, SFL_SHUT_WR) && !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) { err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/); if (err == ERR_OK) { /* * We have successfully enqueued a FIN. The caller is * now responsible for checking whether the PCB and * possibly even the socket object can now be freed. */ tcpsock_set_flag(tcp, TCPF_SENT_FIN); enqueued = TRUE; } else { assert(err == ERR_MEM); /* * FIXME: the resolution for lwIP bug #47485 has taken * away even more control over the closing process from * us, making tracking sockets especially for SO_LINGER * even harder. For now, we simply effectively undo * the patch by clearing TF_CLOSEPEND if tcp_shutdown() * returns ERR_MEM. This will not be sustainable in * the long term, though. */ tcp->tcp_pcb->flags &= ~TF_CLOSEPEND; tcpsock_set_flag(tcp, TCPF_FULL); } } return enqueued; } /* * Request lwIP to start sending any enqueued data and/or FIN on the TCP * socket's lwIP PCB. On success, return OK. On failure, return a negative * error code, after cleaning up the socket, freeing the PCB. If the socket * was already being closed, also free the socket object in that case; the * caller must then not touch the socket object anymore upon return. If the * socket object is not freed, and if 'raise_error' is TRUE, raise the error * on the socket object. */ static int tcpsock_pcb_send(struct tcpsock * tcp, int raise_error) { err_t err; int r; assert(tcp->tcp_pcb != NULL); /* * If we have enqueued something, ask lwIP to send TCP packets now. * This may result in a fatal error, in which case we clean up the * socket and return the error to the caller. Since cleaning up the * socket may free the socket object, and the caller cannot tell * whether that will happen or has happened, also possibly raise the * error on the socket object if it is not gone. As such, callers that * set 'raise_error' to FALSE must know for sure that the socket was * not being closed, for example because the caller is processing a * (send) call from userland. */ err = tcp_output(tcp->tcp_pcb); if (err != ERR_OK && err != ERR_MEM) { tcpsock_pcb_abort(tcp); r = util_convert_err(err); if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) { if (raise_error) sockevent_set_error(tcpsock_get_sock(tcp), r); } /* Otherwise, do not touch the socket object anymore! */ return r; } else return OK; } /* * Callback from lwIP. The given number of data bytes have been acknowledged * as received by the remote end. Dequeue and free data from the TCP socket's * send queue as appropriate. */ static err_t tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len) { struct tcpsock *tcp = (struct tcpsock *)arg; struct pbuf *phead; size_t left; assert(tcp != NULL); assert(pcb == tcp->tcp_pcb); assert(len > 0); assert(tcp->tcp_snd.ts_len >= len); assert(tcp->tcp_snd.ts_head != NULL); left = len; /* * First see if we can free up whole buffers. Check against the head * buffer's 'len' rather than 'tot_len', or we may end up leaving an * empty buffer on the chain. */ while ((phead = tcp->tcp_snd.ts_head) != NULL && left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) { left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off; tcp->tcp_snd.ts_head = phead->next; tcp->tcp_snd.ts_head_off = 0; if (phead == tcp->tcp_snd.ts_unsent) { assert(tcp->tcp_snd.ts_unsent_off == phead->len); tcp->tcp_snd.ts_unsent = phead->next; tcp->tcp_snd.ts_unsent_off = 0; } assert(tcpsock_sendbufs > 0); tcpsock_sendbufs--; tcpsock_free_buf(phead); } /* * The rest of the given length is for less than the current head * buffer. */ if (left > 0) { assert(tcp->tcp_snd.ts_head != NULL); assert((size_t)tcp->tcp_snd.ts_head->len - tcp->tcp_snd.ts_head_off > left); tcp->tcp_snd.ts_head_off += left; } tcp->tcp_snd.ts_len -= (size_t)len; if (tcp->tcp_snd.ts_head == NULL) { assert(tcp->tcp_snd.ts_len == 0); assert(tcp->tcp_snd.ts_unsent == NULL); tcp->tcp_snd.ts_tail = NULL; } else assert(tcp->tcp_snd.ts_len > 0); /* * If we emptied the send queue, and we already managed to send a FIN * earlier, we may now have met all requirements to close the socket's * PCB. Otherwise, we may also be able to send more now, so try to * resume sending. Since we are invoked from the "sent" event, * tcp_output() will not actually process anything, and so we do not * call it either. If we did, we would have to deal with errors here. */ if (tcpsock_may_close(tcp)) { if (tcpsock_finish_close(tcp)) return ERR_OK; } else { tcpsock_clear_flag(tcp, TCPF_FULL); /* * If we now manage to enqueue a FIN, we may be ready to close * the PCB after all. */ if (tcpsock_pcb_enqueue(tcp)) { if (tcpsock_may_close(tcp) && tcpsock_finish_close(tcp)) return ERR_OK; } } /* The user may also be able to send more now. */ sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND); return ERR_OK; } /* * Check whether any (additional) data previously received on a TCP socket * should be acknowledged, possibly allowing the remote end to send additional * data as a result. */ static void tcpsock_ack_recv(struct tcpsock * tcp) { size_t rcvbuf, left, delta, ack; assert(tcp->tcp_pcb != NULL); /* * We must make sure that at all times, we can still add an entire * window's worth of data to the receive queue. If the amount of free * space drops below that threshold, we stop acknowledging received * data. The user may change the receive buffer size at all times; we * update the window size lazily as appropriate. */ rcvbuf = tcpsock_get_rcvbuf(tcp); if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) { /* * The number of bytes that lwIP can still give us at any time * is represented as 'left'. The number of bytes that we still * allow to be stored in the receive queue is represented as * 'delta'. We must make sure that 'left' does not ever exceed * 'delta' while acknowledging as many bytes as possible under * that rule. */ left = TCP_WND - tcp->tcp_rcv.tr_unacked; delta = rcvbuf - tcp->tcp_rcv.tr_len; if (left < delta) { ack = delta - left; if (ack > tcp->tcp_rcv.tr_unacked) ack = tcp->tcp_rcv.tr_unacked; tcp_recved(tcp->tcp_pcb, ack); tcp->tcp_rcv.tr_unacked -= ack; assert(tcp->tcp_rcv.tr_len + TCP_WND - tcp->tcp_rcv.tr_unacked <= rcvbuf); } } } /* * Attempt to merge two consecutive underfilled buffers in the receive queue of * a TCP socket, freeing up one of the two buffers as a result. The first * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at * 'pnext'. The second (new) buffer is 'pbuf', which is already attached to * the first buffer. The second buffer may be followed by additional buffers * with even more new data. Return TRUE if buffers have been merged, in which * case the pointer at 'pnext' may have changed, and no assumptions should be * made about whether 'ptail' and 'pbuf' still exist in any form. Return FALSE * if no merging was necessary or if no new buffer could be allocated. */ static int tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf) { struct pbuf *pnew; assert(*pnext == ptail); assert(ptail->next == pbuf); /* * Unfortunately, we cannot figure out what kind of pbuf we were given * by the lower layers, so we cannot merge two buffers without first * allocating a third. Once we have done that, though, we can easily * merge more into that new buffer. For now we use the following * policies: * * 1. if two consecutive lwIP-provided buffers are both used less than * half the size of a full buffer, try to allocate a new buffer and * copy both lwIP-provided buffers into that new buffer, freeing up * the pair afterwards; * 2. if the tail buffer on the chain is allocated by us and not yet * full, and the next buffer's contents can be added to the tail * buffer in their entirety, do just that. * * Obviously there is a trade-off between the performance overhead of * copying and the resource overhead of keeping less-than-full buffers * on the receive queue, but this policy should both keep actual memory * usage to no more than twice the receive queue length and prevent * excessive copying. The policy deliberately performs more aggressive * merging into a buffer that we allocated ourselves. */ if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 && pbuf->len <= MEMPOOL_BUFSIZE / 2) { /* * Case #1. */ assert(ptail->tot_len == ptail->len); assert(pbuf->tot_len == pbuf->len); pnew = tcpsock_alloc_buf(); if (pnew == NULL) return FALSE; memcpy(pnew->payload, ptail->payload, ptail->len); memcpy((char *)pnew->payload + ptail->len, pbuf->payload, pbuf->len); pnew->len = ptail->len + pbuf->len; assert(pnew->len <= pnew->tot_len); pnew->next = pbuf->next; /* For now, we need not inherit any flags from either pbuf. */ *pnext = pnew; /* One allocated, two about to be deallocated. */ assert(tcpsock_recvbufs > 0); tcpsock_recvbufs--; tcpsock_free_buf(ptail); tcpsock_free_buf(pbuf); return TRUE; } else if (ptail->tot_len - ptail->len >= pbuf->len) { /* * Case #2. */ memcpy((char *)ptail->payload + ptail->len, pbuf->payload, pbuf->len); ptail->len += pbuf->len; ptail->next = pbuf->next; assert(tcpsock_recvbufs > 0); tcpsock_recvbufs--; tcpsock_free_buf(pbuf); return TRUE; } else return FALSE; } /* * Callback from lwIP. New data or flags have been received on a TCP socket. */ static err_t tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused, struct pbuf * pbuf, err_t err) { struct tcpsock *tcp = (struct tcpsock *)arg; struct pbuf *ptail, **pprevp; size_t len; assert(tcp != NULL); assert(pcb == tcp->tcp_pcb); /* * lwIP should never provide anything other than ERR_OK in 'err', and * it is not clear what we should do if it would. If lwIP ever changes * in this regard, we will likely have to change this code accordingly. */ if (err != ERR_OK) panic("TCP receive event with error: %d", err); /* If the given buffer is NULL, we have received a FIN. */ if (pbuf == NULL) { tcpsock_set_flag(tcp, TCPF_RCVD_FIN); /* Userland may now receive EOF. */ if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV); /* * If we were in the process of closing the socket, and we * receive a FIN before our FIN got acknowledged, we close the * socket anyway, as described in tcpsock_close(). However, if * there is still unacknowledged outgoing data or we did not * even manage to send our FIN yet, hold off closing the socket * for now. */ if (tcpsock_may_close(tcp)) (void)tcpsock_finish_close(tcp); return ERR_OK; } /* * If the socket is being closed, receiving new data should cause a * reset. */ if (sockevent_is_closing(tcpsock_get_sock(tcp))) { tcpsock_pcb_abort(tcp); (void)tcpsock_cleanup(tcp, TRUE /*may_free*/); /* Do not touch the socket object anymore! */ pbuf_free(pbuf); return ERR_ABRT; } /* * If the socket has already been shut down for reading, discard the * incoming data and do nothing else. */ if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) { tcp_recved(tcp->tcp_pcb, pbuf->tot_len); pbuf_free(pbuf); return ERR_OK; } /* * We deliberately ignore the PBUF_FLAG_PUSH flag. This flag would * enable the receive functionality to delay delivering "un-pushed" * data to applications. The implementation of this scheme could track * the amount of data up to and including the last-pushed segment using * a "tr_push_len" field or so. Deciding when to deliver "un-pushed" * data after all is a bit tricker though. As far as I can tell, the * BSDs do not implement anything like that. Windows does, and this * results in interaction problems with even more lightweight TCP/IP * stacks that do not send the TCP PSH flag. Currently, there is no * obvious benefit for us to support delaying data delivery like that. * In addition, testing its implementation reliably would be difficult. */ len = (size_t)pbuf->tot_len; /* * Count the number of buffers that are now owned by us. The new total * of buffers owned by us must not exceed the size of the memory pool. * Any more would indicate an accounting error. Note that * tcpsock_recvbufs is currently used for debugging only! */ tcpsock_recvbufs += pbuf_clen(pbuf); assert(tcpsock_recvbufs < mempool_cur_buffers()); /* * The pre-tail pointer points to whatever is pointing to the tail * buffer. The latter pointer may be the 'tr_head' field in our * tcpsock structure, or the 'next' field in the penultimate buffer, * or NULL if there are currently no buffers on the receive queue. */ if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) { ptail = *pprevp; assert(ptail != NULL); assert(ptail->next == NULL); assert(tcp->tcp_rcv.tr_head != NULL); ptail->next = pbuf; pbuf->tot_len = pbuf->len; /* to help freeing on merges */ if (tcpsock_try_merge(pprevp, ptail, pbuf)) { ptail = *pprevp; pbuf = ptail->next; } if (pbuf != NULL) pprevp = &ptail->next; } else { assert(tcp->tcp_rcv.tr_head == NULL); assert(tcp->tcp_rcv.tr_head_off == 0); tcp->tcp_rcv.tr_head = pbuf; pprevp = &tcp->tcp_rcv.tr_head; } /* * Chop up the chain into individual buffers. This is necessary as we * overload 'tot_len' to mean "space available in the buffer", as we * want for buffers allocated by us as part of buffer merges. Also get * a pointer to the pointer to the new penultimate tail buffer. Due to * merging, the chain may already be empty by now, though. */ if (pbuf != NULL) { for (; pbuf->next != NULL; pbuf = pbuf->next) { pbuf->tot_len = pbuf->len; pprevp = &pbuf->next; } assert(pbuf->len == pbuf->tot_len); } assert(*pprevp != NULL); assert((*pprevp)->next == NULL); tcp->tcp_rcv.tr_pre_tailp = pprevp; tcp->tcp_rcv.tr_len += len; tcp->tcp_rcv.tr_unacked += len; assert(tcp->tcp_rcv.tr_unacked <= TCP_WND); /* * Note that tr_len may now exceed the receive buffer size in the * highly exceptional case that the user is adjusting the latter after * the socket had already received data. */ /* See if we can immediately acknowledge some or all of the data. */ tcpsock_ack_recv(tcp); /* Also wake up any receivers now. */ sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV); return ERR_OK; } /* * Callback from lwIP. The PCB corresponding to the socket identified by 'arg' * has been closed by lwIP, with the reason specified in 'err': either the * connection has been aborted locally (ERR_ABRT), it has been reset by the * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD). */ static void tcpsock_event_err(void * arg, err_t err) { struct tcpsock *tcp = (struct tcpsock *)arg; int r; assert(tcp != NULL); assert(tcp->tcp_pcb != NULL); assert(err != ERR_OK); /* The original PCB is now gone, or will be shortly. */ tcp->tcp_pcb = NULL; /* * Clean up the socket. As a result it may be freed, in which case we * must not touch it anymore. No need to return ERR_ABRT from here, as * the PCB has been aborted already. */ if (tcpsock_cleanup(tcp, TRUE /*may_free*/)) return; if (err == ERR_CLSD) { /* * We may get here if the socket is shut down for writing and * we already received a FIN from the remote side, thus putting * the socket in LAST_ACK state, and we receive that last * acknowledgment. There is nothing more we need to do. * * We will never get here in the other case that ERR_CLSD is * raised, which is when the socket is reset because of * unacknowledged data while closing: we handle the * reset-on-ACK case ourselves in tcpsock_close(), and the * socket is in closing state after that. */ assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR)); assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN); } else { /* * Anything else should be an error directly from lwIP; * currently either ERR_ABRT and ERR_RST. Covert it to a * regular error and set it on the socket. Doing so will also * raise the appropriate events. */ /* * Unfortunately, lwIP is not throwing accurate errors even * when it can. We convert some errors to reflect more * accurately the most likely cause. * * TODO: fix lwIP in this regard.. */ r = util_convert_err(err); if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) { switch (err) { case ERR_ABRT: r = ETIMEDOUT; break; case ERR_RST: r = ECONNREFUSED; break; } } sockevent_set_error(tcpsock_get_sock(tcp), r); } } /* * Callback from lwIP. Perform regular checks on a TCP socket. This function * is called one per five seconds on connected sockets, and twice per second on * closing sockets. */ static err_t tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused) { struct tcpsock *tcp = (struct tcpsock *)arg; err_t err; int r; assert(tcp != NULL); assert(pcb == tcp->tcp_pcb); /* * If we ended up running out of buffers earlier, try resuming any send * requests now, both for enqueuing TCP data with lwIP and for user * requests. */ if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) { tcpsock_clear_flag(tcp, TCPF_FULL); tcpsock_clear_flag(tcp, TCPF_OOM); /* See if we can enqueue more data with lwIP. */ if (tcpsock_pcb_enqueue(tcp)) { /* In some cases, we can now close the PCB. */ if (tcpsock_may_close(tcp)) { (void)tcpsock_finish_close(tcp); /* * The PCB is definitely gone here, and the * entire socket object may be gone now too. * Do not touch either anymore! */ return ERR_OK; } /* * If actually sending the data fails, the PCB will be * gone, and the socket object may be gone as well. Do * not touch either anymore in that case! */ if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK) return ERR_ABRT; } /* * If we ran out of buffers earlier, it may be possible to take * in more data from a user process now, even if we did not * manage to enqueue any more pending data with lwIP. */ sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND); assert(tcp->tcp_pcb != NULL); } else if (tcp->tcp_snd.ts_unsent != NULL && tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) { /* * If the send buffer is full, we will no longer call * tcp_output(), which means we may also miss out on fatal * errors that would otherwise kill the connection (e.g., no * route). As a result, the connection may erroneously * continue to exist for a long time. To avoid this, we call * tcp_output() every once in a while when there are still * unsent data. */ err = tcp_output(tcp->tcp_pcb); if (err != ERR_OK && err != ERR_MEM) { tcpsock_pcb_abort(tcp); if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) { r = util_convert_err(err); sockevent_set_error(tcpsock_get_sock(tcp), r); } /* Otherwise do not touch the socket object anymore! */ return ERR_ABRT; } } /* * If we are closing the socket, and we sent a FIN, see if the FIN got * acknowledged. If so, finish closing the socket. Unfortunately, we * can perform this check by polling only. TODO: change lwIP.. */ if (sockevent_is_closing(tcpsock_get_sock(tcp)) && (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) && tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) { assert(tcp->tcp_snd.ts_len == 0); tcpsock_finish_close(tcp); } return ERR_OK; } /* * Bind a TCP socket to a local address. */ static int tcpsock_bind(struct sock * sock, const struct sockaddr * addr, socklen_t addr_len, endpoint_t user_endpt) { struct tcpsock *tcp = (struct tcpsock *)sock; ip_addr_t ipaddr; uint16_t port; err_t err; int r; if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED) return EINVAL; if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len, user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port, FALSE /*allow_mcast*/, &ipaddr, &port)) != OK) return r; err = tcp_bind(tcp->tcp_pcb, &ipaddr, port); return util_convert_err(err); } /* * Callback from lwIP. A new connection 'pcb' has arrived on the listening * socket identified by 'arg'. Note that 'pcb' may be NULL in the case that * lwIP could not accept the connection itself. */ static err_t tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err) { struct tcpsock *tcp = (struct tcpsock *)arg; assert(tcp != NULL); assert(tcpsock_is_listening(tcp)); /* * If the given PCB is NULL, then lwIP ran out of memory allocating a * PCB for the new connection. There is nothing we can do with that * information. Also check 'err' just to make sure. */ if (pcb == NULL || err != OK) return ERR_OK; /* * The TCP socket is the listening socket, but the PCB is for the * incoming connection. */ if (tcpsock_clone(tcp, pcb) != OK) { /* * We could not allocate the resources necessary to accept the * connection. Abort it immediately. */ tcp_abort(pcb); return ERR_ABRT; } /* * The connection has not yet been accepted, and thus should still be * considered on the listen queue. */ tcp_backlog_delayed(pcb); /* Set the callback functions. */ tcp_recv(pcb, tcpsock_event_recv); tcp_sent(pcb, tcpsock_event_sent); tcp_err(pcb, tcpsock_event_err); tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL); sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT); return ERR_OK; } /* * Put a TCP socket in listening mode. */ static int tcpsock_listen(struct sock * sock, int backlog) { struct tcpsock *tcp = (struct tcpsock *)sock; struct tcp_pcb *pcb; err_t err; /* The maximum backlog value must not exceed its field size. */ assert(SOMAXCONN <= UINT8_MAX); /* * Allow only CLOSED sockets to enter listening mode. If the socket * was already in listening mode, allow its backlog value to be * updated, even if it was shut down already (making this a no-op). */ if (!tcpsock_is_listening(tcp) && (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)) return EINVAL; /* * If the socket was not already in listening mode, put it in that mode * now. That involves switching PCBs as lwIP attempts to save memory * by replacing the original PCB with a smaller one. If the socket was * already in listening mode, simply update its backlog value--this has * no effect on the sockets already in the backlog. */ if (!tcpsock_is_listening(tcp)) { assert(tcp->tcp_pcb != NULL); /* * If the socket has not been bound to a port yet, do that * first. This does mean that the listen call may fail with * side effects, but that is acceptable in this case. */ if (tcp->tcp_pcb->local_port == 0) { err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip, 0 /*port*/); if (err != ERR_OK) return util_convert_err(err); } /* * Clear the argument on the PCB that is about to be replaced, * because if we do not, once the PCB is reused (which does not * clear the argument), we might get weird events. Do this * before the tcp_listen() call, because we should no longer * access the old PCB afterwards (even if we can). */ tcp_arg(tcp->tcp_pcb, NULL); pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog, &err); if (pcb == NULL) { tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */ return util_convert_err(err); } tcp_arg(pcb, tcp); tcp->tcp_pcb = pcb; tcp_accept(pcb, tcpsock_event_accept); /* Initialize the queue head for sockets pending acceptance. */ TAILQ_INIT(&tcp->tcp_queue.tq_head); } else if (tcp->tcp_pcb != NULL) tcp_backlog_set(tcp->tcp_pcb, backlog); return OK; } /* * Callback from lwIP. A socket connection attempt has succeeded. Note that * failed socket events will trigger the tcpsock_event_err() callback instead. */ static err_t tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err) { struct tcpsock *tcp = (struct tcpsock *)arg; assert(tcp != NULL); assert(pcb == tcp->tcp_pcb); assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING); /* * If lwIP ever changes so that this callback is called for connect * failures as well, then we need to change the code here accordingly. */ if (err != ERR_OK) panic("TCP connected event with error: %d", err); tcpsock_clear_flag(tcp, TCPF_CONNECTING); sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND); return ERR_OK; } /* * Connect a TCP socket to a remote address. */ static int tcpsock_connect(struct sock * sock, const struct sockaddr * addr, socklen_t addr_len, endpoint_t user_endpt) { struct tcpsock *tcp = (struct tcpsock *)sock; ip_addr_t dst_addr; uint16_t dst_port; err_t err; int r; /* * Listening sockets may not have a PCB, so we use higher-level flags * to throw the correct error code for those instead. */ if (tcpsock_is_listening(tcp)) return EOPNOTSUPP; /* * If there is no longer any PCB, we obviously cannot perform the * connection, but POSIX is not clear on which error to return. We * copy NetBSD's. */ if (tcp->tcp_pcb == NULL) return EINVAL; /* * The only state from which a connection can be initiated, is CLOSED. * Some of the other states require distinct error codes, though. */ switch (tcp->tcp_pcb->state) { case CLOSED: break; case SYN_SENT: return EALREADY; case LISTEN: assert(0); /* we just checked.. */ default: return EISCONN; } /* * Get the destination address, and attempt to start connecting. If * the socket was not bound before, or it was bound to a port only, * then lwIP will select a source address for us. We cannot do this * ourselves even if we wanted to: it is impossible to re-bind a TCP * PCB in the case it was previously bound to a port only. */ if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len, &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK) return r; err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port, tcpsock_event_connected); /* * Note that various tcp_connect() error cases will leave the PCB with * a newly set local and remote IP address anyway. We should be * careful not to rely on the addresses being as they were before. */ if (err != ERR_OK) return util_convert_err(err); /* Set the other callback functions. */ tcp_recv(tcp->tcp_pcb, tcpsock_event_recv); tcp_sent(tcp->tcp_pcb, tcpsock_event_sent); tcp_err(tcp->tcp_pcb, tcpsock_event_err); tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL); /* * Set a flag so that we can correct lwIP's error codes in case the * connection fails. */ tcpsock_set_flag(tcp, TCPF_CONNECTING); return SUSPEND; } /* * Test whether any new connections are pending on a listening TCP socket. */ static int tcpsock_test_accept(struct sock * sock) { struct tcpsock *tcp = (struct tcpsock *)sock; /* Is this socket in listening mode at all? */ if (!tcpsock_is_listening(tcp)) return EINVAL; /* Are there any connections to accept right now? */ if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) return OK; /* If the socket has been shut down, we return ECONNABORTED. */ if (tcp->tcp_pcb == NULL) return ECONNABORTED; /* Otherwise, wait for a new connection first. */ return SUSPEND; } /* * Accept a connection on a listening TCP socket, creating a new TCP socket. */ static sockid_t tcpsock_accept(struct sock * sock, struct sockaddr * addr, socklen_t * addr_len, endpoint_t user_endpt __unused, struct sock ** newsockp) { struct tcpsock *listener = (struct tcpsock *)sock; struct tcpsock *tcp; int r; if ((r = tcpsock_test_accept(sock)) != OK) return r; /* Below, we must not assume that the listener has a PCB. */ tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head); assert(tcp->tcp_listener == listener); assert(tcp->tcp_pcb != NULL); TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next); tcp->tcp_listener = NULL; tcp_backlog_accepted(tcp->tcp_pcb); ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port); /* * Set 'newsockp' to NULL so that libsockevent knows we already cloned * the socket, and it must not be reinitialized anymore. */ *newsockp = NULL; return tcpsock_get_id(tcp); } /* * Perform preliminary checks on a send request. */ static int tcpsock_pre_send(struct sock * sock, size_t len __unused, socklen_t ctl_len __unused, const struct sockaddr * addr __unused, socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags) { /* * Reject calls with unknown flags. Since libsockevent strips out the * flags it handles itself here, we only have to test for ones we can * not handle. Currently, there are no send flags that we support. */ if (flags != 0) return EOPNOTSUPP; return OK; } /* * Test whether the given number of data bytes can be sent on a TCP socket. */ static int tcpsock_test_send(struct sock * sock, size_t min) { struct tcpsock *tcp = (struct tcpsock *)sock; size_t sndbuf; if (tcp->tcp_pcb == NULL) return EPIPE; switch (tcp->tcp_pcb->state) { case CLOSED: /* new */ case LISTEN: /* listening */ return ENOTCONN; case SYN_SENT: /* connecting */ case SYN_RCVD: /* simultaneous open, maybe someday? */ return SUSPEND; case ESTABLISHED: /* connected */ case CLOSE_WAIT: /* closed remotely */ break; default: /* shut down locally */ assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR)); return EPIPE; } sndbuf = tcpsock_get_sndbuf(tcp); if (min > sndbuf) min = sndbuf; if (tcp->tcp_snd.ts_len + min > sndbuf) return SUSPEND; else return OK; } /* * Send data on a TCP socket. */ static int tcpsock_send(struct sock * sock, const struct sockdriver_data * data, size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, socklen_t ctl_len __unused, socklen_t * ctl_off __unused, const struct sockaddr * addr __unused, socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags __unused, size_t min) { struct tcpsock *tcp = (struct tcpsock *)sock; struct pbuf *ptail, *pfirst, *pnext, *plast; size_t off, tail_off, chunk, left, sndbuf; int r; if ((r = tcpsock_test_send(sock, min)) != OK) return r; if (len == 0) return OK; /* nothing to do */ sndbuf = tcpsock_get_sndbuf(tcp); if (min > sndbuf) min = sndbuf; assert(min > 0); assert(sndbuf > tcp->tcp_snd.ts_len); left = sndbuf - tcp->tcp_snd.ts_len; if (left > len) left = len; /* * First see if we can fit any more data in the current tail buffer. * If so, we set 'ptail' to point to it and 'tail_off' to the previous * length of the tail buffer, while optimistically extending it to * include the new data. If not, we set them to NULL/0. */ if ((ptail = tcp->tcp_snd.ts_tail) != NULL && ptail->len < ptail->tot_len) { assert(ptail->len > 0); tail_off = (size_t)ptail->len; /* * Optimistically extend the head buffer to include whatever * fits in it. This is needed for util_copy_data(). */ assert(ptail->tot_len > ptail->len); off = (size_t)ptail->tot_len - (size_t)ptail->len; if (off > left) off = left; ptail->len += off; } else { ptail = NULL; tail_off = 0; off = 0; } /* * Then, if there is more to send, allocate new buffers as needed. If * we run out of memory, work with whatever we did manage to grab. */ pfirst = NULL; plast = NULL; while (off < left) { if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS || (pnext = tcpsock_alloc_buf()) == NULL) { /* * Chances are that we will end up suspending this send * request because of being out of buffers. We try to * resume such requests from the polling function. */ tcpsock_set_flag(tcp, TCPF_OOM); break; } tcpsock_sendbufs++; if (pfirst == NULL) pfirst = pnext; else plast->next = pnext; plast = pnext; chunk = (size_t)pnext->tot_len; if (chunk > left - off) chunk = left - off; pnext->len = chunk; off += chunk; } /* * Copy in the data and continue, unless we did not manage to find * enough space to even meet the low send watermark, in which case we * undo any allocation and suspend the call until later. */ if (off >= min) { /* * Optimistically attach the new buffers to the tail, also for * util_copy_data(). We undo all this if the copy fails. */ if (ptail != NULL) { ptail->next = pfirst; pnext = ptail; } else pnext = pfirst; assert(pnext != NULL); r = util_copy_data(data, off, *offp, pnext, tail_off, TRUE /*copy_in*/); } else r = SUSPEND; if (r != OK) { /* Undo the modifications made so far. */ while (pfirst != NULL) { pnext = pfirst->next; assert(tcpsock_sendbufs > 0); tcpsock_sendbufs--; tcpsock_free_buf(pfirst); pfirst = pnext; } if (ptail != NULL) { ptail->next = NULL; ptail->len = tail_off; } return r; } /* Attach the new buffers, if any, to the buffer tail. */ if (pfirst != NULL) { if ((ptail = tcp->tcp_snd.ts_tail) != NULL) { assert(ptail->len == ptail->tot_len); /* * Due to our earlier optimistic modifications, this * may or may not be redundant. */ ptail->next = pfirst; } assert(plast != NULL); tcp->tcp_snd.ts_tail = plast; if (tcp->tcp_snd.ts_head == NULL) { tcp->tcp_snd.ts_head = pfirst; assert(tcp->tcp_snd.ts_head_off == 0); } if (tcp->tcp_snd.ts_unsent == NULL) { tcp->tcp_snd.ts_unsent = pfirst; assert(tcp->tcp_snd.ts_unsent_off == 0); } } tcp->tcp_snd.ts_len += off; /* * See if we can send any of the data we just enqueued. The socket is * still open as we are still processing a call from userland on it; * this saves us from having to deal with the cases that the following * calls end up freeing the socket object. */ if (tcpsock_pcb_enqueue(tcp) && (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) { /* * That did not go well. Return the error immediately if we * had not made any progress earlier. Otherwise, return our * partial progress and leave the error to be picked up later. */ if (*offp > 0) { sockevent_set_error(tcpsock_get_sock(tcp), r); return OK; } else return r; } *offp += off; return (off < len) ? SUSPEND : OK; } /* * Perform preliminary checks on a receive request. */ static int tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, int flags) { /* * Reject calls with unknown flags. Since libsockevent strips out the * flags it handles itself here, we only have to test for ones we can * not handle. */ if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) return EOPNOTSUPP; return OK; } /* * Return TRUE if receive calls may wait for more data to come in on the * connection, or FALSE if we already know that that is not going to happen. */ static int tcpsock_may_wait(struct tcpsock * tcp) { return (tcp->tcp_pcb != NULL && !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN)); } /* * Test whether data can be received on a TCP socket, and if so, how many bytes * of data. */ static int tcpsock_test_recv(struct sock * sock, size_t min, size_t * size) { struct tcpsock *tcp = (struct tcpsock *)sock; int may_wait; /* If there is and never was a connection, refuse the call at all. */ if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED || tcp->tcp_pcb->state == LISTEN)) return ENOTCONN; /* * If we are certain that no more data will come in later, ignore the * low receive watermark. Otherwise, bound it to the size of the * receive buffer, or receive calls may block forever. */ if (!(may_wait = tcpsock_may_wait(tcp))) min = 1; else if (min > tcpsock_get_rcvbuf(tcp)) min = tcpsock_get_rcvbuf(tcp); if (tcp->tcp_rcv.tr_len >= min) { if (size != NULL) *size = tcp->tcp_rcv.tr_len; return OK; } return (may_wait) ? SUSPEND : SOCKEVENT_EOF; } /* * Receive data on a TCP socket. */ static int tcpsock_recv(struct sock * sock, const struct sockdriver_data * data, size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, socklen_t ctl_len __unused, socklen_t * ctl_off __unused, struct sockaddr * addr __unused, socklen_t * addr_len __unused, endpoint_t user_endpt __unused, int flags, size_t min, int * rflags __unused) { struct tcpsock *tcp = (struct tcpsock *)sock; struct pbuf *ptail; size_t off, left; int r; /* See if we can receive at all, and if so, how much at most. */ if ((r = tcpsock_test_recv(sock, min, NULL)) != OK) return r; if (len == 0) return OK; /* nothing to do */ off = tcp->tcp_rcv.tr_len; if (off > len) off = len; assert(tcp->tcp_rcv.tr_head != NULL); assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len); /* Copy out the data to the caller. */ if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head, tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK) return r; /* Unless peeking, remove the data from the receive queue. */ if (!(flags & MSG_PEEK)) { left = off; /* Dequeue and free as many entire buffers as possible. */ while ((ptail = tcp->tcp_rcv.tr_head) != NULL && left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) { left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off; tcp->tcp_rcv.tr_head = ptail->next; tcp->tcp_rcv.tr_head_off = 0; if (tcp->tcp_rcv.tr_head == NULL) tcp->tcp_rcv.tr_pre_tailp = NULL; else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next) tcp->tcp_rcv.tr_pre_tailp = &tcp->tcp_rcv.tr_head; assert(tcpsock_recvbufs > 0); tcpsock_recvbufs--; tcpsock_free_buf(ptail); } /* * If only part of the (new) head buffer is consumed, adjust * the saved offset into that buffer. */ if (left > 0) { assert(tcp->tcp_rcv.tr_head != NULL); assert((size_t)tcp->tcp_rcv.tr_head->len - tcp->tcp_rcv.tr_head_off > left); tcp->tcp_rcv.tr_head_off += left; } tcp->tcp_rcv.tr_len -= off; if (tcp->tcp_rcv.tr_head != NULL) { assert(tcp->tcp_rcv.tr_pre_tailp != NULL); assert(tcp->tcp_rcv.tr_len > 0); } else { assert(tcp->tcp_rcv.tr_pre_tailp == NULL); assert(tcp->tcp_rcv.tr_len == 0); } /* * The receive buffer has shrunk, so there may now be space to * receive more data. */ if (tcp->tcp_pcb != NULL) tcpsock_ack_recv(tcp); } else flags &= ~MSG_WAITALL; /* for the check below */ /* Advance the current copy position, and see if we are done. */ *offp += off; if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp)) return SUSPEND; else return OK; } /* * Update the set of flag-type socket options on a TCP socket. */ static void tcpsock_setsockmask(struct sock * sock, unsigned int mask) { struct tcpsock *tcp = (struct tcpsock *)sock; if (tcp->tcp_pcb == NULL) return; if (mask & SO_REUSEADDR) ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR); else ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR); if (mask & SO_KEEPALIVE) ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE); else ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE); } /* * Prepare a helper structure for IP-level option processing. */ static void tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts) { ipopts->local_ip = &tcp->tcp_pcb->local_ip; ipopts->remote_ip = &tcp->tcp_pcb->remote_ip; ipopts->tos = &tcp->tcp_pcb->tos; ipopts->ttl = &tcp->tcp_pcb->ttl; ipopts->sndmin = TCP_SNDBUF_MIN; ipopts->sndmax = TCP_SNDBUF_MAX; ipopts->rcvmin = TCP_RCVBUF_MIN; ipopts->rcvmax = TCP_RCVBUF_MAX; } /* * Set socket options on a TCP socket. */ static int tcpsock_setsockopt(struct sock * sock, int level, int name, const struct sockdriver_data * data, socklen_t len) { struct tcpsock *tcp = (struct tcpsock *)sock; struct ipopts ipopts; uint32_t uval; int r, val; if (tcp->tcp_pcb == NULL) return ECONNRESET; /* Handle TCP-level options. */ switch (level) { case IPPROTO_IPV6: switch (name) { case IPV6_RECVTCLASS: if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), len)) != OK) return r; /* * This option is not supported for TCP sockets; it * would not even make sense. However, named(8) * insists on trying to set it anyway. We accept the * request but ignore the value, not even returning * what was set through getsockopt(2). */ return OK; case IPV6_FAITH: if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), len)) != OK) return r; /* * This option is not supported at all, but to save * ourselves from having to remember the current state * for getsockopt(2), we also refuse to enable it. */ if (val != 0) return EINVAL; return OK; } break; case IPPROTO_TCP: switch (name) { case TCP_NODELAY: /* * lwIP's listening TCP PCBs do not have this field. * If this ever becomes an issue, we can create our own * shadow flag and do the inheritance ourselves. */ if (tcp->tcp_pcb->state == LISTEN) return EINVAL; if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), len)) != OK) return r; if (val) tcp_nagle_disable(tcp->tcp_pcb); else tcp_nagle_enable(tcp->tcp_pcb); return OK; case TCP_KEEPIDLE: case TCP_KEEPINTVL: /* * lwIP's listening TCP PCBs do not have these fields. */ if (tcp->tcp_pcb->state == LISTEN) return EINVAL; if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), len)) != OK) return r; if (val == 0) return EINVAL; /* * The given value is unsigned, but lwIP stores the * value in milliseconds in a uint32_t field, so we * have to limit large values to whatever fits in the * field anyway. */ if (val < 0 || (uint32_t)val > UINT32_MAX / 1000) uval = UINT32_MAX; else uval = (uint32_t)val * 1000; if (name == TCP_KEEPIDLE) tcp->tcp_pcb->keep_idle = uval; else tcp->tcp_pcb->keep_intvl = uval; return OK; case TCP_KEEPCNT: /* lwIP's listening TCP PCBs do not have this field. */ if (tcp->tcp_pcb->state == LISTEN) return EINVAL; if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), len)) != OK) return r; if (val == 0) return EINVAL; tcp->tcp_pcb->keep_cnt = (uint32_t)val; return OK; } return EOPNOTSUPP; } /* Handle all other options at the IP level. */ tcpsock_get_ipopts(tcp, &ipopts); return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data, len, &ipopts); } /* * Retrieve socket options on a TCP socket. */ static int tcpsock_getsockopt(struct sock * sock, int level, int name, const struct sockdriver_data * data, socklen_t * len) { struct tcpsock *tcp = (struct tcpsock *)sock; struct ipopts ipopts; int val; if (tcp->tcp_pcb == NULL) return ECONNRESET; /* Handle TCP-level options. */ switch (level) { case IPPROTO_IPV6: switch (name) { case IPV6_RECVTCLASS: case IPV6_FAITH: val = 0; return sockdriver_copyout_opt(data, &val, sizeof(val), len); } break; case IPPROTO_TCP: switch (name) { case TCP_NODELAY: /* lwIP's listening TCP PCBs do not have this field. */ if (tcp->tcp_pcb->state == LISTEN) return EINVAL; val = tcp_nagle_disabled(tcp->tcp_pcb); return sockdriver_copyout_opt(data, &val, sizeof(val), len); case TCP_MAXSEG: /* lwIP's listening TCP PCBs do not have this field. */ if (tcp->tcp_pcb->state == LISTEN) return EINVAL; /* This option is read-only at this time. */ val = tcp->tcp_pcb->mss; return sockdriver_copyout_opt(data, &val, sizeof(val), len); case TCP_KEEPIDLE: /* lwIP's listening TCP PCBs do not have this field. */ if (tcp->tcp_pcb->state == LISTEN) return EINVAL; val = (int)(tcp->tcp_pcb->keep_idle / 1000); return sockdriver_copyout_opt(data, &val, sizeof(val), len); case TCP_KEEPINTVL: /* lwIP's listening TCP PCBs do not have this field. */ if (tcp->tcp_pcb->state == LISTEN) return EINVAL; val = (int)(tcp->tcp_pcb->keep_intvl / 1000); return sockdriver_copyout_opt(data, &val, sizeof(val), len); case TCP_KEEPCNT: /* lwIP's listening TCP PCBs do not have this field. */ if (tcp->tcp_pcb->state == LISTEN) return EINVAL; val = (int)tcp->tcp_pcb->keep_cnt; return sockdriver_copyout_opt(data, &val, sizeof(val), len); } return EOPNOTSUPP; } /* Handle all other options at the IP level. */ tcpsock_get_ipopts(tcp, &ipopts); return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data, len, &ipopts); } /* * Retrieve the local socket address of a TCP socket. */ static int tcpsock_getsockname(struct sock * sock, struct sockaddr * addr, socklen_t * addr_len) { struct tcpsock *tcp = (struct tcpsock *)sock; if (tcp->tcp_pcb == NULL) return EINVAL; ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port); return OK; } /* * Retrieve the remote socket address of a TCP socket. */ static int tcpsock_getpeername(struct sock * sock, struct sockaddr * addr, socklen_t * addr_len) { struct tcpsock *tcp = (struct tcpsock *)sock; if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED || tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT) return ENOTCONN; ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port); return OK; } /* * Perform a TCP half-close on a TCP socket. This operation may not complete * immediately due to memory conditions, in which case it will be completed at * a later time. */ static void tcpsock_send_fin(struct tcpsock * tcp) { sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR); /* * Attempt to send the FIN. If a fatal error occurs as a result, raise * it as an asynchronous error, because this function's callers cannot * do much with it. That happens to match the way these functions are * used elsewhere. In any case, as a result, the PCB may be closed. * However, we are never called from a situation where the socket is * being closed here, so the socket object will not be freed either. */ if (tcpsock_pcb_enqueue(tcp)) { assert(!sockevent_is_closing(tcpsock_get_sock(tcp))); if (tcpsock_may_close(tcp)) tcpsock_finish_close(tcp); else (void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/); } } /* * Shut down a TCP socket for reading and/or writing. */ static int tcpsock_shutdown(struct sock * sock, unsigned int mask) { struct tcpsock *tcp = (struct tcpsock *)sock; /* * If the PCB is gone, we want to allow shutdowns for reading but not * writing: shutting down for writing affects the PCB, shutting down * for reading does not. Also, if the PCB is in CLOSED state, we would * not know how to deal with subsequent operations after a shutdown for * writing, so forbid such calls altogether. */ if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) && (mask & SFL_SHUT_WR)) return ENOTCONN; /* * Handle listening sockets as a special case. Shutting down a * listening socket frees its PCB. Sockets pending on the accept queue * may still be accepted, but after that, accept(2) will start * returning ECONNABORTED. This feature allows multi-process server * applications to shut down gracefully, supposedly.. */ if (tcpsock_is_listening(tcp)) { if (tcp->tcp_pcb != NULL) tcpsock_pcb_close(tcp); return OK; } /* * We control shutdown-for-reading locally, and intentially do not tell * lwIP about it: if we do that and also shut down for writing, the PCB * may disappear (now or eventually), which is not what we want. * Instead, we only tell lwIP to shut down for reading once we actually * want to get rid of the PCB, using tcp_close(). In the meantime, if * the socket is shut down for reading by the user, we simply discard * received data as fast as we can--one out of a number of possible * design choices there, and (reportedly) the one used by the BSDs. */ if (mask & SFL_SHUT_RD) (void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/); /* * Shutting down for writing a connecting socket simply closes its PCB. * Closing a PCB in SYN_SENT state simply deallocates it, so this can * not fail. On the other hand, for connected sockets we want to send * a FIN, which may fail due to memory shortage, in which case we have * to try again later.. */ if (mask & SFL_SHUT_WR) { if (tcp->tcp_pcb->state == SYN_SENT) tcpsock_pcb_close(tcp); else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR)) tcpsock_send_fin(tcp); } return OK; } /* * Close a TCP socket. Complete the operation immediately if possible, or * otherwise initiate the closing process and complete it later, notifying * libsockevent about that as well. Depending on linger settings, this * function may be called twice on the same socket: the first time with the * 'force' flag cleared, and the second time with the 'force' flag set. */ static int tcpsock_close(struct sock * sock, int force) { struct tcpsock *tcp = (struct tcpsock *)sock; struct tcpsock *queued; size_t rlen; assert(tcp->tcp_listener == NULL); /* * If this was a listening socket, so abort and clean up any and all * connections on its listener queue. Note that the listening socket * may or may not have a PCB at this point. */ if (tcpsock_is_listening(tcp)) { while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) { queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head); tcpsock_pcb_abort(queued); (void)tcpsock_cleanup(queued, TRUE /*may_free*/); } } /* * Clear the receive queue, and make sure that we no longer add new * data to it. The latter is relevant only for the case that we end up * returning SUSPEND below. Remember whether there were bytes left, * because we should reset the connection if there were. */ rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/); sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD); /* * If the socket is connected, perform a graceful shutdown, unless 1) * we are asked to force-close the socket, or 2) if the local side has * not consumed all data, as per RFC 1122 Sec.4.2.2.13. Normally lwIP * would take care of the second point, but we may have data in our * receive buffer of which lwIP is not aware. * * Implementing proper linger support is somewhat difficult with lwIP. * In particular, we cannot reliably wait for our FIN to be ACK'ed by * the other side in all cases: * * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not * trigger any event and once in the TIME_WAIT state, the poll event * no longer triggers either; * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to * TIME_WAIT will trigger a receive event, but it is not clear * whether we can reliably check that our FIN was ACK'ed from there. * * That means we have to compromise. Instead of the proper approach, * we complete our side of the close operation whenever: * * 1. all of or data was acknowledged, AND, * 2. our FIN was sent, AND, * 3a. our FIN was acknowledged, OR, * 3b. we received a FIN from the other side. * * With the addition of the rule 3b, we do not run into the above * reliability problems, but we may return from SO_LINGER-blocked close * calls too early and thus give callers a false impression of success. * TODO: if lwIP ever gets improved on this point, the code in this * module should be rewritten to make use of the improvements. * * The set of rules is basically the same as for closing the PCB early * as per tcpsock_may_close(), except with the check for our FIN being * acknowledged. Unfortunately only the FIN_WAIT_2, TIME_WAIT, and * (reentered) CLOSED TCP states guarantee that there are no * unacknowledged data segments anymore, so we may have to wait for * reaching any one of these before we can actually finish closing the * socket with tcp_close(). * * In addition, lwIP does not tell us when our FIN gets acknowledged, * so we have to use polling and direct access to lwIP's PCB fields * instead, just like lwIP's BSD API does. There is no other way. * Also, we may not even be able to send the FIN right away, in which * case we must defer that until later. */ if (tcp->tcp_pcb != NULL) { switch (tcp->tcp_pcb->state) { case CLOSE_WAIT: case CLOSING: case LAST_ACK: assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN); /* FALLTHROUGH */ case SYN_RCVD: case ESTABLISHED: case FIN_WAIT_1: /* First check if we should abort the connection. */ if (force || rlen > 0) break; /* * If we have not sent a FIN yet, try sending it now; * if all other conditions are met for closing the * socket, successful FIN transmission will complete * the close. Otherwise, perform the close check * explicitly. */ if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR)) tcpsock_send_fin(tcp); else if (tcpsock_may_close(tcp)) tcpsock_pcb_close(tcp); /* * If at this point the PCB is gone, we managed to * close the connection immediately, and the socket has * already been cleaned up by now. This may occur if * there is no unacknowledged data and we already * received a FIN earlier on. */ if (tcp->tcp_pcb == NULL) return OK; /* * Complete the close operation at a later time. * Adjust the polling interval, so that we can detect * completion of the close as quickly as possible. */ tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_CLOSE_INTERVAL); return SUSPEND; default: /* * The connection is either not yet established, or * already in a state where we can close it right now. */ tcpsock_pcb_close(tcp); } } /* * Abort the connection is the PCB is still around, and clean up the * socket. We cannot let tcpsock_cleanup() free the socket object yet, * because we are still in the callback from libsockevent, and the * latter cannot handle the socket object being freed from here. */ if (tcp->tcp_pcb != NULL) tcpsock_pcb_abort(tcp); (void)tcpsock_cleanup(tcp, FALSE /*may_free*/); return OK; } /* * Free up a closed TCP socket. */ static void tcpsock_free(struct sock * sock) { struct tcpsock *tcp = (struct tcpsock *)sock; assert(tcp->tcp_pcb == NULL); assert(tcp->tcp_snd.ts_len == 0); assert(tcp->tcp_snd.ts_head == NULL); assert(tcp->tcp_rcv.tr_len == 0); assert(tcp->tcp_rcv.tr_head == NULL); TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next); } /* This table maps TCP states from lwIP numbers to NetBSD numbers. */ static const struct { int tsm_tstate; int tsm_sostate; } tcpsock_statemap[] = { [CLOSED] = { TCPS_CLOSED, SS_ISDISCONNECTED }, [LISTEN] = { TCPS_LISTEN, 0 }, [SYN_SENT] = { TCPS_SYN_SENT, SS_ISCONNECTING }, [SYN_RCVD] = { TCPS_SYN_RECEIVED, SS_ISCONNECTING }, [ESTABLISHED] = { TCPS_ESTABLISHED, SS_ISCONNECTED }, [FIN_WAIT_1] = { TCPS_FIN_WAIT_1, SS_ISDISCONNECTING }, [FIN_WAIT_2] = { TCPS_FIN_WAIT_2, SS_ISDISCONNECTING }, [CLOSE_WAIT] = { TCPS_CLOSE_WAIT, SS_ISCONNECTED }, [CLOSING] = { TCPS_CLOSING, SS_ISDISCONNECTING }, [LAST_ACK] = { TCPS_LAST_ACK, SS_ISDISCONNECTING }, [TIME_WAIT] = { TCPS_TIME_WAIT, SS_ISDISCONNECTED }, }; /* * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP * PCB identified by the given pointer. */ static void tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr) { const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr; struct tcpsock *tcp; /* * Not all TCP PCBs have an associated tcpsock structure. We are * careful enough clearing the callback argument for PCBs on any of the * TCP lists that we can use that callback argument to determine * whether there is an associated tcpsock structure, although with one * exception: PCBs for incoming connections that have not yet been * fully established (i.e., in SYN_RCVD state). These will have the * callback argument of the listening socket (which itself may already * have been deallocated at this point) but should not be considered as * associated with the listening socket's tcpsock structure. */ if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) { tcp = (struct tcpsock *)pcb->callback_arg; assert(tcp >= tcp_array && tcp < &tcp_array[__arraycount(tcp_array)]); /* TODO: change this so that sockstat(1) may work one day. */ ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp); } else { /* No tcpsock. Could also be in TIME_WAIT state etc. */ tcp = NULL; ki->ki_sostate = SS_NOFDREF; } ki->ki_type = SOCK_STREAM; if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) { ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate; /* TODO: this needs work, but does anything rely on it? */ ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate; } /* Careful with the LISTEN state here (see below). */ ipsock_get_info(ki, &pcb->local_ip, pcb->local_port, &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0); /* * The PCBs for listening sockets are actually smaller. Thus, for * listening sockets, do not attempt to access any of the fields beyond * those provided in the smaller structure. */ if (pcb->state == LISTEN) { assert(tcp != NULL); ki->ki_refs = (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head); } else { if (tcp_nagle_disabled(pcb)) ki->ki_tflags |= NETBSD_TF_NODELAY; if (tcp != NULL) { ki->ki_rcvq = tcp->tcp_rcv.tr_len; ki->ki_sndq = tcp->tcp_snd.ts_len; if (tcp->tcp_listener != NULL) ki->ki_nextref = (uint64_t)(uintptr_t) TAILQ_NEXT(tcp, tcp_queue.tq_next); } } } /* * Given either NULL or a previously returned TCP PCB pointer, return the first * or next TCP PCB pointer, or NULL if there are no more. The current * implementation supports only one concurrent iteration at once. */ static const void * tcpsock_enum(const void * last) { static struct { unsigned int i; const struct tcp_pcb *pcb; } iter; if (last != NULL && (iter.pcb = iter.pcb->next) != NULL) return (const void *)iter.pcb; for (iter.i = (last != NULL) ? iter.i + 1 : 0; iter.i < __arraycount(tcp_pcb_lists); iter.i++) { if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL) return (const void *)iter.pcb; } return NULL; } /* * Obtain the list of TCP protocol control blocks, for sysctl(7). */ static ssize_t tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused, struct rmib_oldp * oldp, struct rmib_newp * newp __unused) { return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info); } static const struct sockevent_ops tcpsock_ops = { .sop_bind = tcpsock_bind, .sop_listen = tcpsock_listen, .sop_connect = tcpsock_connect, .sop_accept = tcpsock_accept, .sop_test_accept = tcpsock_test_accept, .sop_pre_send = tcpsock_pre_send, .sop_send = tcpsock_send, .sop_test_send = tcpsock_test_send, .sop_pre_recv = tcpsock_pre_recv, .sop_recv = tcpsock_recv, .sop_test_recv = tcpsock_test_recv, .sop_ioctl = ifconf_ioctl, .sop_setsockmask = tcpsock_setsockmask, .sop_setsockopt = tcpsock_setsockopt, .sop_getsockopt = tcpsock_getsockopt, .sop_getsockname = tcpsock_getsockname, .sop_getpeername = tcpsock_getpeername, .sop_shutdown = tcpsock_shutdown, .sop_close = tcpsock_close, .sop_free = tcpsock_free };