phunix/minix/net/lwip/rawsock.c
David van Moolenbroek ef8d499e2d Add lwip: a new lwIP-based TCP/IP service
This commit adds a new TCP/IP service to MINIX 3.  As its core, the
service uses the lwIP TCP/IP stack for maintenance reasons.  The
service aims to be compatible with NetBSD userland, including its
low-level network management utilities.  It also aims to support
modern features such as IPv6.  In summary, the new LWIP service has
support for the following main features:

- TCP, UDP, RAW sockets with mostly standard BSD API semantics;
- IPv6 support: host mode (complete) and router mode (partial);
- most of the standard BSD API socket options (SO_);
- all of the standard BSD API message flags (MSG_);
- the most used protocol-specific socket and control options;
- a default loopback interface and the ability to create one more;
- configuration-free ethernet interfaces and driver tracking;
- queuing and multiple concurrent requests to each ethernet driver;
- standard ioctl(2)-based BSD interface management;
- radix tree backed, destination-based routing;
- routing sockets for standard BSD route reporting and management;
- multicast traffic and multicast group membership tracking;
- Berkeley Packet Filter (BPF) devices;
- standard and custom sysctl(7) nodes for many internals;
- a slab allocation based, hybrid static/dynamic memory pool model.

Many of its modules come with fairly elaborate comments that cover
many aspects of what is going on.  The service is primarily a socket
driver built on top of the libsockdriver library, but for BPF devices
it is at the same time also a character driver.

Change-Id: Ib0c02736234b21143915e5fcc0fda8fe408f046f
2017-04-30 13:16:03 +00:00

1342 lines
36 KiB
C

/* LWIP service - rawsock.c - RAW sockets */
/*
* For IPv6 sockets, this module attempts to implement a part of RFC 3542, but
* currently not more than what is supported by lwIP and/or what is expected by
* a handful of standard utilities (dhcpcd, ping6, traceroute6..).
*
* For general understanding, be aware that IPv4 raw sockets always receive
* packets including the IP header, and may be used to send packets including
* the IP header if IP_HDRINCL is set, while IPv6 raw sockets always send and
* receive actual payloads only, using ancillary (control) data to set and
* retrieve per-packet IP header fields.
*
* For packet headers we follow general BSD semantics. For example, some IPv4
* header fields are swapped both when sending and when receiving. Also, like
* on NetBSD, IPPROTO_RAW is not a special value in any way.
*/
#include "lwip.h"
#include "ifaddr.h"
#include "pktsock.h"
#include "lwip/raw.h"
#include "lwip/inet_chksum.h"
#include <net/route.h>
#include <netinet/icmp6.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
/* The number of RAW sockets. Inherited from the lwIP configuration. */
#define NR_RAWSOCK MEMP_NUM_RAW_PCB
/*
* Outgoing packets are not getting buffered, so the send buffer size simply
* determines the maximum size for sent packets. The send buffer maximum is
* therefore limited to the maximum size of a single packet (64K-1 bytes),
* which is already enforced by lwIP's 16-bit length parameter to pbuf_alloc().
*
* The actual transmission may enforce a lower limit, though. The full packet
* size must not exceed the same 64K-1 limit, and that includes any headers
* that still have to be prepended to the given packet. The size of those
* headers depends on the socket type (IPv4/IPv6) and the IP_HDRINCL setting.
*
* The default is equal to the maximum here, because if a (by definition,
* privileged) application wishes to send large raw packets, it probably has a
* good reason, and we do not want to get in its way.
*/
#define RAW_MAX_PAYLOAD (UINT16_MAX)
#define RAW_SNDBUF_MIN 1 /* minimum RAW send buffer size */
#define RAW_SNDBUF_DEF RAW_MAX_PAYLOAD /* default RAW send buffer size */
#define RAW_SNDBUF_MAX RAW_MAX_PAYLOAD /* maximum RAW send buffer size */
#define RAW_RCVBUF_MIN MEMPOOL_BUFSIZE /* minimum RAW receive buffer size */
#define RAW_RCVBUF_DEF 32768 /* default RAW receive buffer size */
#define RAW_RCVBUF_MAX 65536 /* maximum RAW receive buffer size */
static struct rawsock {
struct pktsock raw_pktsock; /* packet socket object */
struct raw_pcb *raw_pcb; /* lwIP RAW control block */
TAILQ_ENTRY(rawsock) raw_next; /* next in active/free list */
struct icmp6_filter raw_icmp6filter; /* ICMPv6 type filter */
} raw_array[NR_RAWSOCK];
static TAILQ_HEAD(, rawsock) raw_freelist; /* list of free RAW sockets */
static TAILQ_HEAD(, rawsock) raw_activelist; /* list, in-use RAW sockets */
static const struct sockevent_ops rawsock_ops;
#define rawsock_get_sock(raw) (ipsock_get_sock(rawsock_get_ipsock(raw)))
#define rawsock_get_ipsock(raw) (pktsock_get_ipsock(&(raw)->raw_pktsock))
#define rawsock_is_ipv6(raw) (ipsock_is_ipv6(rawsock_get_ipsock(raw)))
#define rawsock_is_v6only(raw) (ipsock_is_v6only(rawsock_get_ipsock(raw)))
#define rawsock_is_conn(raw) \
(raw_flags((raw)->raw_pcb) & RAW_FLAGS_CONNECTED)
#define rawsock_is_hdrincl(raw) \
(raw_flags((raw)->raw_pcb) & RAW_FLAGS_HDRINCL)
static ssize_t rawsock_pcblist(struct rmib_call *, struct rmib_node *,
struct rmib_oldp *, struct rmib_newp *);
/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_RAW subtree. */
/* All dynamically numbered; the sendspace/recvspace entries are ours. */
static struct rmib_node net_inet_raw_table[] = {
RMIB_INT(RMIB_RO, RAW_SNDBUF_DEF, "sendspace",
"Default RAW send buffer size"),
RMIB_INT(RMIB_RO, RAW_RCVBUF_DEF, "recvspace",
"Default RAW receive buffer size"),
RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, rawsock_pcblist, "pcblist",
"RAW IP protocol control block list"),
};
static struct rmib_node net_inet_raw_node =
RMIB_NODE(RMIB_RO, net_inet_raw_table, "raw", "RAW IPv4 settings");
static struct rmib_node net_inet6_raw6_node =
RMIB_NODE(RMIB_RO, net_inet_raw_table, "raw6", "RAW IPv6 settings");
/*
* Initialize the raw sockets module.
*/
void
rawsock_init(void)
{
unsigned int slot;
/* Initialize the list of free RAW sockets. */
TAILQ_INIT(&raw_freelist);
for (slot = 0; slot < __arraycount(raw_array); slot++)
TAILQ_INSERT_TAIL(&raw_freelist, &raw_array[slot], raw_next);
/* Initialize the list of active RAW sockets. */
TAILQ_INIT(&raw_activelist);
/* Register the net.inet.raw and net.inet6.raw6 RMIB subtrees. */
mibtree_register_inet(PF_INET, IPPROTO_RAW, &net_inet_raw_node);
mibtree_register_inet(PF_INET6, IPPROTO_RAW, &net_inet6_raw6_node);
}
/*
* Check whether the given arrived IPv6 packet is fit to be received on the
* given raw socket.
*/
static int
rawsock_check_v6(struct rawsock * raw, struct pbuf * pbuf)
{
uint8_t type;
assert(rawsock_is_ipv6(raw));
/*
* For ICMPv6 packets, test against the configured type filter.
*/
if (raw->raw_pcb->protocol == IPPROTO_ICMPV6) {
if (pbuf->len < offsetof(struct icmp6_hdr, icmp6_dataun))
return FALSE;
memcpy(&type, &((struct icmp6_hdr *)pbuf->payload)->icmp6_type,
sizeof(type));
if (!ICMP6_FILTER_WILLPASS((int)type, &raw->raw_icmp6filter))
return FALSE;
}
/*
* For ICMPv6 packets, or if IPV6_CHECKSUM is enabled, we have to
* verify the checksum of the packet before passing it to the user.
* This is costly, but it needs to be done and lwIP is not doing it for
* us (as of writing, anyway), even though it maintains the offset..
*/
if (raw->raw_pcb->chksum_reqd &&
(pbuf->tot_len < raw->raw_pcb->chksum_offset + sizeof(uint16_t) ||
ip6_chksum_pseudo(pbuf, raw->raw_pcb->protocol, pbuf->tot_len,
ip6_current_src_addr(), ip6_current_dest_addr()) != 0)) {
return FALSE;
}
/* No reason to filter out this packet. */
return TRUE;
}
/*
* Adjust the given arrived IPv4 packet by changing the length and offset
* fields to host-byte order, as is done by the BSDs. This effectively mirrors
* the swapping part of the preparation done on IPv4 packets being sent if the
* IP_HDRINCL socket option is enabled.
*/
static void
rawsock_adjust_v4(struct pbuf * pbuf)
{
struct ip_hdr *iphdr;
if (pbuf->len < sizeof(struct ip_hdr))
return;
iphdr = (struct ip_hdr *)pbuf->payload;
/*
* W. Richard Stevens also mentions ip_id, but at least on NetBSD that
* field seems to be swapped neither when sending nor when receiving..
*/
IPH_LEN(iphdr) = htons(IPH_LEN(iphdr));
IPH_OFFSET(iphdr) = htons(IPH_OFFSET(iphdr));
}
/*
* A packet has arrived on a raw socket. Since the same packet may have to be
* delivered to multiple raw sockets, we always return 0 (= not consumed) from
* this function. As such, we must make a copy of the given packet if we want
* to keep it, and never free it.
*/
static uint8_t
rawsock_input(void * arg, struct raw_pcb * pcb __unused, struct pbuf * psrc,
const ip_addr_t * srcaddr)
{
struct rawsock *raw = (struct rawsock *)arg;
struct pbuf *pbuf;
int off, hdrlen;
assert(raw->raw_pcb == pcb);
/*
* If adding this packet would cause the receive buffer to go beyond
* the current limit, drop the new packet. This is just an estimation,
* because the copy we are about to make may not take the exact same
* amount of memory, due to the fact that 1) the pbuf we're given has
* an unknown set of headers in front of it, and 2) we need to store
* extra information in our copy. The return value of this call, if
* not -1, is the number of bytes we need to reserve to store that
* extra information.
*/
if ((hdrlen = pktsock_test_input(&raw->raw_pktsock, psrc)) < 0)
return 0;
/*
* Raw IPv6 sockets receive only the actual packet data, whereas raw
* IPv4 sockets receive the IP header as well.
*/
if (ip_current_is_v6()) {
off = ip_current_header_tot_len();
util_pbuf_header(psrc, -off);
if (!rawsock_check_v6(raw, psrc)) {
util_pbuf_header(psrc, off);
return 0;
}
} else {
/*
* For IPv6 sockets, drop the packet if it was sent as an IPv4
* packet and checksumming is enabled (this includes ICMPv6).
* Otherwise, the packet would bypass the above checks that we
* perform on IPv6 packets. Applications that want to use a
* dual-stack protocol with checksumming will have to do the
* checksum verification part themselves. Presumably the two
* different pseudoheaders would result in different checksums
* anyhow, so it would be useless to try to support that.
*
* Beyond that, for IPv4 packets on IPv6 sockets, hide the IPv4
* header.
*/
if (rawsock_is_ipv6(raw)) {
if (raw->raw_pcb->chksum_reqd)
return 0;
off = IP_HLEN;
util_pbuf_header(psrc, -off);
} else
off = 0;
}
/*
* We need to make a copy of the incoming packet. If we eat the one
* given to us, this will 1) stop any other raw sockets from getting
* the same packet, 2) allow a single raw socket to discard all TCP/UDP
* traffic, and 3) present us with a problem on how to store ancillary
* data. Raw sockets are not that performance critical so the extra
* copy -even when not always necessary- is not that big of a deal.
*/
if ((pbuf = pchain_alloc(PBUF_RAW, hdrlen + psrc->tot_len)) == NULL) {
if (off > 0)
util_pbuf_header(psrc, off);
return 0;
}
util_pbuf_header(pbuf, -hdrlen);
if (pbuf_copy(pbuf, psrc) != ERR_OK)
panic("unexpected pbuf copy failure");
pbuf->flags |= psrc->flags & (PBUF_FLAG_LLMCAST | PBUF_FLAG_LLBCAST);
if (off > 0)
util_pbuf_header(psrc, off);
if (!rawsock_is_ipv6(raw))
rawsock_adjust_v4(pbuf);
pktsock_input(&raw->raw_pktsock, pbuf, srcaddr, 0);
return 0;
}
/*
* Create a raw socket.
*/
sockid_t
rawsock_socket(int domain, int protocol, struct sock ** sockp,
const struct sockevent_ops ** ops)
{
struct rawsock *raw;
unsigned int flags;
uint8_t ip_type;
if (protocol < 0 || protocol > UINT8_MAX)
return EPROTONOSUPPORT;
if (TAILQ_EMPTY(&raw_freelist))
return ENOBUFS;
raw = TAILQ_FIRST(&raw_freelist);
/*
* Initialize the structure. Do not memset it to zero, as it is still
* part of the linked free list. Initialization may still fail.
*/
ip_type = pktsock_socket(&raw->raw_pktsock, domain, RAW_SNDBUF_DEF,
RAW_RCVBUF_DEF, sockp);
/* We should have enough PCBs so this call should not fail.. */
if ((raw->raw_pcb = raw_new_ip_type(ip_type, protocol)) == NULL)
return ENOBUFS;
raw_recv(raw->raw_pcb, rawsock_input, (void *)raw);
/* By default, the multicast TTL is 1 and looping is enabled. */
raw_set_multicast_ttl(raw->raw_pcb, 1);
flags = raw_flags(raw->raw_pcb);
raw_setflags(raw->raw_pcb, flags | RAW_FLAGS_MULTICAST_LOOP);
/*
* For ICMPv6, checksum generation and verification is mandatory and
* type filtering of incoming packets is supported (RFC 3542). For all
* other IPv6 protocols, checksumming may be turned on by the user.
*/
if (rawsock_is_ipv6(raw) && protocol == IPPROTO_ICMPV6) {
raw->raw_pcb->chksum_reqd = 1;
raw->raw_pcb->chksum_offset =
offsetof(struct icmp6_hdr, icmp6_cksum);
ICMP6_FILTER_SETPASSALL(&raw->raw_icmp6filter);
} else
raw->raw_pcb->chksum_reqd = 0;
TAILQ_REMOVE(&raw_freelist, raw, raw_next);
TAILQ_INSERT_TAIL(&raw_activelist, raw, raw_next);
*ops = &rawsock_ops;
return SOCKID_RAW | (sockid_t)(raw - raw_array);
}
/*
* Bind a raw socket to a local address.
*/
static int
rawsock_bind(struct sock * sock, const struct sockaddr * addr,
socklen_t addr_len, endpoint_t user_endpt)
{
struct rawsock *raw = (struct rawsock *)sock;
ip_addr_t ipaddr;
err_t err;
int r;
/*
* Raw sockets may be rebound even if that is not too useful. However,
* we do not allow (re)binding when the socket is connected, so as to
* eliminate any problems with source and destination type mismatches:
* such mismatches are detected at connect time, and rebinding would
* avoid those, possibly triggering lwIP asserts as a result.
*/
if (rawsock_is_conn(raw))
return EINVAL;
if ((r = ipsock_get_src_addr(rawsock_get_ipsock(raw), addr, addr_len,
user_endpt, &raw->raw_pcb->local_ip, 0 /*local_port*/,
TRUE /*allow_mcast*/, &ipaddr, NULL /*portp*/)) != OK)
return r;
err = raw_bind(raw->raw_pcb, &ipaddr);
return util_convert_err(err);
}
/*
* Connect a raw socket to a remote address.
*/
static int
rawsock_connect(struct sock * sock, const struct sockaddr * addr,
socklen_t addr_len, endpoint_t user_endpt __unused)
{
struct rawsock *raw = (struct rawsock *)sock;
const ip_addr_t *src_addr;
ip_addr_t dst_addr;
struct ifdev *ifdev;
uint32_t ifindex, ifindex2;
err_t err;
int r;
/*
* One may "unconnect" socket by providing an address with family
* AF_UNSPEC.
*/
if (addr_is_unspec(addr, addr_len)) {
raw_disconnect(raw->raw_pcb);
return OK;
}
if ((r = ipsock_get_dst_addr(rawsock_get_ipsock(raw), addr, addr_len,
&raw->raw_pcb->local_ip, &dst_addr, NULL /*dst_port*/)) != OK)
return r;
/*
* Bind explicitly to a source address if the PCB is not bound to one
* yet. This is expected in the BSD socket API, but lwIP does not do
* it for us.
*/
if (ip_addr_isany(&raw->raw_pcb->local_ip)) {
/* Help the multicast case a bit, if possible. */
ifdev = NULL;
if (ip_addr_ismulticast(&dst_addr)) {
ifindex = pktsock_get_ifindex(&raw->raw_pktsock);
ifindex2 = raw_get_multicast_netif_index(raw->raw_pcb);
if (ifindex == 0)
ifindex = ifindex2;
if (ifindex != 0) {
ifdev = ifdev_get_by_index(ifindex);
if (ifdev == NULL)
return ENXIO;
}
}
src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/);
if (src_addr == NULL)
return EHOSTUNREACH;
err = raw_bind(raw->raw_pcb, src_addr);
if (err != ERR_OK)
return util_convert_err(err);
}
/*
* Connecting a raw socket serves two main purposes: 1) the socket uses
* the address as destination when sending, and 2) the socket receives
* packets from only the connected address.
*/
err = raw_connect(raw->raw_pcb, &dst_addr);
if (err != ERR_OK)
return util_convert_err(err);
return OK;
}
/*
* Perform preliminary checks on a send request.
*/
static int
rawsock_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused,
const struct sockaddr * addr, socklen_t addr_len __unused,
endpoint_t user_endpt __unused, int flags)
{
struct rawsock *raw = (struct rawsock *)sock;
if ((flags & ~MSG_DONTROUTE) != 0)
return EOPNOTSUPP;
if (!rawsock_is_conn(raw) && addr == NULL)
return EDESTADDRREQ;
/*
* This is only one part of the length check. The rest is done from
* rawsock_send(), once we have more information.
*/
if (len > ipsock_get_sndbuf(rawsock_get_ipsock(raw)))
return EMSGSIZE;
return OK;
}
/*
* Swap IP-level options between the RAW PCB and the packet options structure,
* for all options that have their flag set in the packet options structure.
* This function is called twice when sending a packet. The result is that the
* flagged options are overridden for only the packet being sent.
*/
static void
rawsock_swap_opt(struct rawsock * raw, struct pktopt * pkto)
{
uint8_t tos, ttl, mcast_ttl;
if (pkto->pkto_flags & PKTOF_TOS) {
tos = raw->raw_pcb->tos;
raw->raw_pcb->tos = pkto->pkto_tos;
pkto->pkto_tos = tos;
}
if (pkto->pkto_flags & PKTOF_TTL) {
ttl = raw->raw_pcb->ttl;
mcast_ttl = raw_get_multicast_ttl(raw->raw_pcb);
raw->raw_pcb->ttl = pkto->pkto_ttl;
raw_set_multicast_ttl(raw->raw_pcb, pkto->pkto_ttl);
pkto->pkto_ttl = ttl;
pkto->pkto_mcast_ttl = mcast_ttl;
}
}
/*
* We are about to send the given packet that already includes an IPv4 header,
* because the IP_HDRINCL option is enabled on a raw IPv4 socket. Prepare the
* IPv4 header for sending, by modifying a few fields in it, as expected by
* userland.
*/
static int
rawsock_prepare_hdrincl(struct rawsock * raw, struct pbuf * pbuf,
const ip_addr_t * src_addr)
{
struct ip_hdr *iphdr;
size_t hlen;
/*
* lwIP obtains the destination address from the IP packet header in
* this case, so make sure the packet has a full-sized header.
*/
if (pbuf->len < sizeof(struct ip_hdr))
return EINVAL;
iphdr = (struct ip_hdr *)pbuf->payload;
/*
* Fill in the source address if it is not set, and do the byte
* swapping and checksum computation common for the BSDs, without which
* ping(8) and traceroute(8) do not work properly. We consider this a
* convenience feature, so malformed packets are simply sent as is.
* TODO: deal with type punning..
*/
hlen = (size_t)IPH_HL(iphdr) << 2;
if (pbuf->len >= hlen) {
/* Fill in the source address if it is blank. */
if (iphdr->src.addr == PP_HTONL(INADDR_ANY)) {
assert(IP_IS_V4(src_addr));
iphdr->src.addr = ip_addr_get_ip4_u32(src_addr);
}
IPH_LEN(iphdr) = htons(IPH_LEN(iphdr));
IPH_OFFSET(iphdr) = htons(IPH_OFFSET(iphdr));
IPH_CHKSUM(iphdr) = 0;
IPH_CHKSUM(iphdr) = inet_chksum(iphdr, hlen);
}
return OK;
}
/*
* Send a packet on a raw socket.
*/
static int
rawsock_send(struct sock * sock, const struct sockdriver_data * data,
size_t len, size_t * off, const struct sockdriver_data * ctl __unused,
socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
const struct sockaddr * addr, socklen_t addr_len,
endpoint_t user_endpt __unused, int flags, size_t min __unused)
{
struct rawsock *raw = (struct rawsock *)sock;
struct pktopt pktopt;
struct pbuf *pbuf;
struct ifdev *ifdev;
struct netif *netif;
const ip_addr_t *dst_addrp, *src_addrp;
ip_addr_t src_addr, dst_addr; /* for storage only; not always used! */
size_t hdrlen;
uint32_t ifindex;
err_t err;
int r;
/* Copy in and parse any packet options. */
pktopt.pkto_flags = 0;
if ((r = pktsock_get_ctl(&raw->raw_pktsock, ctl, ctl_len,
&pktopt)) != OK)
return r;
/*
* For a more in-depth explanation of what is going on here, see the
* udpsock module, which has largely the same code but with more
* elaborate comments.
*/
/*
* Start by checking whether the source address and/or the outgoing
* interface are overridden using sticky and/or ancillary options.
*/
if ((r = pktsock_get_pktinfo(&raw->raw_pktsock, &pktopt, &ifdev,
&src_addr)) != OK)
return r;
if (ifdev != NULL && !ip_addr_isany(&src_addr)) {
/* This is guaranteed to be a proper local unicast address. */
src_addrp = &src_addr;
} else {
src_addrp = &raw->raw_pcb->local_ip;
/*
* If the socket is bound to a multicast address, use the
* unspecified ('any') address as source address instead. A
* real source address will then be selected further below.
*/
if (ip_addr_ismulticast(src_addrp))
src_addrp = IP46_ADDR_ANY(IP_GET_TYPE(src_addrp));
}
/*
* Determine the destination address to use. If the socket is
* connected, always ignore any address provided in the send call.
*/
if (!rawsock_is_conn(raw)) {
assert(addr != NULL); /* already checked in pre_send */
if ((r = ipsock_get_dst_addr(rawsock_get_ipsock(raw), addr,
addr_len, src_addrp, &dst_addr, NULL /*dst_port*/)) != OK)
return r;
dst_addrp = &dst_addr;
} else
dst_addrp = &raw->raw_pcb->remote_ip;
/*
* If the destination is a multicast address, select the outgoing
* interface based on the multicast interface index, if one is set.
* This must however *not* override an interface index already
* specified using IPV6_PKTINFO, as per RFC 3542 Sec. 6.7.
*/
if (ifdev == NULL && ip_addr_ismulticast(dst_addrp)) {
ifindex = raw_get_multicast_netif_index(raw->raw_pcb);
if (ifindex != NETIF_NO_INDEX)
ifdev = ifdev_get_by_index(ifindex); /* (may fail) */
}
/*
* If an interface has been determined already now, the send operation
* will bypass routing. In that case, we must perform our own checks
* on address zone violations, because those will not be made anywhere
* else. Subsequent steps below will never introduce violations.
*/
if (ifdev != NULL && IP_IS_V6(dst_addrp)) {
if (ifaddr_is_zone_mismatch(ip_2_ip6(dst_addrp), ifdev))
return EHOSTUNREACH;
if (IP_IS_V6(src_addrp) &&
ifaddr_is_zone_mismatch(ip_2_ip6(src_addrp), ifdev))
return EHOSTUNREACH;
}
/*
* If we do not yet have an interface at this point, perform a route
* lookup to determine the outgoing interface, unless MSG_DONTROUTE is
* set.
*/
if (ifdev == NULL) {
if (!(flags & MSG_DONTROUTE)) {
/*
* ip_route() should never be called with an
* IPADDR_TYPE_ANY type address. This is a lwIP-
* internal requirement; while we override both routing
* functions, we do not deviate from it.
*/
if (IP_IS_ANY_TYPE_VAL(*src_addrp))
src_addrp =
IP46_ADDR_ANY(IP_GET_TYPE(dst_addrp));
/* Perform the route lookup. */
if ((netif = ip_route(src_addrp, dst_addrp)) == NULL)
return EHOSTUNREACH;
ifdev = netif_get_ifdev(netif);
} else {
if ((ifdev = ifaddr_map_by_subnet(dst_addrp)) == NULL)
return EHOSTUNREACH;
}
}
/*
* At this point we have an outgoing interface. If we do not have a
* source address yet, pick one now. As a sidenote, if the destination
* address is scoped but has no zone, we could also fill in the zone
* now. We let lwIP handle that instead, though.
*/
assert(ifdev != NULL);
if (ip_addr_isany(src_addrp)) {
src_addrp = ifaddr_select(dst_addrp, ifdev, NULL /*ifdevp*/);
if (src_addrp == NULL)
return EHOSTUNREACH;
}
/*
* Now that we know the full conditions of what we are about to send,
* check whether the packet size leaves enough room for lwIP to prepend
* headers. If so, allocate a chain of pbufs for the packet.
*/
assert(len <= RAW_MAX_PAYLOAD);
if (rawsock_is_hdrincl(raw))
hdrlen = 0;
else if (IP_IS_V6(dst_addrp))
hdrlen = IP6_HLEN;
else
hdrlen = IP_HLEN;
if (hdrlen + len > RAW_MAX_PAYLOAD)
return EMSGSIZE;
if ((pbuf = pchain_alloc(PBUF_IP, len)) == NULL)
return ENOBUFS;
/* Copy in the packet data. */
if ((r = pktsock_get_data(&raw->raw_pktsock, data, len, pbuf)) != OK) {
pbuf_free(pbuf);
return r;
}
/*
* If the user has turned on IPV6_CHECKSUM, ensure that the packet is
* not only large enough to have the checksum stored at the configured
* place, but also that the checksum fits within the first pbuf: if we
* do not test this here, an assert will trigger in lwIP later. Also
* zero out the checksum field first, because lwIP does not do that.
*/
if (raw->raw_pcb->chksum_reqd) {
if (pbuf->len < raw->raw_pcb->chksum_offset +
sizeof(uint16_t)) {
pbuf_free(pbuf);
return EINVAL;
}
memset((char *)pbuf->payload + raw->raw_pcb->chksum_offset, 0,
sizeof(uint16_t));
}
/*
* For sockets where an IPv4 header is already included in the packet,
* we need to alter a few header fields to be compatible with BSD.
*/
if (rawsock_is_hdrincl(raw) &&
(r = rawsock_prepare_hdrincl(raw, pbuf, src_addrp)) != OK) {
pbuf_free(pbuf);
return r;
}
/* Set broadcast/multicast flags for accounting purposes. */
if (ip_addr_ismulticast(dst_addrp))
pbuf->flags |= PBUF_FLAG_LLMCAST;
else if (ip_addr_isbroadcast(dst_addrp, ifdev_get_netif(ifdev)))
pbuf->flags |= PBUF_FLAG_LLBCAST;
/* Send the packet. */
rawsock_swap_opt(raw, &pktopt);
assert(!ip_addr_isany(src_addrp));
assert(!ip_addr_ismulticast(src_addrp));
err = raw_sendto_if_src(raw->raw_pcb, pbuf, dst_addrp,
ifdev_get_netif(ifdev), src_addrp);
rawsock_swap_opt(raw, &pktopt);
/* Free the pbuf again. */
pbuf_free(pbuf);
/*
* On success, make sure to return the size of the sent packet as well.
* As an aside: ctl_off need not be updated, as it is not returned.
*/
if ((r = util_convert_err(err)) == OK)
*off = len;
return r;
}
/*
* Update the set of flag-type socket options on a raw socket.
*/
static void
rawsock_setsockmask(struct sock * sock, unsigned int mask)
{
struct rawsock *raw = (struct rawsock *)sock;
/*
* FIXME: raw sockets are not supposed to have a broardcast check, so
* perhaps just remove this and instead always set SOF_BROADCAST?
*/
if (mask & SO_BROADCAST)
ip_set_option(raw->raw_pcb, SOF_BROADCAST);
else
ip_reset_option(raw->raw_pcb, SOF_BROADCAST);
}
/*
* Prepare a helper structure for IP-level option processing.
*/
static void
rawsock_get_ipopts(struct rawsock * raw, struct ipopts * ipopts)
{
ipopts->local_ip = &raw->raw_pcb->local_ip;
ipopts->remote_ip = &raw->raw_pcb->remote_ip;
ipopts->tos = &raw->raw_pcb->tos;
ipopts->ttl = &raw->raw_pcb->ttl;
ipopts->sndmin = RAW_SNDBUF_MIN;
ipopts->sndmax = RAW_SNDBUF_MAX;
ipopts->rcvmin = RAW_RCVBUF_MIN;
ipopts->rcvmax = RAW_RCVBUF_MAX;
}
/*
* Set socket options on a raw socket.
*/
static int
rawsock_setsockopt(struct sock * sock, int level, int name,
const struct sockdriver_data * data, socklen_t len)
{
struct rawsock *raw = (struct rawsock *)sock;
struct ipopts ipopts;
struct icmp6_filter filter;
ip_addr_t ipaddr;
struct in_addr in_addr;
struct ifdev *ifdev;
unsigned int flags;
uint32_t ifindex;
uint8_t byte;
int r, val;
/*
* Unfortunately, we have to duplicate most of the multicast options
* rather than sharing them with udpsock at the pktsock level. The
* reason is that each of the PCBs have their own multicast abstraction
* functions and so we cannot merge the rest. Same for getsockopt.
*/
switch (level) {
case IPPROTO_IP:
if (rawsock_is_ipv6(raw))
break;
switch (name) {
case IP_HDRINCL:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val) {
raw_setflags(raw->raw_pcb,
raw_flags(raw->raw_pcb) |
RAW_FLAGS_HDRINCL);
} else {
raw_setflags(raw->raw_pcb,
raw_flags(raw->raw_pcb) &
~RAW_FLAGS_HDRINCL);
}
return OK;
case IP_MULTICAST_IF:
pktsock_set_mcaware(&raw->raw_pktsock);
if ((r = sockdriver_copyin_opt(data, &in_addr,
sizeof(in_addr), len)) != OK)
return r;
ip_addr_set_ip4_u32(&ipaddr, in_addr.s_addr);
if ((ifdev = ifaddr_map_by_addr(&ipaddr)) == NULL)
return EADDRNOTAVAIL;
raw_set_multicast_netif_index(raw->raw_pcb,
ifdev_get_index(ifdev));
return OK;
case IP_MULTICAST_LOOP:
pktsock_set_mcaware(&raw->raw_pktsock);
if ((r = sockdriver_copyin_opt(data, &byte,
sizeof(byte), len)) != OK)
return r;
flags = raw_flags(raw->raw_pcb);
if (byte)
flags |= RAW_FLAGS_MULTICAST_LOOP;
else
flags &= ~RAW_FLAGS_MULTICAST_LOOP;
raw_setflags(raw->raw_pcb, flags);
return OK;
case IP_MULTICAST_TTL:
pktsock_set_mcaware(&raw->raw_pktsock);
if ((r = sockdriver_copyin_opt(data, &byte,
sizeof(byte), len)) != OK)
return r;
raw_set_multicast_ttl(raw->raw_pcb, byte);
return OK;
}
break;
case IPPROTO_IPV6:
if (!rawsock_is_ipv6(raw))
break;
switch (name) {
case IPV6_CHECKSUM:
/* ICMPv6 checksums are always computed. */
if (raw->raw_pcb->protocol == IPPROTO_ICMPV6)
return EINVAL;
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val == -1) {
raw->raw_pcb->chksum_reqd = 0;
return OK;
} else if (val >= 0 && !(val & 1)) {
raw->raw_pcb->chksum_reqd = 1;
raw->raw_pcb->chksum_offset = val;
return OK;
} else
return EINVAL;
case IPV6_MULTICAST_IF:
pktsock_set_mcaware(&raw->raw_pktsock);
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val != 0) {
ifindex = (uint32_t)val;
ifdev = ifdev_get_by_index(ifindex);
if (ifdev == NULL)
return ENXIO;
} else
ifindex = NETIF_NO_INDEX;
raw_set_multicast_netif_index(raw->raw_pcb, ifindex);
return OK;
case IPV6_MULTICAST_LOOP:
pktsock_set_mcaware(&raw->raw_pktsock);
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val < 0 || val > 1)
return EINVAL;
flags = raw_flags(raw->raw_pcb);
if (val)
flags |= RAW_FLAGS_MULTICAST_LOOP;
else
flags &= ~RAW_FLAGS_MULTICAST_LOOP;
/*
* lwIP's IPv6 functionality does not actually check
* this flag at all yet. We set it in the hope that
* one day this will magically start working.
*/
raw_setflags(raw->raw_pcb, flags);
return OK;
case IPV6_MULTICAST_HOPS:
pktsock_set_mcaware(&raw->raw_pktsock);
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val < -1 || val > UINT8_MAX)
return EINVAL;
if (val == -1)
val = 1;
raw_set_multicast_ttl(raw->raw_pcb, val);
return OK;
}
break;
case IPPROTO_ICMPV6:
if (!rawsock_is_ipv6(raw) ||
raw->raw_pcb->protocol != IPPROTO_ICMPV6)
break;
switch (name) {
case ICMP6_FILTER:
/* Who comes up with these stupid exceptions? */
if (len == 0) {
ICMP6_FILTER_SETPASSALL(&raw->raw_icmp6filter);
return OK;
}
if ((r = sockdriver_copyin_opt(data, &filter,
sizeof(filter), len)) != OK)
return r;
/*
* As always, never copy in the data into the actual
* destination, as any copy may run into a copy fault
* halfway through, potentially leaving the destination
* in a half-updated and thus corrupted state.
*/
memcpy(&raw->raw_icmp6filter, &filter, sizeof(filter));
return OK;
}
}
rawsock_get_ipopts(raw, &ipopts);
return pktsock_setsockopt(&raw->raw_pktsock, level, name, data, len,
&ipopts);
}
/*
* Retrieve socket options on a raw socket.
*/
static int
rawsock_getsockopt(struct sock * sock, int level, int name,
const struct sockdriver_data * data, socklen_t * len)
{
struct rawsock *raw = (struct rawsock *)sock;
struct ipopts ipopts;
const ip4_addr_t *ip4addr;
struct in_addr in_addr;
struct ifdev *ifdev;
unsigned int flags;
uint32_t ifindex;
uint8_t byte;
int val;
switch (level) {
case IPPROTO_IP:
if (rawsock_is_ipv6(raw))
break;
switch (name) {
case IP_HDRINCL:
val = !!rawsock_is_hdrincl(raw);
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case IP_MULTICAST_IF:
ifindex = raw_get_multicast_netif_index(raw->raw_pcb);
/*
* Map back from the interface index to the IPv4
* address assigned to the corresponding interface.
* Should this not work out, return the 'any' address.
*/
if (ifindex != NETIF_NO_INDEX &&
(ifdev = ifdev_get_by_index(ifindex)) != NULL) {
ip4addr =
netif_ip4_addr(ifdev_get_netif(ifdev));
in_addr.s_addr = ip4_addr_get_u32(ip4addr);
} else
in_addr.s_addr = PP_HTONL(INADDR_ANY);
return sockdriver_copyout_opt(data, &in_addr,
sizeof(in_addr), len);
case IP_MULTICAST_LOOP:
flags = raw_flags(raw->raw_pcb);
byte = !!(flags & RAW_FLAGS_MULTICAST_LOOP);
return sockdriver_copyout_opt(data, &byte,
sizeof(byte), len);
case IP_MULTICAST_TTL:
byte = raw_get_multicast_ttl(raw->raw_pcb);
return sockdriver_copyout_opt(data, &byte,
sizeof(byte), len);
}
break;
case IPPROTO_IPV6:
if (!rawsock_is_ipv6(raw))
break;
switch (name) {
case IPV6_CHECKSUM:
if (raw->raw_pcb->chksum_reqd)
val = raw->raw_pcb->chksum_offset;
else
val = -1;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case IPV6_MULTICAST_IF:
ifindex = raw_get_multicast_netif_index(raw->raw_pcb);
val = (int)ifindex;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case IPV6_MULTICAST_LOOP:
flags = raw_flags(raw->raw_pcb);
val = !!(flags & RAW_FLAGS_MULTICAST_LOOP);
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case IPV6_MULTICAST_HOPS:
val = raw_get_multicast_ttl(raw->raw_pcb);
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
}
break;
case IPPROTO_ICMPV6:
if (!rawsock_is_ipv6(raw) ||
raw->raw_pcb->protocol != IPPROTO_ICMPV6)
break;
switch (name) {
case ICMP6_FILTER:
return sockdriver_copyout_opt(data,
&raw->raw_icmp6filter,
sizeof(raw->raw_icmp6filter), len);
}
break;
}
rawsock_get_ipopts(raw, &ipopts);
return pktsock_getsockopt(&raw->raw_pktsock, level, name, data, len,
&ipopts);
}
/*
* Retrieve the local socket address of a raw socket.
*/
static int
rawsock_getsockname(struct sock * sock, struct sockaddr * addr,
socklen_t * addr_len)
{
struct rawsock *raw = (struct rawsock *)sock;
ipsock_put_addr(rawsock_get_ipsock(raw), addr, addr_len,
&raw->raw_pcb->local_ip, 0 /*port*/);
return OK;
}
/*
* Retrieve the remote socket address of a raw socket.
*/
static int
rawsock_getpeername(struct sock * sock, struct sockaddr * addr,
socklen_t * addr_len)
{
struct rawsock *raw = (struct rawsock *)sock;
if (!rawsock_is_conn(raw))
return ENOTCONN;
ipsock_put_addr(rawsock_get_ipsock(raw), addr, addr_len,
&raw->raw_pcb->remote_ip, 0 /*port*/);
return OK;
}
/*
* Shut down a raw socket for reading and/or writing.
*/
static int
rawsock_shutdown(struct sock * sock, unsigned int mask)
{
struct rawsock *raw = (struct rawsock *)sock;
if (mask & SFL_SHUT_RD)
raw_recv(raw->raw_pcb, NULL, NULL);
pktsock_shutdown(&raw->raw_pktsock, mask);
return OK;
}
/*
* Close a raw socket.
*/
static int
rawsock_close(struct sock * sock, int force __unused)
{
struct rawsock *raw = (struct rawsock *)sock;
raw_recv(raw->raw_pcb, NULL, NULL);
raw_remove(raw->raw_pcb);
raw->raw_pcb = NULL;
pktsock_close(&raw->raw_pktsock);
return OK;
}
/*
* Free up a closed raw socket.
*/
static void
rawsock_free(struct sock * sock)
{
struct rawsock *raw = (struct rawsock *)sock;
assert(raw->raw_pcb == NULL);
TAILQ_REMOVE(&raw_activelist, raw, raw_next);
TAILQ_INSERT_HEAD(&raw_freelist, raw, raw_next);
}
/*
* Fill the given kinfo_pcb sysctl(7) structure with information about the RAW
* PCB identified by the given pointer.
*/
static void
rawsock_get_info(struct kinfo_pcb * ki, const void * ptr)
{
const struct raw_pcb *pcb = (const struct raw_pcb *)ptr;
struct rawsock *raw;
/* We iterate our own list so we can't find "strange" PCBs. */
raw = (struct rawsock *)pcb->recv_arg;
assert(raw >= raw_array &&
raw < &raw_array[__arraycount(raw_array)]);
ki->ki_type = SOCK_RAW;
ki->ki_protocol = pcb->protocol;
ipsock_get_info(ki, &pcb->local_ip, 0 /*local_port*/,
&raw->raw_pcb->remote_ip, 0 /*remote_port*/);
/* TODO: change this so that sockstat(1) may work one day. */
ki->ki_sockaddr = (uint64_t)(uintptr_t)rawsock_get_sock(raw);
ki->ki_rcvq = pktsock_get_recvlen(&raw->raw_pktsock);
if (rawsock_is_hdrincl(raw))
ki->ki_pflags |= INP_HDRINCL;
}
/*
* Given either NULL or a previously returned RAW PCB pointer, return the first
* or next RAW PCB pointer, or NULL if there are no more. lwIP does not expose
* 'raw_pcbs', but other modules in this service may also use RAW PCBs (which
* should then stay hidden), so we iterate through our own list instead.
*/
static const void *
rawsock_enum(const void * last)
{
const struct raw_pcb *pcb;
struct rawsock *raw;
if (last != NULL) {
pcb = (const struct raw_pcb *)last;
raw = (struct rawsock *)pcb->recv_arg;
assert(raw >= raw_array &&
raw < &raw_array[__arraycount(raw_array)]);
raw = TAILQ_NEXT(raw, raw_next);
} else
raw = TAILQ_FIRST(&raw_activelist);
if (raw != NULL)
return raw->raw_pcb;
else
return NULL;
}
/*
* Obtain the list of RAW protocol control blocks, for sysctl(7).
*/
static ssize_t
rawsock_pcblist(struct rmib_call * call, struct rmib_node * node,
struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
{
return util_pcblist(call, oldp, rawsock_enum, rawsock_get_info);
}
static const struct sockevent_ops rawsock_ops = {
.sop_bind = rawsock_bind,
.sop_connect = rawsock_connect,
.sop_pre_send = rawsock_pre_send,
.sop_send = rawsock_send,
.sop_pre_recv = pktsock_pre_recv,
.sop_recv = pktsock_recv,
.sop_test_recv = pktsock_test_recv,
.sop_ioctl = ifconf_ioctl,
.sop_setsockmask = rawsock_setsockmask,
.sop_setsockopt = rawsock_setsockopt,
.sop_getsockopt = rawsock_getsockopt,
.sop_getsockname = rawsock_getsockname,
.sop_getpeername = rawsock_getpeername,
.sop_shutdown = rawsock_shutdown,
.sop_close = rawsock_close,
.sop_free = rawsock_free
};