/* LWIP service - pktsock.c - packet code shared between UDP and RAW */ #include "lwip.h" #include "pktsock.h" #include "ifaddr.h" /* * This buffer should be much bigger (at least 10KB, according to RFC 3542), * but we do not support the ancillary options that take so much space anyway. */ #define PKTSOCK_CTLBUF_SIZE 256 static char pktsock_ctlbuf[PKTSOCK_CTLBUF_SIZE]; /* * Header structures with ancillary data for received packets. The reason that * we do not simply use a generic pkthdr structure with ip_addr_t source and * destination addresses, is that for UDP packets, we put this structure in * place of the received (ethernet and IP headers), and such a full structure * (including IPv6-size addresses) would not fit in the header space for IPv4 * packets. So instead we use two address structures, one for IPv4 and one for * IPv6, and a generic header structure on top of it, which also identifies * which address structure is underneath. The combination of the address * structure and the header structure must fit in the IP header. The IPv6 * packet header is already so close to the limit here that we have to use * packed addresses. For IPv4 we use the regular addresses for simplicity. */ struct pkthdr { uint16_t port; /* source port number (UDP only) */ uint8_t dstif; /* interface that received the pkt */ uint8_t addrif; /* interface that accepted the pkt */ uint8_t tos; /* TOS/TC value from the IP header */ uint8_t ttl; /* TTL/HL value from the IP header */ uint8_t flags; /* packet flags (PKTHF_) */ uint8_t _unused; /* all that is still available.. */ }; #define PKTHF_IPV6 0x01 /* packet has IPv6 header */ #define PKTHF_MCAST 0x02 /* packet has multicast destination */ #define PKTHF_BCAST 0x04 /* packet has broadcast destination */ struct pktaddr4 { ip4_addr_t srcaddr; ip4_addr_t dstaddr; }; struct pktaddr6 { ip6_addr_p_t srcaddr; ip6_addr_p_t dstaddr; }; /* * Create a packet socket. Relay parameters and return values to and from the * IP module's socket creation function. This function must not allocate any * resources in any form, as socket creation may still fail later, in which * case no destruction function is called. */ int pktsock_socket(struct pktsock * pkt, int domain, size_t sndbuf, size_t rcvbuf, struct sock ** sockp) { pkt->pkt_rcvhead = NULL; pkt->pkt_rcvtailp = &pkt->pkt_rcvhead; pkt->pkt_rcvlen = 0; mcast_reset(&pkt->pkt_mcast); memset(&pkt->pkt_srcaddr, 0, sizeof(pkt->pkt_srcaddr)); pkt->pkt_ifindex = 0; /* * Any PKTF_ type flags should be initialized on the socket only after * the following call, as this call will clear the flags field. For * now, no PKTF_ flags need to be set by default, though. */ return ipsock_socket(&pkt->pkt_ipsock, domain, sndbuf, rcvbuf, sockp); } /* * Return TRUE if the given packet can and should be received on the given * socket, or FALSE if there is a reason not to receive the packet. */ static int pktsock_may_recv(struct pktsock * pkt, struct pbuf * pbuf) { /* * By policy, multicast packets should not be received on sockets of * which the owning application is not multicast aware. */ if (ip_addr_ismulticast(ip_current_dest_addr()) && !(ipsock_get_flag(&pkt->pkt_ipsock, PKTF_MCAWARE))) return FALSE; /* * Due to fragment reassembly, we might end up with packets that take * up more buffer space than their byte size, even after rounding up * the latter. The user probably does not want packets to get dropped * for that reason, e.g. when they set a 64K limit and the packet ends * up being estimated as 65K and dropped. So, we test against * 'pbuf->tot_len' rather than the rounded-up packet size. However, * 'pkt->pkt_rcvlen' itself is increased by the rounded-up packet size * when enqueuing the packet, so that we still count the memory * consumption (generally) conservatively, which is what we want. */ return (pkt->pkt_rcvlen + pbuf->tot_len <= ipsock_get_rcvbuf(&pkt->pkt_ipsock)); } /* * Check whether the given packet can and should be received on the given * socket. If so, return the amount of space for ancillary information that * will be necessary for the packet. If not, return a negative value. */ int pktsock_test_input(struct pktsock * pkt, struct pbuf * pbuf) { /* * This check will be done again in pktsock_input(), but this function * is called for raw packets only (not for UDP packets) and, if this * (cheap) check fails, we can avoid a (rather expensive) packet copy. */ if (!pktsock_may_recv(pkt, pbuf)) return -1; if (ip_current_is_v6()) return (int)(sizeof(struct pktaddr6) + sizeof(struct pkthdr)); else return (int)(sizeof(struct pktaddr4) + sizeof(struct pkthdr)); } /* * A packet has arrived on a packet socket. We own the given packet buffer, * and so we must free it if we do not want to keep it. */ void pktsock_input(struct pktsock * pkt, struct pbuf * pbuf, const ip_addr_t * srcaddr, uint16_t port) { struct pktaddr4 pktaddr4; struct pktaddr6 pktaddr6; struct pkthdr pkthdr; void *pktaddr; struct ifdev *ifdev; size_t pktaddrlen; /* * We are going to mess with the packet's header and contents, so we * must be the exclusive owner of the packet. For UDP packets, lwIP * must have made a copy for us in case of non-exclusive delivery * (e.g., multicast packets). For raw packets, we have made a copy of * the packet ourselves just before the call to this function. */ if (pbuf->ref != 1) panic("input packet has multiple references!"); /* If the packet should not be received on this socket, drop it. */ if (!pktsock_may_recv(pkt, pbuf)) { pbuf_free(pbuf); return; } /* * Enqueue the packet. Overwrite the leading IP header with packet * information that is used at the time of receipt by userland. The * data structures are such that the information always fits in what * was the IP header. The reference count check earlier ensures that * we never overwrite part of a packet that is still in use elsewhere. */ if (ip_current_is_v6()) { assert(IP_IS_V6(srcaddr)); assert(ip6_current_dest_addr() != NULL); ip6_addr_copy_to_packed(pktaddr6.srcaddr, *ip_2_ip6(srcaddr)); ip6_addr_copy_to_packed(pktaddr6.dstaddr, *ip6_current_dest_addr()); pktaddr = &pktaddr6; pktaddrlen = sizeof(pktaddr6); assert(pktaddrlen + sizeof(pkthdr) <= IP6_HLEN); pkthdr.tos = IP6H_TC(ip6_current_header()); pkthdr.ttl = IP6H_HOPLIM(ip6_current_header()); pkthdr.flags = PKTHF_IPV6; } else { assert(IP_IS_V4(srcaddr)); assert(ip4_current_dest_addr() != NULL); memcpy(&pktaddr4.srcaddr, ip_2_ip4(srcaddr), sizeof(pktaddr4.srcaddr)); memcpy(&pktaddr4.dstaddr, ip4_current_dest_addr(), sizeof(pktaddr4.srcaddr)); pktaddr = &pktaddr4; pktaddrlen = sizeof(pktaddr4); assert(pktaddrlen + sizeof(pkthdr) <= IP_HLEN); pkthdr.tos = IPH_TOS(ip4_current_header()); pkthdr.ttl = IPH_TTL(ip4_current_header()); pkthdr.flags = 0; } /* * Save both the interface on which the packet was received (for * PKTINFO) and the interface that owns the destination address of the * packet (for the source address's zone ID). */ assert(ip_current_input_netif() != NULL); ifdev = netif_get_ifdev(ip_current_input_netif()); pkthdr.dstif = (uint16_t)ifdev_get_index(ifdev); assert(ip_current_netif() != NULL); ifdev = netif_get_ifdev(ip_current_netif()); pkthdr.addrif = (uint16_t)ifdev_get_index(ifdev); if ((pbuf->flags & PBUF_FLAG_LLMCAST) || ip_addr_ismulticast(ip_current_dest_addr())) pkthdr.flags |= PKTHF_MCAST; else if ((pbuf->flags & PBUF_FLAG_LLBCAST) || ip_addr_isbroadcast(ip_current_dest_addr(), ip_current_netif())) pkthdr.flags |= PKTHF_BCAST; pkthdr.port = port; util_pbuf_header(pbuf, sizeof(pkthdr)); memcpy(pbuf->payload, &pkthdr, sizeof(pkthdr)); util_pbuf_header(pbuf, pktaddrlen); memcpy(pbuf->payload, pktaddr, pktaddrlen); util_pbuf_header(pbuf, -(int)(sizeof(pkthdr) + pktaddrlen)); *pkt->pkt_rcvtailp = pbuf; pkt->pkt_rcvtailp = pchain_end(pbuf); pkt->pkt_rcvlen += pchain_size(pbuf); sockevent_raise(ipsock_get_sock(&pkt->pkt_ipsock), SEV_RECV); } /* * Obtain interface and source address information for an outgoing packet. In * particular, parse any IPV6_PKTINFO options provided as either sticky options * on the socket 'pkt' or as ancillary options in the packet options 'pkto'. * On success, return OK, with 'ifdevp' set to either the outgoing interface to * use for the packet, or NULL if no outgoing interface was specified using * either of the aforementioned options. If, and only if, 'ifdevp' is set to * an actual interface (i.e., not NULL), then 'src_addrp' is filled with either * a locally owned, validated, unicast address to use as source of the packet, * or the unspecified ('any') address if no source address was specified using * the options. On failure, return a negative error code. */ int pktsock_get_pktinfo(struct pktsock * pkt, struct pktopt * pkto, struct ifdev ** ifdevp, ip_addr_t * src_addrp) { struct ifdev *ifdev, *ifdev2; ip_addr_t ipaddr; uint32_t ifindex; int r; /* We support only IPV6_PKTINFO. IP_PKTINFO is not supported. */ if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) { *ifdevp = NULL; return OK; } /* * TODO: we are spending a lot of effort on initializing and copying * stuff around, even just to find out whether there is anything to do * at all here. See if this can be optimized. */ ip_addr_set_zero_ip6(&ipaddr); /* * Ancillary data takes precedence over sticky options. We treat the * source address and interface index fields as separate, overriding * each earlier value only if non-zero. TODO: is that correct? */ if (pkto->pkto_flags & PKTOF_PKTINFO) { memcpy(ip_2_ip6(&ipaddr)->addr, &pkto->pkto_srcaddr.addr, sizeof(ip_2_ip6(&ipaddr)->addr)); ifindex = pkto->pkto_ifindex; } else ifindex = 0; if (ip6_addr_isany(ip_2_ip6(&ipaddr))) memcpy(ip_2_ip6(&ipaddr)->addr, &pkt->pkt_srcaddr.addr, sizeof(ip_2_ip6(&ipaddr)->addr)); if (ifindex == 0) ifindex = pkt->pkt_ifindex; /* If both fields are blank, there is nothing more to do. */ if (ip6_addr_isany(ip_2_ip6(&ipaddr)) && ifindex == 0) { *ifdevp = NULL; return OK; } /* If an interface index is specified, it must be valid. */ ifdev = NULL; if (ifindex != 0 && (ifdev = ifdev_get_by_index(ifindex)) == NULL) return ENXIO; /* * Use the interface index to set a zone on the source address, if the * source address has a scope. */ if (ip6_addr_has_scope(ip_2_ip6(&ipaddr), IP6_UNKNOWN)) { if (ifindex == 0) return EADDRNOTAVAIL; ip6_addr_set_zone(ip_2_ip6(&ipaddr), ifindex); } /* * We need to validate the given address just as thoroughly as an * address given through bind(). If we don't, we could allow forged * source addresses etcetera. To be sure: this call may change the * address to an IPv4 type address if needed. */ if ((r = ipsock_check_src_addr(pktsock_get_ipsock(pkt), &ipaddr, FALSE /*allow_mcast*/, &ifdev2)) != OK) return r; if (ifdev2 != NULL) { if (ifdev == NULL) ifdev = ifdev2; else if (ifdev != ifdev2) return EADDRNOTAVAIL; } else { /* * There should be no cases where the (non-multicast) address * successfully parsed, is not unspecified, and yet did not map * to an interface. Eliminate the possibility anyway by * throwing an error for this case. As a result, we are left * with one of two cases: * * 1) ifdevp is not NULL, and src_addrp is unspecified; * 2) ifdevp is not NULL, and src_addrp is a locally assigned * (unicast) address. * * This is why we need not fill src_addrp when ifdevp is NULL. */ if (!ip_addr_isany(&ipaddr)) return EADDRNOTAVAIL; } *ifdevp = ifdev; if (ifdev != NULL) *src_addrp = ipaddr; return OK; } /* * Parse a chunk of user-provided control data, on an IPv4 socket provided as * 'pkt'. The control chunk is given as 'cmsg', and the length of the data * following the control header (possibly zero) is given as 'len'. On success, * return OK, with any parsed options merged into the set of packet options * 'pkto'. On failure, return a negative error code. */ static int pktsock_parse_ctl_v4(struct pktsock * pkt __unused, struct cmsghdr * cmsg, socklen_t len, struct pktopt * pkto) { uint8_t byte; int val; if (cmsg->cmsg_level != IPPROTO_IP) return EAFNOSUPPORT; switch (cmsg->cmsg_type) { case IP_TOS: /* * Some userland code (bind's libisc in particular) supplies * a single byte instead of a full integer for this option. * We go out of our way to accept that format, too. */ if (len != sizeof(val) && len != sizeof(byte)) return EINVAL; if (len == sizeof(byte)) { memcpy(&byte, CMSG_DATA(cmsg), sizeof(byte)); val = (int)byte; } else memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); if (val < 0 || val > UINT8_MAX) return EINVAL; pkto->pkto_flags |= PKTOF_TOS; pkto->pkto_tos = (uint8_t)val; return OK; case IP_TTL: if (len != sizeof(val)) return EINVAL; memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); if (val < 0 || val > UINT8_MAX) return EINVAL; pkto->pkto_flags |= PKTOF_TTL; pkto->pkto_ttl = (uint8_t)val; return OK; /* * Implementing IP_PKTINFO might be a bit harder than its IPV6_PKTINFO * sibling, because it would require the use of zone IDs (interface * indices) for IPv4, which is not supported yet. */ } return EINVAL; } /* * Parse a chunk of user-provided control data, on an IPv6 socket provided as * 'pkt'. The control chunk is given as 'cmsg', and the length of the data * following the control header (possibly zero) is given as 'len'. On success, * return OK, with any parsed options merged into the set of packet options * 'pkto'. On failure, return a negative error code. */ static int pktsock_parse_ctl_v6(struct pktsock * pkt, struct cmsghdr * cmsg, socklen_t len, struct pktopt * pkto) { struct in6_pktinfo ipi6; int val; if (cmsg->cmsg_level != IPPROTO_IPV6) return EAFNOSUPPORT; switch (cmsg->cmsg_type) { case IPV6_TCLASS: if (len != sizeof(val)) return EINVAL; memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); if (val < -1 || val > UINT8_MAX) return EINVAL; if (val == -1) val = 0; pkto->pkto_flags |= PKTOF_TOS; pkto->pkto_tos = (uint8_t)val; return OK; case IPV6_HOPLIMIT: if (len != sizeof(val)) return EINVAL; memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); if (val < -1 || val > UINT8_MAX) return EINVAL; if (val == -1) val = IP_DEFAULT_TTL; pkto->pkto_flags |= PKTOF_TTL; pkto->pkto_ttl = (uint8_t)val; return OK; case IPV6_PKTINFO: if (len != sizeof(ipi6)) return EINVAL; memcpy(&ipi6, CMSG_DATA(cmsg), sizeof(ipi6)); pkto->pkto_flags |= PKTOF_PKTINFO; memcpy(&pkto->pkto_srcaddr.addr, &ipi6.ipi6_addr, sizeof(pkto->pkto_srcaddr.addr)); pkto->pkto_ifindex = ipi6.ipi6_ifindex; return OK; case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return EINVAL; memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); if (val < -1 || val > 1) return EINVAL; /* TODO: not supported by lwIP, but needed by applications. */ return OK; } return EINVAL; } /* * Copy in and parse control data, as part of sending a packet on socket 'pkt'. * The control data is accessible through 'ctl', with a user-provided length of * 'ctl_len'. On success, return OK, with any parsed packet options stored in * 'pkto'. On failure, return a negative error code. */ int pktsock_get_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl, socklen_t ctl_len, struct pktopt * pkto) { struct msghdr msghdr; struct cmsghdr *cmsg; socklen_t left, len; int r; /* The default: no packet options are being overridden. */ assert(pkto->pkto_flags == 0); /* If no control length is given, we are done here. */ if (ctl_len == 0) return OK; /* * For now, we put a rather aggressive limit on the size of the control * data. We copy in and parse the whole thing in a single buffer. */ if (ctl_len > sizeof(pktsock_ctlbuf)) { printf("LWIP: too much control data given (%u bytes)\n", ctl_len); return ENOBUFS; } if ((r = sockdriver_copyin(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK) return r; memset(&msghdr, 0, sizeof(msghdr)); msghdr.msg_control = pktsock_ctlbuf; msghdr.msg_controllen = ctl_len; for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { /* Check for bogus lengths. */ assert((socklen_t)((char *)cmsg - pktsock_ctlbuf) <= ctl_len); left = ctl_len - (socklen_t)((char *)cmsg - pktsock_ctlbuf); assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */ if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) { printf("LWIP: malformed control data rejected\n"); return EINVAL; } len = cmsg->cmsg_len - CMSG_LEN(0); if (ipsock_is_ipv6(&pkt->pkt_ipsock)) r = pktsock_parse_ctl_v6(pkt, cmsg, len, pkto); else r = pktsock_parse_ctl_v4(pkt, cmsg, len, pkto); if (r != OK) return r; } return OK; } /* * Copy in the packet data from the calling user process, and store it in the * buffer 'pbuf' that must already have been allocated with the appropriate * size. */ int pktsock_get_data(struct pktsock * pkt, const struct sockdriver_data * data, size_t len, struct pbuf * pbuf) { return util_copy_data(data, len, 0, pbuf, 0, TRUE /*copy_in*/); } /* * Dequeue and free the head of the receive queue of a packet socket. */ static void pktsock_dequeue(struct pktsock * pkt) { struct pbuf *pbuf, **pnext; size_t size; pbuf = pkt->pkt_rcvhead; assert(pbuf != NULL); pnext = pchain_end(pbuf); size = pchain_size(pbuf); if ((pkt->pkt_rcvhead = *pnext) == NULL) pkt->pkt_rcvtailp = &pkt->pkt_rcvhead; assert(pkt->pkt_rcvlen >= size); pkt->pkt_rcvlen -= size; *pnext = NULL; pbuf_free(pbuf); } /* * Perform preliminary checks on a receive request. */ int pktsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, int flags) { /* * We accept the same flags across all socket types in LWIP, and then * simply ignore the ones we do not support for packet sockets. */ if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) return EOPNOTSUPP; return OK; } /* * Add a chunk of control data to the global control buffer, starting from * offset 'off'. The chunk has the given level and type, and its data is given * in the buffer 'ptr' with size 'len'. Return the (padded) size of the chunk * that was generated as a result. */ static size_t pktsock_add_ctl(int level, int type, void * ptr, socklen_t len, size_t off) { struct cmsghdr cmsg; size_t size; size = CMSG_SPACE(len); /* * The global control buffer must be large enough to store one chunk * of each of the supported options. If this panic triggers, increase * PKTSOCK_CTLBUF_SIZE by as much as needed. */ if (off + size > sizeof(pktsock_ctlbuf)) panic("control buffer too small, increase " "PKTSOCK_CTLBUF_SIZE"); memset(&cmsg, 0, sizeof(cmsg)); cmsg.cmsg_len = CMSG_LEN(len); cmsg.cmsg_level = level; cmsg.cmsg_type = type; /* * Clear any padding space. This can be optimized, but in any case we * must be careful not to copy out any bytes that have not been * initialized at all. */ memset(&pktsock_ctlbuf[off], 0, size); memcpy(&pktsock_ctlbuf[off], &cmsg, sizeof(cmsg)); memcpy(CMSG_DATA((struct cmsghdr *)&pktsock_ctlbuf[off]), ptr, len); return size; } /* * Generate and copy out control data, as part of delivering a packet from * socket 'pkt' to userland. The control data buffer is given as 'ctl', with * a user-given length of 'ctl_len' bytes. The packet's header information is * provided as 'pkthdr', and its source and destination addresses as 'pktaddr', * which maybe a pktaddr4 or pktaddr6 structure depending on the value of the * PKTHF_IPV6 flag in the 'flags' field in 'pkthdr'. Note that we support * dual-stack sockets, and as such it is possible that the socket is of domain * AF_INET6 while the received packet is an IPv4 packet. On success, return * the size of the control data copied out (possibly zero). If more control * data were generated than copied out, also merge the MSG_CTRUNC flag into * 'rflags'. On failure, return a negative error code. */ static int pktsock_put_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl, socklen_t ctl_len, struct pkthdr * pkthdr, void * pktaddr, int * rflags) { struct pktaddr6 *pktaddr6; struct pktaddr4 *pktaddr4; struct in_pktinfo ipi; struct in6_pktinfo ipi6; ip_addr_t ipaddr; unsigned int flags; uint8_t byte; size_t off; int r, val; flags = ipsock_get_flags(&pkt->pkt_ipsock); if (!(flags & (PKTF_RECVINFO | PKTF_RECVTOS | PKTF_RECVTTL))) return 0; /* * Important: all generated control chunks must fit in the global * control buffer together. When adding more options here, ensure that * the control buffer remains large enough to receive all options at * once. See also the panic in pktsock_add_ctl(). */ off = 0; /* * IPv6 sockets may receive IPv4 packets. The ancillary data is in the * format corresponding to the socket, which means we may have to * convert any IPv4 addresses from the packet to IPv4-mapped IPv6 * addresses for the ancillary data, just like the source address. */ if (ipsock_is_ipv6(&pkt->pkt_ipsock)) { if (flags & PKTF_RECVTTL) { val = pkthdr->ttl; off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_HOPLIMIT, &val, sizeof(val), off); } if (flags & PKTF_RECVTOS) { val = pkthdr->tos; off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_TCLASS, &val, sizeof(val), off); } if (flags & PKTF_RECVINFO) { memset(&ipi6, 0, sizeof(ipi6)); if (pkthdr->flags & PKTHF_IPV6) { pktaddr6 = (struct pktaddr6 *)pktaddr; memcpy(&ipi6.ipi6_addr, &pktaddr6->dstaddr, sizeof(ipi6.ipi6_addr)); } else { pktaddr4 = (struct pktaddr4 *)pktaddr; addr_make_v4mapped_v6(&ipaddr, &pktaddr4->dstaddr); memcpy(&ipi6.ipi6_addr, ip_2_ip6(&ipaddr)->addr, sizeof(ipi6.ipi6_addr)); } ipi6.ipi6_ifindex = pkthdr->dstif; off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_PKTINFO, &ipi6, sizeof(ipi6), off); } } else { if (flags & PKTF_RECVTTL) { byte = pkthdr->ttl; off += pktsock_add_ctl(IPPROTO_IP, IP_TTL, &byte, sizeof(byte), off); } if (flags & PKTF_RECVINFO) { assert(!(pkthdr->flags & PKTHF_IPV6)); pktaddr4 = (struct pktaddr4 *)pktaddr; memset(&ipi, 0, sizeof(ipi)); memcpy(&ipi.ipi_addr, &pktaddr4->dstaddr, sizeof(ipi.ipi_addr)); ipi.ipi_ifindex = pkthdr->dstif; off += pktsock_add_ctl(IPPROTO_IP, IP_PKTINFO, &ipi, sizeof(ipi), off); } } assert(off > 0); if (ctl_len >= off) ctl_len = off; else *rflags |= MSG_CTRUNC; if (ctl_len > 0 && (r = sockdriver_copyout(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK) return r; return ctl_len; } /* * Receive data on a packet socket. */ int pktsock_recv(struct sock * sock, const struct sockdriver_data * data, size_t len, size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len, endpoint_t user_endpt __unused, int flags, size_t min __unused, int * rflags) { struct pktsock *pkt = (struct pktsock *)sock; struct pktaddr4 pktaddr4; struct pktaddr6 pktaddr6; struct pkthdr pkthdr; void *pktaddr; struct pbuf *pbuf; ip_addr_t srcaddr; int r; if ((pbuf = pkt->pkt_rcvhead) == NULL) return SUSPEND; /* * Get the ancillary data for the packet. The format of the ancillary * data depends on the received packet type, which may be different * from the socket type. */ util_pbuf_header(pbuf, sizeof(pkthdr)); memcpy(&pkthdr, pbuf->payload, sizeof(pkthdr)); if (pkthdr.flags & PKTHF_IPV6) { util_pbuf_header(pbuf, sizeof(pktaddr6)); memcpy(&pktaddr6, pbuf->payload, sizeof(pktaddr6)); pktaddr = &pktaddr6; ip_addr_copy_from_ip6_packed(srcaddr, pktaddr6.srcaddr); if (ip6_addr_has_scope(ip_2_ip6(&srcaddr), IP6_UNICAST)) ip6_addr_set_zone(ip_2_ip6(&srcaddr), pkthdr.addrif); util_pbuf_header(pbuf, -(int)(sizeof(pkthdr) + sizeof(pktaddr6))); } else { util_pbuf_header(pbuf, sizeof(pktaddr4)); memcpy(&pktaddr4, pbuf->payload, sizeof(pktaddr4)); pktaddr = &pktaddr4; ip_addr_copy_from_ip4(srcaddr, pktaddr4.srcaddr); util_pbuf_header(pbuf, -(int)(sizeof(pkthdr) + sizeof(pktaddr4))); } /* Copy out the packet data to the calling user process. */ if (len >= pbuf->tot_len) len = pbuf->tot_len; else *rflags |= MSG_TRUNC; r = util_copy_data(data, len, 0, pbuf, 0, FALSE /*copy_in*/); if (r != OK) return r; /* Generate and copy out ancillary (control) data, if requested. */ if ((r = pktsock_put_ctl(pkt, ctl, ctl_len, &pkthdr, pktaddr, rflags)) < 0) return r; /* Store the source IP address. */ ipsock_put_addr(&pkt->pkt_ipsock, addr, addr_len, &srcaddr, pkthdr.port); /* Set multicast or broadcast message flag, if applicable. */ if (pkthdr.flags & PKTHF_MCAST) *rflags |= MSG_MCAST; else if (pkthdr.flags & PKTHF_BCAST) *rflags |= MSG_BCAST; /* Discard the packet now, unless we were instructed to peek only. */ if (!(flags & MSG_PEEK)) pktsock_dequeue(pkt); /* Return the received part of the packet length. */ *off = len; *ctl_off = r; return OK; } /* * Test whether data can be received on a packet socket, and if so, how many * bytes of data. */ int pktsock_test_recv(struct sock * sock, size_t min __unused, size_t * size) { struct pktsock *pkt = (struct pktsock *)sock; if (pkt->pkt_rcvhead == NULL) return SUSPEND; if (size != NULL) *size = pkt->pkt_rcvhead->tot_len; return OK; } /* * The caller has performed a multicast operation on the given socket. Thus, * the caller is multicast aware. Remember this, because that means the socket * may also receive traffic to multicast destinations. */ void pktsock_set_mcaware(struct pktsock * pkt) { ipsock_set_flag(&pkt->pkt_ipsock, PKTF_MCAWARE); } /* * Set socket options on a packet socket. */ int pktsock_setsockopt(struct pktsock * pkt, int level, int name, const struct sockdriver_data * data, socklen_t len, struct ipopts * ipopts) { struct ip_mreq imr; struct ipv6_mreq ipv6mr; struct in6_pktinfo ipi6; ip_addr_t ipaddr, ifaddr; struct ifdev *ifdev; unsigned int flag; uint32_t ifindex; int r, val, has_scope; switch (level) { case IPPROTO_IP: if (ipsock_is_ipv6(&pkt->pkt_ipsock)) break; switch (name) { case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: pktsock_set_mcaware(pkt); if ((r = sockdriver_copyin_opt(data, &imr, sizeof(imr), len)) != OK) return r; ip_addr_set_ip4_u32(&ipaddr, imr.imr_multiaddr.s_addr); ip_addr_set_ip4_u32(&ifaddr, imr.imr_interface.s_addr); if (!ip_addr_isany(&ifaddr)) { ifdev = ifaddr_map_by_addr(&ifaddr); if (ifdev == NULL) return EADDRNOTAVAIL; } else ifdev = NULL; if (name == IP_ADD_MEMBERSHIP) r = mcast_join(&pkt->pkt_mcast, &ipaddr, ifdev); else r = mcast_leave(&pkt->pkt_mcast, &ipaddr, ifdev); return r; case IP_RECVTTL: case IP_RECVPKTINFO: if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), len)) != OK) return r; switch (name) { case IP_RECVTTL: flag = PKTF_RECVTTL; break; case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break; default: flag = 0; assert(0); break; } if (val) ipsock_set_flag(&pkt->pkt_ipsock, flag); else ipsock_clear_flag(&pkt->pkt_ipsock, flag); return OK; } break; case IPPROTO_IPV6: if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) break; switch (name) { case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: pktsock_set_mcaware(pkt); if ((r = sockdriver_copyin_opt(data, &ipv6mr, sizeof(ipv6mr), len)) != OK) return r; ip_addr_set_zero_ip6(&ipaddr); memcpy(ip_2_ip6(&ipaddr)->addr, &ipv6mr.ipv6mr_multiaddr, sizeof(ip_2_ip6(&ipaddr)->addr)); /* * We currently do not support joining IPv4 multicast * groups on IPv6 sockets. The reason for this is that * this would require decisions on what to do if the * socket is set to V6ONLY later, as well as various * additional exceptions for a case that hopefully * doesn't occur in practice anyway. */ if (ip6_addr_isipv4mappedipv6(ip_2_ip6(&ipaddr))) return EADDRNOTAVAIL; has_scope = ip6_addr_has_scope(ip_2_ip6(&ipaddr), IP6_UNKNOWN); if ((ifindex = ipv6mr.ipv6mr_interface) != 0) { ifdev = ifdev_get_by_index(ifindex); if (ifdev == NULL) return ENXIO; if (has_scope) ip6_addr_set_zone(ip_2_ip6(&ipaddr), ifindex); } else { if (has_scope) return EADDRNOTAVAIL; ifdev = NULL; } if (name == IPV6_JOIN_GROUP) r = mcast_join(&pkt->pkt_mcast, &ipaddr, ifdev); else r = mcast_leave(&pkt->pkt_mcast, &ipaddr, ifdev); return r; case IPV6_USE_MIN_MTU: if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), len)) != OK) return r; if (val < -1 || val > 1) return EINVAL; /* * lwIP does not support path MTU discovery, so do * nothing. TODO: see if this is actually good enough. */ return OK; case IPV6_PKTINFO: if ((r = sockdriver_copyin_opt(data, &ipi6, sizeof(ipi6), len)) != OK) return r; /* * Simply copy in what is given. The values will be * parsed only once a packet is sent, in * pktsock_get_pktinfo(). Otherwise, if we perform * checks here, they may be outdated by the time the * values are actually used. */ memcpy(&pkt->pkt_srcaddr.addr, &ipi6.ipi6_addr, sizeof(pkt->pkt_srcaddr.addr)); pkt->pkt_ifindex = ipi6.ipi6_ifindex; return OK; case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVTCLASS: if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), len)) != OK) return r; switch (name) { case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break; case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break; case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break; default: flag = 0; assert(0); break; } if (val) ipsock_set_flag(&pkt->pkt_ipsock, flag); else ipsock_clear_flag(&pkt->pkt_ipsock, flag); return OK; } break; } return ipsock_setsockopt(&pkt->pkt_ipsock, level, name, data, len, ipopts); } /* * Retrieve socket options on a packet socket. */ int pktsock_getsockopt(struct pktsock * pkt, int level, int name, const struct sockdriver_data * data, socklen_t * len, struct ipopts * ipopts) { struct in6_pktinfo ipi6; unsigned int flag; int val; switch (level) { case IPPROTO_IP: if (ipsock_is_ipv6(&pkt->pkt_ipsock)) break; switch (name) { case IP_RECVTTL: case IP_RECVPKTINFO: switch (name) { case IP_RECVTTL: flag = PKTF_RECVTTL; break; case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break; default: flag = 0; assert(0); break; } val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag)); return sockdriver_copyout_opt(data, &val, sizeof(val), len); } break; case IPPROTO_IPV6: if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) break; switch (name) { case IPV6_USE_MIN_MTU: /* * TODO: sort out exactly what lwIP actually supports * in the way of path MTU discovery. Value 1 means * that path MTU discovery is disabled and packets are * sent at the minimum MTU (RFC 3542). */ val = 1; return sockdriver_copyout_opt(data, &val, sizeof(val), len); case IPV6_PKTINFO: memset(&ipi6, 0, sizeof(ipi6)); /* * Simply copy out whatever was given before. These * fields are initialized to zero on socket creation. */ memcpy(&ipi6.ipi6_addr, &pkt->pkt_srcaddr.addr, sizeof(ipi6.ipi6_addr)); ipi6.ipi6_ifindex = pkt->pkt_ifindex; return sockdriver_copyout_opt(data, &ipi6, sizeof(ipi6), len); case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVTCLASS: switch (name) { case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break; case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break; case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break; default: flag = 0; assert(0); break; } val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag)); return sockdriver_copyout_opt(data, &val, sizeof(val), len); } break; } return ipsock_getsockopt(&pkt->pkt_ipsock, level, name, data, len, ipopts); } /* * Drain the receive queue of a packet socket. */ static void pktsock_drain(struct pktsock * pkt) { while (pkt->pkt_rcvhead != NULL) pktsock_dequeue(pkt); assert(pkt->pkt_rcvlen == 0); assert(pkt->pkt_rcvtailp == &pkt->pkt_rcvhead); } /* * Shut down a packet socket for reading and/or writing. */ void pktsock_shutdown(struct pktsock * pkt, unsigned int mask) { if (mask & SFL_SHUT_RD) pktsock_drain(pkt); } /* * Close a packet socket. */ void pktsock_close(struct pktsock * pkt) { pktsock_drain(pkt); mcast_leave_all(&pkt->pkt_mcast); } /* * Return the rounded-up number of bytes in the packet socket's receive queue, * for sysctl(7). NetBSD returns the used portion of each buffer, but that * would be quite some extra effort for us (TODO). */ size_t pktsock_get_recvlen(struct pktsock * pkt) { return pkt->pkt_rcvlen; }