
This commit adds a new TCP/IP service to MINIX 3. As its core, the service uses the lwIP TCP/IP stack for maintenance reasons. The service aims to be compatible with NetBSD userland, including its low-level network management utilities. It also aims to support modern features such as IPv6. In summary, the new LWIP service has support for the following main features: - TCP, UDP, RAW sockets with mostly standard BSD API semantics; - IPv6 support: host mode (complete) and router mode (partial); - most of the standard BSD API socket options (SO_); - all of the standard BSD API message flags (MSG_); - the most used protocol-specific socket and control options; - a default loopback interface and the ability to create one more; - configuration-free ethernet interfaces and driver tracking; - queuing and multiple concurrent requests to each ethernet driver; - standard ioctl(2)-based BSD interface management; - radix tree backed, destination-based routing; - routing sockets for standard BSD route reporting and management; - multicast traffic and multicast group membership tracking; - Berkeley Packet Filter (BPF) devices; - standard and custom sysctl(7) nodes for many internals; - a slab allocation based, hybrid static/dynamic memory pool model. Many of its modules come with fairly elaborate comments that cover many aspects of what is going on. The service is primarily a socket driver built on top of the libsockdriver library, but for BPF devices it is at the same time also a character driver. Change-Id: Ib0c02736234b21143915e5fcc0fda8fe408f046f
1366 lines
36 KiB
C
1366 lines
36 KiB
C
/* LWIP service - bpfdev.c - Berkeley Packet Filter (/dev/bpf) interface */
|
|
/*
|
|
* BPF is a cloning device: opening /dev/bpf returns a new BPF device which is
|
|
* independent from any other opened BPF devices. We assume that each BPF
|
|
* device is used by one single user process, and this implementation therefore
|
|
* does not support multiple concurrent device calls on the same BPF device.
|
|
*
|
|
* Packet buffering basically follows the BSD model: each BPF device that is
|
|
* configured (that is, it has been attached to an interface) has two buffers,
|
|
* each of the configured size: a store buffer, where new packets are stored,
|
|
* and a hold buffer, which is typically full and awaiting retrieval through a
|
|
* read call from userland. The buffers are swapped ("rotated") when the store
|
|
* buffer is filled up and the hold buffer is empty - if the hold buffer is not
|
|
* empty is not empty either, additional packets are dropped.
|
|
*
|
|
* These buffers are allocated when the BPF device is attached to an interface.
|
|
* The interface may later disappear, in which case the BPF device is detached
|
|
* from it, allowing any final packets to be read before read requests start
|
|
* returning I/O errors. The buffers are freed only when the device is closed.
|
|
*/
|
|
|
|
#include "lwip.h"
|
|
#include "bpfdev.h"
|
|
|
|
#include <minix/chardriver.h>
|
|
#include <net/if.h>
|
|
#include <net/bpfdesc.h>
|
|
#include <minix/bpf.h>
|
|
#include <sys/mman.h>
|
|
|
|
/*
|
|
* Make sure that our implementation matches the BPF version in the NetBSD
|
|
* headers. If they change the version number, we may have to make changes
|
|
* here accordingly.
|
|
*/
|
|
#if BPF_MAJOR_VERSION != 1 || BPF_MINOR_VERSION != 1
|
|
#error "NetBSD BPF version has changed"
|
|
#endif
|
|
|
|
/* The number of BPF devices. */
|
|
#define NR_BPFDEV 16
|
|
|
|
/* BPF receive buffer size: allowed range and default. */
|
|
#define BPF_BUF_MIN BPF_WORDALIGN(sizeof(struct bpf_hdr))
|
|
#define BPF_BUF_DEF 32768
|
|
#define BPF_BUF_MAX 262144
|
|
|
|
/*
|
|
* By opening /dev/bpf, one will obtain a cloned device with a different minor
|
|
* number, which maps to one of the BPF devices.
|
|
*/
|
|
#define BPFDEV_MINOR 0 /* minor number of /dev/bpf */
|
|
#define BPFDEV_BASE_MINOR 1 /* base minor number for BPF devices */
|
|
|
|
static struct bpfdev {
|
|
struct bpfdev_link bpf_link; /* structure link, MUST be first */
|
|
TAILQ_ENTRY(bpfdev) bpf_next; /* next on free or interface list */
|
|
struct ifdev *bpf_ifdev; /* associated interface, or NULL */
|
|
unsigned int bpf_flags; /* flags (BPFF_) */
|
|
size_t bpf_size; /* size of packet buffers */
|
|
char *bpf_sbuf; /* store buffer (mmap'd, or NULL) */
|
|
char *bpf_hbuf; /* hold buffer (mmap'd, or NULL) */
|
|
size_t bpf_slen; /* used part of store buffer */
|
|
size_t bpf_hlen; /* used part of hold buffer */
|
|
struct bpf_insn *bpf_filter; /* verified BPF filter, or NULL */
|
|
size_t bpf_filterlen; /* length of filter, for munmap */
|
|
pid_t bpf_pid; /* process ID of last using process */
|
|
clock_t bpf_timeout; /* timeout for read calls (0 = none) */
|
|
struct { /* state for pending read request */
|
|
endpoint_t br_endpt; /* reading endpoint, or NONE */
|
|
cp_grant_id_t br_grant; /* grant for reader's buffer */
|
|
cdev_id_t br_id; /* read request identifier */
|
|
minix_timer_t br_timer; /* timer for read timeout */
|
|
} bpf_read;
|
|
struct { /* state for pending select request */
|
|
endpoint_t bs_endpt; /* selecting endpoint, or NONE */
|
|
unsigned int bs_selops; /* pending select operations */
|
|
} bpf_select;
|
|
struct { /* packet capture statistics */
|
|
uint64_t bs_recv; /* # of packets run through filter */
|
|
uint64_t bs_drop; /* # of packets dropped: buffer full */
|
|
uint64_t bs_capt; /* # of packets accepted by filter */
|
|
} bpf_stat;
|
|
} bpf_array[NR_BPFDEV];
|
|
|
|
#define BPFF_IN_USE 0x01 /* this BPF device object is in use */
|
|
#define BPFF_PROMISC 0x02 /* promiscuous mode enabled */
|
|
#define BPFF_IMMEDIATE 0x04 /* immediate mode is enabled */
|
|
#define BPFF_SEESENT 0x08 /* also process host-sent packets */
|
|
#define BPFF_HDRCMPLT 0x10 /* do not fill in link-layer source */
|
|
#define BPFF_FEEDBACK 0x20 /* feed back written packet as input */
|
|
|
|
static TAILQ_HEAD(, bpfdev_link) bpfl_freelist; /* list of free BPF devices */
|
|
|
|
static struct bpf_stat bpf_stat;
|
|
|
|
static ssize_t bpfdev_peers(struct rmib_call *, struct rmib_node *,
|
|
struct rmib_oldp *, struct rmib_newp *);
|
|
|
|
/* The CTL_NET NET_BPF subtree. All nodes are dynamically numbered. */
|
|
static struct rmib_node net_bpf_table[] = {
|
|
RMIB_INT(RMIB_RO, BPF_BUF_MAX, "maxbufsize",
|
|
"Maximum size for data capture buffer"), /* TODO: read-write */
|
|
RMIB_STRUCT(RMIB_RO, sizeof(bpf_stat), &bpf_stat, "stats",
|
|
"BPF stats"),
|
|
RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, bpfdev_peers, "peers",
|
|
"BPF peers"),
|
|
};
|
|
|
|
static struct rmib_node net_bpf_node =
|
|
RMIB_NODE(RMIB_RO, net_bpf_table, "bpf", "BPF options");
|
|
|
|
/*
|
|
* Initialize the BPF module.
|
|
*/
|
|
void
|
|
bpfdev_init(void)
|
|
{
|
|
const int mib[] = { CTL_NET, NET_BPF };
|
|
unsigned int slot;
|
|
int r;
|
|
|
|
/* Initialize data structures. */
|
|
TAILQ_INIT(&bpfl_freelist);
|
|
|
|
for (slot = 0; slot < __arraycount(bpf_array); slot++) {
|
|
bpf_array[slot].bpf_flags = 0;
|
|
|
|
TAILQ_INSERT_TAIL(&bpfl_freelist, &bpf_array[slot].bpf_link,
|
|
bpfl_next);
|
|
}
|
|
|
|
memset(&bpf_stat, 0, sizeof(bpf_stat));
|
|
|
|
/* Register the "net.bpf" subtree with the MIB service. */
|
|
if ((r = rmib_register(mib, __arraycount(mib), &net_bpf_node)) != OK)
|
|
panic("unable to register net.bpf RMIB tree: %d", r);
|
|
}
|
|
|
|
/*
|
|
* Given a BPF device object, return the corresponding minor number.
|
|
*/
|
|
static devminor_t
|
|
bpfdev_get_minor(struct bpfdev * bpfdev)
|
|
{
|
|
|
|
assert(bpfdev != NULL);
|
|
|
|
return BPFDEV_BASE_MINOR + (devminor_t)(bpfdev - bpf_array);
|
|
}
|
|
|
|
/*
|
|
* Given a minor number, return the corresponding BPF device object, or NULL if
|
|
* the minor number does not identify a BPF device.
|
|
*/
|
|
static struct bpfdev *
|
|
bpfdev_get_by_minor(devminor_t minor)
|
|
{
|
|
|
|
if (minor < BPFDEV_BASE_MINOR ||
|
|
(unsigned int)minor >= BPFDEV_BASE_MINOR + __arraycount(bpf_array))
|
|
return NULL;
|
|
|
|
return &bpf_array[minor - BPFDEV_BASE_MINOR];
|
|
}
|
|
|
|
/*
|
|
* Open a BPF device, returning a cloned device instance.
|
|
*/
|
|
static int
|
|
bpfdev_open(devminor_t minor, int access __unused, endpoint_t user_endpt)
|
|
{
|
|
struct bpfdev_link *bpfl;
|
|
struct bpfdev *bpf;
|
|
|
|
/* Disallow opening cloned devices through device nodes. */
|
|
if (minor != BPFDEV_MINOR)
|
|
return ENXIO;
|
|
|
|
if (TAILQ_EMPTY(&bpfl_freelist))
|
|
return ENOBUFS;
|
|
|
|
bpfl = TAILQ_FIRST(&bpfl_freelist);
|
|
TAILQ_REMOVE(&bpfl_freelist, bpfl, bpfl_next);
|
|
|
|
bpf = (struct bpfdev *)bpfl;
|
|
|
|
memset(bpf, 0, sizeof(*bpf));
|
|
|
|
bpf->bpf_flags = BPFF_IN_USE | BPFF_SEESENT;
|
|
bpf->bpf_size = BPF_BUF_DEF;
|
|
bpf->bpf_pid = getnpid(user_endpt);
|
|
bpf->bpf_read.br_endpt = NONE;
|
|
bpf->bpf_select.bs_endpt = NONE;
|
|
|
|
return CDEV_CLONED | bpfdev_get_minor(bpf);
|
|
}
|
|
|
|
/*
|
|
* Close a BPF device.
|
|
*/
|
|
static int
|
|
bpfdev_close(devminor_t minor)
|
|
{
|
|
struct bpfdev *bpf;
|
|
|
|
if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
|
|
return EINVAL;
|
|
|
|
/*
|
|
* There cannot possibly be a pending read request, so we never need to
|
|
* cancel the read timer from here either.
|
|
*/
|
|
assert(bpf->bpf_read.br_endpt == NONE);
|
|
|
|
if (bpf->bpf_sbuf != NULL) {
|
|
assert(bpf->bpf_hbuf != NULL);
|
|
|
|
if (munmap(bpf->bpf_sbuf, bpf->bpf_size) != 0)
|
|
panic("munmap failed: %d", -errno);
|
|
if (munmap(bpf->bpf_hbuf, bpf->bpf_size) != 0)
|
|
panic("munmap failed: %d", -errno);
|
|
|
|
bpf->bpf_sbuf = NULL;
|
|
bpf->bpf_hbuf = NULL;
|
|
} else
|
|
assert(bpf->bpf_hbuf == NULL);
|
|
|
|
if (bpf->bpf_filter != NULL) {
|
|
assert(bpf->bpf_filterlen > 0);
|
|
|
|
if (munmap(bpf->bpf_filter, bpf->bpf_filterlen) != 0)
|
|
panic("munmap failed: %d", -errno);
|
|
|
|
bpf->bpf_filter = NULL;
|
|
}
|
|
|
|
/*
|
|
* If the BPF device was attached to an interface, and that interface
|
|
* has not disappeared in the meantime, detach from it now.
|
|
*/
|
|
if (bpf->bpf_ifdev != NULL) {
|
|
if (bpf->bpf_flags & BPFF_PROMISC)
|
|
ifdev_clear_promisc(bpf->bpf_ifdev);
|
|
|
|
ifdev_detach_bpf(bpf->bpf_ifdev, &bpf->bpf_link);
|
|
|
|
bpf->bpf_ifdev = NULL;
|
|
}
|
|
|
|
bpf->bpf_flags = 0; /* mark as no longer in use */
|
|
|
|
TAILQ_INSERT_HEAD(&bpfl_freelist, &bpf->bpf_link, bpfl_next);
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Rotate buffers for the BPF device, by swapping the store buffer and the hold
|
|
* buffer.
|
|
*/
|
|
static void
|
|
bpfdev_rotate(struct bpfdev * bpf)
|
|
{
|
|
char *buf;
|
|
size_t len;
|
|
|
|
/*
|
|
* When rotating, the store buffer may or may not be empty, but the
|
|
* hold buffer must always be empty.
|
|
*/
|
|
assert(bpf->bpf_hlen == 0);
|
|
|
|
buf = bpf->bpf_sbuf;
|
|
len = bpf->bpf_slen;
|
|
bpf->bpf_sbuf = bpf->bpf_hbuf;
|
|
bpf->bpf_slen = bpf->bpf_hlen;
|
|
bpf->bpf_hbuf = buf;
|
|
bpf->bpf_hlen = len;
|
|
}
|
|
|
|
/*
|
|
* Test whether any of the given select operations are ready on the BPF device,
|
|
* and return the set of ready operations.
|
|
*/
|
|
static unsigned int
|
|
bpfdev_test_select(struct bpfdev * bpf, unsigned int ops)
|
|
{
|
|
unsigned int ready_ops;
|
|
|
|
ready_ops = 0;
|
|
|
|
/*
|
|
* The BPF device is ready for reading if the hold buffer is not empty
|
|
* (i.e.: the store buffer has been filled up completely and was
|
|
* therefore rotated) or if immediate mode is set and the store buffer
|
|
* is not empty (i.e.: any packet is available at all). In the latter
|
|
* case, the buffers will be rotated during the read. We do not
|
|
* support applying the read timeout to selects and maintaining state
|
|
* between the select and the following read, because despite that
|
|
* libpcap claims that it is the right behavior, that is just insane.
|
|
*/
|
|
if (ops & CDEV_OP_RD) {
|
|
if (bpf->bpf_ifdev == NULL)
|
|
ready_ops |= CDEV_OP_RD;
|
|
else if (bpf->bpf_hlen > 0)
|
|
ready_ops |= CDEV_OP_RD;
|
|
else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
|
|
bpf->bpf_slen > 0)
|
|
ready_ops |= CDEV_OP_RD;
|
|
}
|
|
|
|
if (ops & CDEV_OP_WR)
|
|
ready_ops |= CDEV_OP_WR;
|
|
|
|
return ready_ops;
|
|
}
|
|
|
|
/*
|
|
* There has been a state change on the BPF device. If now possible, resume a
|
|
* pending select query, if any.
|
|
*/
|
|
static void
|
|
bpfdev_resume_select(struct bpfdev * bpf)
|
|
{
|
|
unsigned int ops, ready_ops;
|
|
endpoint_t endpt;
|
|
|
|
/* First see if there is a pending select request at all. */
|
|
if ((endpt = bpf->bpf_select.bs_endpt) == NONE)
|
|
return;
|
|
ops = bpf->bpf_select.bs_selops;
|
|
|
|
assert(ops != 0);
|
|
|
|
/* Then see if any of the pending operations are now ready. */
|
|
if ((ready_ops = bpfdev_test_select(bpf, ops)) == 0)
|
|
return;
|
|
|
|
/* If so, notify VFS about the ready operations. */
|
|
chardriver_reply_select(bpf->bpf_select.bs_endpt,
|
|
bpfdev_get_minor(bpf), ready_ops);
|
|
|
|
/*
|
|
* Forget about the ready operations. If that leaves no pending
|
|
* operations, forget about the select request altogether.
|
|
*/
|
|
if ((bpf->bpf_select.bs_selops &= ~ready_ops) == 0)
|
|
bpf->bpf_select.bs_endpt = NONE;
|
|
}
|
|
|
|
/*
|
|
* There has been a state change on the BPF device. If now possible, resume a
|
|
* pending read request, if any. If the call is a result of a timeout,
|
|
* 'is_timeout' is set. In that case, the read request must be resumed with an
|
|
* EAGAIN error if no packets are available, and the running timer must be
|
|
* canceled. Otherwise, the resumption is due to a full buffer or a
|
|
* disappeared interface, and 'is_timeout' is not set. In this case, the read
|
|
* request must be resumed with an I/O error if no packets are available.
|
|
*/
|
|
static void
|
|
bpfdev_resume_read(struct bpfdev * bpf, int is_timeout)
|
|
{
|
|
ssize_t r;
|
|
|
|
assert(bpf->bpf_read.br_endpt != NONE);
|
|
|
|
/*
|
|
* If the hold buffer is still empty, see if the store buffer has
|
|
* any packets to copy out.
|
|
*/
|
|
if (bpf->bpf_hlen == 0)
|
|
bpfdev_rotate(bpf);
|
|
|
|
/* Return any available packets, or otherwise an error. */
|
|
if (bpf->bpf_hlen > 0) {
|
|
assert(bpf->bpf_hlen <= bpf->bpf_size);
|
|
|
|
r = sys_safecopyto(bpf->bpf_read.br_endpt,
|
|
bpf->bpf_read.br_grant, 0, (vir_bytes)bpf->bpf_hbuf,
|
|
bpf->bpf_hlen);
|
|
|
|
if (r == OK) {
|
|
r = (ssize_t)bpf->bpf_hlen;
|
|
|
|
bpf->bpf_hlen = 0;
|
|
|
|
assert(bpf->bpf_slen != bpf->bpf_size);
|
|
|
|
/*
|
|
* Allow readers to get the last packets after the
|
|
* interface has disappeared, before getting errors.
|
|
*/
|
|
if (bpf->bpf_ifdev == NULL)
|
|
bpfdev_rotate(bpf);
|
|
}
|
|
} else
|
|
r = (is_timeout) ? EAGAIN : EIO;
|
|
|
|
chardriver_reply_task(bpf->bpf_read.br_endpt, bpf->bpf_read.br_id, r);
|
|
|
|
bpf->bpf_read.br_endpt = NONE;
|
|
|
|
/* Was there still a timer running? Then cancel it now. */
|
|
if (bpf->bpf_timeout > 0 && !is_timeout)
|
|
cancel_timer(&bpf->bpf_read.br_timer);
|
|
}
|
|
|
|
/*
|
|
* A read timeout has triggered for the BPF device. Wake up the pending read
|
|
* request.
|
|
*/
|
|
static void
|
|
bpfdev_timeout(int arg)
|
|
{
|
|
struct bpfdev *bpf;
|
|
|
|
assert(arg >= 0 && (unsigned int)arg < __arraycount(bpf_array));
|
|
|
|
bpf = &bpf_array[arg];
|
|
|
|
assert(bpf->bpf_read.br_endpt != NONE);
|
|
|
|
bpfdev_resume_read(bpf, TRUE /*is_timeout*/);
|
|
}
|
|
|
|
/*
|
|
* Read from a BPF device.
|
|
*/
|
|
static ssize_t
|
|
bpfdev_read(devminor_t minor, uint64_t position, endpoint_t endpt,
|
|
cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
|
|
{
|
|
struct bpfdev *bpf;
|
|
ssize_t r;
|
|
int suspend;
|
|
|
|
if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
|
|
return EINVAL;
|
|
|
|
/* Allow only one read call at a time. */
|
|
if (bpf->bpf_read.br_endpt != NONE)
|
|
return EIO;
|
|
|
|
/* Has this BPF device been configured at all yet? */
|
|
if (bpf->bpf_sbuf == NULL)
|
|
return EINVAL;
|
|
|
|
/*
|
|
* Does the read call size match the entire buffer size? This is a
|
|
* ridiculous requirement but it makes our job quite a bit easier..
|
|
*/
|
|
if (size != bpf->bpf_size)
|
|
return EINVAL;
|
|
|
|
/*
|
|
* Following standard receive semantics, if the interface is gone,
|
|
* return all the packets that were pending before returning an error.
|
|
* This requires extra buffer rotations after read completion, too.
|
|
*/
|
|
if (bpf->bpf_ifdev == NULL && bpf->bpf_hlen == 0)
|
|
return EIO;
|
|
|
|
/*
|
|
* If immediate mode is not enabled, we should always suspend the read
|
|
* call if the hold buffer is empty. If immediate mode is enabled, we
|
|
* should only suspend the read call if both buffers are empty, and
|
|
* return data from the hold buffer or otherwise the store buffer,
|
|
* whichever is not empty. A non-blocking call behaves as though
|
|
* immediate mode is enabled, except it will return EAGAIN instead of
|
|
* suspending the read call if both buffers are empty. Thus, we may
|
|
* have to rotate buffers for both immediate mode and non-blocking
|
|
* calls. The latter is necessary for libpcap to behave correctly.
|
|
*/
|
|
if ((flags & CDEV_NONBLOCK) || (bpf->bpf_flags & BPFF_IMMEDIATE))
|
|
suspend = (bpf->bpf_hlen == 0 && bpf->bpf_slen == 0);
|
|
else
|
|
suspend = (bpf->bpf_hlen == 0);
|
|
|
|
if (suspend) {
|
|
if (flags & CDEV_NONBLOCK)
|
|
return EAGAIN;
|
|
|
|
/* Suspend the read call for later. */
|
|
bpf->bpf_read.br_endpt = endpt;
|
|
bpf->bpf_read.br_grant = grant;
|
|
bpf->bpf_read.br_id = id;
|
|
|
|
/* Set a timer if requested. */
|
|
if (bpf->bpf_timeout > 0)
|
|
set_timer(&bpf->bpf_read.br_timer, bpf->bpf_timeout,
|
|
bpfdev_timeout, (int)(bpf - bpf_array));
|
|
|
|
return EDONTREPLY;
|
|
}
|
|
|
|
/* If we get here, either buffer has data; rotate buffers if needed. */
|
|
if (bpf->bpf_hlen == 0)
|
|
bpfdev_rotate(bpf);
|
|
assert(bpf->bpf_hlen > 0);
|
|
|
|
if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)bpf->bpf_hbuf,
|
|
bpf->bpf_hlen)) != OK)
|
|
return r;
|
|
|
|
r = (ssize_t)bpf->bpf_hlen;
|
|
|
|
bpf->bpf_hlen = 0;
|
|
|
|
/*
|
|
* If the store buffer is exactly full, rotate it now. Also, if the
|
|
* interface has disappeared, the store buffer will never fill up.
|
|
* Rotate it so that the application will get any remaining data before
|
|
* getting errors about the interface being gone.
|
|
*/
|
|
if (bpf->bpf_slen == bpf->bpf_size || bpf->bpf_ifdev == NULL)
|
|
bpfdev_rotate(bpf);
|
|
|
|
return r;
|
|
}
|
|
|
|
/*
|
|
* Write to a BPF device.
|
|
*/
|
|
static ssize_t
|
|
bpfdev_write(devminor_t minor, uint64_t position, endpoint_t endpt,
|
|
cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
|
|
{
|
|
struct bpfdev *bpf;
|
|
struct pbuf *pbuf, *pptr, *pcopy;
|
|
size_t off;
|
|
err_t err;
|
|
int r;
|
|
|
|
if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
|
|
return EINVAL;
|
|
|
|
if (bpf->bpf_ifdev == NULL)
|
|
return EINVAL;
|
|
|
|
/* VFS skips zero-sized I/O calls right now, but that may change. */
|
|
if (size == 0)
|
|
return 0; /* nothing to do */
|
|
|
|
if (size > ifdev_get_hdrlen(bpf->bpf_ifdev) +
|
|
ifdev_get_mtu(bpf->bpf_ifdev))
|
|
return EMSGSIZE;
|
|
|
|
if ((pbuf = pchain_alloc(PBUF_LINK, size)) == NULL)
|
|
return ENOMEM;
|
|
|
|
/* TODO: turn this into a series of vector copies. */
|
|
off = 0;
|
|
for (pptr = pbuf; pptr != NULL; pptr = pptr->next) {
|
|
if ((r = sys_safecopyfrom(endpt, grant, off,
|
|
(vir_bytes)pptr->payload, pptr->len)) != OK) {
|
|
pbuf_free(pbuf);
|
|
|
|
return r;
|
|
}
|
|
off += pptr->len;
|
|
}
|
|
assert(off == size);
|
|
|
|
/*
|
|
* In feedback mode, we cannot use the same packet buffers for both
|
|
* output and input, so make a copy. We do this before calling the
|
|
* output function, which may change part of the buffers, because the
|
|
* BSDs take this approach as well.
|
|
*/
|
|
if (bpf->bpf_flags & BPFF_FEEDBACK) {
|
|
if ((pcopy = pchain_alloc(PBUF_LINK, size)) == NULL) {
|
|
pbuf_free(pbuf);
|
|
|
|
return ENOMEM;
|
|
}
|
|
|
|
if (pbuf_copy(pcopy, pbuf) != ERR_OK)
|
|
panic("unexpected pbuf copy failure");
|
|
} else
|
|
pcopy = NULL;
|
|
|
|
/* Pass in the packet as output, and free it again. */
|
|
err = ifdev_output(bpf->bpf_ifdev, pbuf, NULL /*netif*/,
|
|
TRUE /*to_bpf*/, !!(bpf->bpf_flags & BPFF_HDRCMPLT));
|
|
|
|
pbuf_free(pbuf);
|
|
|
|
/* In feedback mode, pass in the copy as input, if output succeeded. */
|
|
if (err == ERR_OK && (bpf->bpf_flags & BPFF_FEEDBACK))
|
|
ifdev_input(bpf->bpf_ifdev, pcopy, NULL /*netif*/,
|
|
FALSE /*to_bpf*/);
|
|
else if (pcopy != NULL)
|
|
pbuf_free(pcopy);
|
|
|
|
return (err == ERR_OK) ? (ssize_t)size : util_convert_err(err);
|
|
}
|
|
|
|
/*
|
|
* Attach a BPF device to a network interface, using the interface name given
|
|
* in an ifreq structure. As side effect, allocate hold and store buffers for
|
|
* the device. These buffers will stay allocated until the device is closed,
|
|
* even though the interface may disappear before that. Return OK if the BPF
|
|
* device was successfully attached to the interface, or a negative error code
|
|
* otherwise.
|
|
*/
|
|
static int
|
|
bpfdev_attach(struct bpfdev * bpf, struct ifreq * ifr)
|
|
{
|
|
struct ifdev *ifdev;
|
|
void *sbuf, *hbuf;
|
|
|
|
/* Find the interface with the given name. */
|
|
ifr->ifr_name[sizeof(ifr->ifr_name) - 1] = '\0';
|
|
if ((ifdev = ifdev_find_by_name(ifr->ifr_name)) == NULL)
|
|
return ENXIO;
|
|
|
|
/*
|
|
* Allocate a store buffer and a hold buffer. Preallocate the memory,
|
|
* or we might get killed later during low-memory conditions.
|
|
*/
|
|
if ((sbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
|
|
MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED)
|
|
return ENOMEM;
|
|
|
|
if ((hbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
|
|
MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) {
|
|
(void)munmap(sbuf, bpf->bpf_size);
|
|
|
|
return ENOMEM;
|
|
}
|
|
|
|
bpf->bpf_ifdev = ifdev;
|
|
bpf->bpf_sbuf = sbuf;
|
|
bpf->bpf_hbuf = hbuf;
|
|
assert(bpf->bpf_slen == 0);
|
|
assert(bpf->bpf_hlen == 0);
|
|
|
|
ifdev_attach_bpf(ifdev, &bpf->bpf_link);
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Detach the BPF device from its interface, which is about to disappear.
|
|
*/
|
|
void
|
|
bpfdev_detach(struct bpfdev_link * bpfl)
|
|
{
|
|
struct bpfdev *bpf = (struct bpfdev *)bpfl;
|
|
|
|
assert(bpf->bpf_flags & BPFF_IN_USE);
|
|
assert(bpf->bpf_ifdev != NULL);
|
|
|
|
/*
|
|
* We deliberately leave the buffers allocated here, for two reasons:
|
|
*
|
|
* 1) it lets applications to read any last packets in the buffers;
|
|
* 2) it prevents reattaching the BPF device to another interface.
|
|
*/
|
|
bpf->bpf_ifdev = NULL;
|
|
|
|
/*
|
|
* Resume pending read and select requests, returning any data left,
|
|
* or an error if none.
|
|
*/
|
|
if (bpf->bpf_hlen == 0)
|
|
bpfdev_rotate(bpf);
|
|
|
|
if (bpf->bpf_read.br_endpt != NONE)
|
|
bpfdev_resume_read(bpf, FALSE /*is_timeout*/);
|
|
|
|
bpfdev_resume_select(bpf);
|
|
}
|
|
|
|
/*
|
|
* Flush the given BPF device, resetting its buffer contents and statistics
|
|
* counters.
|
|
*/
|
|
static void
|
|
bpfdev_flush(struct bpfdev * bpf)
|
|
{
|
|
|
|
bpf->bpf_slen = 0;
|
|
bpf->bpf_hlen = 0;
|
|
|
|
bpf->bpf_stat.bs_recv = 0;
|
|
bpf->bpf_stat.bs_drop = 0;
|
|
bpf->bpf_stat.bs_capt = 0;
|
|
}
|
|
|
|
/*
|
|
* Install a filter program on the BPF device. A new filter replaces any old
|
|
* one. A zero-sized filter simply clears a previous filter. On success,
|
|
* perform a flush and return OK. On failure, return a negative error code
|
|
* without making any modifications to the current filter.
|
|
*/
|
|
static int
|
|
bpfdev_setfilter(struct bpfdev * bpf, endpoint_t endpt, cp_grant_id_t grant)
|
|
{
|
|
struct bpf_insn *filter;
|
|
unsigned int count;
|
|
size_t len;
|
|
int r;
|
|
|
|
if ((r = sys_safecopyfrom(endpt, grant,
|
|
offsetof(struct minix_bpf_program, mbf_len), (vir_bytes)&count,
|
|
sizeof(count))) != OK)
|
|
return r;
|
|
|
|
if (count > BPF_MAXINSNS)
|
|
return EINVAL;
|
|
len = count * sizeof(struct bpf_insn);
|
|
|
|
if (len > 0) {
|
|
if ((filter = (struct bpf_insn *)mmap(NULL, len,
|
|
PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0)) ==
|
|
MAP_FAILED)
|
|
return ENOMEM;
|
|
|
|
if ((r = sys_safecopyfrom(endpt, grant,
|
|
offsetof(struct minix_bpf_program, mbf_insns),
|
|
(vir_bytes)filter, len)) != OK) {
|
|
(void)munmap(filter, len);
|
|
|
|
return r;
|
|
}
|
|
|
|
if (!bpf_validate(filter, count)) {
|
|
(void)munmap(filter, len);
|
|
|
|
return EINVAL;
|
|
}
|
|
} else
|
|
filter = NULL;
|
|
|
|
if (bpf->bpf_filter != NULL)
|
|
(void)munmap(bpf->bpf_filter, bpf->bpf_filterlen);
|
|
|
|
bpf->bpf_filter = filter;
|
|
bpf->bpf_filterlen = len;
|
|
|
|
bpfdev_flush(bpf);
|
|
|
|
return OK;
|
|
}
|
|
|
|
/*
|
|
* Process an I/O control request on the BPF device.
|
|
*/
|
|
static int
|
|
bpfdev_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
|
|
cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id)
|
|
{
|
|
struct bpfdev *bpf;
|
|
struct bpf_stat bs;
|
|
struct bpf_version bv;
|
|
struct bpf_dltlist bfl;
|
|
struct timeval tv;
|
|
struct ifreq ifr;
|
|
unsigned int uval;
|
|
int r, val;
|
|
|
|
if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
|
|
return EINVAL;
|
|
|
|
/*
|
|
* We do not support multiple concurrent requests in this module. That
|
|
* not only means that we forbid a read(2) call on a BPF device object
|
|
* while another read(2) is already pending: we also disallow IOCTL
|
|
* IOCTL calls while such a read(2) call is in progress. This
|
|
* restriction should never be a problem for user programs, and allows
|
|
* us to rely on the fact that that no settings can change between the
|
|
* start and end of any read call. As a side note, pending select(2)
|
|
* queries may be similarly affected, and will also not be fully
|
|
* accurate if any options are changed while pending.
|
|
*/
|
|
if (bpf->bpf_read.br_endpt != NONE)
|
|
return EIO;
|
|
|
|
bpf->bpf_pid = getnpid(user_endpt);
|
|
|
|
/* These are in order of the NetBSD BIOC.. IOCTL numbers. */
|
|
switch (request) {
|
|
case BIOCGBLEN:
|
|
uval = bpf->bpf_size;
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval));
|
|
|
|
case BIOCSBLEN:
|
|
if (bpf->bpf_sbuf != NULL)
|
|
return EINVAL;
|
|
|
|
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval))) != OK)
|
|
return r;
|
|
|
|
if (uval < BPF_BUF_MIN)
|
|
uval = BPF_BUF_MIN;
|
|
else if (uval > BPF_BUF_MAX)
|
|
uval = BPF_BUF_MAX;
|
|
|
|
/* Is this the right thing to do? It doesn't matter for us. */
|
|
uval = BPF_WORDALIGN(uval);
|
|
|
|
if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval))) != OK)
|
|
return r;
|
|
|
|
bpf->bpf_size = uval;
|
|
|
|
return OK;
|
|
|
|
case MINIX_BIOCSETF:
|
|
return bpfdev_setfilter(bpf, endpt, grant);
|
|
|
|
case BIOCPROMISC:
|
|
if (bpf->bpf_ifdev == NULL)
|
|
return EINVAL;
|
|
|
|
if (!(bpf->bpf_flags & BPFF_PROMISC)) {
|
|
if (!ifdev_set_promisc(bpf->bpf_ifdev))
|
|
return EINVAL;
|
|
|
|
bpf->bpf_flags |= BPFF_PROMISC;
|
|
}
|
|
|
|
return OK;
|
|
|
|
case BIOCFLUSH:
|
|
bpfdev_flush(bpf);
|
|
|
|
return OK;
|
|
|
|
case BIOCGDLT:
|
|
if (bpf->bpf_ifdev == NULL)
|
|
return EINVAL;
|
|
|
|
/* TODO: support for type configuration per BPF device. */
|
|
uval = ifdev_get_dlt(bpf->bpf_ifdev);
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval));
|
|
|
|
case BIOCGETIF:
|
|
if (bpf->bpf_ifdev == NULL)
|
|
return EINVAL;
|
|
|
|
memset(&ifr, 0, sizeof(ifr));
|
|
strlcpy(ifr.ifr_name, ifdev_get_name(bpf->bpf_ifdev),
|
|
sizeof(ifr.ifr_name));
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&ifr,
|
|
sizeof(ifr));
|
|
|
|
case BIOCSETIF:
|
|
/*
|
|
* Test on the presence of a buffer rather than on an interface
|
|
* since the latter may disappear and thus be reset to NULL, in
|
|
* which case we do not want to allow rebinding to another.
|
|
*/
|
|
if (bpf->bpf_sbuf != NULL)
|
|
return EINVAL;
|
|
|
|
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&ifr,
|
|
sizeof(ifr))) != OK)
|
|
return r;
|
|
|
|
return bpfdev_attach(bpf, &ifr);
|
|
|
|
case BIOCGSTATS:
|
|
/*
|
|
* Why do we not embed a bpf_stat structure directly in the
|
|
* BPF device structure? Well, bpf_stat has massive padding..
|
|
*/
|
|
memset(&bs, 0, sizeof(bs));
|
|
bs.bs_recv = bpf->bpf_stat.bs_recv;
|
|
bs.bs_drop = bpf->bpf_stat.bs_drop;
|
|
bs.bs_capt = bpf->bpf_stat.bs_capt;
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bs,
|
|
sizeof(bs));
|
|
|
|
case BIOCIMMEDIATE:
|
|
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval))) != OK)
|
|
return r;
|
|
|
|
if (uval)
|
|
bpf->bpf_flags |= BPFF_IMMEDIATE;
|
|
else
|
|
bpf->bpf_flags &= ~BPFF_IMMEDIATE;
|
|
|
|
return OK;
|
|
|
|
case BIOCVERSION:
|
|
memset(&bv, 0, sizeof(bv));
|
|
bv.bv_major = BPF_MAJOR_VERSION;
|
|
bv.bv_minor = BPF_MINOR_VERSION;
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bv,
|
|
sizeof(bv));
|
|
|
|
case BIOCGHDRCMPLT:
|
|
uval = !!(bpf->bpf_flags & BPFF_HDRCMPLT);
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval));
|
|
|
|
case BIOCSHDRCMPLT:
|
|
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval))) != OK)
|
|
return r;
|
|
|
|
if (uval)
|
|
bpf->bpf_flags |= BPFF_HDRCMPLT;
|
|
else
|
|
bpf->bpf_flags &= ~BPFF_HDRCMPLT;
|
|
|
|
return OK;
|
|
|
|
case BIOCSDLT:
|
|
if (bpf->bpf_ifdev == NULL)
|
|
return EINVAL;
|
|
|
|
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval))) != OK)
|
|
return r;
|
|
|
|
/* TODO: support for type configuration per BPF device. */
|
|
if (uval != ifdev_get_dlt(bpf->bpf_ifdev))
|
|
return EINVAL;
|
|
|
|
return OK;
|
|
|
|
case MINIX_BIOCGDLTLIST:
|
|
if (bpf->bpf_ifdev == NULL)
|
|
return EINVAL;
|
|
|
|
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&bfl,
|
|
sizeof(bfl))) != OK)
|
|
return r;
|
|
|
|
if (bfl.bfl_list != NULL) {
|
|
if (bfl.bfl_len < 1)
|
|
return ENOMEM;
|
|
|
|
/*
|
|
* Copy out the 'list', which consists of one entry.
|
|
* If we were to produce multiple entries, we would
|
|
* have to check against the MINIX_BPF_MAXDLT limit.
|
|
*/
|
|
uval = ifdev_get_dlt(bpf->bpf_ifdev);
|
|
|
|
if ((r = sys_safecopyto(endpt, grant,
|
|
offsetof(struct minix_bpf_dltlist, mbfl_list),
|
|
(vir_bytes)&uval, sizeof(uval))) != OK)
|
|
return r;
|
|
}
|
|
bfl.bfl_len = 1;
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bfl,
|
|
sizeof(bfl));
|
|
|
|
case BIOCGSEESENT:
|
|
uval = !!(bpf->bpf_flags & BPFF_SEESENT);
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval));
|
|
|
|
case BIOCSSEESENT:
|
|
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval))) != OK)
|
|
return r;
|
|
|
|
if (uval)
|
|
bpf->bpf_flags |= BPFF_SEESENT;
|
|
else
|
|
bpf->bpf_flags &= ~BPFF_SEESENT;
|
|
|
|
return OK;
|
|
|
|
case BIOCSRTIMEOUT:
|
|
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&tv,
|
|
sizeof(tv))) != OK)
|
|
return r;
|
|
|
|
if ((r = util_timeval_to_ticks(&tv, &bpf->bpf_timeout)) != OK)
|
|
return r;
|
|
|
|
return OK;
|
|
|
|
case BIOCGRTIMEOUT:
|
|
util_ticks_to_timeval(bpf->bpf_timeout, &tv);
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&tv,
|
|
sizeof(tv));
|
|
|
|
case BIOCGFEEDBACK:
|
|
uval = !!(bpf->bpf_flags & BPFF_FEEDBACK);
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval));
|
|
|
|
case BIOCSFEEDBACK:
|
|
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
|
|
sizeof(uval))) != OK)
|
|
return r;
|
|
|
|
if (uval)
|
|
bpf->bpf_flags |= BPFF_FEEDBACK;
|
|
else
|
|
bpf->bpf_flags &= ~BPFF_FEEDBACK;
|
|
|
|
return OK;
|
|
|
|
case FIONREAD:
|
|
val = 0;
|
|
if (bpf->bpf_hlen > 0)
|
|
val = bpf->bpf_hlen;
|
|
else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
|
|
bpf->bpf_slen > 0)
|
|
val = bpf->bpf_slen;
|
|
else
|
|
val = 0;
|
|
|
|
return sys_safecopyto(endpt, grant, 0, (vir_bytes)&val,
|
|
sizeof(val));
|
|
|
|
default:
|
|
return ENOTTY;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Cancel a previously suspended request on a BPF device. Since only read
|
|
* requests may be suspended (select is handled differently), the cancel
|
|
* request must be for a read request. Note that character devices currently
|
|
* (still) behave slightly differently from socket devices here: while socket
|
|
* drivers are supposed to respond to the original request, character drivers
|
|
* must respond to the original request from the cancel callback.
|
|
*/
|
|
static int
|
|
bpfdev_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id)
|
|
{
|
|
struct bpfdev *bpf;
|
|
|
|
if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
|
|
return EDONTREPLY;
|
|
|
|
/* Is this a cancel request for the currently pending read request? */
|
|
if (bpf->bpf_read.br_endpt != endpt || bpf->bpf_read.br_id != id)
|
|
return EDONTREPLY;
|
|
|
|
/* If so, cancel the read request. */
|
|
if (bpf->bpf_timeout > 0)
|
|
cancel_timer(&bpf->bpf_read.br_timer);
|
|
|
|
bpf->bpf_read.br_endpt = NONE;
|
|
|
|
return EINTR; /* the return value for the canceled read request */
|
|
}
|
|
|
|
/*
|
|
* Perform a select query on a BPF device.
|
|
*/
|
|
static int
|
|
bpfdev_select(devminor_t minor, unsigned int ops, endpoint_t endpt)
|
|
{
|
|
struct bpfdev *bpf;
|
|
unsigned int r, notify;
|
|
|
|
if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
|
|
return EINVAL;
|
|
|
|
notify = (ops & CDEV_NOTIFY);
|
|
ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR);
|
|
|
|
r = bpfdev_test_select(bpf, ops);
|
|
|
|
/*
|
|
* For the operations that were not immediately ready, if requested,
|
|
* save the select request for later.
|
|
*/
|
|
ops &= ~r;
|
|
|
|
if (ops != 0 && notify) {
|
|
if (bpf->bpf_select.bs_endpt != NONE) {
|
|
/* Merge in the operations with any earlier request. */
|
|
if (bpf->bpf_select.bs_endpt != endpt)
|
|
return EIO;
|
|
bpf->bpf_select.bs_selops |= ops;
|
|
} else {
|
|
bpf->bpf_select.bs_endpt = endpt;
|
|
bpf->bpf_select.bs_selops = ops;
|
|
}
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
/*
|
|
* Process an incoming packet on the interface to which the given BPF device is
|
|
* attached. If the packet passes the filter (if any), store as much as
|
|
* requested of it in the store buffer, rotating buffers if needed and resuming
|
|
* suspended read and select requests as appropriate. This function is also
|
|
* called through bpfdev_output() below.
|
|
*/
|
|
void
|
|
bpfdev_input(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
|
|
{
|
|
struct bpfdev *bpf = (struct bpfdev *)bpfl;
|
|
struct timespec ts;
|
|
struct bpf_hdr bh;
|
|
const struct pbuf *pptr;
|
|
size_t caplen, hdrlen, totlen, off, chunk;
|
|
int hfull;
|
|
|
|
/*
|
|
* Apparently bs_recv is the counter of packets that were run through
|
|
* the filter, not the number of packets that were or could be received
|
|
* by the user (which is what I got from the manual page.. oh well).
|
|
*/
|
|
bpf->bpf_stat.bs_recv++;
|
|
bpf_stat.bs_recv++;
|
|
|
|
/*
|
|
* Run the packet through the BPF device's filter to see whether the
|
|
* packet should be stored and if so, how much of it. If no filter is
|
|
* set, all packets will be stored in their entirety.
|
|
*/
|
|
caplen = bpf_filter_ext(bpf->bpf_filter, pbuf, (u_char *)pbuf->payload,
|
|
pbuf->tot_len, pbuf->len);
|
|
|
|
if (caplen == 0)
|
|
return; /* no match; ignore packet */
|
|
|
|
if (caplen > pbuf->tot_len)
|
|
caplen = pbuf->tot_len;
|
|
|
|
/* Truncate packet entries to the full size of the buffers. */
|
|
hdrlen = BPF_WORDALIGN(sizeof(bh));
|
|
totlen = BPF_WORDALIGN(hdrlen + caplen);
|
|
|
|
if (totlen > bpf->bpf_size) {
|
|
totlen = bpf->bpf_size;
|
|
caplen = totlen - hdrlen;
|
|
}
|
|
assert(totlen >= hdrlen);
|
|
|
|
bpf->bpf_stat.bs_capt++;
|
|
bpf_stat.bs_capt++;
|
|
|
|
assert(bpf->bpf_sbuf != NULL);
|
|
if (totlen > bpf->bpf_size - bpf->bpf_slen) {
|
|
/*
|
|
* If the store buffer is full and the hold buffer is not
|
|
* empty, we cannot swap the two buffers, and so we must drop
|
|
* the current packet.
|
|
*/
|
|
if (bpf->bpf_hlen > 0) {
|
|
bpf->bpf_stat.bs_drop++;
|
|
bpf_stat.bs_drop++;
|
|
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Rotate the buffers: the hold buffer will now be "full" and
|
|
* ready to be read - it may not actually be entirely full, but
|
|
* we could not fit this packet and we are not going to deliver
|
|
* packets out of order..
|
|
*/
|
|
bpfdev_rotate(bpf);
|
|
|
|
hfull = TRUE;
|
|
} else
|
|
hfull = FALSE;
|
|
|
|
/*
|
|
* Retrieve the capture time for the packet. Ideally this would be
|
|
* done only once per accepted packet, but we do not expect many BPF
|
|
* devices to be receiving the same packets often enough to make that
|
|
* worth it.
|
|
*/
|
|
clock_time(&ts);
|
|
|
|
/*
|
|
* Copy the packet into the store buffer, including a newly generated
|
|
* header. Zero any padding areas, even if strictly not necessary.
|
|
*/
|
|
memset(&bh, 0, sizeof(bh));
|
|
bh.bh_tstamp.tv_sec = ts.tv_sec;
|
|
bh.bh_tstamp.tv_usec = ts.tv_nsec / 1000;
|
|
bh.bh_caplen = caplen;
|
|
bh.bh_datalen = pbuf->tot_len;
|
|
bh.bh_hdrlen = hdrlen;
|
|
|
|
assert(bpf->bpf_sbuf != NULL);
|
|
off = bpf->bpf_slen;
|
|
|
|
memcpy(&bpf->bpf_sbuf[off], &bh, sizeof(bh));
|
|
if (hdrlen > sizeof(bh))
|
|
memset(&bpf->bpf_sbuf[off + sizeof(bh)], 0,
|
|
hdrlen - sizeof(bh));
|
|
off += hdrlen;
|
|
|
|
for (pptr = pbuf; pptr != NULL && caplen > 0; pptr = pptr->next) {
|
|
chunk = pptr->len;
|
|
if (chunk > caplen)
|
|
chunk = caplen;
|
|
|
|
memcpy(&bpf->bpf_sbuf[off], pptr->payload, chunk);
|
|
|
|
off += chunk;
|
|
caplen -= chunk;
|
|
}
|
|
|
|
assert(off <= bpf->bpf_slen + totlen);
|
|
if (bpf->bpf_slen + totlen > off)
|
|
memset(&bpf->bpf_sbuf[off], 0, bpf->bpf_slen + totlen - off);
|
|
|
|
bpf->bpf_slen += totlen;
|
|
|
|
/*
|
|
* Edge case: if the hold buffer is empty and the store buffer is now
|
|
* exactly full, rotate buffers so that the packets can be read
|
|
* immediately, without waiting for the next packet to cause rotation.
|
|
*/
|
|
if (bpf->bpf_hlen == 0 && bpf->bpf_slen == bpf->bpf_size) {
|
|
bpfdev_rotate(bpf);
|
|
|
|
hfull = TRUE;
|
|
}
|
|
|
|
/*
|
|
* If the hold buffer is now full, or if immediate mode is enabled,
|
|
* then we now have data to deliver to userland. See if we can wake up
|
|
* any read or select call (either but not both here).
|
|
*/
|
|
if (hfull || (bpf->bpf_flags & BPFF_IMMEDIATE)) {
|
|
if (bpf->bpf_read.br_endpt != NONE)
|
|
bpfdev_resume_read(bpf, FALSE /*is_timeout*/);
|
|
else
|
|
bpfdev_resume_select(bpf);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Process an outgoing packet on the interface to which the given BPF device is
|
|
* attached. If the BPF device is configured to capture outgoing packets as
|
|
* well, attempt to capture the packet as per bpfdev_input().
|
|
*/
|
|
void
|
|
bpfdev_output(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
|
|
{
|
|
struct bpfdev *bpf = (struct bpfdev *)bpfl;
|
|
|
|
if (bpf->bpf_flags & BPFF_SEESENT)
|
|
bpfdev_input(bpfl, pbuf);
|
|
}
|
|
|
|
/*
|
|
* Fill the given 'bde' structure with information about BPF device 'bpf'.
|
|
*/
|
|
static void
|
|
bpfdev_get_info(struct bpf_d_ext * bde, const struct bpfdev * bpf)
|
|
{
|
|
|
|
bde->bde_bufsize = bpf->bpf_size;
|
|
bde->bde_promisc = !!(bpf->bpf_flags & BPFF_PROMISC);
|
|
bde->bde_state = BPF_IDLE;
|
|
bde->bde_immediate = !!(bpf->bpf_flags & BPFF_IMMEDIATE);
|
|
bde->bde_hdrcmplt = !!(bpf->bpf_flags & BPFF_HDRCMPLT);
|
|
bde->bde_seesent = !!(bpf->bpf_flags & BPFF_SEESENT);
|
|
/*
|
|
* NetBSD updates the process ID upon device open, close, ioctl, and
|
|
* poll. From those, only open and ioctl make sense for us. Sadly
|
|
* there is no way to indicate "no known PID" to netstat(1), so we
|
|
* cannot even save just the endpoint and look up the corresponding PID
|
|
* later, since the user process may be gone by then.
|
|
*/
|
|
bde->bde_pid = bpf->bpf_pid;
|
|
bde->bde_rcount = bpf->bpf_stat.bs_recv;
|
|
bde->bde_dcount = bpf->bpf_stat.bs_drop;
|
|
bde->bde_ccount = bpf->bpf_stat.bs_capt;
|
|
if (bpf->bpf_ifdev != NULL)
|
|
strlcpy(bde->bde_ifname, ifdev_get_name(bpf->bpf_ifdev),
|
|
sizeof(bde->bde_ifname));
|
|
}
|
|
|
|
/*
|
|
* Obtain statistics about open BPF devices ("peers"). This node may be
|
|
* accessed by the superuser only. Used by netstat(1).
|
|
*/
|
|
static ssize_t
|
|
bpfdev_peers(struct rmib_call * call, struct rmib_node * node __unused,
|
|
struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
|
|
{
|
|
struct bpfdev *bpf;
|
|
struct bpf_d_ext bde;
|
|
unsigned int slot;
|
|
ssize_t off;
|
|
int r, size, max;
|
|
|
|
if (!(call->call_flags & RMIB_FLAG_AUTH))
|
|
return EPERM;
|
|
|
|
if (call->call_namelen != 2)
|
|
return EINVAL;
|
|
|
|
size = call->call_name[0];
|
|
if (size < 0 || (size_t)size > sizeof(bde))
|
|
return EINVAL;
|
|
if (size == 0)
|
|
size = sizeof(bde);
|
|
max = call->call_name[1];
|
|
|
|
off = 0;
|
|
|
|
for (slot = 0; slot < __arraycount(bpf_array); slot++) {
|
|
bpf = &bpf_array[slot];
|
|
|
|
if (!(bpf->bpf_flags & BPFF_IN_USE))
|
|
continue;
|
|
|
|
if (rmib_inrange(oldp, off)) {
|
|
memset(&bde, 0, sizeof(bde));
|
|
|
|
bpfdev_get_info(&bde, bpf);
|
|
|
|
if ((r = rmib_copyout(oldp, off, &bde, size)) < 0)
|
|
return r;
|
|
}
|
|
|
|
off += sizeof(bde);
|
|
if (max > 0 && --max == 0)
|
|
break;
|
|
}
|
|
|
|
/* No slack needed: netstat(1) resizes its buffer as needed. */
|
|
return off;
|
|
}
|
|
|
|
static const struct chardriver bpfdev_tab = {
|
|
.cdr_open = bpfdev_open,
|
|
.cdr_close = bpfdev_close,
|
|
.cdr_read = bpfdev_read,
|
|
.cdr_write = bpfdev_write,
|
|
.cdr_ioctl = bpfdev_ioctl,
|
|
.cdr_cancel = bpfdev_cancel,
|
|
.cdr_select = bpfdev_select
|
|
};
|
|
|
|
/*
|
|
* Process a character driver request. Since the LWIP service offers character
|
|
* devices for BPF only, it must be a request for a BPF device.
|
|
*/
|
|
void
|
|
bpfdev_process(message * m_ptr, int ipc_status)
|
|
{
|
|
|
|
chardriver_process(&bpfdev_tab, m_ptr, ipc_status);
|
|
}
|