/*	$NetBSD: npf_session.c,v 1.18 2012/09/13 21:09:36 joerg Exp $	*/

/*-
 * Copyright (c) 2010-2012 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This material is based upon work partially supported by The
 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * NPF session tracking for stateful filtering and translation.
 *
 * Overview
 *
 *	Session direction is identified by the direction of its first packet.
 *	Packets can be incoming or outgoing with respect to an interface.
 *	To describe the packet in the context of session direction, we will
 *	use the terms "forwards stream" and "backwards stream".  All sessions
 *	have two embedded entries - npf_session_t::s_forw_entry for forwards
 *	stream and npf_session_t::s_back_entry for backwards stream.  These
 *	entries (npf_sentry_t) contain source and destination identifiers.
 *	Note that entry may contain translated values in a case of NAT.
 *
 *	Sessions can serve two purposes: "pass" or "NAT".  Sessions for the
 *	former purpose are created according to the rules with "stateful"
 *	attribute and are used for stateful filtering.  Such sessions
 *	indicate that the packet of the backwards stream should be passed
 *	without inspection of the ruleset.  Another purpose is to associate
 *	NAT with a connection (which implies connection tracking).  Such
 *	sessions are created according to the NAT policies and they have a
 *	relationship with NAT translation structure via npf_session_t::s_nat.
 *	A single session can serve both purposes, which is a common case.
 *
 * Session life-cycle
 *
 *	Sessions are established when a packet matches said rule or NAT policy.
 *	Both entries of established session are inserted into the hashed tree.
 *	A garbage collection thread periodically scans all session entries and
 *	depending on session properties (e.g. last activity time, protocol)
 *	removes session entries and expires the actual sessions.
 *
 *	Each session has a reference count.  Reference is acquired on lookup
 *	and should be released by the caller.  Reference guarantees that the
 *	session will not be destroyed, although it may be expired.
 *
 * External session identifiers
 *
 *	Application-level gateways (ALGs) can inspect the packet and fill
 *	the packet cache (npf_cache_t) representing the IDs.  It is done
 *	via npf_alg_sessionid() call.  In such case, ALGs are responsible
 *	for correct filling of protocol, addresses and ports/IDs.
 *
 * Lock order
 *
 *	[ sess_lock -> ]
 *		npf_sehash_t::sh_lock ->
 *			npf_state_t::nst_lock
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: npf_session.c,v 1.18 2012/09/13 21:09:36 joerg Exp $");

#include <sys/param.h>
#include <sys/types.h>

#include <netinet/in.h>
#include <netinet/tcp.h>

#include <sys/atomic.h>
#include <sys/condvar.h>
#include <sys/hash.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/mutex.h>
#include <net/pfil.h>
#include <sys/pool.h>
#include <sys/rwlock.h>
#include <sys/queue.h>
#include <sys/systm.h>

#include "npf_impl.h"

/*
 * Session structures: entry for embedding and the main structure.
 * WARNING: update npf_session_restore() when adding fields.
 */

struct npf_secomid;
typedef struct npf_secomid npf_secomid_t;

typedef struct {
	/* Session entry node and back-pointer to the actual session. */
	rb_node_t		se_rbnode;
	union {
		npf_session_t *	se_backptr;
		void *		se_common_id;
	};
	/* Size of the addresses. */
	int			se_alen;
	/* Source and destination addresses. */
	npf_addr_t		se_src_addr;
	npf_addr_t		se_dst_addr;
	/* Source and destination ports (TCP / UDP) or generic IDs. */
	uint16_t		se_src_id;
	uint16_t		se_dst_id;
} npf_sentry_t;

struct npf_session {
	/* Session "forwards" and "backwards" entries. */
	npf_sentry_t		s_forw_entry;
	npf_sentry_t		s_back_entry;
	/* Entry in the session hash or G/C list. */
	LIST_ENTRY(npf_session)	s_list;
	u_int			s_refcnt;
	/* Protocol and interface (common IDs). */
	struct npf_secomid {
		uint16_t	proto;
		uint16_t	if_idx;
	} s_common_id;
	/* Flags and the protocol state. */
	int			s_flags;
	npf_state_t		s_state;
	/* Association of rule procedure data. */
	npf_rproc_t *		s_rproc;
	/* NAT associated with this session (if any). */
	npf_nat_t *		s_nat;
	/* Last activity time (used to calculate expiration time). */
	struct timespec 	s_atime;
};

#define	SESS_HASH_BUCKETS	1024	/* XXX tune + make tunable */
#define	SESS_HASH_MASK		(SESS_HASH_BUCKETS - 1)

LIST_HEAD(npf_sesslist, npf_session);

struct npf_sehash {
	rb_tree_t		sh_tree;
	struct npf_sesslist	sh_list;
	krwlock_t		sh_lock;
	u_int			sh_count;
};

/*
 * Session flags:
 * - PFIL_IN and PFIL_OUT values are reserved for direction.
 * - SE_ACTIVE: session is active i.e. visible on inspection.
 * - SE_PASS: a "pass" session.
 * - SE_EXPIRE: explicitly expire the session.
 * - SE_REMOVING: session is being removed (indicate need to enter G/C list).
 */
CTASSERT(PFIL_ALL == (0x001 | 0x002));
#define	SE_ACTIVE		0x004
#define	SE_PASS			0x008
#define	SE_EXPIRE		0x010
#define	SE_REMOVING		0x020

/*
 * Session tracking state: disabled (off), enabled (on) or flush request.
 */
enum { SESS_TRACKING_OFF, SESS_TRACKING_ON, SESS_TRACKING_FLUSH };
static int			sess_tracking	__cacheline_aligned;

/* Session hash table, lock and session cache. */
static npf_sehash_t *		sess_hashtbl	__read_mostly;
static pool_cache_t		sess_cache	__read_mostly;

static kmutex_t			sess_lock;
static kcondvar_t		sess_cv;
static lwp_t *			sess_gc_lwp;

#define	SESS_GC_INTERVAL	5		/* 5 sec */

static void	sess_tracking_stop(void);
static void	npf_session_destroy(npf_session_t *);
static void	npf_session_worker(void *) __dead;

/*
 * npf_session_sys{init,fini}: initialise/destroy session handling structures.
 *
 * Session table and G/C thread are initialised when session tracking gets
 * actually enabled via npf_session_tracking() interface.
 */

void
npf_session_sysinit(void)
{

	sess_cache = pool_cache_init(sizeof(npf_session_t), coherency_unit,
	    0, 0, "npfsespl", NULL, IPL_NET, NULL, NULL, NULL);
	mutex_init(&sess_lock, MUTEX_DEFAULT, IPL_NONE);
	cv_init(&sess_cv, "npfgccv");
	sess_hashtbl = NULL;
	sess_gc_lwp = NULL;

	sess_tracking = SESS_TRACKING_OFF;
}

void
npf_session_sysfini(void)
{

	/* Disable tracking, flush all sessions. */
	npf_session_tracking(false);
	KASSERT(sess_tracking == SESS_TRACKING_OFF);
	KASSERT(sess_gc_lwp == NULL);

	/* Sessions might have been restored while the tracking is off. */
	if (sess_hashtbl) {
		sess_htable_destroy(sess_hashtbl);
	}

	pool_cache_destroy(sess_cache);
	cv_destroy(&sess_cv);
	mutex_destroy(&sess_lock);
}

/*
 * Session hash table and RB-tree helper routines.
 * The order is (src.id, dst.id, src.addr, dst.addr, common_id),
 * where (node1 < node2) shall be negative.
 */

static signed int
sess_rbtree_cmp_nodes(void *ctx, const void *n1, const void *n2)
{
	const npf_sentry_t * const sen1 = n1;
	const npf_sentry_t * const sen2 = n2;
	const int sz = sen1->se_alen;
	int ret;

	/*
	 * Ports are expected to vary most, therefore they are first.
	 */
	if (sen1->se_src_id != sen2->se_src_id) {
		return (sen1->se_src_id < sen2->se_src_id) ? -1 : 1;
	}
	if (sen1->se_dst_id != sen2->se_dst_id) {
		return (sen1->se_dst_id < sen2->se_dst_id) ? -1 : 1;
	}

	/*
	 * Note that hash should minimise differentiation on addresses.
	 */
	if (sen1->se_alen != sen2->se_alen) {
		return (sen1->se_alen < sen2->se_alen) ? -1 : 1;
	}
	if ((ret = memcmp(&sen1->se_src_addr, &sen2->se_src_addr, sz)) != 0) {
		return ret;
	}
	if ((ret = memcmp(&sen1->se_dst_addr, &sen2->se_dst_addr, sz)) != 0) {
		return ret;
	}

	const npf_secomid_t *id1 = &sen1->se_backptr->s_common_id;
	const npf_secomid_t *id2 = ctx ? ctx : &sen2->se_backptr->s_common_id;
	return memcmp(id1, id2, sizeof(npf_secomid_t));
}

static signed int
sess_rbtree_cmp_key(void *ctx, const void *n1, const void *key)
{
	const npf_sentry_t * const sen1 = n1;
	const npf_sentry_t * const sen2 = key;

	KASSERT(sen1->se_alen != 0 && sen2->se_alen != 0);
	return sess_rbtree_cmp_nodes(sen2->se_common_id, sen1, sen2);
}

static const rb_tree_ops_t sess_rbtree_ops = {
	.rbto_compare_nodes = sess_rbtree_cmp_nodes,
	.rbto_compare_key = sess_rbtree_cmp_key,
	.rbto_node_offset = offsetof(npf_sentry_t, se_rbnode),
	.rbto_context = NULL
};

static inline npf_sehash_t *
sess_hash_bucket(npf_sehash_t *stbl, const npf_secomid_t *scid,
    const npf_sentry_t *sen)
{
	const int sz = sen->se_alen;
	uint32_t hash, mix;

	/*
	 * Sum protocol, interface and both addresses (for both directions).
	 */
	mix = scid->proto + scid->if_idx;
	mix += npf_addr_sum(sz, &sen->se_src_addr, &sen->se_dst_addr);
	hash = hash32_buf(&mix, sizeof(uint32_t), HASH32_BUF_INIT);
	return &stbl[hash & SESS_HASH_MASK];
}

npf_sehash_t *
sess_htable_create(void)
{
	npf_sehash_t *stbl, *sh;
	u_int i;

	stbl = kmem_zalloc(SESS_HASH_BUCKETS * sizeof(*sh), KM_SLEEP);
	if (stbl == NULL) {
		return NULL;
	}
	for (i = 0; i < SESS_HASH_BUCKETS; i++) {
		sh = &stbl[i];
		LIST_INIT(&sh->sh_list);
		rb_tree_init(&sh->sh_tree, &sess_rbtree_ops);
		rw_init(&sh->sh_lock);
		sh->sh_count = 0;
	}
	return stbl;
}

void
sess_htable_destroy(npf_sehash_t *stbl)
{
	npf_sehash_t *sh;
	u_int i;

	for (i = 0; i < SESS_HASH_BUCKETS; i++) {
		sh = &stbl[i];
		KASSERT(sh->sh_count == 0);
		KASSERT(LIST_EMPTY(&sh->sh_list));
		KASSERT(!rb_tree_iterate(&sh->sh_tree, NULL, RB_DIR_LEFT));
		rw_destroy(&sh->sh_lock);
	}
	kmem_free(stbl, SESS_HASH_BUCKETS * sizeof(*sh));
}

void
sess_htable_reload(npf_sehash_t *stbl)
{
	npf_sehash_t *oldstbl;

	/* Flush all existing entries. */
	mutex_enter(&sess_lock);
	if (sess_gc_lwp) {
		sess_tracking = SESS_TRACKING_FLUSH;
		cv_broadcast(&sess_cv);
	}
	while (sess_tracking == SESS_TRACKING_FLUSH) {
		cv_wait(&sess_cv, &sess_lock);
	}

	/* Set a new session table. */
	oldstbl = sess_hashtbl;
	sess_hashtbl = stbl;
	mutex_exit(&sess_lock);

	/* Destroy the old table. */
	if (oldstbl) {
		sess_htable_destroy(oldstbl);
	}
}

/*
 * Session tracking routines.  Note: manages tracking structures.
 */

static int
sess_tracking_start(void)
{
	npf_sehash_t *nstbl;

	nstbl = sess_htable_create();
	if (nstbl == NULL) {
		return ENOMEM;
	}

	/* Note: should be visible before thread start. */
	mutex_enter(&sess_lock);
	if (sess_tracking != SESS_TRACKING_OFF) {
		mutex_exit(&sess_lock);
		sess_htable_destroy(nstbl);
		return EEXIST;
	}
	sess_hashtbl = nstbl;
	sess_tracking = SESS_TRACKING_ON;
	mutex_exit(&sess_lock);

	if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
	    npf_session_worker, NULL, &sess_gc_lwp, "npfgc")) {
		sess_tracking_stop();
		return ENOMEM;
	}
	return 0;
}

static void
sess_tracking_stop(void)
{
	npf_sehash_t *stbl;

	mutex_enter(&sess_lock);
	if (sess_tracking == SESS_TRACKING_OFF) {
		mutex_exit(&sess_lock);
		return;
	}

	/* Notify G/C thread to flush all sessions. */
	sess_tracking = SESS_TRACKING_OFF;
	cv_broadcast(&sess_cv);

	/* Wait for the exit. */
	while (sess_gc_lwp != NULL) {
		cv_wait(&sess_cv, &sess_lock);
	}
	stbl = sess_hashtbl;
	sess_hashtbl = NULL;
	mutex_exit(&sess_lock);

	sess_htable_destroy(stbl);
	pool_cache_invalidate(sess_cache);
}

/*
 * npf_session_tracking: enable/disable session tracking.
 */
int
npf_session_tracking(bool track)
{

	if (sess_tracking == SESS_TRACKING_OFF && track) {
		/* Disabled -> Enable. */
		return sess_tracking_start();
	}
	if (sess_tracking == SESS_TRACKING_ON && !track) {
		/* Enabled -> Disable. */
		sess_tracking_stop();
		return 0;
	}
	return 0;
}

/*
 * npf_session_inspect: lookup for an established session (connection).
 *
 * => If found, we will hold a reference for caller.
 */
npf_session_t *
npf_session_inspect(npf_cache_t *npc, nbuf_t *nbuf, const ifnet_t *ifp,
    const int di, int *error)
{
	npf_sehash_t *sh;
	npf_sentry_t *sen;
	npf_session_t *se;
	int flags;

	/*
	 * Check if session tracking is on.  Also, if layer 3 and 4 are not
	 * cached - protocol is not supported or packet is invalid.
	 */
	if (sess_tracking == SESS_TRACKING_OFF) {
		return NULL;
	}
	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
		return NULL;
	}

	/*
	 * Construct a key for hash and tree lookup.  Execute ALG session
	 * helpers, which may construct a custom key.
	 */
	npf_cache_t algkey = { .npc_info = 0 }, *key;
	npf_sentry_t senkey;

	if (!npf_alg_sessionid(npc, nbuf, &algkey)) {
		/* Default: use the cache data of original packet. */
		key = npc;
	} else {
		/* Unique IDs filled by ALG in a separate key cache. */
		key = &algkey;
	}

	/* Note: take protocol from the key. */
	const u_int proto = npf_cache_ipproto(key);

	switch (proto) {
	case IPPROTO_TCP: {
		const struct tcphdr *th = &key->npc_l4.tcp;
		senkey.se_src_id = th->th_sport;
		senkey.se_dst_id = th->th_dport;
		break;
	}
	case IPPROTO_UDP: {
		const struct udphdr *uh = &key->npc_l4.udp;
		senkey.se_src_id = uh->uh_sport;
		senkey.se_dst_id = uh->uh_dport;
		break;
	}
	case IPPROTO_ICMP:
		if (npf_iscached(key, NPC_ICMP_ID)) {
			const struct icmp *ic = &key->npc_l4.icmp;
			senkey.se_src_id = ic->icmp_id;
			senkey.se_dst_id = ic->icmp_id;
			break;
		}
		return NULL;
	case IPPROTO_ICMPV6:
		if (npf_iscached(key, NPC_ICMP_ID)) {
			const struct icmp6_hdr *ic6 = &key->npc_l4.icmp6;
			senkey.se_src_id = ic6->icmp6_id;
			senkey.se_dst_id = ic6->icmp6_id;
			break;
		}
		return NULL;
	default:
		/* Unsupported protocol. */
		return NULL;
	}

	KASSERT(key->npc_srcip && key->npc_dstip && key->npc_alen > 0);
	memcpy(&senkey.se_src_addr, key->npc_srcip, key->npc_alen);
	memcpy(&senkey.se_dst_addr, key->npc_dstip, key->npc_alen);
	senkey.se_alen = key->npc_alen;

	/*
	 * Note: this is a special case where we use common ID pointer
	 * to pass the structure for the key comparator.
	 */
	npf_secomid_t scid;
	memset(&scid, 0, sizeof(npf_secomid_t));
	scid = (npf_secomid_t){ .proto = proto, .if_idx = ifp->if_index };
	senkey.se_common_id = &scid;

	/*
	 * Get a hash bucket from the cached key data.
	 * Pre-check if there are any entries in the hash table.
	 */
	sh = sess_hash_bucket(sess_hashtbl, &scid, &senkey);
	if (sh->sh_count == 0) {
		return NULL;
	}

	/* Lookup the tree for a session entry and get the actual session. */
	rw_enter(&sh->sh_lock, RW_READER);
	sen = rb_tree_find_node(&sh->sh_tree, &senkey);
	if (sen == NULL) {
		rw_exit(&sh->sh_lock);
		return NULL;
	}
	se = sen->se_backptr;
	KASSERT(se->s_common_id.proto == proto);
	KASSERT(se->s_common_id.if_idx == ifp->if_index);
	flags = se->s_flags;

	/* Check if session is active and not expired. */
	if (__predict_false((flags & (SE_ACTIVE | SE_EXPIRE)) != SE_ACTIVE)) {
		rw_exit(&sh->sh_lock);
		return NULL;
	}

	/* Match directions of the session entry and the packet. */
	const bool sforw = (sen == &se->s_forw_entry);
	const bool pforw = (flags & PFIL_ALL) == di;
	if (__predict_false(sforw != pforw)) {
		rw_exit(&sh->sh_lock);
		return NULL;
	}

	/* Inspect the protocol data and handle state changes. */
	if (npf_state_inspect(npc, nbuf, &se->s_state, sforw)) {
		/* Update the last activity time and hold a reference. */
		getnanouptime(&se->s_atime);
		atomic_inc_uint(&se->s_refcnt);
	} else {
		/* Silently block invalid packets. */
		npf_stats_inc(NPF_STAT_INVALID_STATE);
		*error = ENETUNREACH;
		se = NULL;
	}
	rw_exit(&sh->sh_lock);
	return se;
}

/*
 * npf_establish_session: create a new session, insert into the global list.
 *
 * => Session is created with the reference held for the caller.
 * => Session will be activated on the first reference release.
 */
npf_session_t *
npf_session_establish(const npf_cache_t *npc, nbuf_t *nbuf,
    const ifnet_t *ifp, const int di)
{
	const struct tcphdr *th;
	const struct udphdr *uh;
	npf_sentry_t *fw, *bk;
	npf_sehash_t *sh;
	npf_session_t *se;
	u_int proto, alen;
	bool ok;

	/*
	 * Check if session tracking is on.  Also, if layer 3 and 4 are not
	 * cached - protocol is not supported or packet is invalid.
	 */
	if (sess_tracking == SESS_TRACKING_OFF) {
		return NULL;
	}
	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
		return NULL;
	}

	/* Allocate and initialise new state. */
	se = pool_cache_get(sess_cache, PR_NOWAIT);
	if (__predict_false(se == NULL)) {
		return NULL;
	}
	NPF_PRINTF(("NPF: create se %p\n", se));
	npf_stats_inc(NPF_STAT_SESSION_CREATE);

	/* Reference count and flags (indicate direction). */
	se->s_refcnt = 1;
	se->s_flags = (di & PFIL_ALL);
	se->s_rproc = NULL;
	se->s_nat = NULL;

	/* Initialize protocol state. */
	if (!npf_state_init(npc, nbuf, &se->s_state)) {
		ok = false;
		goto out;
	}

	/* Unique IDs: IP addresses.  Setup "forwards" entry first. */
	KASSERT(npf_iscached(npc, NPC_IP46));
	alen = npc->npc_alen;
	fw = &se->s_forw_entry;
	memcpy(&fw->se_src_addr, npc->npc_srcip, alen);
	memcpy(&fw->se_dst_addr, npc->npc_dstip, alen);

	/* Protocol and interface. */
	proto = npf_cache_ipproto(npc);
	memset(&se->s_common_id, 0, sizeof(npf_secomid_t));
	se->s_common_id.proto = proto;
	se->s_common_id.if_idx = ifp->if_index;

	switch (proto) {
	case IPPROTO_TCP:
		KASSERT(npf_iscached(npc, NPC_TCP));
		th = &npc->npc_l4.tcp;
		/* Additional IDs: ports. */
		fw->se_src_id = th->th_sport;
		fw->se_dst_id = th->th_dport;
		break;
	case IPPROTO_UDP:
		KASSERT(npf_iscached(npc, NPC_UDP));
		/* Additional IDs: ports. */
		uh = &npc->npc_l4.udp;
		fw->se_src_id = uh->uh_sport;
		fw->se_dst_id = uh->uh_dport;
		break;
	case IPPROTO_ICMP:
		if (npf_iscached(npc, NPC_ICMP_ID)) {
			/* ICMP query ID. */
			const struct icmp *ic = &npc->npc_l4.icmp;
			fw->se_src_id = ic->icmp_id;
			fw->se_dst_id = ic->icmp_id;
			break;
		}
		ok = false;
		goto out;
	case IPPROTO_ICMPV6:
		if (npf_iscached(npc, NPC_ICMP_ID)) {
			/* ICMP query ID. */
			const struct icmp6_hdr *ic6 = &npc->npc_l4.icmp6;
			fw->se_src_id = ic6->icmp6_id;
			fw->se_dst_id = ic6->icmp6_id;
			break;
		}
		ok = false;
		goto out;
	default:
		/* Unsupported. */
		ok = false;
		goto out;
	}

	/* Set last activity time for a new session. */
	getnanouptime(&se->s_atime);

	/* Setup inverted "backwards". */
	bk = &se->s_back_entry;
	memcpy(&bk->se_src_addr, &fw->se_dst_addr, alen);
	memcpy(&bk->se_dst_addr, &fw->se_src_addr, alen);
	bk->se_src_id = fw->se_dst_id;
	bk->se_dst_id = fw->se_src_id;

	/* Finish the setup of entries. */
	fw->se_backptr = bk->se_backptr = se;
	fw->se_alen = bk->se_alen = alen;

	/*
	 * Insert the session and both entries into the tree.
	 */
	sh = sess_hash_bucket(sess_hashtbl, &se->s_common_id, fw);
	KASSERT(sh == sess_hash_bucket(sess_hashtbl, &se->s_common_id, bk));

	rw_enter(&sh->sh_lock, RW_WRITER);
	ok = (rb_tree_insert_node(&sh->sh_tree, fw) == fw);
	if (__predict_true(ok)) {
		ok = (rb_tree_insert_node(&sh->sh_tree, bk) == bk);
		if (__predict_true(ok)) {
			/* Success: insert session, count both entries. */
			LIST_INSERT_HEAD(&sh->sh_list, se, s_list);
			sh->sh_count += 2;
			NPF_PRINTF(("NPF: establish se %p\n", se));
		} else {
			/* Race with duplicate packet. */
			rb_tree_remove_node(&sh->sh_tree, fw);
			npf_stats_inc(NPF_STAT_RACE_SESSION);
		}
	}
	rw_exit(&sh->sh_lock);
out:
	if (__predict_false(!ok)) {
		npf_session_destroy(se);
		return NULL;
	}
	return se;
}

static void
npf_session_destroy(npf_session_t *se)
{

	if (se->s_nat) {
		/* Release any NAT related structures. */
		npf_nat_expire(se->s_nat);
	}
	if (se->s_rproc) {
		/* Release rule procedure. */
		npf_rproc_release(se->s_rproc);
	}

	/* Destroy the state. */
	npf_state_destroy(&se->s_state);

	/* Free the structure, increase the counter. */
	pool_cache_put(sess_cache, se);
	npf_stats_inc(NPF_STAT_SESSION_DESTROY);
	NPF_PRINTF(("NPF: se %p destroyed\n", se));
}

/*
 * npf_session_setnat: associate NAT entry with the session, update
 * and re-insert session entry accordingly.
 */
int
npf_session_setnat(npf_session_t *se, npf_nat_t *nt, const int di)
{
	npf_sehash_t *sh;
	npf_sentry_t *sen;
	npf_addr_t *taddr;
	in_port_t tport;
	bool ok;

	KASSERT(se->s_refcnt > 0);

	/* First, atomically check and associate NAT entry. */
	if (atomic_cas_ptr(&se->s_nat, NULL, nt) != NULL) {
		/* Race: see below for description. */
		npf_stats_inc(NPF_STAT_RACE_NAT);
		return EISCONN;
	}

	/*
	 * Update, re-hash and re-insert "backwards" entry, according to
	 * the translation.  First, remove the entry from tree.  Note that
	 * a duplicate packet may establish a duplicate session while lock
	 * will be released.  In such case, caller will drop this packet
	 * and structures associated with it.  Such race condition should
	 * never happen in practice, though.
	 */
	sen = &se->s_back_entry;
	sh = sess_hash_bucket(sess_hashtbl, &se->s_common_id, sen);

	rw_enter(&sh->sh_lock, RW_WRITER);
	rb_tree_remove_node(&sh->sh_tree, sen);
	sh->sh_count--;
	rw_exit(&sh->sh_lock);

	/*
	 * New source/destination and hash.  Note that source/destination
	 * are inverted, since we are handling "backwards" entry.
	 */
	npf_nat_gettrans(nt, &taddr, &tport);
	if (di == PFIL_OUT) {
		/* NPF_NATOUT: source in "forwards" = destination. */
		memcpy(&sen->se_dst_addr, taddr, sen->se_alen);
		if (tport) {
			sen->se_dst_id = tport;
		}
	} else {
		/* NPF_NATIN: destination in "forwards" = source. */
		memcpy(&sen->se_src_addr, taddr, sen->se_alen);
		if (tport) {
			sen->se_src_id = tport;
		}
	}
	sh = sess_hash_bucket(sess_hashtbl, &se->s_common_id, sen);

	/* Insert into the new bucket. */
	rw_enter(&sh->sh_lock, RW_WRITER);
	ok = (rb_tree_insert_node(&sh->sh_tree, sen) == sen);
	if (__predict_true(ok)) {
		sh->sh_count++;
		NPF_PRINTF(("NPF: se %p assoc with nat %p\n", se, se->s_nat));
	} else {
		/* FIXMEgc */
		printf("npf_session_setnat: Houston, we've had a problem.\n");
	}
	rw_exit(&sh->sh_lock);
	return ok ? 0 : EISCONN;
}

/*
 * npf_session_expire: explicitly mark session as expired.
 */
void
npf_session_expire(npf_session_t *se)
{

	/* KASSERT(se->s_refcnt > 0); XXX: npf_nat_freepolicy() */
	atomic_or_uint(&se->s_flags, SE_EXPIRE);
}

/*
 * npf_session_pass: return true if session is "pass" one, otherwise false.
 */
bool
npf_session_pass(const npf_session_t *se, npf_rproc_t **rp)
{

	KASSERT(se->s_refcnt > 0);
	if ((se->s_flags & SE_PASS) != 0) {
		*rp = se->s_rproc;
		return true;
	}
	return false;
}

/*
 * npf_session_setpass: mark session as a "pass" one and associate rule
 * procedure with it.
 */
void
npf_session_setpass(npf_session_t *se, npf_rproc_t *rp)
{

	KASSERT((se->s_flags & SE_ACTIVE) == 0);
	KASSERT(se->s_refcnt > 0);
	KASSERT(se->s_rproc == NULL);

	/* No need for atomic since the session is not yet active. */
	se->s_flags |= SE_PASS;
	se->s_rproc = rp;
}

/*
 * npf_session_release: release a reference, which might allow G/C thread
 * to destroy this session.
 */
void
npf_session_release(npf_session_t *se)
{

	KASSERT(se->s_refcnt > 0);
	if ((se->s_flags & SE_ACTIVE) == 0) {
		/* Activate: after this point, session is globally visible. */
		se->s_flags |= SE_ACTIVE;
	}
	atomic_dec_uint(&se->s_refcnt);
}

/*
 * npf_session_retnat: return associated NAT data entry and indicate
 * whether it is a "forwards" or "backwards" stream.
 */
npf_nat_t *
npf_session_retnat(npf_session_t *se, const int di, bool *forw)
{

	KASSERT(se->s_refcnt > 0);
	*forw = (se->s_flags & PFIL_ALL) == di;
	return se->s_nat;
}

/*
 * npf_session_expired: criterion to check if session is expired.
 */
static inline bool
npf_session_expired(const npf_session_t *se, const struct timespec *tsnow)
{
	const u_int proto = se->s_common_id.proto;
	const int etime = npf_state_etime(&se->s_state, proto);
	struct timespec tsdiff;

	if (__predict_false(se->s_flags & SE_EXPIRE)) {
		/* Explicitly marked to be expired. */
		return true;
	}
	timespecsub(tsnow, &se->s_atime, &tsdiff);
	return __predict_false(tsdiff.tv_sec > etime);
}

/*
 * npf_session_gc: scan all sessions, insert into G/C list all expired ones.
 */
static void
npf_session_gc(struct npf_sesslist *gc_list, bool flushall)
{
	struct timespec tsnow;
	npf_sentry_t *sen, *nsen;
	npf_session_t *se;
	u_int i;

	getnanouptime(&tsnow);

	/* Scan each session entry in the hash table. */
	for (i = 0; i < SESS_HASH_BUCKETS; i++) {
		npf_sehash_t *sh;

		sh = &sess_hashtbl[i];
		if (sh->sh_count == 0) {
			continue;
		}
		rw_enter(&sh->sh_lock, RW_WRITER);
		/* For each (left -> right) ... */
		sen = rb_tree_iterate(&sh->sh_tree, NULL, RB_DIR_LEFT);
		while (sen != NULL) {
			/* Get session, pre-iterate, skip if not expired. */
			se = sen->se_backptr;
			nsen = rb_tree_iterate(&sh->sh_tree, sen, RB_DIR_RIGHT);
			if (!npf_session_expired(se, &tsnow) && !flushall) {
				KASSERT((se->s_flags & SE_REMOVING) == 0);
				sen = nsen;
				continue;
			}

			/* Expired - remove from the tree. */
			rb_tree_remove_node(&sh->sh_tree, sen);
			sh->sh_count--;

			/*
			 * Set removal bit when the first entry is removed.
			 * If already set, then second entry has been removed,
			 * therefore move the session into the G/C list.
			 */
			if (se->s_flags & SE_REMOVING) {
				LIST_REMOVE(se, s_list);
				LIST_INSERT_HEAD(gc_list, se, s_list);
			} else {
				atomic_or_uint(&se->s_flags, SE_REMOVING);
			}

			/* Next.. */
			sen = nsen;
		}
		KASSERT(!flushall || sh->sh_count == 0);
		rw_exit(&sh->sh_lock);
	}
}

/*
 * npf_session_freelist: destroy all sessions, which have no references,
 * in the given G/C list.  Return true, if the list is empty.
 */
static void
npf_session_freelist(struct npf_sesslist *gc_list)
{
	npf_session_t *se, *nse;

	se = LIST_FIRST(gc_list);
	while (se != NULL) {
		nse = LIST_NEXT(se, s_list);
		if (se->s_refcnt == 0) {
			/* Destroy only if no references. */
			LIST_REMOVE(se, s_list);
			npf_session_destroy(se);
		}
		se = nse;
	}
}

/*
 * npf_session_worker: G/C worker thread.
 */
static void
npf_session_worker(void *arg)
{
	struct npf_sesslist gc_list;
	bool flushreq = false;

	LIST_INIT(&gc_list);
	do {
		/* Periodically wake up, unless get notified. */
		mutex_enter(&sess_lock);
		(void)cv_timedwait(&sess_cv, &sess_lock, SESS_GC_INTERVAL);
		flushreq = (sess_tracking != SESS_TRACKING_ON);
		npf_session_gc(&gc_list, flushreq);
		if (sess_tracking == SESS_TRACKING_FLUSH) {
			/* Flush was requested - on again, notify waiter. */
			sess_tracking = SESS_TRACKING_ON;
			cv_broadcast(&sess_cv);
		}
		mutex_exit(&sess_lock);

		npf_session_freelist(&gc_list);

	} while (sess_tracking != SESS_TRACKING_OFF);

	/* Wait for any referenced sessions to be released. */
	while (!LIST_EMPTY(&gc_list)) {
		kpause("npfgcfr", false, 1, NULL);
		npf_session_freelist(&gc_list);
	}

	/* Notify that we are done. */
	mutex_enter(&sess_lock);
	sess_gc_lwp = NULL;
	cv_broadcast(&sess_cv);
	mutex_exit(&sess_lock);

	kthread_exit(0);
}

/*
 * npf_session_save: construct a list of sessions prepared for saving.
 * Note: this is expected to be an expensive operation.
 */
int
npf_session_save(prop_array_t selist, prop_array_t nplist)
{
	npf_sehash_t *sh;
	npf_session_t *se;
	int error = 0, i;

	/* If not tracking - empty. */
	mutex_enter(&sess_lock);
	if (sess_tracking == SESS_TRACKING_OFF) {
		mutex_exit(&sess_lock);
		return 0;
	}

	/*
	 * Note: hold the session lock to prevent G/C thread from session
	 * expiring and removing.  Therefore, no need to exclusively lock
	 * the entire hash table.
	 */
	for (i = 0; i < SESS_HASH_BUCKETS; i++) {
		sh = &sess_hashtbl[i];
		if (sh->sh_count == 0) {
			/* Empty bucket, next. */
			continue;
		}
		rw_enter(&sh->sh_lock, RW_READER);
		LIST_FOREACH(se, &sh->sh_list, s_list) {
			prop_dictionary_t sedict;
			prop_data_t sd;
			/*
			 * Create a copy of npf_session_t binary data and the
			 * unique identifier, which may be a pointer value.
			 * Set the data, insert into the array.
			 */
			sedict = prop_dictionary_create();
			sd = prop_data_create_data(se, sizeof(npf_session_t));
			prop_dictionary_set(sedict, "data", sd);
			prop_object_release(sd);

			CTASSERT(sizeof(uintptr_t) <= sizeof(uint64_t));
			prop_dictionary_set_uint64(
			    sedict, "id-ptr", (uintptr_t)se);

			if (se->s_nat) {
				/* Save NAT entry and policy, if any. */
				error = npf_nat_save(sedict, nplist, se->s_nat);
				if (error) {
					prop_object_release(sedict);
					break;
				}
			}
			prop_array_add(selist, sedict);
			prop_object_release(sedict);
		}
		rw_exit(&sh->sh_lock);
		if (error) {
			/* Note: caller will free the array. */
			break;
		}
	}
	mutex_exit(&sess_lock);
	return error;
}

/*
 * npf_session_restore: fully reconstruct a single session from a directory
 * and insert into the given hash table.
 */
int
npf_session_restore(npf_sehash_t *stbl, prop_dictionary_t sedict)
{
	npf_session_t *se;
	npf_sehash_t *fsh, *bsh;
	npf_sentry_t *fw, *bk;
	prop_object_t obj;
	npf_state_t *nst;
	const void *d;
	int error = 0;

	/* Get the pointer to the npf_session_t data and check size. */
	obj = prop_dictionary_get(sedict, "data");
	d = prop_data_data_nocopy(obj);
	if (d == NULL || prop_data_size(obj) != sizeof(npf_session_t)) {
		return EINVAL;
	}

	/*
	 * Copy the binary data of the structure.  Warning: must reset
	 * reference count, rule procedure and state lock.
	 */
	se = pool_cache_get(sess_cache, PR_WAITOK);
	memcpy(se, d, sizeof(npf_session_t));
	se->s_refcnt = 0;
	se->s_rproc = NULL;

	nst = &se->s_state;
	mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET);

	/*
	 * Reconstruct NAT association, if any, or return NULL.
	 * Warning: must not leave stale entry.
	 */
	se->s_nat = npf_nat_restore(sedict, se);

	/*
	 * Find a hash bucket and insert each entry.
	 * Warning: must reset back pointers.
	 */
	fw = &se->s_forw_entry;
	fw->se_backptr = se;
	fsh = sess_hash_bucket(stbl, &se->s_common_id, fw);
	if (rb_tree_insert_node(&fsh->sh_tree, fw) != fw) {
		error = EINVAL;
		goto out;
	}
	fsh->sh_count++;

	bk = &se->s_back_entry;
	bk->se_backptr = se;
	bsh = sess_hash_bucket(stbl, &se->s_common_id, bk);
	if (rb_tree_insert_node(&bsh->sh_tree, bk) != bk) {
		rb_tree_remove_node(&fsh->sh_tree, fw);
		error = EINVAL;
		goto out;
	}
	bsh->sh_count++;

	/* Note: bucket of the forwards entry is for session list. */
	LIST_INSERT_HEAD(&fsh->sh_list, se, s_list);
out:
	if (error) {
		/* Drop, in a case of duplicate. */
		npf_session_destroy(se);
	}
	return error;
}

#if defined(DDB) || defined(_NPF_TESTING)

void
npf_sessions_dump(void)
{
	npf_sehash_t *sh;
	npf_sentry_t *sen;
	npf_session_t *se;
	struct timespec tsnow;

	getnanouptime(&tsnow);
	for (u_int i = 0; i < SESS_HASH_BUCKETS; i++) {
		sh = &sess_hashtbl[i];
		if (sh->sh_count == 0) {
			KASSERT(rb_tree_iterate(&sh->sh_tree,
			    NULL, RB_DIR_LEFT) == NULL);
			continue;
		}
		printf("s_bucket %d (%p, count %d)\n", i, sh, sh->sh_count);
		RB_TREE_FOREACH(sen, &sh->sh_tree) {
			struct timespec tsdiff;
			struct in_addr ip;
			int proto, etime;

			se = sen->se_backptr;
			proto = se->s_common_id.proto;
			timespecsub(&tsnow, &se->s_atime, &tsdiff);
			etime = npf_state_etime(&se->s_state, proto);

			printf("    %p[%p]: %s proto %d flags 0x%x tsdiff %d "
			    "etime %d\n", sen, se, sen == &se->s_forw_entry ?
			    "forw" : "back", proto, se->s_flags,
			    (int)tsdiff.tv_sec, etime);
			memcpy(&ip, &sen->se_src_addr, sizeof(ip));
			printf("\tsrc (%s, %d) ",
			    inet_ntoa(ip), ntohs(sen->se_src_id));
			memcpy(&ip, &sen->se_dst_addr, sizeof(ip));
			printf("dst (%s, %d)\n",
			    inet_ntoa(ip), ntohs(sen->se_dst_id));
			npf_state_dump(&se->s_state);
			if (se->s_nat != NULL) {
				npf_nat_dump(se->s_nat);
			}
		}
	}
}

#endif