/* $NetBSD: npf_session.c,v 1.18 2012/09/13 21:09:36 joerg Exp $ */ /*- * Copyright (c) 2010-2012 The NetBSD Foundation, Inc. * All rights reserved. * * This material is based upon work partially supported by The * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * NPF session tracking for stateful filtering and translation. * * Overview * * Session direction is identified by the direction of its first packet. * Packets can be incoming or outgoing with respect to an interface. * To describe the packet in the context of session direction, we will * use the terms "forwards stream" and "backwards stream". All sessions * have two embedded entries - npf_session_t::s_forw_entry for forwards * stream and npf_session_t::s_back_entry for backwards stream. These * entries (npf_sentry_t) contain source and destination identifiers. * Note that entry may contain translated values in a case of NAT. * * Sessions can serve two purposes: "pass" or "NAT". Sessions for the * former purpose are created according to the rules with "stateful" * attribute and are used for stateful filtering. Such sessions * indicate that the packet of the backwards stream should be passed * without inspection of the ruleset. Another purpose is to associate * NAT with a connection (which implies connection tracking). Such * sessions are created according to the NAT policies and they have a * relationship with NAT translation structure via npf_session_t::s_nat. * A single session can serve both purposes, which is a common case. * * Session life-cycle * * Sessions are established when a packet matches said rule or NAT policy. * Both entries of established session are inserted into the hashed tree. * A garbage collection thread periodically scans all session entries and * depending on session properties (e.g. last activity time, protocol) * removes session entries and expires the actual sessions. * * Each session has a reference count. Reference is acquired on lookup * and should be released by the caller. Reference guarantees that the * session will not be destroyed, although it may be expired. * * External session identifiers * * Application-level gateways (ALGs) can inspect the packet and fill * the packet cache (npf_cache_t) representing the IDs. It is done * via npf_alg_sessionid() call. In such case, ALGs are responsible * for correct filling of protocol, addresses and ports/IDs. * * Lock order * * [ sess_lock -> ] * npf_sehash_t::sh_lock -> * npf_state_t::nst_lock */ #include __KERNEL_RCSID(0, "$NetBSD: npf_session.c,v 1.18 2012/09/13 21:09:36 joerg Exp $"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "npf_impl.h" /* * Session structures: entry for embedding and the main structure. * WARNING: update npf_session_restore() when adding fields. */ struct npf_secomid; typedef struct npf_secomid npf_secomid_t; typedef struct { /* Session entry node and back-pointer to the actual session. */ rb_node_t se_rbnode; union { npf_session_t * se_backptr; void * se_common_id; }; /* Size of the addresses. */ int se_alen; /* Source and destination addresses. */ npf_addr_t se_src_addr; npf_addr_t se_dst_addr; /* Source and destination ports (TCP / UDP) or generic IDs. */ uint16_t se_src_id; uint16_t se_dst_id; } npf_sentry_t; struct npf_session { /* Session "forwards" and "backwards" entries. */ npf_sentry_t s_forw_entry; npf_sentry_t s_back_entry; /* Entry in the session hash or G/C list. */ LIST_ENTRY(npf_session) s_list; u_int s_refcnt; /* Protocol and interface (common IDs). */ struct npf_secomid { uint16_t proto; uint16_t if_idx; } s_common_id; /* Flags and the protocol state. */ int s_flags; npf_state_t s_state; /* Association of rule procedure data. */ npf_rproc_t * s_rproc; /* NAT associated with this session (if any). */ npf_nat_t * s_nat; /* Last activity time (used to calculate expiration time). */ struct timespec s_atime; }; #define SESS_HASH_BUCKETS 1024 /* XXX tune + make tunable */ #define SESS_HASH_MASK (SESS_HASH_BUCKETS - 1) LIST_HEAD(npf_sesslist, npf_session); struct npf_sehash { rb_tree_t sh_tree; struct npf_sesslist sh_list; krwlock_t sh_lock; u_int sh_count; }; /* * Session flags: * - PFIL_IN and PFIL_OUT values are reserved for direction. * - SE_ACTIVE: session is active i.e. visible on inspection. * - SE_PASS: a "pass" session. * - SE_EXPIRE: explicitly expire the session. * - SE_REMOVING: session is being removed (indicate need to enter G/C list). */ CTASSERT(PFIL_ALL == (0x001 | 0x002)); #define SE_ACTIVE 0x004 #define SE_PASS 0x008 #define SE_EXPIRE 0x010 #define SE_REMOVING 0x020 /* * Session tracking state: disabled (off), enabled (on) or flush request. */ enum { SESS_TRACKING_OFF, SESS_TRACKING_ON, SESS_TRACKING_FLUSH }; static int sess_tracking __cacheline_aligned; /* Session hash table, lock and session cache. */ static npf_sehash_t * sess_hashtbl __read_mostly; static pool_cache_t sess_cache __read_mostly; static kmutex_t sess_lock; static kcondvar_t sess_cv; static lwp_t * sess_gc_lwp; #define SESS_GC_INTERVAL 5 /* 5 sec */ static void sess_tracking_stop(void); static void npf_session_destroy(npf_session_t *); static void npf_session_worker(void *) __dead; /* * npf_session_sys{init,fini}: initialise/destroy session handling structures. * * Session table and G/C thread are initialised when session tracking gets * actually enabled via npf_session_tracking() interface. */ void npf_session_sysinit(void) { sess_cache = pool_cache_init(sizeof(npf_session_t), coherency_unit, 0, 0, "npfsespl", NULL, IPL_NET, NULL, NULL, NULL); mutex_init(&sess_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&sess_cv, "npfgccv"); sess_hashtbl = NULL; sess_gc_lwp = NULL; sess_tracking = SESS_TRACKING_OFF; } void npf_session_sysfini(void) { /* Disable tracking, flush all sessions. */ npf_session_tracking(false); KASSERT(sess_tracking == SESS_TRACKING_OFF); KASSERT(sess_gc_lwp == NULL); /* Sessions might have been restored while the tracking is off. */ if (sess_hashtbl) { sess_htable_destroy(sess_hashtbl); } pool_cache_destroy(sess_cache); cv_destroy(&sess_cv); mutex_destroy(&sess_lock); } /* * Session hash table and RB-tree helper routines. * The order is (src.id, dst.id, src.addr, dst.addr, common_id), * where (node1 < node2) shall be negative. */ static signed int sess_rbtree_cmp_nodes(void *ctx, const void *n1, const void *n2) { const npf_sentry_t * const sen1 = n1; const npf_sentry_t * const sen2 = n2; const int sz = sen1->se_alen; int ret; /* * Ports are expected to vary most, therefore they are first. */ if (sen1->se_src_id != sen2->se_src_id) { return (sen1->se_src_id < sen2->se_src_id) ? -1 : 1; } if (sen1->se_dst_id != sen2->se_dst_id) { return (sen1->se_dst_id < sen2->se_dst_id) ? -1 : 1; } /* * Note that hash should minimise differentiation on addresses. */ if (sen1->se_alen != sen2->se_alen) { return (sen1->se_alen < sen2->se_alen) ? -1 : 1; } if ((ret = memcmp(&sen1->se_src_addr, &sen2->se_src_addr, sz)) != 0) { return ret; } if ((ret = memcmp(&sen1->se_dst_addr, &sen2->se_dst_addr, sz)) != 0) { return ret; } const npf_secomid_t *id1 = &sen1->se_backptr->s_common_id; const npf_secomid_t *id2 = ctx ? ctx : &sen2->se_backptr->s_common_id; return memcmp(id1, id2, sizeof(npf_secomid_t)); } static signed int sess_rbtree_cmp_key(void *ctx, const void *n1, const void *key) { const npf_sentry_t * const sen1 = n1; const npf_sentry_t * const sen2 = key; KASSERT(sen1->se_alen != 0 && sen2->se_alen != 0); return sess_rbtree_cmp_nodes(sen2->se_common_id, sen1, sen2); } static const rb_tree_ops_t sess_rbtree_ops = { .rbto_compare_nodes = sess_rbtree_cmp_nodes, .rbto_compare_key = sess_rbtree_cmp_key, .rbto_node_offset = offsetof(npf_sentry_t, se_rbnode), .rbto_context = NULL }; static inline npf_sehash_t * sess_hash_bucket(npf_sehash_t *stbl, const npf_secomid_t *scid, const npf_sentry_t *sen) { const int sz = sen->se_alen; uint32_t hash, mix; /* * Sum protocol, interface and both addresses (for both directions). */ mix = scid->proto + scid->if_idx; mix += npf_addr_sum(sz, &sen->se_src_addr, &sen->se_dst_addr); hash = hash32_buf(&mix, sizeof(uint32_t), HASH32_BUF_INIT); return &stbl[hash & SESS_HASH_MASK]; } npf_sehash_t * sess_htable_create(void) { npf_sehash_t *stbl, *sh; u_int i; stbl = kmem_zalloc(SESS_HASH_BUCKETS * sizeof(*sh), KM_SLEEP); if (stbl == NULL) { return NULL; } for (i = 0; i < SESS_HASH_BUCKETS; i++) { sh = &stbl[i]; LIST_INIT(&sh->sh_list); rb_tree_init(&sh->sh_tree, &sess_rbtree_ops); rw_init(&sh->sh_lock); sh->sh_count = 0; } return stbl; } void sess_htable_destroy(npf_sehash_t *stbl) { npf_sehash_t *sh; u_int i; for (i = 0; i < SESS_HASH_BUCKETS; i++) { sh = &stbl[i]; KASSERT(sh->sh_count == 0); KASSERT(LIST_EMPTY(&sh->sh_list)); KASSERT(!rb_tree_iterate(&sh->sh_tree, NULL, RB_DIR_LEFT)); rw_destroy(&sh->sh_lock); } kmem_free(stbl, SESS_HASH_BUCKETS * sizeof(*sh)); } void sess_htable_reload(npf_sehash_t *stbl) { npf_sehash_t *oldstbl; /* Flush all existing entries. */ mutex_enter(&sess_lock); if (sess_gc_lwp) { sess_tracking = SESS_TRACKING_FLUSH; cv_broadcast(&sess_cv); } while (sess_tracking == SESS_TRACKING_FLUSH) { cv_wait(&sess_cv, &sess_lock); } /* Set a new session table. */ oldstbl = sess_hashtbl; sess_hashtbl = stbl; mutex_exit(&sess_lock); /* Destroy the old table. */ if (oldstbl) { sess_htable_destroy(oldstbl); } } /* * Session tracking routines. Note: manages tracking structures. */ static int sess_tracking_start(void) { npf_sehash_t *nstbl; nstbl = sess_htable_create(); if (nstbl == NULL) { return ENOMEM; } /* Note: should be visible before thread start. */ mutex_enter(&sess_lock); if (sess_tracking != SESS_TRACKING_OFF) { mutex_exit(&sess_lock); sess_htable_destroy(nstbl); return EEXIST; } sess_hashtbl = nstbl; sess_tracking = SESS_TRACKING_ON; mutex_exit(&sess_lock); if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, npf_session_worker, NULL, &sess_gc_lwp, "npfgc")) { sess_tracking_stop(); return ENOMEM; } return 0; } static void sess_tracking_stop(void) { npf_sehash_t *stbl; mutex_enter(&sess_lock); if (sess_tracking == SESS_TRACKING_OFF) { mutex_exit(&sess_lock); return; } /* Notify G/C thread to flush all sessions. */ sess_tracking = SESS_TRACKING_OFF; cv_broadcast(&sess_cv); /* Wait for the exit. */ while (sess_gc_lwp != NULL) { cv_wait(&sess_cv, &sess_lock); } stbl = sess_hashtbl; sess_hashtbl = NULL; mutex_exit(&sess_lock); sess_htable_destroy(stbl); pool_cache_invalidate(sess_cache); } /* * npf_session_tracking: enable/disable session tracking. */ int npf_session_tracking(bool track) { if (sess_tracking == SESS_TRACKING_OFF && track) { /* Disabled -> Enable. */ return sess_tracking_start(); } if (sess_tracking == SESS_TRACKING_ON && !track) { /* Enabled -> Disable. */ sess_tracking_stop(); return 0; } return 0; } /* * npf_session_inspect: lookup for an established session (connection). * * => If found, we will hold a reference for caller. */ npf_session_t * npf_session_inspect(npf_cache_t *npc, nbuf_t *nbuf, const ifnet_t *ifp, const int di, int *error) { npf_sehash_t *sh; npf_sentry_t *sen; npf_session_t *se; int flags; /* * Check if session tracking is on. Also, if layer 3 and 4 are not * cached - protocol is not supported or packet is invalid. */ if (sess_tracking == SESS_TRACKING_OFF) { return NULL; } if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) { return NULL; } /* * Construct a key for hash and tree lookup. Execute ALG session * helpers, which may construct a custom key. */ npf_cache_t algkey = { .npc_info = 0 }, *key; npf_sentry_t senkey; if (!npf_alg_sessionid(npc, nbuf, &algkey)) { /* Default: use the cache data of original packet. */ key = npc; } else { /* Unique IDs filled by ALG in a separate key cache. */ key = &algkey; } /* Note: take protocol from the key. */ const u_int proto = npf_cache_ipproto(key); switch (proto) { case IPPROTO_TCP: { const struct tcphdr *th = &key->npc_l4.tcp; senkey.se_src_id = th->th_sport; senkey.se_dst_id = th->th_dport; break; } case IPPROTO_UDP: { const struct udphdr *uh = &key->npc_l4.udp; senkey.se_src_id = uh->uh_sport; senkey.se_dst_id = uh->uh_dport; break; } case IPPROTO_ICMP: if (npf_iscached(key, NPC_ICMP_ID)) { const struct icmp *ic = &key->npc_l4.icmp; senkey.se_src_id = ic->icmp_id; senkey.se_dst_id = ic->icmp_id; break; } return NULL; case IPPROTO_ICMPV6: if (npf_iscached(key, NPC_ICMP_ID)) { const struct icmp6_hdr *ic6 = &key->npc_l4.icmp6; senkey.se_src_id = ic6->icmp6_id; senkey.se_dst_id = ic6->icmp6_id; break; } return NULL; default: /* Unsupported protocol. */ return NULL; } KASSERT(key->npc_srcip && key->npc_dstip && key->npc_alen > 0); memcpy(&senkey.se_src_addr, key->npc_srcip, key->npc_alen); memcpy(&senkey.se_dst_addr, key->npc_dstip, key->npc_alen); senkey.se_alen = key->npc_alen; /* * Note: this is a special case where we use common ID pointer * to pass the structure for the key comparator. */ npf_secomid_t scid; memset(&scid, 0, sizeof(npf_secomid_t)); scid = (npf_secomid_t){ .proto = proto, .if_idx = ifp->if_index }; senkey.se_common_id = &scid; /* * Get a hash bucket from the cached key data. * Pre-check if there are any entries in the hash table. */ sh = sess_hash_bucket(sess_hashtbl, &scid, &senkey); if (sh->sh_count == 0) { return NULL; } /* Lookup the tree for a session entry and get the actual session. */ rw_enter(&sh->sh_lock, RW_READER); sen = rb_tree_find_node(&sh->sh_tree, &senkey); if (sen == NULL) { rw_exit(&sh->sh_lock); return NULL; } se = sen->se_backptr; KASSERT(se->s_common_id.proto == proto); KASSERT(se->s_common_id.if_idx == ifp->if_index); flags = se->s_flags; /* Check if session is active and not expired. */ if (__predict_false((flags & (SE_ACTIVE | SE_EXPIRE)) != SE_ACTIVE)) { rw_exit(&sh->sh_lock); return NULL; } /* Match directions of the session entry and the packet. */ const bool sforw = (sen == &se->s_forw_entry); const bool pforw = (flags & PFIL_ALL) == di; if (__predict_false(sforw != pforw)) { rw_exit(&sh->sh_lock); return NULL; } /* Inspect the protocol data and handle state changes. */ if (npf_state_inspect(npc, nbuf, &se->s_state, sforw)) { /* Update the last activity time and hold a reference. */ getnanouptime(&se->s_atime); atomic_inc_uint(&se->s_refcnt); } else { /* Silently block invalid packets. */ npf_stats_inc(NPF_STAT_INVALID_STATE); *error = ENETUNREACH; se = NULL; } rw_exit(&sh->sh_lock); return se; } /* * npf_establish_session: create a new session, insert into the global list. * * => Session is created with the reference held for the caller. * => Session will be activated on the first reference release. */ npf_session_t * npf_session_establish(const npf_cache_t *npc, nbuf_t *nbuf, const ifnet_t *ifp, const int di) { const struct tcphdr *th; const struct udphdr *uh; npf_sentry_t *fw, *bk; npf_sehash_t *sh; npf_session_t *se; u_int proto, alen; bool ok; /* * Check if session tracking is on. Also, if layer 3 and 4 are not * cached - protocol is not supported or packet is invalid. */ if (sess_tracking == SESS_TRACKING_OFF) { return NULL; } if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) { return NULL; } /* Allocate and initialise new state. */ se = pool_cache_get(sess_cache, PR_NOWAIT); if (__predict_false(se == NULL)) { return NULL; } NPF_PRINTF(("NPF: create se %p\n", se)); npf_stats_inc(NPF_STAT_SESSION_CREATE); /* Reference count and flags (indicate direction). */ se->s_refcnt = 1; se->s_flags = (di & PFIL_ALL); se->s_rproc = NULL; se->s_nat = NULL; /* Initialize protocol state. */ if (!npf_state_init(npc, nbuf, &se->s_state)) { ok = false; goto out; } /* Unique IDs: IP addresses. Setup "forwards" entry first. */ KASSERT(npf_iscached(npc, NPC_IP46)); alen = npc->npc_alen; fw = &se->s_forw_entry; memcpy(&fw->se_src_addr, npc->npc_srcip, alen); memcpy(&fw->se_dst_addr, npc->npc_dstip, alen); /* Protocol and interface. */ proto = npf_cache_ipproto(npc); memset(&se->s_common_id, 0, sizeof(npf_secomid_t)); se->s_common_id.proto = proto; se->s_common_id.if_idx = ifp->if_index; switch (proto) { case IPPROTO_TCP: KASSERT(npf_iscached(npc, NPC_TCP)); th = &npc->npc_l4.tcp; /* Additional IDs: ports. */ fw->se_src_id = th->th_sport; fw->se_dst_id = th->th_dport; break; case IPPROTO_UDP: KASSERT(npf_iscached(npc, NPC_UDP)); /* Additional IDs: ports. */ uh = &npc->npc_l4.udp; fw->se_src_id = uh->uh_sport; fw->se_dst_id = uh->uh_dport; break; case IPPROTO_ICMP: if (npf_iscached(npc, NPC_ICMP_ID)) { /* ICMP query ID. */ const struct icmp *ic = &npc->npc_l4.icmp; fw->se_src_id = ic->icmp_id; fw->se_dst_id = ic->icmp_id; break; } ok = false; goto out; case IPPROTO_ICMPV6: if (npf_iscached(npc, NPC_ICMP_ID)) { /* ICMP query ID. */ const struct icmp6_hdr *ic6 = &npc->npc_l4.icmp6; fw->se_src_id = ic6->icmp6_id; fw->se_dst_id = ic6->icmp6_id; break; } ok = false; goto out; default: /* Unsupported. */ ok = false; goto out; } /* Set last activity time for a new session. */ getnanouptime(&se->s_atime); /* Setup inverted "backwards". */ bk = &se->s_back_entry; memcpy(&bk->se_src_addr, &fw->se_dst_addr, alen); memcpy(&bk->se_dst_addr, &fw->se_src_addr, alen); bk->se_src_id = fw->se_dst_id; bk->se_dst_id = fw->se_src_id; /* Finish the setup of entries. */ fw->se_backptr = bk->se_backptr = se; fw->se_alen = bk->se_alen = alen; /* * Insert the session and both entries into the tree. */ sh = sess_hash_bucket(sess_hashtbl, &se->s_common_id, fw); KASSERT(sh == sess_hash_bucket(sess_hashtbl, &se->s_common_id, bk)); rw_enter(&sh->sh_lock, RW_WRITER); ok = (rb_tree_insert_node(&sh->sh_tree, fw) == fw); if (__predict_true(ok)) { ok = (rb_tree_insert_node(&sh->sh_tree, bk) == bk); if (__predict_true(ok)) { /* Success: insert session, count both entries. */ LIST_INSERT_HEAD(&sh->sh_list, se, s_list); sh->sh_count += 2; NPF_PRINTF(("NPF: establish se %p\n", se)); } else { /* Race with duplicate packet. */ rb_tree_remove_node(&sh->sh_tree, fw); npf_stats_inc(NPF_STAT_RACE_SESSION); } } rw_exit(&sh->sh_lock); out: if (__predict_false(!ok)) { npf_session_destroy(se); return NULL; } return se; } static void npf_session_destroy(npf_session_t *se) { if (se->s_nat) { /* Release any NAT related structures. */ npf_nat_expire(se->s_nat); } if (se->s_rproc) { /* Release rule procedure. */ npf_rproc_release(se->s_rproc); } /* Destroy the state. */ npf_state_destroy(&se->s_state); /* Free the structure, increase the counter. */ pool_cache_put(sess_cache, se); npf_stats_inc(NPF_STAT_SESSION_DESTROY); NPF_PRINTF(("NPF: se %p destroyed\n", se)); } /* * npf_session_setnat: associate NAT entry with the session, update * and re-insert session entry accordingly. */ int npf_session_setnat(npf_session_t *se, npf_nat_t *nt, const int di) { npf_sehash_t *sh; npf_sentry_t *sen; npf_addr_t *taddr; in_port_t tport; bool ok; KASSERT(se->s_refcnt > 0); /* First, atomically check and associate NAT entry. */ if (atomic_cas_ptr(&se->s_nat, NULL, nt) != NULL) { /* Race: see below for description. */ npf_stats_inc(NPF_STAT_RACE_NAT); return EISCONN; } /* * Update, re-hash and re-insert "backwards" entry, according to * the translation. First, remove the entry from tree. Note that * a duplicate packet may establish a duplicate session while lock * will be released. In such case, caller will drop this packet * and structures associated with it. Such race condition should * never happen in practice, though. */ sen = &se->s_back_entry; sh = sess_hash_bucket(sess_hashtbl, &se->s_common_id, sen); rw_enter(&sh->sh_lock, RW_WRITER); rb_tree_remove_node(&sh->sh_tree, sen); sh->sh_count--; rw_exit(&sh->sh_lock); /* * New source/destination and hash. Note that source/destination * are inverted, since we are handling "backwards" entry. */ npf_nat_gettrans(nt, &taddr, &tport); if (di == PFIL_OUT) { /* NPF_NATOUT: source in "forwards" = destination. */ memcpy(&sen->se_dst_addr, taddr, sen->se_alen); if (tport) { sen->se_dst_id = tport; } } else { /* NPF_NATIN: destination in "forwards" = source. */ memcpy(&sen->se_src_addr, taddr, sen->se_alen); if (tport) { sen->se_src_id = tport; } } sh = sess_hash_bucket(sess_hashtbl, &se->s_common_id, sen); /* Insert into the new bucket. */ rw_enter(&sh->sh_lock, RW_WRITER); ok = (rb_tree_insert_node(&sh->sh_tree, sen) == sen); if (__predict_true(ok)) { sh->sh_count++; NPF_PRINTF(("NPF: se %p assoc with nat %p\n", se, se->s_nat)); } else { /* FIXMEgc */ printf("npf_session_setnat: Houston, we've had a problem.\n"); } rw_exit(&sh->sh_lock); return ok ? 0 : EISCONN; } /* * npf_session_expire: explicitly mark session as expired. */ void npf_session_expire(npf_session_t *se) { /* KASSERT(se->s_refcnt > 0); XXX: npf_nat_freepolicy() */ atomic_or_uint(&se->s_flags, SE_EXPIRE); } /* * npf_session_pass: return true if session is "pass" one, otherwise false. */ bool npf_session_pass(const npf_session_t *se, npf_rproc_t **rp) { KASSERT(se->s_refcnt > 0); if ((se->s_flags & SE_PASS) != 0) { *rp = se->s_rproc; return true; } return false; } /* * npf_session_setpass: mark session as a "pass" one and associate rule * procedure with it. */ void npf_session_setpass(npf_session_t *se, npf_rproc_t *rp) { KASSERT((se->s_flags & SE_ACTIVE) == 0); KASSERT(se->s_refcnt > 0); KASSERT(se->s_rproc == NULL); /* No need for atomic since the session is not yet active. */ se->s_flags |= SE_PASS; se->s_rproc = rp; } /* * npf_session_release: release a reference, which might allow G/C thread * to destroy this session. */ void npf_session_release(npf_session_t *se) { KASSERT(se->s_refcnt > 0); if ((se->s_flags & SE_ACTIVE) == 0) { /* Activate: after this point, session is globally visible. */ se->s_flags |= SE_ACTIVE; } atomic_dec_uint(&se->s_refcnt); } /* * npf_session_retnat: return associated NAT data entry and indicate * whether it is a "forwards" or "backwards" stream. */ npf_nat_t * npf_session_retnat(npf_session_t *se, const int di, bool *forw) { KASSERT(se->s_refcnt > 0); *forw = (se->s_flags & PFIL_ALL) == di; return se->s_nat; } /* * npf_session_expired: criterion to check if session is expired. */ static inline bool npf_session_expired(const npf_session_t *se, const struct timespec *tsnow) { const u_int proto = se->s_common_id.proto; const int etime = npf_state_etime(&se->s_state, proto); struct timespec tsdiff; if (__predict_false(se->s_flags & SE_EXPIRE)) { /* Explicitly marked to be expired. */ return true; } timespecsub(tsnow, &se->s_atime, &tsdiff); return __predict_false(tsdiff.tv_sec > etime); } /* * npf_session_gc: scan all sessions, insert into G/C list all expired ones. */ static void npf_session_gc(struct npf_sesslist *gc_list, bool flushall) { struct timespec tsnow; npf_sentry_t *sen, *nsen; npf_session_t *se; u_int i; getnanouptime(&tsnow); /* Scan each session entry in the hash table. */ for (i = 0; i < SESS_HASH_BUCKETS; i++) { npf_sehash_t *sh; sh = &sess_hashtbl[i]; if (sh->sh_count == 0) { continue; } rw_enter(&sh->sh_lock, RW_WRITER); /* For each (left -> right) ... */ sen = rb_tree_iterate(&sh->sh_tree, NULL, RB_DIR_LEFT); while (sen != NULL) { /* Get session, pre-iterate, skip if not expired. */ se = sen->se_backptr; nsen = rb_tree_iterate(&sh->sh_tree, sen, RB_DIR_RIGHT); if (!npf_session_expired(se, &tsnow) && !flushall) { KASSERT((se->s_flags & SE_REMOVING) == 0); sen = nsen; continue; } /* Expired - remove from the tree. */ rb_tree_remove_node(&sh->sh_tree, sen); sh->sh_count--; /* * Set removal bit when the first entry is removed. * If already set, then second entry has been removed, * therefore move the session into the G/C list. */ if (se->s_flags & SE_REMOVING) { LIST_REMOVE(se, s_list); LIST_INSERT_HEAD(gc_list, se, s_list); } else { atomic_or_uint(&se->s_flags, SE_REMOVING); } /* Next.. */ sen = nsen; } KASSERT(!flushall || sh->sh_count == 0); rw_exit(&sh->sh_lock); } } /* * npf_session_freelist: destroy all sessions, which have no references, * in the given G/C list. Return true, if the list is empty. */ static void npf_session_freelist(struct npf_sesslist *gc_list) { npf_session_t *se, *nse; se = LIST_FIRST(gc_list); while (se != NULL) { nse = LIST_NEXT(se, s_list); if (se->s_refcnt == 0) { /* Destroy only if no references. */ LIST_REMOVE(se, s_list); npf_session_destroy(se); } se = nse; } } /* * npf_session_worker: G/C worker thread. */ static void npf_session_worker(void *arg) { struct npf_sesslist gc_list; bool flushreq = false; LIST_INIT(&gc_list); do { /* Periodically wake up, unless get notified. */ mutex_enter(&sess_lock); (void)cv_timedwait(&sess_cv, &sess_lock, SESS_GC_INTERVAL); flushreq = (sess_tracking != SESS_TRACKING_ON); npf_session_gc(&gc_list, flushreq); if (sess_tracking == SESS_TRACKING_FLUSH) { /* Flush was requested - on again, notify waiter. */ sess_tracking = SESS_TRACKING_ON; cv_broadcast(&sess_cv); } mutex_exit(&sess_lock); npf_session_freelist(&gc_list); } while (sess_tracking != SESS_TRACKING_OFF); /* Wait for any referenced sessions to be released. */ while (!LIST_EMPTY(&gc_list)) { kpause("npfgcfr", false, 1, NULL); npf_session_freelist(&gc_list); } /* Notify that we are done. */ mutex_enter(&sess_lock); sess_gc_lwp = NULL; cv_broadcast(&sess_cv); mutex_exit(&sess_lock); kthread_exit(0); } /* * npf_session_save: construct a list of sessions prepared for saving. * Note: this is expected to be an expensive operation. */ int npf_session_save(prop_array_t selist, prop_array_t nplist) { npf_sehash_t *sh; npf_session_t *se; int error = 0, i; /* If not tracking - empty. */ mutex_enter(&sess_lock); if (sess_tracking == SESS_TRACKING_OFF) { mutex_exit(&sess_lock); return 0; } /* * Note: hold the session lock to prevent G/C thread from session * expiring and removing. Therefore, no need to exclusively lock * the entire hash table. */ for (i = 0; i < SESS_HASH_BUCKETS; i++) { sh = &sess_hashtbl[i]; if (sh->sh_count == 0) { /* Empty bucket, next. */ continue; } rw_enter(&sh->sh_lock, RW_READER); LIST_FOREACH(se, &sh->sh_list, s_list) { prop_dictionary_t sedict; prop_data_t sd; /* * Create a copy of npf_session_t binary data and the * unique identifier, which may be a pointer value. * Set the data, insert into the array. */ sedict = prop_dictionary_create(); sd = prop_data_create_data(se, sizeof(npf_session_t)); prop_dictionary_set(sedict, "data", sd); prop_object_release(sd); CTASSERT(sizeof(uintptr_t) <= sizeof(uint64_t)); prop_dictionary_set_uint64( sedict, "id-ptr", (uintptr_t)se); if (se->s_nat) { /* Save NAT entry and policy, if any. */ error = npf_nat_save(sedict, nplist, se->s_nat); if (error) { prop_object_release(sedict); break; } } prop_array_add(selist, sedict); prop_object_release(sedict); } rw_exit(&sh->sh_lock); if (error) { /* Note: caller will free the array. */ break; } } mutex_exit(&sess_lock); return error; } /* * npf_session_restore: fully reconstruct a single session from a directory * and insert into the given hash table. */ int npf_session_restore(npf_sehash_t *stbl, prop_dictionary_t sedict) { npf_session_t *se; npf_sehash_t *fsh, *bsh; npf_sentry_t *fw, *bk; prop_object_t obj; npf_state_t *nst; const void *d; int error = 0; /* Get the pointer to the npf_session_t data and check size. */ obj = prop_dictionary_get(sedict, "data"); d = prop_data_data_nocopy(obj); if (d == NULL || prop_data_size(obj) != sizeof(npf_session_t)) { return EINVAL; } /* * Copy the binary data of the structure. Warning: must reset * reference count, rule procedure and state lock. */ se = pool_cache_get(sess_cache, PR_WAITOK); memcpy(se, d, sizeof(npf_session_t)); se->s_refcnt = 0; se->s_rproc = NULL; nst = &se->s_state; mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET); /* * Reconstruct NAT association, if any, or return NULL. * Warning: must not leave stale entry. */ se->s_nat = npf_nat_restore(sedict, se); /* * Find a hash bucket and insert each entry. * Warning: must reset back pointers. */ fw = &se->s_forw_entry; fw->se_backptr = se; fsh = sess_hash_bucket(stbl, &se->s_common_id, fw); if (rb_tree_insert_node(&fsh->sh_tree, fw) != fw) { error = EINVAL; goto out; } fsh->sh_count++; bk = &se->s_back_entry; bk->se_backptr = se; bsh = sess_hash_bucket(stbl, &se->s_common_id, bk); if (rb_tree_insert_node(&bsh->sh_tree, bk) != bk) { rb_tree_remove_node(&fsh->sh_tree, fw); error = EINVAL; goto out; } bsh->sh_count++; /* Note: bucket of the forwards entry is for session list. */ LIST_INSERT_HEAD(&fsh->sh_list, se, s_list); out: if (error) { /* Drop, in a case of duplicate. */ npf_session_destroy(se); } return error; } #if defined(DDB) || defined(_NPF_TESTING) void npf_sessions_dump(void) { npf_sehash_t *sh; npf_sentry_t *sen; npf_session_t *se; struct timespec tsnow; getnanouptime(&tsnow); for (u_int i = 0; i < SESS_HASH_BUCKETS; i++) { sh = &sess_hashtbl[i]; if (sh->sh_count == 0) { KASSERT(rb_tree_iterate(&sh->sh_tree, NULL, RB_DIR_LEFT) == NULL); continue; } printf("s_bucket %d (%p, count %d)\n", i, sh, sh->sh_count); RB_TREE_FOREACH(sen, &sh->sh_tree) { struct timespec tsdiff; struct in_addr ip; int proto, etime; se = sen->se_backptr; proto = se->s_common_id.proto; timespecsub(&tsnow, &se->s_atime, &tsdiff); etime = npf_state_etime(&se->s_state, proto); printf(" %p[%p]: %s proto %d flags 0x%x tsdiff %d " "etime %d\n", sen, se, sen == &se->s_forw_entry ? "forw" : "back", proto, se->s_flags, (int)tsdiff.tv_sec, etime); memcpy(&ip, &sen->se_src_addr, sizeof(ip)); printf("\tsrc (%s, %d) ", inet_ntoa(ip), ntohs(sen->se_src_id)); memcpy(&ip, &sen->se_dst_addr, sizeof(ip)); printf("dst (%s, %d)\n", inet_ntoa(ip), ntohs(sen->se_dst_id)); npf_state_dump(&se->s_state); if (se->s_nat != NULL) { npf_nat_dump(se->s_nat); } } } } #endif