mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 02:24:32 +01:00
netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation
Ulrich reports a regression with nfqueue:
If an application did not set the 'F_GSO' capability flag and a gso
packet with an unconfirmed nf_conn entry is received all packets are
now dropped instead of queued, because the check happens after
skb_gso_segment(). In that case, we did have exclusive ownership
of the skb and its associated conntrack entry. The elevated use
count is due to skb_clone happening via skb_gso_segment().
Move the check so that its peformed vs. the aggregated packet.
Then, annotate the individual segments except the first one so we
can do a 2nd check at reinject time.
For the normal case, where userspace does in-order reinjects, this avoids
packet drops: first reinjected segment continues traversal and confirms
entry, remaining segments observe the confirmed entry.
While at it, simplify nf_ct_drop_unconfirmed(): We only care about
unconfirmed entries with a refcnt > 1, there is no need to special-case
dying entries.
This only happens with UDP. With TCP, the only unconfirmed packet will
be the TCP SYN, those aren't aggregated by GRO.
Next patch adds a udpgro test case to cover this scenario.
Reported-by: Ulrich Weber <ulrich.weber@gmail.com>
Fixes: 7d8dc1c7be ("netfilter: nf_queue: drop packets with cloned unconfirmed conntracks")
Signed-off-by: Florian Westphal <fw@strlen.de>
This commit is contained in:
parent
35f83a7552
commit
207b3ebacb
2 changed files with 75 additions and 49 deletions
|
|
@ -21,6 +21,7 @@ struct nf_queue_entry {
|
|||
struct net_device *physout;
|
||||
#endif
|
||||
struct nf_hook_state state;
|
||||
bool nf_ct_is_unconfirmed;
|
||||
u16 size; /* sizeof(entry) + saved route keys */
|
||||
u16 queue_num;
|
||||
|
||||
|
|
|
|||
|
|
@ -435,6 +435,34 @@ next_hook:
|
|||
nf_queue_entry_free(entry);
|
||||
}
|
||||
|
||||
/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
|
||||
* exclusively.
|
||||
*/
|
||||
static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed)
|
||||
{
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
struct nf_conn *ct = (void *)skb_nfct(entry->skb);
|
||||
|
||||
if (!ct || nf_ct_is_confirmed(ct))
|
||||
return false;
|
||||
|
||||
if (is_unconfirmed)
|
||||
*is_unconfirmed = true;
|
||||
|
||||
/* in some cases skb_clone() can occur after initial conntrack
|
||||
* pickup, but conntrack assumes exclusive skb->_nfct ownership for
|
||||
* unconfirmed entries.
|
||||
*
|
||||
* This happens for br_netfilter and with ip multicast routing.
|
||||
* This can't be solved with serialization here because one clone
|
||||
* could have been queued for local delivery or could be transmitted
|
||||
* in parallel on another CPU.
|
||||
*/
|
||||
return refcount_read(&ct->ct_general.use) > 1;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
|
||||
{
|
||||
const struct nf_ct_hook *ct_hook;
|
||||
|
|
@ -462,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) {
|
||||
/* If first queued segment was already reinjected then
|
||||
* there is a good chance the ct entry is now confirmed.
|
||||
*
|
||||
* Handle the rare cases:
|
||||
* - out-of-order verdict
|
||||
* - threaded userspace reinjecting in parallel
|
||||
* - first segment was dropped
|
||||
*
|
||||
* In all of those cases we can't handle this packet
|
||||
* because we can't be sure that another CPU won't modify
|
||||
* nf_conn->ext in parallel which isn't allowed.
|
||||
*/
|
||||
if (nf_ct_drop_unconfirmed(entry, NULL))
|
||||
verdict = NF_DROP;
|
||||
}
|
||||
|
||||
nf_reinject(entry, verdict);
|
||||
}
|
||||
|
||||
|
|
@ -891,49 +937,6 @@ nlmsg_failure:
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
|
||||
{
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
|
||||
struct nf_conn *ct = (void *)skb_nfct(entry->skb);
|
||||
unsigned long status;
|
||||
unsigned int use;
|
||||
|
||||
if (!ct)
|
||||
return false;
|
||||
|
||||
status = READ_ONCE(ct->status);
|
||||
if ((status & flags) == IPS_DYING)
|
||||
return true;
|
||||
|
||||
if (status & IPS_CONFIRMED)
|
||||
return false;
|
||||
|
||||
/* in some cases skb_clone() can occur after initial conntrack
|
||||
* pickup, but conntrack assumes exclusive skb->_nfct ownership for
|
||||
* unconfirmed entries.
|
||||
*
|
||||
* This happens for br_netfilter and with ip multicast routing.
|
||||
* We can't be solved with serialization here because one clone could
|
||||
* have been queued for local delivery.
|
||||
*/
|
||||
use = refcount_read(&ct->ct_general.use);
|
||||
if (likely(use == 1))
|
||||
return false;
|
||||
|
||||
/* Can't decrement further? Exclusive ownership. */
|
||||
if (!refcount_dec_not_one(&ct->ct_general.use))
|
||||
return false;
|
||||
|
||||
skb_set_nfct(entry->skb, 0);
|
||||
/* No nf_ct_put(): we already decremented .use and it cannot
|
||||
* drop down to 0.
|
||||
*/
|
||||
return true;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
static int
|
||||
__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
|
||||
struct nf_queue_entry *entry)
|
||||
|
|
@ -950,9 +953,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
|
|||
}
|
||||
spin_lock_bh(&queue->lock);
|
||||
|
||||
if (nf_ct_drop_unconfirmed(entry))
|
||||
goto err_out_free_nskb;
|
||||
|
||||
if (queue->queue_total >= queue->queue_maxlen)
|
||||
goto err_out_queue_drop;
|
||||
|
||||
|
|
@ -995,7 +995,6 @@ err_out_queue_drop:
|
|||
else
|
||||
net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
|
||||
}
|
||||
err_out_free_nskb:
|
||||
kfree_skb(nskb);
|
||||
err_out_unlock:
|
||||
spin_unlock_bh(&queue->lock);
|
||||
|
|
@ -1074,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
|
|||
static int
|
||||
nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
|
||||
{
|
||||
unsigned int queued;
|
||||
struct nfqnl_instance *queue;
|
||||
struct sk_buff *skb, *segs, *nskb;
|
||||
bool ct_is_unconfirmed = false;
|
||||
struct nfqnl_instance *queue;
|
||||
unsigned int queued;
|
||||
int err = -ENOBUFS;
|
||||
struct net *net = entry->state.net;
|
||||
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
|
||||
|
|
@ -1100,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
|
|||
break;
|
||||
}
|
||||
|
||||
/* Check if someone already holds another reference to
|
||||
* unconfirmed ct. If so, we cannot queue the skb:
|
||||
* concurrent modifications of nf_conn->ext are not
|
||||
* allowed and we can't know if another CPU isn't
|
||||
* processing the same nf_conn entry in parallel.
|
||||
*/
|
||||
if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
|
||||
return -EINVAL;
|
||||
|
||||
if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
|
||||
return __nfqnl_enqueue_packet(net, queue, entry);
|
||||
|
||||
|
|
@ -1113,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
|
|||
goto out_err;
|
||||
queued = 0;
|
||||
err = 0;
|
||||
|
||||
skb_list_walk_safe(segs, segs, nskb) {
|
||||
if (ct_is_unconfirmed && queued > 0) {
|
||||
/* skb_gso_segment() increments the ct refcount.
|
||||
* This is a problem for unconfirmed (not in hash)
|
||||
* entries, those can race when reinjections happen
|
||||
* in parallel.
|
||||
*
|
||||
* Annotate this for all queued entries except the
|
||||
* first one.
|
||||
*
|
||||
* As long as the first one is reinjected first it
|
||||
* will do the confirmation for us.
|
||||
*/
|
||||
entry->nf_ct_is_unconfirmed = ct_is_unconfirmed;
|
||||
}
|
||||
|
||||
if (err == 0)
|
||||
err = __nfqnl_enqueue_packet_gso(net, queue,
|
||||
segs, entry);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue