diff --cc net/mptcp/protocol.c
index 5d747c6a610e,34c037731f35..000000000000
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@@ -112,64 -112,205 +112,205 @@@ static int __mptcp_socket_create(struc
return 0;
}
- static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
- struct sk_buff *skb,
- unsigned int offset, size_t copy_len)
+ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
+ {
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+ }
+
+ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from)
+ {
+ bool fragstolen;
+ int delta;
+
+ if (MPTCP_SKB_CB(from)->offset ||
+ !skb_try_coalesce(to, from, &fragstolen, &delta))
+ return false;
+
+ pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
+ MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
+ to->len, MPTCP_SKB_CB(from)->end_seq);
+ MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
+ kfree_skb_partial(from, fragstolen);
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ return true;
+ }
+
+ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
+ struct sk_buff *from)
+ {
+ if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
+ return false;
+
+ return mptcp_try_coalesce((struct sock *)msk, to, from);
+ }
+
+ /* "inspired" by tcp_data_queue_ofo(), main differences:
+ * - use mptcp seqs
+ * - don't cope with sacks
+ */
+ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
{
struct sock *sk = (struct sock *)msk;
- struct sk_buff *tail;
+ struct rb_node **p, *parent;
+ u64 seq, end_seq, max_seq;
+ struct sk_buff *skb1;
+ int space;
+
+ seq = MPTCP_SKB_CB(skb)->map_seq;
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ space = tcp_space(sk);
+ max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq;
+
+ pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
+ RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ if (after64(seq, max_seq)) {
+ /* out of window */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
+ return;
+ }
- __skb_unlink(skb, &ssk->sk_receive_queue);
+ p = &msk->out_of_order_queue.rb_node;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
+ if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
+ msk->ooo_last_skb = skb;
+ goto end;
+ }
- skb_ext_reset(skb);
- skb_orphan(skb);
- WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
+ /* with 2 subflows, adding at end of ooo queue is quite likely
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ return;
+ }
- tail = skb_peek_tail(&sk->sk_receive_queue);
- if (offset == 0 && tail) {
- bool fragstolen;
- int delta;
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ parent = &msk->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
- if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
- kfree_skb_partial(skb, fragstolen);
- atomic_add(delta, &sk->sk_rmem_alloc);
- sk_mem_charge(sk, delta);
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ return;
+ }
+ if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ /* partial overlap:
+ * | skb |
+ * | skb1 |
+ * continue traversing
+ */
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ goto merge_right;
+ }
+ } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
return;
}
+ p = &parent->rb_right;
}
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- MPTCP_SKB_CB(skb)->offset = offset;
- }
+ insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
- static void mptcp_stop_timer(struct sock *sk)
- {
- struct inet_connection_sock *icsk = inet_csk(sk);
+ merge_right:
+ /* Remove other segments covered by skb. */
+ while ((skb1 = skb_rb_next(skb)) != NULL) {
+ if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
+ break;
+ rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ }
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!skb1)
+ msk->ooo_last_skb = skb;
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
- mptcp_sk(sk)->timer_ival = 0;
+ end:
+ skb_condense(skb);
+ skb_set_owner_r(skb, sk);
}
- /* both sockets must be locked */
- static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
- struct sock *ssk)
+ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb, unsigned int offset,
+ size_t copy_len)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- u64 dsn = mptcp_subflow_get_mapped_dsn(subflow);
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *tail;
- /* revalidate data sequence number.
- *
- * mptcp_subflow_data_available() is usually called
- * without msk lock. Its unlikely (but possible)
- * that msk->ack_seq has been advanced since the last
- * call found in-sequence data.
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+
+ skb_ext_reset(skb);
+ skb_orphan(skb);
+
+ /* the skb map_seq accounts for the skb offset:
+ * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
+ * value
*/
- if (likely(dsn == msk->ack_seq))
+ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
+ MPTCP_SKB_CB(skb)->offset = offset;
+
+ if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
+ /* in sequence */
- msk->ack_seq += copy_len;
++ WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (tail && mptcp_try_coalesce(sk, tail, skb))
+ return true;
+
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
+ } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
+ mptcp_data_queue_ofo(msk, skb);
+ return false;
+ }
- subflow->data_avail = 0;
- return mptcp_subflow_data_available(ssk);
+ /* old data, keep it simple and drop the whole pkt, sender
+ * will retransmit as needed, if needed.
+ */
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ mptcp_drop(sk, skb);
+ return false;
+ }
+
+ static void mptcp_stop_timer(struct sock *sk)
+ {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ mptcp_sk(sk)->timer_ival = 0;
}
static void mptcp_check_data_fin_ack(struct sock *sk)