diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 12fef76..04c7e11 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3200,7 +3200,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos) */ memset(&tmp_opt, 0, sizeof(tmp_opt)); tcp_clear_options(&tmp_opt); - tcp_parse_options(skb, &tmp_opt, 0, NULL); + tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL); req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req)); memset(req, 0, sizeof(*req)); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6b4a4db..6547d46 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -173,6 +173,8 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq, head = vq->free_head; vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT; vq->vring.desc[head].addr = virt_to_phys(desc); + /* kmemleak gives a false positive, as it's hidden by virt_to_phys */ + kmemleak_ignore(desc); vq->vring.desc[head].len = i * sizeof(struct vring_desc); /* Update free pointer */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d686334..51a6c48 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -72,6 +72,53 @@ struct tcp_sack_block { u32 end_seq; }; +struct tcp_out_options { + u16 options; /* bit field of OPTION_* */ + u8 ws; /* window scale, 0 to disable */ + u8 num_sack_blocks;/* number of SACK blocks to include */ + u8 hash_size; /* bytes in hash_location */ + u16 mss; /* 0 to disable */ + __u8 *hash_location; /* temporary pointer, overloaded */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ + struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ +#ifdef CONFIG_MPTCP + u16 mptcp_options; /* bit field of MPTCP related OPTION_* */ + u8 dss_csum:1, + add_addr_v4:1, + add_addr_v6:1; /* dss-checksum required? */ + + __u32 data_seq; /* data sequence number, for MPTCP */ + __u32 data_ack; /* data ack, for MPTCP */ + + union { + struct { + __u64 sender_key; /* sender's key for mptcp */ + __u64 receiver_key; /* receiver's key for mptcp */ + } mp_capable; + + struct { + __u64 sender_truncated_mac; + __u32 sender_nonce; + /* random number of the sender */ + __u32 token; /* token for mptcp */ + } mp_join_syns; + }; + + struct { + struct in_addr addr; + u8 addr_id; + } add_addr4; + + struct { + struct in6_addr addr; + u8 addr_id; + } add_addr6; + + u16 remove_addrs; /* list of address id */ + u8 addr_id; /* address id (mp_join or add_address) */ +#endif /* CONFIG_MPTCP */ +}; + /*These are used to set the sack_ok field in struct tcp_options_received */ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/ @@ -95,6 +142,9 @@ struct tcp_options_received { u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ }; +struct mptcp_cb; +struct mptcp_tcp_sock; + static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; @@ -123,6 +173,7 @@ struct tcp_request_sock { * FastOpen it's the seq# * after data-in-SYN. */ + u8 saw_mpc:1; }; static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) @@ -320,6 +371,35 @@ struct tcp_sock { * socket. Used to retransmit SYNACKs etc. */ struct request_sock *fastopen_rsk; + + + struct mptcp_cb *mpcb; + struct sock *meta_sk; + /* We keep these flags even if CONFIG_MPTCP is not checked, because + * it allows checking MPTCP capability just by checking the mpc flag, + * rather than adding ifdefs everywhere. + */ + u16 mpc:1, /* Other end is multipath capable */ + inside_tk_table:1, /* Is the tcp_sock inside the token-table? */ + send_mp_fclose:1, + request_mptcp:1, /* Did we send out an MP_CAPABLE? + * (this speeds up mptcp_doit() in tcp_recvmsg) + */ + mptcp_enabled:1, /* Is MPTCP enabled from the application ? */ + pf:1, /* Potentially Failed state: when this flag is set, we + * stop using the subflow + */ + mp_killed:1, /* Killed with a tcp_done in mptcp? */ + mptcp_add_addr_ack:1, + was_meta_sk:1, /* This was a meta sk (in case of reuse) */ + close_it:1, /* Must close socket in mptcp_data_ready? */ + closing:1; + struct mptcp_tcp_sock *mptcp; +#ifdef CONFIG_MPTCP + struct hlist_nulls_node tk_table; + u32 mptcp_loc_token; + u64 mptcp_loc_key; +#endif /* CONFIG_MPTCP */ }; enum tsq_flags { @@ -331,6 +411,8 @@ enum tsq_flags { TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call * tcp_v{4|6}_mtu_reduced() */ + MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */ + MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */ }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -349,6 +431,7 @@ struct tcp_timewait_sock { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif + struct mptcp_tw *mptcp_tw; }; static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index 04642c9..27bdc88 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -28,7 +28,8 @@ extern int inet6_csk_bind_conflict(const struct sock *sk, extern struct dst_entry* inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6, const struct request_sock *req); - +u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, + const u32 rnd, const u32 synq_hsize); extern struct request_sock *inet6_csk_search_req(const struct sock *sk, struct request_sock ***prevp, const __be16 rport, diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 2340087..d40eee0 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -1,6 +1,8 @@ #ifndef _INET_COMMON_H #define _INET_COMMON_H +#include + extern const struct proto_ops inet_stream_ops; extern const struct proto_ops inet_dgram_ops; @@ -13,6 +15,8 @@ struct sock; struct sockaddr; struct socket; +int inet_create(struct net *net, struct socket *sock, int protocol, int kern); +int inet6_create(struct net *net, struct socket *sock, int protocol, int kern); extern int inet_release(struct socket *sock); extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index de2c785..4ac892f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -243,6 +243,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, extern struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); +u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, + const u32 synq_hsize); + extern struct request_sock *inet_csk_search_req(const struct sock *sk, struct request_sock ***prevp, const __be16 rport, diff --git a/include/net/mptcp.h b/include/net/mptcp.h new file mode 100644 index 0000000..2ad58da --- /dev/null +++ b/include/net/mptcp.h @@ -0,0 +1,1459 @@ +/* + * MPTCP implementation + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _MPTCP_H +#define _MPTCP_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if defined(__LITTLE_ENDIAN_BITFIELD) + #define ntohll(x) be64_to_cpu(x) + #define htonll(x) cpu_to_be64(x) +#elif defined(__BIG_ENDIAN_BITFIELD) + #define ntohll(x) (x) + #define htonll(x) (x) +#endif + +/* Max number of local or remote addresses we can store. + * When changing, see the bitfield below in mptcp_loc4/6. */ +#define MPTCP_MAX_ADDR 8 + +#define MPTCP_SUBFLOW_RETRY_DELAY 1000 + +struct mptcp_loc4 { + u8 id; + u8 low_prio:1; + struct in_addr addr; +}; + +struct mptcp_rem4 { + u8 id; + u8 bitfield; + u8 retry_bitfield; + __be16 port; + struct in_addr addr; +}; + +struct mptcp_loc6 { + u8 id; + u8 low_prio:1; + struct in6_addr addr; +}; + +struct mptcp_rem6 { + u8 id; + u8 bitfield; + u8 retry_bitfield; + __be16 port; + struct in6_addr addr; +}; + +struct mptcp_request_sock { + struct tcp_request_sock req; + struct mptcp_cb *mpcb; + /* Collision list in the tuple hashtable. We need to find + * the req sock when receiving the third msg of the 3-way handshake, + * since that one does not contain the token. If this makes + * the request sock too long, we can use kmalloc'ed specific entries for + * that tuple hashtable. At the moment, though, I extend the + * request_sock. + */ + struct list_head collide_tuple; + struct hlist_nulls_node collide_tk; + u32 mptcp_rem_nonce; + u32 mptcp_loc_token; + u64 mptcp_loc_key; + u64 mptcp_rem_key; + u64 mptcp_hash_tmac; + u32 mptcp_loc_nonce; + u8 loc_id; + u8 rem_id; /* Address-id in the MP_JOIN */ + u8 dss_csum:1, + low_prio:1; +}; + +struct mptcp_options_received { + u16 saw_mpc:1, + dss_csum:1, + drop_me:1, + + is_mp_join:1, + join_ack:1, + + saw_low_prio:2, /* 0x1 - low-prio set for this subflow + * 0x2 - low-prio set for another subflow + */ + low_prio:1, + + saw_add_addr:2, /* Saw at least one add_addr option: + * 0x1: IPv4 - 0x2: IPv6 + */ + more_add_addr:1, /* Saw one more add-addr. */ + + saw_rem_addr:1, /* Saw at least one rem_addr option */ + more_rem_addr:1, /* Saw one more rem-addr. */ + + mp_fail:1, + mp_fclose:1; + u8 rem_id; /* Address-id in the MP_JOIN */ + u8 prio_addr_id; /* Address-id in the MP_PRIO */ + + const unsigned char *add_addr_ptr; /* Pointer to add-address option */ + const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */ + + u32 data_ack; + u32 data_seq; + u16 data_len; + + u32 mptcp_rem_token;/* Remote token */ + + /* Key inside the option (from mp_capable or fast_close) */ + u64 mptcp_key; + + u32 mptcp_recv_nonce; + u64 mptcp_recv_tmac; + u8 mptcp_recv_mac[20]; +}; + +struct mptcp_tcp_sock { + struct tcp_sock *next; /* Next subflow socket */ + struct list_head cb_list; + struct mptcp_options_received rx_opt; + + /* Those three fields record the current mapping */ + u64 map_data_seq; + u32 map_subseq; + u16 map_data_len; + u16 slave_sk:1, + nonce_set:1, /* Is the nonce set? (in order to support 0-nonce) */ + fully_established:1, + establish_increased:1, + second_packet:1, + attached:1, + send_mp_fail:1, + include_mpc:1, + mapping_present:1, + map_data_fin:1, + low_prio:1, /* use this socket as backup */ + rcv_low_prio:1, /* Peer sent low-prio option to us */ + send_mp_prio:1, /* Trigger to send mp_prio on this socket */ + pre_established:1; /* State between sending 3rd ACK and + * receiving the fourth ack of new subflows. + */ + + /* isn: needed to translate abs to relative subflow seqnums */ + u32 snt_isn; + u32 rcv_isn; + u32 last_data_seq; + u8 path_index; + u8 loc_id; + u8 rem_id; + + u32 last_rbuf_opti; /* Timestamp of last rbuf optimization */ + unsigned int sent_pkts; + + struct sk_buff *shortcut_ofoqueue; /* Shortcut to the current modified + * skb in the ofo-queue. + */ + + int init_rcv_wnd; + u32 infinite_cutoff_seq; + struct delayed_work work; + u32 mptcp_loc_nonce; + struct tcp_sock *tp; /* Where is my daddy? */ + u32 last_end_data_seq; + + /* MP_JOIN subflow: timer for retransmitting the 3rd ack */ + struct timer_list mptcp_ack_timer; + + /* HMAC of the third ack */ + char sender_mac[20]; +}; + +struct mptcp_tw { + struct list_head list; + u64 loc_key; + u64 rcv_nxt; + struct mptcp_cb __rcu *mpcb; + u8 meta_tw:1, + in_list:1; +}; + +#define MPTCP_PM_NAME_MAX 16 +struct mptcp_pm_ops { + struct list_head list; + + /* Signal the creation of a new MPTCP-session. */ + void (*new_session)(struct sock *meta_sk, int id); + void (*release_sock)(struct sock *meta_sk); + void (*fully_established)(struct sock *meta_sk); + void (*new_remote_address)(struct sock *meta_sk); + int (*get_local_id)(sa_family_t family, union inet_addr *addr, + struct net *net); + void (*addr_signal)(struct sock *sk, unsigned *size, + struct tcp_out_options *opts, struct sk_buff *skb); + + char name[MPTCP_PM_NAME_MAX]; + struct module *owner; +}; + +struct mptcp_cb { + struct sock *meta_sk; + + /* list of sockets in this multipath connection */ + struct tcp_sock *connection_list; + /* list of sockets that need a call to release_cb */ + struct list_head callback_list; + + spinlock_t tw_lock; + struct list_head tw_list; + unsigned char mptw_state; + + atomic_t mpcb_refcnt; + + /* High-order bits of 64-bit sequence numbers */ + u32 snd_high_order[2]; + u32 rcv_high_order[2]; + + u16 send_infinite_mapping:1, + in_time_wait:1, + list_rcvd:1, /* XXX TO REMOVE */ + dss_csum:1, + server_side:1, + infinite_mapping_rcv:1, + infinite_mapping_snd:1, + dfin_combined:1, /* Was the DFIN combined with subflow-fin? */ + passive_close:1, + snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */ + rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */ + + /* socket count in this connection */ + u8 cnt_subflows; + u8 cnt_established; + + u32 noneligible; /* Path mask of temporarily non + * eligible subflows by the scheduler + */ + + struct sk_buff_head reinject_queue; + + u8 dfin_path_index; + +#define MPTCP_PM_SIZE 320 + u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8); + struct mptcp_pm_ops *pm_ops; + + /* Mutex needed, because otherwise mptcp_close will complain that the + * socket is owned by the user. + * E.g., mptcp_sub_close_wq is taking the meta-lock. + */ + struct mutex mpcb_mutex; + + /* Master socket, also part of the connection_list, this + * socket is the one that the application sees. + */ + struct sock *master_sk; + + u64 csum_cutoff_seq; + + __u64 mptcp_loc_key; + __u32 mptcp_loc_token; + __u64 mptcp_rem_key; + __u32 mptcp_rem_token; + + /* Create a new subflow - necessary because the meta-sk may be IPv4, but + * the new subflow can be IPv6 + */ + struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); + + /* Remote addresses */ + struct mptcp_rem4 remaddr4[MPTCP_MAX_ADDR]; + u8 rem4_bits; + + struct mptcp_rem6 remaddr6[MPTCP_MAX_ADDR]; + u8 rem6_bits; + + u32 path_index_bits; + /* Next pi to pick up in case a new path becomes available */ + u8 next_path_index; + + /* Original snd/rcvbuf of the initial subflow. + * Used for the new subflows on the server-side to allow correct + * autotuning + */ + int orig_sk_rcvbuf; + int orig_sk_sndbuf; + u32 orig_window_clamp; +}; + +#define MPTCP_SUB_CAPABLE 0 +#define MPTCP_SUB_LEN_CAPABLE_SYN 12 +#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12 +#define MPTCP_SUB_LEN_CAPABLE_ACK 20 +#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20 + +#define MPTCP_SUB_JOIN 1 +#define MPTCP_SUB_LEN_JOIN_SYN 12 +#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12 +#define MPTCP_SUB_LEN_JOIN_SYNACK 16 +#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16 +#define MPTCP_SUB_LEN_JOIN_ACK 24 +#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24 + +#define MPTCP_SUB_DSS 2 +#define MPTCP_SUB_LEN_DSS 4 +#define MPTCP_SUB_LEN_DSS_ALIGN 4 + +/* Lengths for seq and ack are the ones without the generic MPTCP-option header, + * as they are part of the DSS-option. + * To get the total length, just add the different options together. + */ +#define MPTCP_SUB_LEN_SEQ 10 +#define MPTCP_SUB_LEN_SEQ_CSUM 12 +#define MPTCP_SUB_LEN_SEQ_ALIGN 12 + +#define MPTCP_SUB_LEN_SEQ_64 14 +#define MPTCP_SUB_LEN_SEQ_CSUM_64 16 +#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16 + +#define MPTCP_SUB_LEN_ACK 4 +#define MPTCP_SUB_LEN_ACK_ALIGN 4 + +#define MPTCP_SUB_LEN_ACK_64 8 +#define MPTCP_SUB_LEN_ACK_64_ALIGN 8 + +/* This is the "default" option-length we will send out most often. + * MPTCP DSS-header + * 32-bit data sequence number + * 32-bit data ack + * + * It is necessary to calculate the effective MSS we will be using when + * sending data. + */ +#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \ + MPTCP_SUB_LEN_SEQ_ALIGN + \ + MPTCP_SUB_LEN_ACK_ALIGN) + +#define MPTCP_SUB_ADD_ADDR 3 +#define MPTCP_SUB_LEN_ADD_ADDR4 8 +#define MPTCP_SUB_LEN_ADD_ADDR6 20 +#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8 +#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20 + +#define MPTCP_SUB_REMOVE_ADDR 4 +#define MPTCP_SUB_LEN_REMOVE_ADDR 4 + +#define MPTCP_SUB_PRIO 5 +#define MPTCP_SUB_LEN_PRIO 3 +#define MPTCP_SUB_LEN_PRIO_ADDR 4 +#define MPTCP_SUB_LEN_PRIO_ALIGN 4 + +#define MPTCP_SUB_FAIL 6 +#define MPTCP_SUB_LEN_FAIL 12 +#define MPTCP_SUB_LEN_FAIL_ALIGN 12 + +#define MPTCP_SUB_FCLOSE 7 +#define MPTCP_SUB_LEN_FCLOSE 12 +#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12 + + +#define OPTION_MPTCP (1 << 5) + +#ifdef CONFIG_MPTCP + +/* Used for checking if the mptcp initialization has been successful */ +extern bool mptcp_init_failed; + +/* MPTCP options */ +#define OPTION_TYPE_SYN (1 << 0) +#define OPTION_TYPE_SYNACK (1 << 1) +#define OPTION_TYPE_ACK (1 << 2) +#define OPTION_MP_CAPABLE (1 << 3) +#define OPTION_DATA_ACK (1 << 4) +#define OPTION_ADD_ADDR (1 << 5) +#define OPTION_MP_JOIN (1 << 6) +#define OPTION_MP_FAIL (1 << 7) +#define OPTION_MP_FCLOSE (1 << 8) +#define OPTION_REMOVE_ADDR (1 << 9) +#define OPTION_MP_PRIO (1 << 10) + +/* MPTCP flags */ +#define MPTCPHDR_ACK 0x01 +#define MPTCPHDR_SEQ 0x02 +#define MPTCPHDR_FIN 0x04 +#define MPTCPHDR_INF 0x08 +#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number */ +#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */ +#define MPTCPHDR_SEQ64_INDEX 0x40 /* Index of seq in mpcb->snd_high_order */ +#define MPTCPHDR_DSS_CSUM 0x80 + +/* It is impossible, that all 8 bits of mptcp_flags are set to 1 with the above + * Thus, defining MPTCPHDR_JOIN as 0xFF is safe. + */ +#define MPTCPHDR_JOIN 0xFF + +struct mptcp_option { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ver:4, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + ver:4; +#else +#error "Adjust your defines" +#endif +}; + +struct mp_capable { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ver:4, + sub:4; + __u8 h:1, + rsv:5, + b:1, + a:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + ver:4; + __u8 a:1, + b:1, + rsv:5, + h:1; +#else +#error "Adjust your defines" +#endif + __u64 sender_key; + __u64 receiver_key; +} __attribute__((__packed__)); + +struct mp_join { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 b:1, + rsv:3, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + rsv:3, + b:1; +#else +#error "Adjust your defines" +#endif + __u8 addr_id; + union { + struct { + u32 token; + u32 nonce; + } syn; + struct { + __u64 mac; + u32 nonce; + } synack; + struct { + __u8 mac[20]; + } ack; + } u; +} __attribute__((__packed__)); + +struct mp_dss { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 rsv1:4, + sub:4, + A:1, + a:1, + M:1, + m:1, + F:1, + rsv2:3; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 sub:4, + rsv1:4, + rsv2:3, + F:1, + m:1, + M:1, + a:1, + A:1; +#else +#error "Adjust your defines" +#endif +}; + +struct mp_add_addr { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ipver:4, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + ipver:4; +#else +#error "Adjust your defines" +#endif + __u8 addr_id; + union { + struct { + struct in_addr addr; + __be16 port; + } v4; + struct { + struct in6_addr addr; + __be16 port; + } v6; + } u; +} __attribute__((__packed__)); + +struct mp_remove_addr { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 rsv:4, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + rsv:4; +#else +#error "Adjust your defines" +#endif + /* list of addr_id */ + __u8 addrs_id; +}; + +struct mp_fail { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 rsv1:4, + sub:4, + rsv2:8; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 sub:4, + rsv1:4, + rsv2:8; +#else +#error "Adjust your defines" +#endif + __be64 data_seq; +} __attribute__((__packed__)); + +struct mp_fclose { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 rsv1:4, + sub:4, + rsv2:8; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 sub:4, + rsv1:4, + rsv2:8; +#else +#error "Adjust your defines" +#endif + __u64 key; +} __attribute__((__packed__)); + +struct mp_prio { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 b:1, + rsv:3, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + rsv:3, + b:1; +#else +#error "Adjust your defines" +#endif + __u8 addr_id; +} __attribute__((__packed__)); + +static inline int mptcp_sub_len_dss(struct mp_dss *m, int csum) +{ + return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2); +} + +#define MPTCP_APP 2 + +extern int sysctl_mptcp_enabled; +extern int sysctl_mptcp_checksum; +extern int sysctl_mptcp_debug; +extern int sysctl_mptcp_syn_retries; + +extern struct workqueue_struct *mptcp_wq; + +#define mptcp_debug(fmt, args...) \ + do { \ + if (unlikely(sysctl_mptcp_debug)) \ + pr_err(__FILE__ ": " fmt, ##args); \ + } while (0) + +/* Iterates over all subflows */ +#define mptcp_for_each_tp(mpcb, tp) \ + for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next) + +#define mptcp_for_each_sk(mpcb, sk) \ + for ((sk) = (struct sock *)(mpcb)->connection_list; \ + sk; \ + sk = (struct sock *)tcp_sk(sk)->mptcp->next) + +#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) \ + for (__sk = (struct sock *)(__mpcb)->connection_list, \ + __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \ + __sk; \ + __sk = __temp, \ + __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL) + +/* Iterates over all bit set to 1 in a bitset */ +#define mptcp_for_each_bit_set(b, i) \ + for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1) + +#define mptcp_for_each_bit_unset(b, i) \ + mptcp_for_each_bit_set(~b, i) + +extern struct lock_class_key meta_key; +extern struct lock_class_key meta_slock_key; +extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4]; + +/* This is needed to ensure that two subsequent key-generation result in + * different keys if the IPs and ports are the same. + */ +extern u32 mptcp_key_seed; + +#define MPTCP_HASH_SIZE 1024 + +extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE]; + +/* This second hashtable is needed to retrieve request socks + * created as a result of a join request. While the SYN contains + * the token, the final ack does not, so we need a separate hashtable + * to retrieve the mpcb. + */ +extern struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE]; +extern spinlock_t mptcp_reqsk_hlock; /* hashtable protection */ + +/* Lock, protecting the two hash-tables that hold the token. Namely, + * mptcp_reqsk_tk_htb and tk_hashtable + */ +extern spinlock_t mptcp_tk_hashlock; /* hashtable protection */ + +void mptcp_data_ready(struct sock *sk, int bytes); +void mptcp_write_space(struct sock *sk); + +void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb, + struct sock *sk); +void mptcp_ofo_queue(struct sock *meta_sk); +void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp); +void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied); +int mptcp_alloc_mpcb(struct sock *master_sk, __u64 remote_key, u32 window); +int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, + gfp_t flags); +void mptcp_del_sock(struct sock *sk); +void mptcp_update_metasocket(struct sock *sock, struct sock *meta_sk); +void mptcp_reinject_data(struct sock *orig_sk, int clone_it); +void mptcp_update_sndbuf(struct mptcp_cb *mpcb); +struct sk_buff *mptcp_next_segment(struct sock *sk, int *reinject); +void mptcp_send_fin(struct sock *meta_sk); +void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority); +int mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); +void mptcp_parse_options(const uint8_t *ptr, int opsize, + struct tcp_options_received *opt_rx, + struct mptcp_options_received *mopt, + const struct sk_buff *skb); +void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts, + unsigned *remaining); +void mptcp_synack_options(struct request_sock *req, + struct tcp_out_options *opts, + unsigned *remaining); +void mptcp_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, unsigned *size); +void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, + struct tcp_out_options *opts, + struct sk_buff *skb); +void mptcp_close(struct sock *meta_sk, long timeout); +int mptcp_doit(struct sock *sk); +int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window); +int mptcp_check_req_master(struct sock *sk, struct sock *child, + struct request_sock *req, + struct request_sock **prev, + struct mptcp_options_received *mopt); +struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child, + struct request_sock *req, + struct request_sock **prev, + struct mptcp_options_received *mopt); +u32 __mptcp_select_window(struct sock *sk); +void mptcp_select_initial_window(int *__space, __u32 *window_clamp, + const struct sock *sk); +unsigned int mptcp_current_mss(struct sock *meta_sk); +int mptcp_select_size(const struct sock *meta_sk, bool sg); +void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn); +void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2, + u32 *hash_out); +void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk); +void mptcp_fin(struct sock *meta_sk); +void mptcp_retransmit_timer(struct sock *meta_sk); +int mptcp_write_wakeup(struct sock *meta_sk); +void mptcp_sub_close_wq(struct work_struct *work); +void mptcp_sub_close(struct sock *sk, unsigned long delay); +struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied); +void mptcp_fallback_meta_sk(struct sock *meta_sk); +int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb); +struct sock *mptcp_sk_clone(const struct sock *sk, int family, const gfp_t priority); +void mptcp_ack_handler(unsigned long); +void mptcp_set_keepalive(struct sock *sk, int val); +int mptcp_check_rtt(const struct tcp_sock *tp, int time); +int mptcp_check_snd_buf(const struct tcp_sock *tp); +int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb); +void __init mptcp_init(void); +int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len); +int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + unsigned int mss_now, int reinject); +int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, + unsigned int mss_now, gfp_t gfp, int reinject); +void mptcp_destroy_sock(struct sock *sk); +int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, + struct sk_buff *skb, + struct mptcp_options_received *mopt); +unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now, + int large_allowed); +int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw); +void mptcp_twsk_destructor(struct tcp_timewait_sock *tw); +void mptcp_update_tw_socks(const struct tcp_sock *tp, int state); +void mptcp_disconnect(struct sock *sk); +bool mptcp_should_expand_sndbuf(struct sock *meta_sk); +int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb); +void mptcp_tsq_flags(struct sock *sk); +void mptcp_tsq_sub_deferred(struct sock *meta_sk); +struct mp_join *mptcp_find_join(struct sk_buff *skb); +void mptcp_hash_remove_bh(struct tcp_sock *meta_tp); +void mptcp_hash_remove(struct tcp_sock *meta_tp); +struct sock *mptcp_hash_find(struct net *net, u32 token); +int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw); +int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt, + struct tcp_options_received *tmp_opt, struct net *net); +void mptcp_reqsk_destructor(struct request_sock *req); +void mptcp_reqsk_new_mptcp(struct request_sock *req, + const struct tcp_options_received *rx_opt, + const struct mptcp_options_received *mopt, + const struct sk_buff *skb); +int mptcp_check_req(struct sk_buff *skb, struct net *net); +void mptcp_connect_init(struct sock *sk); +void mptcp_sub_force_close(struct sock *sk); +int mptcp_sub_len_remove_addr_align(u16 bitfield); +void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, + const struct sk_buff *skb); + +/* MPTCP-path-manager registration/initialization functions */ +int mptcp_register_path_manager(struct mptcp_pm_ops *pm); +void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm); +void mptcp_init_path_manager(struct mptcp_cb *mpcb); +void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb); +void mptcp_fallback_default(struct mptcp_cb *mpcb); +void mptcp_get_default_path_manager(char *name); +int mptcp_set_default_path_manager(const char *name); +extern struct mptcp_pm_ops mptcp_pm_default; + +static inline +struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req) +{ + return (struct mptcp_request_sock *)req; +} + +static inline +struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req) +{ + return (struct request_sock *)req; +} + +static inline bool mptcp_can_sendpage(struct sock *sk) +{ + struct sock *sk_it; + + if (tcp_sk(sk)->mpcb->dss_csum) + return false; + + mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) { + if (!(sk_it->sk_route_caps & NETIF_F_SG) || + !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM)) + return false; + } + + return true; +} + +static inline void mptcp_push_pending_frames(struct sock *meta_sk) +{ + if (mptcp_next_segment(meta_sk, NULL)) { + struct tcp_sock *tp = tcp_sk(meta_sk); + + /* We don't care about the MSS, because it will be set in + * mptcp_write_xmit. + */ + __tcp_push_pending_frames(meta_sk, 0, tp->nonagle); + } +} + +static inline void mptcp_send_reset(struct sock *sk) +{ + tcp_send_active_reset(sk, GFP_ATOMIC); + mptcp_sub_force_close(sk); +} + +static inline int mptcp_is_data_seq(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ; +} + +static inline int mptcp_is_data_fin(const struct sk_buff *skb) +{ + return mptcp_is_data_seq(skb) && + (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN); +} + +/* Is it a data-fin while in infinite mapping mode? + * In infinite mode, a subflow-fin is in fact a data-fin. + */ +static inline int mptcp_is_data_fin2(const struct sk_buff *skb, + const struct tcp_sock *tp) +{ + return mptcp_is_data_fin(skb) || + (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin); +} + +static inline void mptcp_skb_entail_init(const struct tcp_sock *tp, + struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_SEQ; +} + +static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb) +{ + u64 data_seq_high = (u32)(data_seq >> 32); + + if (mpcb->rcv_high_order[0] == data_seq_high) + return 0; + else if (mpcb->rcv_high_order[1] == data_seq_high) + return MPTCPHDR_SEQ64_INDEX; + else + return MPTCPHDR_SEQ64_OFO; +} + +/* Sets the data_seq and returns pointer to the in-skb field of the data_seq. + * If the packet has a 64-bit dseq, the pointer points to the last 32 bits. + */ +static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, + u32 *data_seq, + struct mptcp_cb *mpcb) +{ + __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off); + + if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) { + u64 data_seq64 = get_unaligned_be64(ptr); + + if (mpcb) + TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb); + + *data_seq = (u32)data_seq64 ; + ptr++; + } else { + *data_seq = get_unaligned_be32(ptr); + } + + return ptr; +} + +static inline struct sock *mptcp_meta_sk(const struct sock *sk) +{ + return tcp_sk(sk)->meta_sk; +} + +static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) +{ + return tcp_sk(tp->meta_sk); +} + +static inline int is_meta_tp(const struct tcp_sock *tp) +{ + return tp->mpcb && mptcp_meta_tp(tp) == tp; +} + +static inline int is_meta_sk(const struct sock *sk) +{ + return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP && + tcp_sk(sk)->mpc && mptcp_meta_sk(sk) == sk; +} + +static inline int is_master_tp(const struct tcp_sock *tp) +{ + return !tp->mpc || (!tp->mptcp->slave_sk && !is_meta_tp(tp)); +} + +static inline void mptcp_hash_request_remove(struct request_sock *req) +{ + int in_softirq = 0; + + if (list_empty(&mptcp_rsk(req)->collide_tuple)) + return; + + if (in_softirq()) { + spin_lock(&mptcp_reqsk_hlock); + in_softirq = 1; + } else { + spin_lock_bh(&mptcp_reqsk_hlock); + } + + list_del(&mptcp_rsk(req)->collide_tuple); + + if (in_softirq) + spin_unlock(&mptcp_reqsk_hlock); + else + spin_unlock_bh(&mptcp_reqsk_hlock); +} + +static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt) +{ + mopt->saw_mpc = 0; + mopt->dss_csum = 0; + mopt->drop_me = 0; + + mopt->is_mp_join = 0; + mopt->join_ack = 0; + + mopt->saw_low_prio = 0; + mopt->low_prio = 0; + + mopt->saw_add_addr = 0; + mopt->more_add_addr = 0; + + mopt->saw_rem_addr = 0; + mopt->more_rem_addr = 0; + + mopt->mp_fail = 0; + mopt->mp_fclose = 0; +} + +static inline void mptcp_reset_mopt(struct tcp_sock *tp) +{ + struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; + + mopt->saw_low_prio = 0; + mopt->saw_add_addr = 0; + mopt->more_add_addr = 0; + mopt->saw_rem_addr = 0; + mopt->more_rem_addr = 0; + mopt->join_ack = 0; + mopt->mp_fail = 0; + mopt->mp_fclose = 0; +} + +static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, + const struct mptcp_cb *mpcb) +{ + return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags & + MPTCPHDR_SEQ64_INDEX) ? 1 : 0]); +} + +static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, + u32 data_seq_32) +{ + return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32; +} + +static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp) +{ + struct mptcp_cb *mpcb = meta_tp->mpcb; + return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, + meta_tp->rcv_nxt); +} + +static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc) +{ + if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) { + struct mptcp_cb *mpcb = meta_tp->mpcb; + mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; + mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2; + } +} + +static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, + u32 old_rcv_nxt) +{ + if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) { + struct mptcp_cb *mpcb = meta_tp->mpcb; + mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2; + mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1; + } +} + +static inline int mptcp_sk_can_send(const struct sock *sk) +{ + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + !tcp_sk(sk)->mptcp->pre_established; +} + +static inline int mptcp_sk_can_recv(const struct sock *sk) +{ + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCP_FIN_WAIT1 | TCP_FIN_WAIT2); +} + +static inline int mptcp_sk_can_send_ack(const struct sock *sk) +{ + return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | + TCPF_CLOSE | TCPF_LISTEN)) && + !tcp_sk(sk)->mptcp->pre_established; +} + +/* Only support GSO if all subflows supports it */ +static inline bool mptcp_sk_can_gso(const struct sock *meta_sk) +{ + struct sock *sk; + + if (tcp_sk(meta_sk)->mpcb->dss_csum) + return 0; + + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { + if (!mptcp_sk_can_send(sk)) + continue; + if (!sk_can_gso(sk)) + return false; + } + return true; +} + +static inline bool mptcp_can_sg(const struct sock *meta_sk) +{ + struct sock *sk; + + if (tcp_sk(meta_sk)->mpcb->dss_csum) + return 0; + + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { + if (!mptcp_sk_can_send(sk)) + continue; + if (!(sk->sk_route_caps & NETIF_F_SG)) + return false; + } + return true; +} + +/* Adding a new subflow to the rcv-buffer space. We make a simple addition, + * to give some space to allow traffic on the new subflow. Autotuning will + * increase it further later on. + */ +static inline void mptcp_init_buffer_space(struct sock *sk) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + int space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]); + + if (space > meta_sk->sk_rcvbuf) { + tcp_sk(meta_sk)->window_clamp += tcp_sk(sk)->window_clamp; + tcp_sk(meta_sk)->rcv_ssthresh += tcp_sk(sk)->rcv_ssthresh; + meta_sk->sk_rcvbuf = space; + } +} + +static inline void mptcp_set_rto(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sock *sk_it; + struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk)); + __u32 max_rto = 0; + + /* We are in recovery-phase on the MPTCP-level. Do not update the + * RTO, because this would kill exponential backoff. + */ + if (micsk->icsk_retransmits) + return; + + mptcp_for_each_sk(tp->mpcb, sk_it) { + if (mptcp_sk_can_send(sk_it) && + inet_csk(sk_it)->icsk_rto > max_rto) + max_rto = inet_csk(sk_it)->icsk_rto; + } + if (max_rto) { + micsk->icsk_rto = max_rto << 1; + + /* A successfull rto-measurement - reset backoff counter */ + micsk->icsk_backoff = 0; + } +} + +static inline int mptcp_sysctl_syn_retries(void) +{ + return sysctl_mptcp_syn_retries; +} + +static inline void mptcp_sub_close_passive(struct sock *sk) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk); + + /* Only close, if the app did a send-shutdown (passive close), and we + * received the data-ack of the data-fin. + */ + if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq) + mptcp_sub_close(sk, 0); +} + +static inline bool mptcp_fallback_infinite(struct sock *sk, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* If data has been acknowleged on the meta-level, fully_established + * will have been set before and thus we will not fall back to infinite + * mapping. + */ + if (likely(tp->mptcp->fully_established)) + return false; + + if (!(flag & MPTCP_FLAG_DATA_ACKED)) + return false; + + /* Don't fallback twice ;) */ + if (tp->mpcb->infinite_mapping_snd) + return false; + + pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n", + __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index, + &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr, + __builtin_return_address(0)); + if (!is_master_tp(tp)) + return true; + + tp->mpcb->infinite_mapping_snd = 1; + tp->mpcb->infinite_mapping_rcv = 1; + tp->mptcp->fully_established = 1; + + return false; +} + +/* Find the first free index in the bitfield */ +static inline int __mptcp_find_free_index(u8 bitfield, int j, u8 base) +{ + int i; + mptcp_for_each_bit_unset(bitfield >> base, i) { + /* We wrapped at the bitfield - try from 0 on */ + if (i + base >= sizeof(bitfield) * 8) { + mptcp_for_each_bit_unset(bitfield, i) { + if (i >= sizeof(bitfield) * 8) + goto exit; + + if (i != j) + return i; + } + goto exit; + } + if (i + base >= sizeof(bitfield) * 8) + break; + + if (i + base != j) + return i + base; + } +exit: + return -1; +} + +static inline int mptcp_find_free_index(u8 bitfield) +{ + return __mptcp_find_free_index(bitfield, -1, 0); +} + +/* Find the first index whose bit in the bit-field == 0 */ +static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb) +{ + u8 base = mpcb->next_path_index; + int i; + + /* Start at 1, because 0 is reserved for the meta-sk */ + mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) { + if (i + base < 1) + continue; + if (i + base >= sizeof(mpcb->path_index_bits) * 8) + break; + i += base; + mpcb->path_index_bits |= (1 << i); + mpcb->next_path_index = i + 1; + return i; + } + mptcp_for_each_bit_unset(mpcb->path_index_bits, i) { + if (i >= sizeof(mpcb->path_index_bits) * 8) + break; + if (i < 1) + continue; + mpcb->path_index_bits |= (1 << i); + mpcb->next_path_index = i + 1; + return i; + } + + return 0; +} + +static inline int mptcp_v6_is_v4_mapped(struct sock *sk) +{ + return sk->sk_family == AF_INET6 && + ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED; +} +#else /* CONFIG_MPTCP */ +#define mptcp_debug(fmt, args...) \ + do { \ + } while (0) + +/* Without MPTCP, we just do one iteration + * over the only socket available. This assumes that + * the sk/tp arg is the socket in that case. + */ +#define mptcp_for_each_sk(mpcb, sk) +#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) + +static inline int mptcp_is_data_fin(const struct sk_buff *skb) +{ + return 0; +} +static inline int mptcp_is_data_seq(const struct sk_buff *skb) +{ + return 0; +} +static inline struct sock *mptcp_meta_sk(const struct sock *sk) +{ + return NULL; +} +static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) +{ + return NULL; +} +static inline int is_meta_sk(const struct sock *sk) +{ + return 0; +} +static inline int is_master_tp(const struct tcp_sock *tp) +{ + return 0; +} +static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {} +static inline void mptcp_cleanup_rbuf(const struct sock *meta_sk, int copied) {} +static inline void mptcp_del_sock(const struct sock *sk) {} +static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {} +static inline void mptcp_init_buffer_space(const struct sock *sk) {} +static inline void mptcp_update_sndbuf(const struct mptcp_cb *mpcb) {} +static inline void mptcp_skb_entail_init(const struct tcp_sock *tp, + const struct sk_buff *skb) {} +static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb, + const struct sock *sk) {} +static inline void mptcp_retransmit_timer(const struct sock *meta_sk) {} +static inline int mptcp_write_wakeup(struct sock *meta_sk) +{ + return 0; +} +static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {} +static inline void mptcp_set_rto(const struct sock *sk) {} +static inline void mptcp_send_fin(const struct sock *meta_sk) {} +static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize, + const struct tcp_options_received *opt_rx, + const struct mptcp_options_received *mopt, + const struct sk_buff *skb) {} +static inline void mptcp_syn_options(struct sock *sk, + struct tcp_out_options *opts, + unsigned *remaining) {} +static inline void mptcp_synack_options(struct request_sock *req, + struct tcp_out_options *opts, + unsigned *remaining) {} + +static inline void mptcp_established_options(struct sock *sk, + struct sk_buff *skb, + struct tcp_out_options *opts, + unsigned *size) {} +static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, + struct tcp_out_options *opts, + struct sk_buff *skb) {} +static inline void mptcp_close(struct sock *meta_sk, long timeout) {} +static inline int mptcp_doit(struct sock *sk) +{ + return 0; +} +static inline int mptcp_check_req_master(const struct sock *sk, + const struct sock *child, + struct request_sock *req, + struct request_sock **prev, + const struct mptcp_options_received *mopt) +{ + return 1; +} +static inline struct sock *mptcp_check_req_child(struct sock *sk, + struct sock *child, + struct request_sock *req, + struct request_sock **prev, + struct mptcp_options_received *mopt) +{ + return NULL; +} +static inline u32 __mptcp_select_window(const struct sock *sk) +{ + return 0; +} +static inline void mptcp_select_initial_window(int *__space, + __u32 *window_clamp, + const struct sock *sk) {} +static inline unsigned int mptcp_current_mss(struct sock *meta_sk) +{ + return 0; +} +static inline int mptcp_select_size(const struct sock *meta_sk, bool sg) +{ + return 0; +} +static inline void mptcp_sub_close_passive(struct sock *sk) {} +static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag) +{ + return false; +} +static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {} +static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time) +{ + return 0; +} +static inline int mptcp_check_snd_buf(const struct tcp_sock *tp) +{ + return 0; +} +static inline int mptcp_sysctl_syn_retries(void) +{ + return 0; +} +static inline void mptcp_send_reset(const struct sock *sk) {} +static inline void mptcp_send_active_reset(struct sock *meta_sk, + gfp_t priority) {} +static inline int mptcp_write_xmit(struct sock *sk, unsigned int mss_now, + int nonagle, int push_one, gfp_t gfp) +{ + return 0; +} +static inline struct sock *mptcp_sk_clone(const struct sock *sk, int family, + const gfp_t priority) +{ + return NULL; +} +static inline void mptcp_set_keepalive(struct sock *sk, int val) {} +static inline int mptcp_handle_options(struct sock *sk, + const struct tcphdr *th, + struct sk_buff *skb) +{ + return 0; +} +static inline void mptcp_reset_mopt(struct tcp_sock *tp) {} +static inline void __init mptcp_init(void) {} +static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +{ + return 0; +} +static inline int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + unsigned int mss_now, int reinject) +{ + return 0; +} +static inline int mptso_fragment(struct sock *sk, struct sk_buff *skb, + unsigned int len, unsigned int mss_now, + gfp_t gfp, int reinject) +{ + return 0; +} +static inline bool mptcp_sk_can_gso(const struct sock *sk) +{ + return false; +} +static inline bool mptcp_can_sg(const struct sock *meta_sk) +{ + return false; +} +static inline unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, + u32 mss_now, int large_allowed) +{ + return 0; +} +static inline void mptcp_destroy_sock(struct sock *sk) {} +static inline int mptcp_rcv_synsent_state_process(struct sock *sk, + struct sock **skptr, + struct sk_buff *skb, + struct mptcp_options_received *mopt) +{ + return 0; +} +static inline bool mptcp_can_sendpage(struct sock *sk) +{ + return false; +} +static inline int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw) +{ + return 0; +} +static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {} +static inline void mptcp_update_tw_socks(const struct tcp_sock *tp, int state) {} +static inline void mptcp_disconnect(struct sock *sk) {} +static inline bool mptcp_should_expand_sndbuf(struct sock *meta_sk) +{ + return false; +} +static inline void mptcp_tsq_flags(struct sock *sk) {} +static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {} +static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {} +static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {} +static inline void mptcp_reqsk_new_mptcp(struct request_sock *req, + const struct tcp_options_received *rx_opt, + const struct mptcp_options_received *mopt, + const struct sk_buff *skb) {} +static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, + const struct sk_buff *skb) {} +#endif /* CONFIG_MPTCP */ + +#endif /* _MPTCP_H */ diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h new file mode 100644 index 0000000..3c3b5eb --- /dev/null +++ b/include/net/mptcp_v4.h @@ -0,0 +1,69 @@ +/* + * MPTCP implementation + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef MPTCP_V4_H_ +#define MPTCP_V4_H_ + + +#include +#include +#include +#include +#include + +extern struct request_sock_ops mptcp_request_sock_ops; + +#ifdef CONFIG_MPTCP + +int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb); +int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id); +int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr, + __be16 port, u8 id); +void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr, u8 id); +struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, + const __be32 laddr, const struct net *net); +int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, + struct mptcp_rem4 *rem); +int mptcp_pm_v4_init(void); +void mptcp_pm_v4_undo(void); +u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, + u32 seq); +u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); + +#else + +static inline int mptcp_v4_do_rcv(const struct sock *meta_sk, + const struct sk_buff *skb) +{ + return 0; +} + +#endif /* CONFIG_MPTCP */ + +#endif /* MPTCP_V4_H_ */ diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h new file mode 100644 index 0000000..c27e153b --- /dev/null +++ b/include/net/mptcp_v6.h @@ -0,0 +1,77 @@ +/* + * MPTCP implementation + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Jaakko Korkeaniemi + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _MPTCP_V6_H +#define _MPTCP_V6_H + +#include +#include + +#include + +extern struct request_sock_ops mptcp6_request_sock_ops; +extern struct proto mptcpv6_prot; + +struct mptcp6_request_sock { + struct mptcp_request_sock mptcp6rsk_tcp; + struct inet6_request_sock mptcp6rsk_inet6; +}; + +#ifdef CONFIG_MPTCP + +int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb); +int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id); +int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr, + __be16 port, u8 id); +void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb, + const struct in6_addr *daddr, u8 id); +struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, + const struct in6_addr *laddr, const struct net *net); +int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, + struct mptcp_rem6 *rem); +int mptcp_pm_v6_init(void); +void mptcp_pm_v6_undo(void); +struct sock *mptcp_v6v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); +__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport, u32 seq); +u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport); + +#else /* CONFIG_MPTCP */ + +static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) +{ + return 0; +} + +#endif /* CONFIG_MPTCP */ + +#endif /* _MPTCP_V6_H */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 9d22f08..a9f7585 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -89,6 +90,9 @@ struct net { #if IS_ENABLED(CONFIG_IPV6) struct netns_ipv6 ipv6; #endif +#if IS_ENABLED(CONFIG_MPTCP) + struct netns_mptcp mptcp; +#endif #if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE) struct netns_sctp sctp; #endif diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h new file mode 100644 index 0000000..bad418b --- /dev/null +++ b/include/net/netns/mptcp.h @@ -0,0 +1,44 @@ +/* + * MPTCP implementation - MPTCP namespace + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef __NETNS_MPTCP_H__ +#define __NETNS_MPTCP_H__ + +#include + +enum { + MPTCP_PM_FULLMESH = 0, + MPTCP_PM_MAX +}; + +struct netns_mptcp { + void *path_managers[MPTCP_PM_MAX]; +}; + +#endif /* __NETNS_MPTCP_H__ */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 59795e4..fd01bc7 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -162,8 +162,8 @@ struct request_sock_queue { */ }; -extern int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries); +int reqsk_queue_alloc(struct request_sock_queue *queue, + unsigned int nr_table_entries, gfp_t flags); extern void __reqsk_queue_destroy(struct request_sock_queue *queue); extern void reqsk_queue_destroy(struct request_sock_queue *queue); diff --git a/include/net/sock.h b/include/net/sock.h index 6e2c490..51e67fa 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -868,6 +868,16 @@ extern void sk_clear_memalloc(struct sock *sk); extern int sk_wait_data(struct sock *sk, long *timeo); +/* START - needed for MPTCP */ +extern void sock_def_error_report(struct sock *sk); +extern struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, + int family); +extern void sock_lock_init(struct sock *sk); + +extern struct lock_class_key af_callback_keys[AF_MAX]; +extern char *const af_family_clock_key_strings[AF_MAX+1]; +/* END - needed for MPTCP */ + struct request_sock_ops; struct timewait_sock_ops; struct inet_hashinfo; diff --git a/include/net/tcp.h b/include/net/tcp.h index 31c4890..36564a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -176,6 +176,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ +#define TCPOPT_MPTCP 30 #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -234,6 +235,27 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define TFO_SERVER_ALWAYS 0x1000 +/* Flags from tcp_input.c for tcp_ack */ +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ +#define MPTCP_FLAG_DATA_ACKED 0x8000 + +#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) +#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) + extern struct inet_timewait_death_row tcp_death_row; /* sysctl variables for tcp */ @@ -348,6 +370,108 @@ extern struct proto tcp_prot; #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) +/**** START - Exports needed for MPTCP ****/ +extern const struct inet_connection_sock_af_ops ipv4_specific; +extern const struct inet_connection_sock_af_ops ipv6_specific; +extern const struct inet_connection_sock_af_ops ipv6_mapped; +extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; +extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; + +struct mptcp_options_received; + +int tcp_close_state(struct sock *sk); +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle); +int tcp_xmit_probe_skb(struct sock *sk, int urgent); +void tcp_cwnd_validate(struct sock *sk); +void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb); +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask); +unsigned int tcp_mss_split_point(const struct sock *sk, + const struct sk_buff *skb, + unsigned int mss_now, + unsigned int cwnd); +bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb); +bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss, int nonagle); +bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss); +unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb); +int tcp_mtu_probe(struct sock *sk); +int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, + unsigned int mss_now); +void __pskb_trim_head(struct sk_buff *skb, int len); +void tcp_queue_skb(struct sock *sk, struct sk_buff *skb); +void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags); +void tcp_reset(struct sock *sk); +bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, + const u32 ack_seq, const u32 nwin); +bool tcp_urg_mode(const struct tcp_sock *tp); +void tcp_ack_probe(struct sock *sk); +void tcp_rearm_rto(struct sock *sk); +int tcp_write_timeout(struct sock *sk); +bool retransmits_timed_out(struct sock *sk, unsigned int boundary, + unsigned int timeout, bool syn_set); +void tcp_write_err(struct sock *sk); +void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr); +void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, + unsigned int mss_now); + +int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req); +void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req); +__u32 tcp_v4_init_sequence(const struct sk_buff *skb); +int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + u16 queue_mapping); +void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb); +struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb); +struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb); +void tcp_v4_reqsk_destructor(struct request_sock *req); + +int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req); +void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req); +__u32 tcp_v6_init_sequence(const struct sk_buff *skb); +int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, + struct flowi6 *fl6, struct request_sock *req, + u16 queue_mapping); +void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); +int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +void tcp_v6_destroy_sock(struct sock *sk); +void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); +void tcp_v6_hash(struct sock *sk); +struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb); +struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); +void tcp_v6_reqsk_destructor(struct request_sock *req); + +void sock_valbool_flag(struct sock *sk, int bit, int valbool); +unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + int large_allowed); +u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb); + +void skb_clone_fraglist(struct sk_buff *skb); +void copy_skb_header(struct sk_buff *new, const struct sk_buff *old); + +void inet_twsk_free(struct inet_timewait_sock *tw); +/* These states need RST on ABORT according to RFC793 */ +static inline bool tcp_need_reset(int state) +{ + return (1 << state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | + TCPF_FIN_WAIT2 | TCPF_SYN_RECV); +} + +bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, + int hlen); +int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, + bool *fragstolen); +bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, + struct sk_buff *from, bool *fragstolen); +/**** END - Exports needed for MPTCP ****/ + extern void tcp_init_mem(struct net *net); extern void tcp_tasklet_init(void); @@ -446,6 +570,7 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); extern void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, + struct mptcp_options_received *mopt_rx, int estab, struct tcp_fastopen_cookie *foc); extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); @@ -713,14 +838,24 @@ void tcp_send_window_probe(struct sock *sk); */ struct tcp_skb_cb { union { - struct inet_skb_parm h4; + union { + struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; + struct inet6_skb_parm h6; +#endif + } header; /* For incoming frames */ +#ifdef CONFIG_MPTCP + __u32 path_mask; /* path indices that tried to send this skb */ #endif - } header; /* For incoming frames */ + }; __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ __u32 when; /* used to compute rtt's */ +#ifdef CONFIG_MPTCP + __u8 mptcp_flags; /* flags for the MPTCP layer */ + __u8 dss_off; /* Number of 4-byte words until + * seq-number */ +#endif __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ @@ -1074,7 +1209,7 @@ extern u32 tcp_default_init_rwnd(u32 mss); extern void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, - __u32 init_rcv_wnd); + __u32 init_rcv_wnd, const struct sock *sk); static inline int tcp_win_from_space(int space) { @@ -1086,12 +1221,18 @@ static inline int tcp_win_from_space(int space) /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { + if (tcp_sk(sk)->mpc) + sk = tcp_sk(sk)->meta_sk; + return tcp_win_from_space(sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)); } static inline int tcp_full_space(const struct sock *sk) { + if (tcp_sk(sk)->mpc) + sk = tcp_sk(sk)->meta_sk; + return tcp_win_from_space(sk->sk_rcvbuf); } @@ -1106,6 +1247,7 @@ static inline void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; + tcp_rsk(req)->saw_mpc = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 1ec407b..d30e03a 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -53,6 +53,9 @@ #define IFF_ECHO 0x40000 /* echo sent packets */ +#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */ +#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */ + #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\ IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 377f1e5..2ffcb03 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -112,6 +112,7 @@ enum { #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ +#define MPTCP_ENABLED 26 struct tcp_repair_opt { __u32 opt_code; diff --git a/net/Kconfig b/net/Kconfig index b50dacc..0871482 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -79,6 +79,7 @@ if INET source "net/ipv4/Kconfig" source "net/ipv6/Kconfig" source "net/netlabel/Kconfig" +source "net/mptcp/Kconfig" endif # if INET diff --git a/net/Makefile b/net/Makefile index 9492e8c..c41f2c6 100644 --- a/net/Makefile +++ b/net/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ obj-$(CONFIG_NET) += ipv6/ +obj-$(CONFIG_MPTCP) += mptcp/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/core/dev.c b/net/core/dev.c index 01d53d6..fad678b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5022,7 +5022,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | - IFF_AUTOMEDIA)) | + IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) | (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI)); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 4425148..e128f08 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -38,7 +38,8 @@ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) + unsigned int nr_table_entries, + gfp_t flags) { size_t lopt_size = sizeof(struct listen_sock); struct listen_sock *lopt; @@ -48,9 +49,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) - lopt = vzalloc(lopt_size); + lopt = __vmalloc(lopt_size, + flags | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); else - lopt = kzalloc(lopt_size, GFP_KERNEL); + lopt = kzalloc(lopt_size, flags); if (lopt == NULL) return -ENOMEM; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f69f2ed..6bac678 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -459,7 +459,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb) skb_drop_list(&skb_shinfo(skb)->frag_list); } -static void skb_clone_fraglist(struct sk_buff *skb) +void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; @@ -882,7 +882,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) skb->inner_mac_header += off; } -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { __copy_skb_header(new, old); diff --git a/net/core/sock.c b/net/core/sock.c index ec228a3..34643a8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -231,7 +231,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" }; -static const char *const af_family_clock_key_strings[AF_MAX+1] = { +char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , @@ -252,7 +252,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { * sk_callback_lock locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ -static struct lock_class_key af_callback_keys[AF_MAX]; +struct lock_class_key af_callback_keys[AF_MAX]; /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across @@ -608,7 +608,7 @@ out: return ret; } -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +void sock_valbool_flag(struct sock *sk, int bit, int valbool) { if (valbool) sock_set_flag(sk, bit); @@ -1196,7 +1196,7 @@ lenout: * * (We also register the sk_lock with the lock validator.) */ -static inline void sock_lock_init(struct sock *sk) +void sock_lock_init(struct sock *sk) { sock_lock_init_class_and_name(sk, af_family_slock_key_strings[sk->sk_family], @@ -1244,7 +1244,7 @@ void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) } EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); -static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, +struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { struct sock *sk; @@ -2187,7 +2187,7 @@ static void sock_def_wakeup(struct sock *sk) rcu_read_unlock(); } -static void sock_def_error_report(struct sock *sk) +void sock_def_error_report(struct sock *sk) { struct socket_wq *wq; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 05c57f0..4413a45 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -556,6 +556,22 @@ config TCP_CONG_ILLINOIS For further details see: http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html +config TCP_CONG_COUPLED + tristate "MPTCP COUPLED CONGESTION CONTROL" + depends on MPTCP + default n + ---help--- + MultiPath TCP Coupled Congestion Control + To enable it, just put 'coupled' in tcp_congestion_control + +config TCP_CONG_OLIA + tristate "MPTCP Opportunistic Linked Increase" + depends on MPTCP + default n + ---help--- + MultiPath TCP Opportunistic Linked Increase Congestion Control + To enable it, just put 'olia' in tcp_congestion_control + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -584,6 +600,12 @@ choice config DEFAULT_WESTWOOD bool "Westwood" if TCP_CONG_WESTWOOD=y + config DEFAULT_COUPLED + bool "Coupled" if TCP_CONG_COUPLED=y + + config DEFAULT_OLIA + bool "Olia" if TCP_CONG_OLIA=y + config DEFAULT_RENO bool "Reno" @@ -605,6 +627,7 @@ config DEFAULT_TCP_CONG default "vegas" if DEFAULT_VEGAS default "westwood" if DEFAULT_WESTWOOD default "veno" if DEFAULT_VENO + default "coupled" if DEFAULT_COUPLED default "reno" if DEFAULT_RENO default "cubic" diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cfeb85c..1cca208 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -104,6 +104,7 @@ #include #include #include +#include #include #include #include @@ -272,8 +273,7 @@ EXPORT_SYMBOL(build_ehash_secret); * Create an inet socket. */ -static int inet_create(struct net *net, struct socket *sock, int protocol, - int kern) +int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; @@ -709,6 +709,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) lock_sock(sk2); sock_rps_record_flow(sk2); + + if (sk2->sk_protocol == IPPROTO_TCP && tcp_sk(sk2)->mpc) { + struct sock *sk_it = sk2; + + mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it) + sock_rps_record_flow(sk_it); + + if (tcp_sk(sk2)->mpcb->master_sk) { + sk_it = tcp_sk(sk2)->mpcb->master_sk; + + write_lock_bh(&sk_it->sk_callback_lock); + sk_it->sk_wq = newsock->wq; + sk_it->sk_socket = newsock; + write_unlock_bh(&sk_it->sk_callback_lock); + } + } + WARN_ON(!((1 << sk2->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_CLOSE_WAIT | TCPF_CLOSE))); @@ -1741,6 +1758,9 @@ static int __init inet_init(void) ip_init(); + /* We must initialize MPTCP before TCP. */ + mptcp_init(); + tcp_v4_init(); /* Setup TCP slab cache for open requests. */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6acb541..56b5a3f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -477,8 +478,8 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) +u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, + const u32 synq_hsize) { return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); } @@ -675,7 +676,12 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority) { - struct sock *newsk = sk_clone_lock(sk, priority); + struct sock *newsk; + + if (sk->sk_protocol == IPPROTO_TCP && tcp_sk(sk)->mpc) + newsk = mptcp_sk_clone(sk, req->rsk_ops->family, priority); + else + newsk = sk_clone_lock(sk, priority); if (newsk != NULL) { struct inet_connection_sock *newicsk = inet_csk(newsk); @@ -752,7 +758,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries, + GFP_KERNEL); if (rc != 0) return rc; @@ -810,9 +817,14 @@ void inet_csk_listen_stop(struct sock *sk) while ((req = acc_req) != NULL) { struct sock *child = req->sk; + bool mutex_taken = false; acc_req = req->dl_next; + if (is_meta_sk(child)) { + mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex); + mutex_taken = true; + } local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); @@ -841,6 +853,8 @@ void inet_csk_listen_stop(struct sock *sk) bh_unlock_sock(child); local_bh_enable(); + if (mutex_taken) + mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex); sock_put(child); sk_acceptq_removed(sk); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1f27c9f..2e2cf24 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -99,7 +99,7 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, } } -static noinline void inet_twsk_free(struct inet_timewait_sock *tw) +void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; twsk_destructor((struct sock *)tw); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 15e0241..a1baa43 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -289,7 +289,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, 0, NULL); + tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL); if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) goto out; @@ -362,7 +362,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, &rcv_wscale, - dst_metric(&rt->dst, RTAX_INITRWND)); + dst_metric(&rt->dst, RTAX_INITRWND), sk); ireq->rcv_wscale = rcv_wscale; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 531ab57..72eb146 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include #include +#include #include #include #include @@ -603,6 +604,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; tcb->sacked = 0; + if (tp->mpc) + mptcp_skb_entail_init(tp, skb); skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; @@ -617,7 +620,7 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) tp->snd_up = tp->write_seq; } -static inline void tcp_push(struct sock *sk, int flags, int mss_now, +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle) { if (tcp_send_head(sk)) { @@ -683,6 +686,14 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, int ret; sock_rps_record_flow(sk); + +#ifdef CONFIG_MPTCP + if (tcp_sk(sk)->mpc) { + struct sock *sk_it; + mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) + sock_rps_record_flow(sk_it); + } +#endif /* * We can't seek on a socket input */ @@ -778,8 +789,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) return NULL; } -static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, - int large_allowed) +unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); u32 xmit_size_goal, old_size_goal; @@ -829,8 +839,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; - mss_now = tcp_current_mss(sk); - *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + if (tcp_sk(sk)->mpc) { + mss_now = mptcp_current_mss(sk); + *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + } else { + mss_now = tcp_current_mss(sk); + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + } return mss_now; } @@ -854,6 +869,26 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, goto out_err; } + if (tp->mpc) { + struct sock *sk_it = sk; + + /* We must check this with socket-lock hold because we iterate + * over the subflows. + */ + if (!mptcp_can_sendpage(sk)) { + ssize_t ret; + + release_sock(sk); + ret = sock_no_sendpage(sk->sk_socket, page, offset, + size, flags); + lock_sock(sk); + return ret; + } + + mptcp_for_each_sk(tp->mpcb, sk_it) + sock_rps_record_flow(sk_it); + } + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -957,8 +992,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, { ssize_t res; - if (!(sk->sk_route_caps & NETIF_F_SG) || - !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) + /* If MPTCP is enabled, we check it later after establishment */ + if (!tcp_sk(sk)->mpc && (!(sk->sk_route_caps & NETIF_F_SG) || + !(sk->sk_route_caps & NETIF_F_ALL_CSUM))) return sock_no_sendpage(sk->sk_socket, page, offset, size, flags); @@ -974,6 +1010,9 @@ static inline int select_size(const struct sock *sk, bool sg) const struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; + if (tp->mpc) + return mptcp_select_size(sk, sg); + if (sg) { if (sk_can_gso(sk)) { /* Small frames wont use a full page: @@ -1061,6 +1100,12 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto do_error; } + if (tp->mpc) { + struct sock *sk_it = sk; + mptcp_for_each_sk(tp->mpcb, sk_it) + sock_rps_record_flow(sk_it); + } + if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); @@ -1088,7 +1133,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; - sg = !!(sk->sk_route_caps & NETIF_F_SG); + if (tp->mpc) + sg = mptcp_can_sg(sk); + else + sg = !!(sk->sk_route_caps & NETIF_F_SG); while (--iovlen >= 0) { size_t seglen = iov->iov_len; @@ -1139,8 +1187,15 @@ new_segment: /* * Check whether we can use HW checksum. + * + * If dss-csum is enabled, we do not do hw-csum. + * In case of non-mptcp we check the + * device-capabilities. + * In case of mptcp, hw-csum's will be handled + * later in mptcp_write_xmit. */ - if (sk->sk_route_caps & NETIF_F_ALL_CSUM) + if (((tp->mpc && !tp->mpcb->dss_csum) || !tp->mpc) && + (tp->mpc || sk->sk_route_caps & NETIF_F_ALL_CSUM)) skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); @@ -1340,6 +1395,11 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + if (is_meta_sk(sk)) { + mptcp_cleanup_rbuf(sk, copied); + return; + } + WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); @@ -1577,6 +1637,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, lock_sock(sk); +#ifdef CONFIG_MPTCP + if (tp->mpc) { + struct sock *sk_it; + mptcp_for_each_sk(tp->mpcb, sk_it) + sock_rps_record_flow(sk_it); + } +#endif + err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; @@ -2024,7 +2092,7 @@ static const unsigned char new_state[16] = { /* TCP_CLOSING */ TCP_CLOSING, }; -static int tcp_close_state(struct sock *sk) +int tcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; @@ -2053,8 +2121,12 @@ void tcp_shutdown(struct sock *sk, int how) (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ - if (tcp_close_state(sk)) - tcp_send_fin(sk); + if (tcp_close_state(sk)) { + if (!is_meta_sk(sk)) + tcp_send_fin(sk); + else + mptcp_send_fin(sk); + } } } EXPORT_SYMBOL(tcp_shutdown); @@ -2079,6 +2151,11 @@ void tcp_close(struct sock *sk, long timeout) int data_was_unread = 0; int state; + if (is_meta_sk(sk)) { + mptcp_close(sk, timeout); + return; + } + lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; @@ -2245,15 +2322,6 @@ out: } EXPORT_SYMBOL(tcp_close); -/* These states need RST on ABORT according to RFC793 */ - -static inline bool tcp_need_reset(int state) -{ - return (1 << state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | - TCPF_FIN_WAIT2 | TCPF_SYN_RECV); -} - int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2294,6 +2362,13 @@ int tcp_disconnect(struct sock *sk, int flags) if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); + if (is_meta_sk(sk)) { + mptcp_disconnect(sk); + } else { + if (tp->inside_tk_table) + mptcp_hash_remove_bh(tp); + } + sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt = 0; @@ -2553,6 +2628,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, elapsed = tp->keepalive_time - elapsed; else elapsed = 0; + if (tp->mpc) { + struct sock *sk_it = sk; + mptcp_for_each_sk(tp->mpcb, sk_it) + if (!(1 << sk->sk_state & (TCPF_CLOSE | TCPF_LISTEN))) + inet_csk_reset_keepalive_timer(sk_it, elapsed); + break; + } inet_csk_reset_keepalive_timer(sk, elapsed); } } @@ -2653,6 +2735,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level, tp->notsent_lowat = val; sk->sk_write_space(sk); break; +#ifdef CONFIG_MPTCP + case MPTCP_ENABLED: + if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) { + if (val) + tp->mptcp_enabled = 1; + else + tp->mptcp_enabled = 0; + } else { + err = -EPERM; + } + break; +#endif default: err = -ENOPROTOOPT; break; @@ -2872,6 +2966,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; +#ifdef CONFIG_MPTCP + case MPTCP_ENABLED: + val = tp->mptcp_enabled; + break; +#endif default: return -ENOPROTOOPT; } @@ -3061,8 +3160,11 @@ void tcp_done(struct sock *sk) if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + WARN_ON(sk->sk_state == TCP_CLOSE); tcp_set_state(sk, TCP_CLOSE); + tcp_clear_xmit_timers(sk); + if (req != NULL) reqsk_fastopen_remove(sk, req, false); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 068c8fb..2b3d43e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -74,6 +74,9 @@ #include #include #include +#include +#include +#include int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; @@ -99,25 +102,6 @@ int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; -#define FLAG_DATA 0x01 /* Incoming frame contained data. */ -#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ -#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ -#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ -#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ -#define FLAG_DATA_SACKED 0x20 /* New SACK. */ -#define FLAG_ECE 0x40 /* ECE in this ACK */ -#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ -#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ -#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ -#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ -#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ -#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ - -#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) -#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) -#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) -#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) - #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) @@ -322,10 +306,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); /* Check #1 */ - if (tp->rcv_ssthresh < tp->window_clamp && - (int)tp->rcv_ssthresh < tcp_space(sk) && + if (meta_tp->rcv_ssthresh < meta_tp->window_clamp && + (int)meta_tp->rcv_ssthresh < tcp_space(sk) && !sk_under_memory_pressure(sk)) { int incr; @@ -333,14 +319,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) * will fit to rcvbuf in future. */ if (tcp_win_from_space(skb->truesize) <= skb->len) - incr = 2 * tp->advmss; + incr = 2 * meta_tp->advmss; else - incr = __tcp_grow_window(sk, skb); + incr = __tcp_grow_window(meta_sk, skb); if (incr) { incr = max_t(int, incr, 2 * skb->len); - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, - tp->window_clamp); + meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr, + meta_tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; } } @@ -393,6 +379,11 @@ void tcp_init_buffer_space(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_time_stamp; + + if (tp->mpc) { + mptcp_init_buffer_space(sk); + mptcp_update_sndbuf(tp->mpcb); + } } /* 5. Recalculate window clamp after socket hit its memory bounds. */ @@ -518,7 +509,10 @@ void tcp_rcv_space_adjust(struct sock *sk) goto new_measure; time = tcp_time_stamp - tp->rcvq_space.time; - if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) + if (tp->mpc) { + if (mptcp_check_rtt(tp, time)) + return; + } else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; space = 2 * (tp->copied_seq - tp->rcvq_space.seq); @@ -744,6 +738,8 @@ void tcp_set_rto(struct sock *sk) * guarantees that rto is higher. */ tcp_bound_rto(sk); + if (tp->mpc) + mptcp_set_rto(sk); } __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) @@ -1328,7 +1324,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, int len; int in_sack; - if (!sk_can_gso(sk)) + /* For MPTCP we cannot shift skb-data and remove one skb from the + * send-queue, because this will make us loose the DSS-option (which + * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing. + */ + if (!sk_can_gso(sk) || tp->mpc) goto fallback; /* Normally R but no L won't result in plain S */ @@ -2948,7 +2948,7 @@ void tcp_resume_early_retransmit(struct sock *sk) } /* If we get here, the whole TSO packet has not been acked. */ -static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) +u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); u32 packets_acked; @@ -3042,6 +3042,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, */ if (!(scb->tcp_flags & TCPHDR_SYN)) { flag |= FLAG_DATA_ACKED; + if (tp->mpc && mptcp_is_data_seq(skb)) + flag |= MPTCP_FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; @@ -3144,7 +3146,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, return flag; } -static void tcp_ack_probe(struct sock *sk) +void tcp_ack_probe(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -3191,9 +3193,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline bool tcp_may_update_window(const struct tcp_sock *tp, - const u32 ack, const u32 ack_seq, - const u32 nwin) +bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, + const u32 ack_seq, const u32 nwin) { return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || @@ -3312,7 +3313,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } /* This routine deals with incoming acks, but not outgoing ones. */ -static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3405,6 +3406,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); acked -= tp->packets_out; + if (tp->mpc) { + if (mptcp_fallback_infinite(sk, flag)) { + pr_err("%s resetting flow\n", __func__); + mptcp_send_reset(sk); + goto invalid_ack; + } + + mptcp_clean_rtx_infinite(skb, sk); + } + /* Advance cwnd if state allows */ if (tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); @@ -3469,8 +3480,9 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(const struct sk_buff *skb, - struct tcp_options_received *opt_rx, int estab, - struct tcp_fastopen_cookie *foc) + struct tcp_options_received *opt_rx, + struct mptcp_options_received *mopt, + int estab, struct tcp_fastopen_cookie *foc) { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); @@ -3553,6 +3565,10 @@ void tcp_parse_options(const struct sk_buff *skb, */ break; #endif + case TCPOPT_MPTCP: + mptcp_parse_options(ptr - 2, opsize, opt_rx, + mopt, skb); + break; case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. It's valid only in @@ -3614,8 +3630,8 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, if (tcp_parse_aligned_timestamp(tp, th)) return true; } - - tcp_parse_options(skb, &tp->rx_opt, 1, NULL); + tcp_parse_options(skb, &tp->rx_opt, tp->mpc ? &tp->mptcp->rx_opt : NULL, + 1, NULL); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; @@ -3788,6 +3804,8 @@ static void tcp_fin(struct sock *sk) dst = __sk_dst_get(sk); if (!dst || !dst_metric(dst, RTAX_QUICKACK)) inet_csk(sk)->icsk_ack.pingpong = 1; + if (tp->mpc) + mptcp_sub_close_passive(sk); break; case TCP_CLOSE_WAIT: @@ -3809,6 +3827,13 @@ static void tcp_fin(struct sock *sk) tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: + if (tp->mpc) { + /* The socket will get closed by mptcp_data_ready. + * We first have to process all data-sequences. + */ + tp->close_it = 1; + break; + } /* Received a FIN -- send ACK and enter TIME_WAIT. */ tcp_send_ack(sk); tcp_time_wait(sk, TCP_TIME_WAIT, 0); @@ -3833,6 +3858,10 @@ static void tcp_fin(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); + /* Don't wake up MPTCP-subflows */ + if (tp->mpc) + return; + /* Do not send POLL_HUP for half duplex close. */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) @@ -4030,7 +4059,11 @@ static void tcp_ofo_queue(struct sock *sk) tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + /* In case of MPTCP, the segment may be empty if it's a + * non-data DATA_FIN. (see beginning of tcp_data_queue) + */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) && + !(tp->mpc && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) { SOCK_DEBUG(sk, "ofo packet was already received\n"); __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); @@ -4054,6 +4087,9 @@ static int tcp_prune_queue(struct sock *sk); static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) { + if (tcp_sk(sk)->mpc) + sk = mptcp_meta_sk(sk); + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, size)) { @@ -4084,15 +4120,16 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, * Better try to coalesce them right now to avoid future collapses. * Returns true if caller should free @from instead of queueing it */ -static bool tcp_try_coalesce(struct sock *sk, - struct sk_buff *to, - struct sk_buff *from, - bool *fragstolen) +bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from, + bool *fragstolen) { int delta; *fragstolen = false; + if (tcp_sk(sk)->mpc && !is_meta_sk(sk)) + return false; + if (tcp_hdr(from)->fin) return false; @@ -4182,7 +4219,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* Do skb overlap to previous one? */ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { - if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* MPTCP allows non-data data-fin to be in the ofo-queue */ + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) && + !(tp->mpc && end_seq == seq)) { /* All the bits are present. Drop. */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); __kfree_skb(skb); @@ -4220,6 +4259,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq); break; } + /* MPTCP allows non-data data-fin to be in the ofo-queue */ + if (tp->mpc && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq) + continue; __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); @@ -4237,8 +4279,8 @@ end: } } -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, - bool *fragstolen) +int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, + bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); @@ -4300,7 +4342,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) int eaten = -1; bool fragstolen = false; - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) + /* If no data is present, but a data_fin is in the options, we still + * have to call mptcp_queue_skb later on. */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && + !(tp->mpc && mptcp_is_data_fin(skb))) goto drop; skb_dst_drop(skb); @@ -4346,7 +4391,7 @@ queue_and_out: eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if (skb->len) + if (skb->len || mptcp_is_data_fin(skb)) tcp_event_data_recv(sk, skb); if (th->fin) tcp_fin(sk); @@ -4368,7 +4413,11 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); - if (!sock_flag(sk, SOCK_DEAD)) + if (!sock_flag(sk, SOCK_DEAD) || tp->mpc) + /* MPTCP: we always have to call data_ready, because + * we may be about to receive a data-fin, which still + * must get queued. + */ sk->sk_data_ready(sk, 0); return; } @@ -4420,6 +4469,8 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, next = skb_queue_next(list, skb); __skb_unlink(skb, list); + if (tcp_sk(sk)->mpc) + mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb); __kfree_skb(skb); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); @@ -4592,6 +4643,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); bool res = false; + if (is_meta_sk(sk)) { + if (!skb_queue_empty(&tp->out_of_order_queue)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); + mptcp_purge_ofo_queue(tp); + + /* No sack at the mptcp-level */ + sk_mem_reclaim(sk); + res = true; + } + return res; + } + if (!skb_queue_empty(&tp->out_of_order_queue)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); __skb_queue_purge(&tp->out_of_order_queue); @@ -4685,6 +4748,9 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + if (tp->mpc) + return mptcp_should_expand_sndbuf(mptcp_meta_sk(sk)); + /* If the user specified a specific send buffer setting, do * not modify it. */ @@ -4721,11 +4787,27 @@ static void tcp_new_space(struct sock *sk) tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER); - int demanded = max_t(unsigned int, tp->snd_cwnd, - tp->reordering + 1); + int demanded; + + if (tp->mpc) + demanded = mptcp_check_snd_buf(tp); + else + demanded = max_t(unsigned int, tp->snd_cwnd, + tp->reordering + 1); + + /* MPTCP: After this, sndmem is the new contribution of the + * current subflow to the aggregate sndbuf + */ sndmem *= 2 * demanded; - if (sndmem > sk->sk_sndbuf) + if (sndmem > sk->sk_sndbuf) { + int old_sndbuf = sk->sk_sndbuf; sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + /* MPTCP: ok, the subflow sndbuf has grown, reflect + * this in the aggregate buffer. + */ + if (tp->mpc && old_sndbuf != sk->sk_sndbuf) + mptcp_update_sndbuf(tp->mpcb); + } tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -4736,8 +4818,9 @@ static void tcp_check_space(struct sock *sk) { if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + if (tcp_sk(sk)->mpc || + (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))) tcp_new_space(sk); } } @@ -4862,6 +4945,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t { struct tcp_sock *tp = tcp_sk(sk); + /* MPTCP urgent data is not yet supported */ + if (tp->mpc) + return; + /* Check if we get a new urgent pointer - normally not. */ if (th->urg) tcp_check_urg(sk, th); @@ -4929,8 +5016,7 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, } #ifdef CONFIG_NET_DMA -static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, - int hlen) +bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); int chunk = skb->len - hlen; @@ -5039,9 +5125,15 @@ syn_challenge: goto discard; } + /* If valid: post process the received MPTCP options. */ + if (tp->mpc && mptcp_handle_options(sk, th, skb)) + goto discard; + return true; discard: + if (tp->mpc) + mptcp_reset_mopt(tp); __kfree_skb(skb); return false; } @@ -5093,6 +5185,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rx_opt.saw_tstamp = 0; + /* MPTCP: force slowpath. */ + if (tp->mpc) + goto slow_path; + /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 @@ -5337,7 +5433,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, /* Get original SYNACK MSS value if user MSS sets mss_clamp */ tcp_clear_options(&opt); opt.user_mss = opt.mss_clamp = 0; - tcp_parse_options(synack, &opt, 0, NULL); + tcp_parse_options(synack, &opt, NULL, 0, NULL); mss = opt.mss_clamp; } @@ -5372,8 +5468,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; + struct mptcp_options_received mopt; + mptcp_init_mp_opt(&mopt); - tcp_parse_options(skb, &tp->rx_opt, 0, &foc); + tcp_parse_options(skb, &tp->rx_opt, + tp->mpc ? &tp->mptcp->rx_opt : &mopt, 0, &foc); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; @@ -5420,6 +5519,21 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (!th->syn) goto discard_and_undo; + if (tp->request_mptcp || tp->mpc) { + int ret; + ret = mptcp_rcv_synsent_state_process(sk, &sk, + skb, &mopt); + + /* May have changed if we support MPTCP */ + tp = tcp_sk(sk); + icsk = inet_csk(sk); + + if (ret == 1) + goto reset_and_undo; + if (ret == 2) + goto discard; + } + /* rfc793: * "If the SYN bit is on ... * are acceptable then ... @@ -5432,6 +5546,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_ack(sk, skb, FLAG_SLOWPATH); + if (tp->mpc && !is_master_tp(tp)) { + /* Timer for repeating the ACK until an answer + * arrives. Used only when establishing an additional + * subflow inside of an MPTCP connection. + */ + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, + jiffies + icsk->icsk_rto); + } + /* Ok.. it's good. Set up sequence numbers and * move to established. */ @@ -5458,6 +5581,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } + if (tp->mpc) { + tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; + } + if (tcp_is_sack(tp) && sysctl_tcp_fack) tcp_enable_fack(tp); @@ -5478,7 +5606,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_rcv_fastopen_synack(sk, skb, &foc)) return -1; - if (sk->sk_write_pending || + /* With MPTCP we cannot send data on the third ack due to the + * lack of option-space */ + if ((sk->sk_write_pending && !tp->mpc) || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { /* Save one ACK. Data will be ready after @@ -5520,6 +5650,7 @@ discard: tcp_paws_reject(&tp->rx_opt, 0)) goto discard_and_undo; + /* TODO - check this here for MPTCP */ if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. @@ -5536,6 +5667,11 @@ discard: tp->tcp_header_len = sizeof(struct tcphdr); } + if (tp->mpc) { + tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; + } + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; @@ -5594,6 +5730,7 @@ reset_and_undo: int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) + __releases(&sk->sk_lock.slock) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -5645,6 +5782,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_SYN_SENT: queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + if (is_meta_sk(sk)) { + sk = tcp_sk(sk)->mpcb->master_sk; + tp = tcp_sk(sk); + } if (queued >= 0) return queued; @@ -5652,6 +5793,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_urg(sk, skb, th); __kfree_skb(skb); tcp_data_snd_check(sk); + if (tp->mpc && is_master_tp(tp)) + bh_unlock_sock(sk); return 0; } @@ -5714,6 +5857,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + if (tp->mpc) + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; if (req) { /* Re-arm the timer because data may have been sent out. @@ -5735,6 +5880,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp); + /* Send an ACK when establishing a new + * MPTCP subflow, i.e. using an MP_JOIN + * subtype. + */ + if (tp->mpc && !is_master_tp(tp)) + tcp_send_ack(sk); break; case TCP_FIN_WAIT1: { @@ -5786,7 +5937,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); - } else if (th->fin || sock_owned_by_user(sk)) { + } else if (th->fin || mptcp_is_data_fin(skb) || + sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing * and not so rare event. We still can lose it now, @@ -5815,6 +5967,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } break; + case TCP_CLOSE: + if (tp->mp_killed) + goto discard; } /* step 6: check the URG bit */ @@ -5835,7 +5990,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && + !tp->mpc) { + /* In case of mptcp, the reset is handled by + * mptcp_rcv_state_process + */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5031f68..d26b11f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -67,6 +67,8 @@ #include #include #include +#include +#include #include #include #include @@ -99,7 +101,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) +__u32 tcp_v4_init_sequence(const struct sk_buff *skb) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, @@ -333,7 +335,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) struct inet_sock *inet; const int type = icmp_hdr(icmp_skb)->type; const int code = icmp_hdr(icmp_skb)->code; - struct sock *sk; + struct sock *sk, *meta_sk; struct sk_buff *skb; struct request_sock *req; __u32 seq; @@ -357,13 +359,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) return; } - bh_lock_sock(sk); + tp = tcp_sk(sk); + if (tp->mpc) + meta_sk = mptcp_meta_sk(sk); + else + meta_sk = sk; + + bh_lock_sock(meta_sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. * We do take care of PMTU discovery (RFC1191) special case : * we can receive locally generated ICMP messages while socket is held. */ - if (sock_owned_by_user(sk)) { + if (sock_owned_by_user(meta_sk)) { if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); } @@ -376,7 +384,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } icsk = inet_csk(sk); - tp = tcp_sk(sk); req = tp->fastopen_rsk; seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && @@ -410,11 +417,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; tp->mtu_info = info; - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { tcp_v4_mtu_reduced(sk); } else { if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) sock_hold(sk); + if (tp->mpc) + mptcp_tsq_flags(sk); } goto out; } @@ -430,7 +439,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) /* XXX (TFO) - revisit the following logic for TFO */ - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(meta_sk)) break; icsk->icsk_backoff--; @@ -472,7 +481,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(meta_sk)) goto out; req = inet_csk_search_req(sk, &prev, th->dest, @@ -505,7 +514,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) It can f.e. if SYNs crossed, or Fast Open. */ - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { sk->sk_err = err; sk->sk_error_report(sk); @@ -534,7 +543,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) */ inet = inet_sk(sk); - if (!sock_owned_by_user(sk) && inet->recverr) { + if (!sock_owned_by_user(meta_sk) && inet->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else { /* Only an error on timeout */ @@ -542,7 +551,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } out: - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } @@ -584,7 +593,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) +void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -708,10 +717,10 @@ release_sk1: outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, - int reply_flags, u8 tos) + int reply_flags, u8 tos, int mptcp) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -720,6 +729,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, #ifdef CONFIG_TCP_MD5SIG + (TCPOLEN_MD5SIG_ALIGNED >> 2) #endif +#ifdef CONFIG_MPTCP + + ((MPTCP_SUB_LEN_DSS >> 2) + + (MPTCP_SUB_LEN_ACK >> 2)) +#endif ]; } rep; struct ip_reply_arg arg; @@ -764,6 +777,21 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ip_hdr(skb)->daddr, &rep.th); } #endif +#ifdef CONFIG_MPTCP + if (mptcp) { + int offset = (tsecr) ? 3 : 0; + /* Construction of 32-bit data_ack */ + rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) | + ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | + (0x20 << 8) | + (0x01)); + rep.opt[offset] = htonl(data_ack); + + arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; + rep.th.doff = arg.iov[0].iov_len / 4; + } +#endif /* CONFIG_MPTCP */ + arg.flags = reply_flags; arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ @@ -782,36 +810,44 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + u32 data_ack = 0; + int mptcp = 0; + + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) { + data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; + mptcp = 1; + } tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + data_ack, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, - tw->tw_tos + tw->tw_tos, mptcp ); inet_twsk_put(tw); } -static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, - struct request_sock *req) +void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd, tcp_time_stamp, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos); + ip_hdr(skb)->tos, 0); } /* @@ -819,9 +855,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req, - u16 queue_mapping) +int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + u16 queue_mapping) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -849,7 +885,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, return err; } -static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) +int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { int res = tcp_v4_send_synack(sk, NULL, req, 0); @@ -861,7 +897,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) /* * IPv4 request_sock destructor. */ -static void tcp_v4_reqsk_destructor(struct request_sock *req) +void tcp_v4_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->opt); } @@ -901,7 +937,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action); /* * Save and compile IPv4 options into the request_sock if needed. */ -static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) +struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) { const struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options_rcu *dopt = NULL; @@ -1253,7 +1289,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { }; #ifdef CONFIG_TCP_MD5SIG -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .md5_lookup = tcp_v4_reqsk_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, }; @@ -1443,6 +1479,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tmp_opt; + struct mptcp_options_received mopt; struct request_sock *req; struct inet_request_sock *ireq; struct tcp_sock *tp = tcp_sk(sk); @@ -1457,6 +1494,22 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) struct sk_buff *skb_synack; int do_fastopen; + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = TCP_MSS_DEFAULT; + tmp_opt.user_mss = tp->rx_opt.user_mss; + mptcp_init_mp_opt(&mopt); + tcp_parse_options(skb, &tmp_opt, &mopt, 0, want_cookie ? NULL : &foc); + +#ifdef CONFIG_MPTCP + /* MPTCP structures not initialized, so clear MPTCP fields */ + if (mptcp_init_failed) + mptcp_init_mp_opt(&mopt); + + if (mopt.is_mp_join) + return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk)); + if (mopt.drop_me) + goto drop; +#endif /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; @@ -1482,7 +1535,22 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop; } - req = inet_reqsk_alloc(&tcp_request_sock_ops); +#ifdef CONFIG_MPTCP + if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled) + mopt.saw_mpc = 0; + if (mopt.saw_mpc && !want_cookie) { + req = inet_reqsk_alloc(&mptcp_request_sock_ops); + + if (!req) + goto drop; + + mptcp_rsk(req)->mpcb = NULL; + mptcp_rsk(req)->dss_csum = mopt.dss_csum; + mptcp_rsk(req)->collide_tk.pprev = NULL; + } else +#endif + req = inet_reqsk_alloc(&tcp_request_sock_ops); + if (!req) goto drop; @@ -1490,17 +1558,15 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; #endif - tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = TCP_MSS_DEFAULT; - tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); - if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); + if (mopt.saw_mpc && !want_cookie) + mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb); + ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; @@ -1713,7 +1779,7 @@ put_and_exit: } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); @@ -1730,8 +1796,15 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { + /* Don't lock again the meta-sk. It has been locked + * before mptcp_v4_do_rcv. + */ + if (tcp_sk(nsk)->mpc && !is_meta_sk(sk)) + bh_lock_sock(mptcp_meta_sk(nsk)); bh_lock_sock(nsk); + return nsk; + } inet_twsk_put(inet_twsk(nsk)); return NULL; @@ -1788,6 +1861,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; #endif + if (is_meta_sk(sk)) + return mptcp_v4_do_rcv(sk, skb); + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst = sk->sk_rx_dst; @@ -1919,7 +1995,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { wake_up_interruptible_sync_poll(sk_sleep(sk), POLLIN | POLLRDNORM | POLLRDBAND); - if (!inet_csk_ack_scheduled(sk)) + if (!inet_csk_ack_scheduled(sk) && !tp->mpc) inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, (3 * tcp_rto_min(sk)) / 4, TCP_RTO_MAX); @@ -1936,7 +2012,7 @@ int tcp_v4_rcv(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; - struct sock *sk; + struct sock *sk, *meta_sk = NULL; int ret; struct net *net = dev_net(skb->dev); @@ -1969,18 +2045,42 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); +#ifdef CONFIG_MPTCP + TCP_SKB_CB(skb)->mptcp_flags = 0; + TCP_SKB_CB(skb)->dss_off = 0; +#endif TCP_SKB_CB(skb)->when = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); - if (!sk) - goto no_tcp_socket; process: - if (sk->sk_state == TCP_TIME_WAIT) + if (sk && sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; +#ifdef CONFIG_MPTCP + if (!sk && th->syn && !th->ack) { + int ret = mptcp_lookup_join(skb, NULL); + + if (ret < 0) { + tcp_v4_send_reset(NULL, skb); + goto discard_it; + } else if (ret > 0) { + return 0; + } + } + + /* Is there a pending request sock for this segment ? */ + if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) { + if (sk) + sock_put(sk); + return 0; + } +#endif + if (!sk) + goto no_tcp_socket; + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1996,11 +2096,21 @@ process: sk_mark_napi_id(sk, skb); skb->dev = NULL; - bh_lock_sock_nested(sk); + if (tcp_sk(sk)->mpc) { + meta_sk = mptcp_meta_sk(sk); + + bh_lock_sock_nested(meta_sk); + if (sock_owned_by_user(meta_sk)) + skb->sk = sk; + } else { + meta_sk = sk; + bh_lock_sock_nested(sk); + } + ret = 0; - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { #ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(meta_sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = net_dma_find_channel(); if (tp->ucopy.dma_chan) @@ -2008,16 +2118,16 @@ process: else #endif { - if (!tcp_prequeue(sk, skb)) + if (!tcp_prequeue(meta_sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } - } else if (unlikely(sk_add_backlog(sk, skb, - sk->sk_rcvbuf + sk->sk_sndbuf))) { - bh_unlock_sock(sk); + } else if (unlikely(sk_add_backlog(meta_sk, skb, + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { + bh_unlock_sock(meta_sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); @@ -2072,6 +2182,18 @@ do_time_wait: sk = sk2; goto process; } +#ifdef CONFIG_MPTCP + if (th->syn && !th->ack) { + int ret = mptcp_lookup_join(skb, inet_twsk(sk)); + + if (ret < 0) { + tcp_v4_send_reset(NULL, skb); + goto discard_it; + } else if (ret > 0) { + return 0; + } + } +#endif /* Fall through to ACK */ } case TCP_TW_ACK: @@ -2154,6 +2276,11 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_cleanup_congestion_control(sk); + if (tp->mpc) + mptcp_destroy_sock(sk); + if (tp->inside_tk_table) + mptcp_hash_remove(tp); + /* Cleanup up the write buffer. */ tcp_write_queue_purge(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 58a3e69..92a5fe7 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -18,11 +18,13 @@ * Jorge Cwik, */ +#include #include #include #include #include #include +#include #include #include #include @@ -95,10 +97,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + struct mptcp_options_received mopt; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, 0, NULL); + mptcp_init_mp_opt(&mopt); + + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; @@ -106,6 +111,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } + + if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) { + if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key) + goto kill_with_rst; + } } if (tw->tw_substate == TCP_FIN_WAIT2) { @@ -128,6 +138,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { + /* If mptcp_is_data_fin() returns true, we are sure that + * mopt has been initialized - otherwise it would not + * be a DATA_FIN. + */ + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw && + mptcp_is_data_fin(skb) && + TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && + mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt) + return TCP_TW_ACK; + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -270,6 +290,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct tcp_sock *tp = tcp_sk(sk); bool recycle_ok = false; + if (is_meta_sk(sk)) { + mptcp_update_tw_socks(tp, state); + goto tcp_done; + } + if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tcp_remember_stamp(sk); @@ -290,6 +315,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; + if (tp->mpc) { + if (mptcp_time_wait(sk, tcptw)) { + inet_twsk_free(tw); + goto exit; + } + } else { + tcptw->mptcp_tw = NULL; + } + #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -349,15 +383,19 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); } +exit: tcp_update_metrics(sk); +tcp_done: tcp_done(sk); } void tcp_twsk_destructor(struct sock *sk) { -#ifdef CONFIG_TCP_MD5SIG struct tcp_timewait_sock *twsk = tcp_twsk(sk); + if (twsk->mptcp_tw) + mptcp_twsk_destructor(twsk); +#ifdef CONFIG_TCP_MD5SIG if (twsk->tw_md5_key) kfree_rcu(twsk->tw_md5_key, rcu); #endif @@ -394,6 +432,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; +#ifdef CONFIG_MPTCP + memset(&newtp->rcvq_space, 0, sizeof(newtp->rcvq_space)); +#endif tcp_prequeue_init(newtp); INIT_LIST_HEAD(&newtp->tsq_node); @@ -470,6 +511,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } + if (treq->saw_mpc) + newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; newtp->tsoffset = 0; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ @@ -506,16 +549,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool fastopen) { struct tcp_options_received tmp_opt; + struct mptcp_options_received mopt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; - BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); + BUG_ON(!tcp_sk(sk)->mpc && fastopen == (sk->sk_state == TCP_LISTEN)); tmp_opt.saw_tstamp = 0; + + mptcp_init_mp_opt(&mopt); + if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, 0, NULL); + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; @@ -554,7 +601,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. + * + * Fall back to TCP if MP_CAPABLE is not set. */ + + if (tcp_rsk(req)->saw_mpc && !mopt.saw_mpc) + tcp_rsk(req)->saw_mpc = false; + + if (!inet_rtx_syn_ack(sk, req)) req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX) + jiffies; @@ -676,7 +730,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1 && + /* TODO MPTCP: + * We do this here, because otherwise options sent in the third ack, + * or duplicate fourth ack will get lost. Options like MP_PRIO, ADD_ADDR,... + * + * We could store them in request_sock, but this would mean that we + * have to put tcp_options_received and mptcp_options_received in there, + * increasing considerably the size of the request-sock. + * + * As soon as we have reworked the request-sock MPTCP-fields and + * created a mptcp_request_sock structure, we can handle options + * correclty there without increasing request_sock. + */ + !tcp_rsk(req)->saw_mpc) { inet_rsk(req)->acked = 1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; @@ -688,10 +755,29 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); +#ifdef CONFIG_MPTCP + if (tcp_sk(sk)->mpc) + /* MPTCP: We call the mptcp-specific syn_recv_sock */ + child = tcp_sk(sk)->mpcb->syn_recv_sock(sk, skb, req, NULL); + else +#endif + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, + req, NULL); + if (child == NULL) goto listen_overflow; + if (!is_meta_sk(sk)) { + int ret = mptcp_check_req_master(sk, child, req, prev, &mopt); + if (ret < 0) + goto listen_overflow; + + /* MPTCP-supported */ + if (!ret) + return tcp_sk(child)->mpcb->master_sk; + } else { + return mptcp_check_req_child(sk, child, req, prev, &mopt); + } inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); @@ -741,8 +827,9 @@ int tcp_child_process(struct sock *parent, struct sock *child, { int ret = 0; int state = child->sk_state; + struct sock *meta_sk = tcp_sk(child)->mpc ? mptcp_meta_sk(child) : child; - if (!sock_owned_by_user(child)) { + if (!sock_owned_by_user(meta_sk)) { ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); /* Wakeup parent, send SIGIO */ @@ -753,10 +840,14 @@ int tcp_child_process(struct sock *parent, struct sock *child, * in main socket hash table and lock on listening * socket does not protect us more. */ - __sk_add_backlog(child, skb); + if (tcp_sk(child)->mpc) + skb->sk = child; + __sk_add_backlog(meta_sk, skb); } - bh_unlock_sock(child); + if (tcp_sk(child)->mpc) + bh_unlock_sock(child); + bh_unlock_sock(meta_sk); sock_put(child); return ret; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 826fc6f..c246821 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -36,6 +36,8 @@ #define pr_fmt(fmt) "TCP: " fmt +#include +#include #include #include @@ -72,7 +74,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) +void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -211,9 +213,14 @@ u32 tcp_default_init_rwnd(u32 mss) void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, - __u32 init_rcv_wnd) + __u32 init_rcv_wnd, const struct sock *sk) { - unsigned int space = (__space < 0 ? 0 : __space); + unsigned int space; + + if (tcp_sk(sk)->mpc) + mptcp_select_initial_window(&__space, window_clamp, sk); + + space = (__space < 0 ? 0 : __space); /* If no clamp set the clamp to the max possible scaled window */ if (*window_clamp == 0) @@ -269,7 +276,11 @@ EXPORT_SYMBOL(tcp_select_initial_window); static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 cur_win = tcp_receive_window(tp); + /* The window must never shrink at the meta-level. At the subflow we + * have to allow this. Otherwise we may announce a window too large + * for the current meta-level sk_rcvbuf. + */ + u32 cur_win = tcp_receive_window(tp->mpc ? tcp_sk(mptcp_meta_sk(sk)) : tp); u32 new_win = __tcp_select_window(sk); /* Never shrink the offered window */ @@ -283,6 +294,12 @@ static u16 tcp_select_window(struct sock *sk) */ new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } + + if (tp->mpc) { + mptcp_meta_tp(tp)->rcv_wnd = new_win; + mptcp_meta_tp(tp)->rcv_wup = mptcp_meta_tp(tp)->rcv_nxt; + } + tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; @@ -361,7 +378,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -379,7 +396,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->end_seq = seq; } -static inline bool tcp_urg_mode(const struct tcp_sock *tp) +bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } @@ -389,17 +406,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) - -struct tcp_out_options { - u16 options; /* bit field of OPTION_* */ - u16 mss; /* 0 to disable */ - u8 ws; /* window scale, 0 to disable */ - u8 num_sack_blocks; /* number of SACK blocks to include */ - u8 hash_size; /* bytes in hash_location */ - __u8 *hash_location; /* temporary pointer, overloaded */ - __u32 tsval, tsecr; /* need to include OPTION_TS */ - struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ -}; +/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */ /* Write previously computed TCP options to the packet. * @@ -415,7 +422,7 @@ struct tcp_out_options { * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, - struct tcp_out_options *opts) + struct tcp_out_options *opts, struct sk_buff *skb) { u16 options = opts->options; /* mungable copy */ @@ -498,6 +505,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } ptr += (foc->len + 3) >> 2; } + + if (unlikely(OPTION_MPTCP & opts->options)) + mptcp_options_write(ptr, tp, opts, skb); } /* Compute TCP options for SYN packets. This is not the final @@ -549,6 +559,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; } + if (tp->request_mptcp || tp->mpc) + mptcp_syn_options(sk, opts, &remaining); if (fastopen && fastopen->cookie.len >= 0) { u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; @@ -622,6 +634,9 @@ static unsigned int tcp_synack_options(struct sock *sk, } } + if (tcp_rsk(req)->saw_mpc) + mptcp_synack_options(req, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -655,16 +670,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } + if (tp->mpc) + mptcp_established_options(sk, skb, opts, &size); eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { - const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; - opts->num_sack_blocks = - min_t(unsigned int, eff_sacks, - (remaining - TCPOLEN_SACK_BASE_ALIGNED) / - TCPOLEN_SACK_PERBLOCK); - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + const unsigned remaining = MAX_TCP_OPTION_SPACE - size; + if (remaining < TCPOLEN_SACK_BASE_ALIGNED) + opts->num_sack_blocks = 0; + else + opts->num_sack_blocks = + min_t(unsigned int, eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); + if (opts->num_sack_blocks) + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } return size; @@ -712,7 +733,7 @@ static void tcp_tasklet_func(unsigned long data) unsigned long flags; struct list_head *q, *n; struct tcp_sock *tp; - struct sock *sk; + struct sock *sk, *meta_sk; local_irq_save(flags); list_splice_init(&tsq->head, &list); @@ -723,15 +744,31 @@ static void tcp_tasklet_func(unsigned long data) list_del(&tp->tsq_node); sk = (struct sock *)tp; - bh_lock_sock(sk); + meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; + bh_lock_sock(meta_sk); - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { tcp_tsq_handler(sk); + if (tp->mpc) + tcp_tsq_handler(meta_sk); } else { + if (tp->mpc && sk->sk_state == TCP_CLOSE) + goto exit; + /* defer the work to tcp_release_cb() */ set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); + + /* For MPTCP, we set the tsq-bit on the meta, and the + * subflow as we don't know if the limitation happened + * while inside mptcp_write_xmit or during tcp_write_xmit. + */ + if (tp->mpc) { + set_bit(TCP_TSQ_DEFERRED, &tcp_sk(meta_sk)->tsq_flags); + mptcp_tsq_flags(sk); + } } - bh_unlock_sock(sk); +exit: + bh_unlock_sock(meta_sk); clear_bit(TSQ_QUEUED, &tp->tsq_flags); sk_free(sk); @@ -741,7 +778,10 @@ static void tcp_tasklet_func(unsigned long data) #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ (1UL << TCP_WRITE_TIMER_DEFERRED) | \ (1UL << TCP_DELACK_TIMER_DEFERRED) | \ - (1UL << TCP_MTU_REDUCED_DEFERRED)) + (1UL << TCP_MTU_REDUCED_DEFERRED) | \ + (1UL << MPTCP_PATH_MANAGER) | \ + (1UL << MPTCP_SUB_DEFERRED)) + /** * tcp_release_cb - tcp release_sock() callback * @sk: socket @@ -788,6 +828,13 @@ void tcp_release_cb(struct sock *sk) sk->sk_prot->mtu_reduced(sk); __sock_put(sk); } + if (flags & (1UL << MPTCP_PATH_MANAGER)) { + if (tcp_sk(sk)->mpcb->pm_ops->release_sock) + tcp_sk(sk)->mpcb->pm_ops->release_sock(sk); + __sock_put(sk); + } + if (flags & (1UL << MPTCP_SUB_DEFERRED)) + mptcp_tsq_sub_deferred(sk); } EXPORT_SYMBOL(tcp_release_cb); @@ -847,8 +894,8 @@ void tcp_wfree(struct sk_buff *skb) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - gfp_t gfp_mask) +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; @@ -876,10 +923,28 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); - if (unlikely(skb_cloned(skb))) - skb = pskb_copy(skb, gfp_mask); - else + if (unlikely(skb_cloned(skb))) { + struct sk_buff *newskb; + if (mptcp_is_data_seq(skb)) + skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN + + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN); + + newskb = pskb_copy(skb, gfp_mask); + + if (mptcp_is_data_seq(skb)) { + skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN + + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN); + if (newskb) + skb_pull(newskb, MPTCP_SUB_LEN_DSS_ALIGN + + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN); + } + skb = newskb; + } else { skb = skb_clone(skb, gfp_mask); + } if (unlikely(!skb)) return -ENOBUFS; } @@ -943,7 +1008,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } } - tcp_options_write((__be32 *)(th + 1), tp, &opts); + tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); @@ -982,7 +1047,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */ -static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) +void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -995,13 +1060,14 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) { /* Make sure we own this skb before messing gso_size/gso_segs */ WARN_ON_ONCE(skb_cloned(skb)); - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { + if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) || + (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. */ @@ -1033,7 +1099,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ -static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) +void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); @@ -1074,6 +1140,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, int nlen; u8 flags; + if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb)) + mptcp_fragment(sk, skb, len, mss_now, 0); + if (WARN_ON(len > skb->len)) return -EINVAL; @@ -1156,7 +1225,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, * eventually). The difference is that pulled data not copied, but * immediately discarded. */ -static void __pskb_trim_head(struct sk_buff *skb, int len) +void __pskb_trim_head(struct sk_buff *skb, int len) { int i, k, eat; @@ -1195,6 +1264,9 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { + if (tcp_sk(sk)->mpc && !is_meta_sk(sk) && mptcp_is_data_seq(skb)) + return mptcp_trim_head(sk, skb, len); + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; @@ -1212,6 +1284,15 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (tcp_skb_pcount(skb) > 1) tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); +#ifdef CONFIG_MPTCP + /* Some data got acked - we assume that the seq-number reached the dest. + * Anyway, our MPTCP-option has been trimmed above - we lost it here. + * Only remove the SEQ if the call does not come from a meta retransmit. + */ + if (tcp_sk(sk)->mpc && !is_meta_sk(sk)) + TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ; +#endif + return 0; } @@ -1371,7 +1452,7 @@ unsigned int tcp_current_mss(struct sock *sk) } /* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk) +void tcp_cwnd_validate(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1402,16 +1483,25 @@ static void tcp_cwnd_validate(struct sock *sk) * modulo only when the receiver window alone is the limiting factor or * when we would be allowed to send the split-due-to-Nagle skb fully. */ -static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int max_segs) +unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, + unsigned int mss_now, unsigned int max_segs) { const struct tcp_sock *tp = tcp_sk(sk); + const struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; u32 needed, window, max_len; - window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + if (!tp->mpc) + window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + else + /* We need to evaluate the available space in the sending window + * at the subflow level. However, the subflow seq has not yet + * been set. Nevertheless we know that the caller will set it to + * write_seq. + */ + window = tcp_wnd_end(tp) - tp->write_seq; max_len = mss_now * max_segs; - if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) + if (likely(max_len <= window && skb != tcp_write_queue_tail(meta_sk))) return max_len; needed = min(skb->len, window); @@ -1425,13 +1515,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, - const struct sk_buff *skb) +unsigned int tcp_cwnd_test(const struct tcp_sock *tp, + const struct sk_buff *skb) { u32 in_flight, cwnd; /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && + if (skb && + ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) && tcp_skb_pcount(skb) == 1) return 1; @@ -1447,8 +1538,8 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); @@ -1485,8 +1576,8 @@ static inline bool tcp_nagle_check(const struct tcp_sock *tp, /* Return true if the Nagle test allows this packet to be * sent now. */ -static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned int cur_mss, int nonagle) +bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). @@ -1498,7 +1589,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf return true; /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || + mptcp_is_data_fin(skb)) return true; if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) @@ -1508,9 +1600,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf } /* Does at least the first segment of SKB fit into the send window? */ -static bool tcp_snd_wnd_test(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned int cur_mss) +bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1568,6 +1659,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, int nlen = skb->len - len; u8 flags; + if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb)) + mptso_fragment(sk, skb, len, mss_now, gfp, 0); + /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now); @@ -1613,29 +1707,39 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; int win_divisor; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) goto send_now; if (icsk->icsk_ca_state != TCP_CA_Open) goto send_now; /* Defer for less than two clock ticks. */ - if (tp->tso_deferred && - (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) + if (meta_tp->tso_deferred && + (((u32)jiffies << 1) >> 1) - (meta_tp->tso_deferred >> 1) > 1) goto send_now; in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); - send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + if (!tp->mpc) + send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + else + /* We need to evaluate the available space in the sending window + * at the subflow level. However, the subflow seq has not yet + * been set. Nevertheless we know that the caller will set it to + * write_seq. + */ + send_win = tcp_wnd_end(tp) - tp->write_seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; @@ -1648,7 +1752,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ - if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) + if ((skb != tcp_write_queue_tail(meta_sk)) && (limit >= skb->len)) goto send_now; win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); @@ -1674,13 +1778,13 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) /* Ok, it looks like it is advisable to defer. * Do not rearm the timer if already set to not break TCP ACK clocking. */ - if (!tp->tso_deferred) - tp->tso_deferred = 1 | (jiffies << 1); + if (!meta_tp->tso_deferred) + meta_tp->tso_deferred = 1 | (jiffies << 1); return true; send_now: - tp->tso_deferred = 0; + meta_tp->tso_deferred = 0; return false; } @@ -1693,7 +1797,7 @@ send_now: * 1 if a probe was sent, * -1 otherwise */ -static int tcp_mtu_probe(struct sock *sk) +int tcp_mtu_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -1838,6 +1942,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int cwnd_quota; int result; + if (is_meta_sk(sk)) + return mptcp_write_xmit(sk, mss_now, nonagle, push_one, gfp); + sent_pkts = 0; if (!push_one) { @@ -2162,6 +2269,9 @@ u32 __tcp_select_window(struct sock *sk) int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); int window; + if (tp->mpc) + return __mptcp_select_window(sk); + if (mss > full_space) mss = full_space; @@ -2292,6 +2402,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; + /* Currently not supported for MPTCP - but it should be possible */ + if (tp->mpc) + return; + tcp_for_write_queue_from_safe(skb, tmp, sk) { if (!tcp_can_collapse(sk, skb)) break; @@ -2403,8 +2517,24 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { - struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, - GFP_ATOMIC); + struct sk_buff *nskb; + + if (mptcp_is_data_seq(skb)) + skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN + + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN); + + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); + + if (mptcp_is_data_seq(skb)) { + skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN + + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN); + if (nskb) + skb_pull(nskb, MPTCP_SUB_LEN_DSS_ALIGN + + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN); + } return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; } else { @@ -2629,6 +2759,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) { struct sk_buff *skb; + if (is_meta_sk(sk)) { + mptcp_send_active_reset(sk, priority); + return; + } + /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { @@ -2731,14 +2866,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) req->window_clamp = tcp_full_space(sk); - /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) - + (tcp_rsk(req)->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0), &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); + dst_metric(dst, RTAX_INITRWND), sk); ireq->rcv_wscale = rcv_wscale; } @@ -2774,7 +2909,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts); + tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); th->doff = (tcp_header_size >> 2); TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); @@ -2834,7 +2969,7 @@ void tcp_connect_init(struct sock *sk) &tp->window_clamp, sysctl_tcp_window_scaling, &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); + dst_metric(dst, RTAX_INITRWND), sk); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; @@ -2858,6 +2993,18 @@ void tcp_connect_init(struct sock *sk) inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); + +#ifdef CONFIG_MPTCP + if (sysctl_mptcp_enabled && mptcp_doit(sk)) { + if (is_master_tp(tp)) { + tp->request_mptcp = 1; + mptcp_connect_init(sk); + } else if (tp->mptcp) { + tp->mptcp->snt_isn = tp->write_seq; + tp->mptcp->init_rcv_wnd = tp->rcv_wnd; + } + } +#endif } static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) @@ -3100,6 +3247,7 @@ void tcp_send_ack(struct sock *sk) TCP_SKB_CB(buff)->when = tcp_time_stamp; tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } +EXPORT_SYMBOL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. @@ -3112,7 +3260,7 @@ void tcp_send_ack(struct sock *sk) * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ -static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +int tcp_xmit_probe_skb(struct sock *sk, int urgent) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3150,6 +3298,9 @@ int tcp_write_wakeup(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return -1; + if (is_meta_sk(sk)) + return mptcp_write_wakeup(sk); + if ((skb = tcp_send_head(sk)) != NULL && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4b85e6f..48a0f32 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -20,6 +20,7 @@ #include #include +#include #include int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; @@ -32,7 +33,7 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; int sysctl_tcp_orphan_retries __read_mostly; int sysctl_tcp_thin_linear_timeouts __read_mostly; -static void tcp_write_err(struct sock *sk) +void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; sk->sk_error_report(sk); @@ -124,10 +125,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if * syn_set flag is set. */ -static bool retransmits_timed_out(struct sock *sk, - unsigned int boundary, - unsigned int timeout, - bool syn_set) +bool retransmits_timed_out(struct sock *sk, unsigned int boundary, + unsigned int timeout, bool syn_set) { unsigned int linear_backoff_thresh, start_ts; unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; @@ -153,7 +152,7 @@ static bool retransmits_timed_out(struct sock *sk, } /* A write timeout has occurred. Process the after effects. */ -static int tcp_write_timeout(struct sock *sk) +int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; @@ -164,6 +163,10 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; syn_set = true; + /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */ + if (tcp_sk(sk)->request_mptcp && + icsk->icsk_retransmits >= mptcp_sysctl_syn_retries()) + tcp_sk(sk)->request_mptcp = 0; } else { if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { /* Black hole detection */ @@ -244,18 +247,22 @@ out: static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock *)data; + struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) { + bh_lock_sock(meta_sk); + if (!sock_owned_by_user(meta_sk)) { tcp_delack_timer_handler(sk); } else { inet_csk(sk)->icsk_ack.blocked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) sock_hold(sk); + if (tp->mpc) + mptcp_tsq_flags(sk); } - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } @@ -418,6 +425,9 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk, 0); + if (tp->mpc) + mptcp_reinject_data(sk, 1); + if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { /* Retransmission failed because of local congestion, * do not backoff. @@ -468,6 +478,8 @@ out_reset_timer: /* Use normal (exponential) backoff */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } + if (tp->mpc) + mptcp_set_rto(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) __sk_dst_reset(sk); @@ -499,7 +511,10 @@ void tcp_write_timer_handler(struct sock *sk) break; case ICSK_TIME_RETRANS: icsk->icsk_pending = 0; - tcp_retransmit_timer(sk); + if (is_meta_sk(sk)) + mptcp_retransmit_timer(sk); + else + tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: icsk->icsk_pending = 0; @@ -514,16 +529,19 @@ out: static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock *)data; + struct sock *meta_sk = tcp_sk(sk)->mpc ? mptcp_meta_sk(sk) : sk; - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) { + bh_lock_sock(meta_sk); + if (!sock_owned_by_user(meta_sk)) { tcp_write_timer_handler(sk); } else { /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) sock_hold(sk); + if (tcp_sk(sk)->mpc) + mptcp_tsq_flags(sk); } - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } @@ -548,6 +566,11 @@ void tcp_set_keepalive(struct sock *sk, int val) if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return; + if (is_meta_sk(sk)) { + mptcp_set_keepalive(sk, val); + return; + } + if (val && !sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) @@ -560,19 +583,38 @@ static void tcp_keepalive_timer (unsigned long data) struct sock *sk = (struct sock *) data; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk; u32 elapsed; /* Only process if socket is not in use. */ - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { + bh_lock_sock(meta_sk); + if (sock_owned_by_user(meta_sk)) { /* Try again later. */ inet_csk_reset_keepalive_timer (sk, HZ/20); goto out; } - if (sk->sk_state == TCP_LISTEN) { - tcp_synack_timer(sk); - goto out; + if (tp->send_mp_fclose) { + /* MUST do this before tcp_write_timeout, because retrans_stamp + * may have been set to 0 in another part while we are + * retransmitting MP_FASTCLOSE. Then, we would crash, because + * retransmits_timed_out accesses the meta-write-queue. + * + * We make sure that the timestamp is != 0. + */ + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_time_stamp ? : 1; + + if (tcp_write_timeout(sk)) + goto out; + + tcp_send_ack(sk); + icsk->icsk_backoff++; + icsk->icsk_retransmits++; + + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + elapsed = icsk->icsk_rto; + goto resched; } if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { @@ -588,7 +630,13 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + if (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) { + tcp_synack_timer(sk); + goto out; + } + + /* MPTCP: Keepalive timers are handled at the subflow level */ + if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE || is_meta_sk(sk)) goto out; elapsed = keepalive_time_when(tp); @@ -636,7 +684,7 @@ death: tcp_done(sk); out: - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5dac9fd..f2c3972 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -752,6 +752,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) kfree_rcu(ifp, rcu); } +EXPORT_SYMBOL(inet6_ifa_finish_destroy); static void ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8132b44..42f066e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -97,8 +97,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } -static int inet6_create(struct net *net, struct socket *sock, int protocol, - int kern) +int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) { struct inet_sock *inet; struct ipv6_pinfo *np; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index e4311cb..e0b65e0 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -96,8 +96,8 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, /* * request_sock (formerly open request) hash tables. */ -static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) +u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, + const u32 rnd, const u32 synq_hsize) { u32 c; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index d703218..3cdf1bd 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -176,7 +176,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, 0, NULL); + tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL); if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) goto out; @@ -252,7 +252,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); + dst_metric(dst, RTAX_INITRWND), sk); ireq->rcv_wscale = rcv_wscale; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5c71501..1c12623 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -63,6 +63,8 @@ #include #include #include +#include +#include #include #include @@ -73,14 +75,6 @@ #include #include -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, - struct request_sock *req); - -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); - -static const struct inet_connection_sock_af_ops ipv6_mapped; -static const struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; @@ -92,7 +86,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, } #endif -static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct rt6_info *rt = (const struct rt6_info *)dst; @@ -104,7 +98,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; } -static void tcp_v6_hash(struct sock *sk) +void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { @@ -117,7 +111,7 @@ static void tcp_v6_hash(struct sock *sk) } } -static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) +__u32 tcp_v6_init_sequence(const struct sk_buff *skb) { return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, @@ -125,7 +119,7 @@ static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) tcp_hdr(skb)->source); } -static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, +int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; @@ -340,7 +334,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, const struct ipv6hdr *hdr = (const struct ipv6hdr*)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct ipv6_pinfo *np; - struct sock *sk; + struct sock *sk, *meta_sk; int err; struct tcp_sock *tp; __u32 seq; @@ -360,8 +354,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } - bh_lock_sock(sk); - if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) + tp = tcp_sk(sk); + if (tp->mpc) + meta_sk = mptcp_meta_sk(sk); + else + meta_sk = sk; + + bh_lock_sock(meta_sk); + if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) @@ -372,7 +372,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - tp = tcp_sk(sk); seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { @@ -399,11 +398,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; tp->mtu_info = ntohl(info); - if (!sock_owned_by_user(sk)) + if (!sock_owned_by_user(meta_sk)) tcp_v6_mtu_reduced(sk); - else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, + else { + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) - sock_hold(sk); + sock_hold(sk); + if (tp->mpc) + mptcp_tsq_flags(sk); + } goto out; } @@ -413,7 +416,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(meta_sk)) goto out; req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, @@ -438,7 +441,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. It can, it SYNs are crossed. --ANK */ - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ @@ -448,22 +451,22 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - if (!sock_owned_by_user(sk) && np->recverr) { + if (!sock_owned_by_user(meta_sk) && np->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else sk->sk_err_soft = err; out: - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } -static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, - struct flowi6 *fl6, - struct request_sock *req, - u16 queue_mapping) +int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, + struct flowi6 *fl6, + struct request_sock *req, + u16 queue_mapping) { struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -489,7 +492,7 @@ done: return err; } -static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) +int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) { struct flowi6 fl6; int res; @@ -500,7 +503,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) return res; } -static void tcp_v6_reqsk_destructor(struct request_sock *req) +void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree_skb(inet6_rsk(req)->pktopts); } @@ -713,15 +716,15 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { }; #ifdef CONFIG_TCP_MD5SIG -static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .md5_lookup = tcp_v6_reqsk_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, }; #endif -static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, - u32 tsval, u32 tsecr, - struct tcp_md5sig_key *key, int rst, u8 tclass) +static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, + u32 data_ack, u32 win, u32 tsval, u32 tsecr, + struct tcp_md5sig_key *key, int rst, u8 tclass, int mptcp) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; @@ -739,7 +742,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, if (key) tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif - +#ifdef CONFIG_MPTCP + if (mptcp) + tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; +#endif buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (buff == NULL) @@ -777,6 +783,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, tcp_v6_md5_hash_hdr((__u8 *)topt, key, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, t1); + topt += 4; + } +#endif +#ifdef CONFIG_MPTCP + if (mptcp) { + /* Construction of 32-bit data_ack */ + *topt++ = htonl((TCPOPT_MPTCP << 24) | + ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | + (0x20 << 8) | + (0x01)); + *topt++ = htonl(data_ack); } #endif @@ -813,7 +830,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, kfree_skb(buff); } -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); u32 seq = 0, ack_seq = 0; @@ -868,7 +885,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); - tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, key, 1, 0); + tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, key, 1, 0, 0); #ifdef CONFIG_TCP_MD5SIG release_sk1: @@ -879,37 +896,44 @@ release_sk1: #endif } -static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, - struct tcp_md5sig_key *key, u8 tclass) + struct tcp_md5sig_key *key, u8 tclass, int mptcp) { - tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, key, 0, tclass); + tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, key, 0, tclass, mptcp); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + u32 data_ack = 0; + int mptcp = 0; + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) { + data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; + mptcp = 1; + } tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + data_ack, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw), - tw->tw_tclass); + tw->tw_tclass, mptcp); inet_twsk_put(tw); } -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, - struct request_sock *req) +void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) { tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, - req->rcv_wnd, tcp_time_stamp, req->ts_recent, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0); + 0, req->rcv_wnd, tcp_time_stamp, req->ts_recent, + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); } -static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) { struct request_sock *req, **prev; const struct tcphdr *th = tcp_hdr(skb); @@ -928,7 +952,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { + /* Don't lock again the meta-sk. It has been locked + * before mptcp_v6_do_rcv. + */ + if (tcp_sk(nsk)->mpc && !is_meta_sk(sk)) + bh_lock_sock(mptcp_meta_sk(nsk)); bh_lock_sock(nsk); + return nsk; } inet_twsk_put(inet_twsk(nsk)); @@ -948,6 +978,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tmp_opt; + struct mptcp_options_received mopt; struct request_sock *req; struct inet6_request_sock *treq; struct ipv6_pinfo *np = inet6_sk(sk); @@ -960,6 +991,23 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + tmp_opt.user_mss = tp->rx_opt.user_mss; + mptcp_init_mp_opt(&mopt); + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); + +#ifdef CONFIG_MPTCP + /*MPTCP structures not initialized, so return error */ + if (mptcp_init_failed) + mptcp_init_mp_opt(&mopt); + + if (mopt.is_mp_join) + return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk)); + if (mopt.drop_me) + goto drop; +#endif + if (!ipv6_unicast_destination(skb)) goto drop; @@ -975,7 +1023,22 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop; } - req = inet6_reqsk_alloc(&tcp6_request_sock_ops); +#ifdef CONFIG_MPTCP + if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled) + mopt.saw_mpc = 0; + if (mopt.saw_mpc && !want_cookie) { + req = inet6_reqsk_alloc(&mptcp6_request_sock_ops); + + if (req == NULL) + goto drop; + + mptcp_rsk(req)->mpcb = NULL; + mptcp_rsk(req)->dss_csum = mopt.dss_csum; + mptcp_rsk(req)->collide_tk.pprev = NULL; + } else +#endif + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + if (req == NULL) goto drop; @@ -983,17 +1046,15 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; #endif - tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); - tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, NULL); - if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); + if (mopt.saw_mpc && !want_cookie) + mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb); + treq = inet6_rsk(req); treq->rmt_addr = ipv6_hdr(skb)->saddr; treq->loc_addr = ipv6_hdr(skb)->daddr; @@ -1082,9 +1143,9 @@ drop: return 0; /* don't send reset */ } -static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { struct inet6_request_sock *treq; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); @@ -1302,7 +1363,7 @@ static __sum16 tcp_v6_checksum_init(struct sk_buff *skb) * This is because we cannot sleep with the original spinlock * held. */ -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp; @@ -1324,6 +1385,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; #endif + if (is_meta_sk(sk)) + return mptcp_v6_do_rcv(sk, skb); + if (sk_filter(sk, skb)) goto discard; @@ -1443,7 +1507,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) { const struct tcphdr *th; const struct ipv6hdr *hdr; - struct sock *sk; + struct sock *sk, *meta_sk = NULL; int ret; struct net *net = dev_net(skb->dev); @@ -1474,18 +1538,43 @@ static int tcp_v6_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); +#ifdef CONFIG_MPTCP + TCP_SKB_CB(skb)->mptcp_flags = 0; + TCP_SKB_CB(skb)->dss_off = 0; +#endif TCP_SKB_CB(skb)->when = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); - if (!sk) - goto no_tcp_socket; process: - if (sk->sk_state == TCP_TIME_WAIT) + if (sk && sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; +#ifdef CONFIG_MPTCP + if (!sk && th->syn && !th->ack) { + int ret = mptcp_lookup_join(skb, NULL); + + if (ret < 0) { + tcp_v6_send_reset(NULL, skb); + goto discard_it; + } else if (ret > 0) { + return 0; + } + } + + /* Is there a pending request sock for this segment ? */ + if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) { + if (sk) + sock_put(sk); + return 0; + } +#endif + + if (!sk) + goto no_tcp_socket; + if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1500,11 +1589,21 @@ process: sk_mark_napi_id(sk, skb); skb->dev = NULL; - bh_lock_sock_nested(sk); + if (tcp_sk(sk)->mpc) { + meta_sk = mptcp_meta_sk(sk); + + bh_lock_sock_nested(meta_sk); + if (sock_owned_by_user(meta_sk)) + skb->sk = sk; + } else { + meta_sk = sk; + bh_lock_sock_nested(sk); + } + ret = 0; - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { #ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(meta_sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = net_dma_find_channel(); if (tp->ucopy.dma_chan) @@ -1512,16 +1611,17 @@ process: else #endif { - if (!tcp_prequeue(sk, skb)) + if (!tcp_prequeue(meta_sk, skb)) ret = tcp_v6_do_rcv(sk, skb); } - } else if (unlikely(sk_add_backlog(sk, skb, - sk->sk_rcvbuf + sk->sk_sndbuf))) { - bh_unlock_sock(sk); + } else if (unlikely(sk_add_backlog(meta_sk, skb, + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { + bh_unlock_sock(meta_sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } - bh_unlock_sock(sk); + + bh_unlock_sock(meta_sk); sock_put(sk); return ret ? -1 : 0; @@ -1578,6 +1678,18 @@ do_time_wait: sk = sk2; goto process; } +#ifdef CONFIG_MPTCP + if (th->syn && !th->ack) { + int ret = mptcp_lookup_join(skb, inet_twsk(sk)); + + if (ret < 0) { + tcp_v6_send_reset(NULL, skb); + goto discard_it; + } else if (ret > 0) { + return 0; + } + } +#endif /* Fall through to ACK */ } case TCP_TW_ACK: @@ -1627,13 +1739,13 @@ static void tcp_v6_early_demux(struct sk_buff *skb) } } -static struct timewait_sock_ops tcp6_timewait_sock_ops = { +struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, }; -static const struct inet_connection_sock_af_ops ipv6_specific = { +const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, @@ -1665,7 +1777,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { * TCP over IPv4 via INET6 API */ -static const struct inet_connection_sock_af_ops ipv6_mapped = { +const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1710,7 +1822,7 @@ static int tcp_v6_init_sock(struct sock *sk) return 0; } -static void tcp_v6_destroy_sock(struct sock *sk) +void tcp_v6_destroy_sock(struct sock *sk) { tcp_v4_destroy_sock(sk); inet6_destroy_sock(sk); diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig new file mode 100644 index 0000000..88a05b1 --- /dev/null +++ b/net/mptcp/Kconfig @@ -0,0 +1,58 @@ +# +# MPTCP configuration +# +config MPTCP + bool "MPTCP protocol" + depends on (IPV6=y || IPV6=n) + ---help--- + This replaces the normal TCP stack with a Multipath TCP stack, + able to use several paths at once. + +menuconfig MPTCP_PM_ADVANCED + bool "MPTCP: advanced path-manager control" + depends on MPTCP=y + ---help--- + Support for selection of different path-managers. You should choose 'Y' here, + because otherwise you will not actively create new MPTCP-subflows. + +if MPTCP_PM_ADVANCED + +config MPTCP_FULLMESH + tristate "MPTCP Full-Mesh Path-Manager" + depends on MPTCP=y + ---help--- + This path-management module will create a full-mesh among all IP-addresses. + +config MPTCP_NDIFFPORTS + tristate "MPTCP ndiff-ports" + depends on MPTCP=y + ---help--- + This path-management module will create multiple subflows between the same + pair of IP-addresses, modifying the source-port. You can set the number + of subflows via the mptcp_ndiffports-sysctl. + +choice + prompt "Default MPTCP Path-Manager" + default DEFAULT + help + Select the Path-Manager of your choice + + config DEFAULT_FULLMESH + bool "Full mesh" if MPTCP_FULLMESH=y + + config DEFAULT_NDIFFPORTS + bool "ndiff-ports" if MPTCP_NDIFFPORTS=y + + config DEFAULT_DUMMY + bool "Default" + +endchoice + +endif + +config DEFAULT_MPTCP_PM + string + default "default" if DEFAULT_DUMMY + default "fullmesh" if DEFAULT_FULLMESH + default "ndiffports" if DEFAULT_NDIFFPORTS + default "default" diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile new file mode 100644 index 0000000..b38c962 --- /dev/null +++ b/net/mptcp/Makefile @@ -0,0 +1,17 @@ +# +## Makefile for MultiPath TCP support code. +# +# + +obj-$(CONFIG_MPTCP) += mptcp.o + +mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \ + mptcp_output.o mptcp_input.o + +obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o +obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o +obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o +obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o + +mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o + diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c new file mode 100644 index 0000000..f738ede --- /dev/null +++ b/net/mptcp/mptcp_coupled.c @@ -0,0 +1,273 @@ +/* + * MPTCP implementation - Coupled Congestion Control + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include + +#include + +/* Scaling is done in the numerator with alpha_scale_num and in the denominator + * with alpha_scale_den. + * + * To downscale, we just need to use alpha_scale. + * + * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2) + */ +static int alpha_scale_den = 10; +static int alpha_scale_num = 32; +static int alpha_scale = 12; + +struct mptcp_ccc { + u64 alpha; + bool forced_update; +}; + +static inline int mptcp_ccc_sk_can_send(const struct sock *sk) +{ + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt; +} + +static inline u64 mptcp_get_alpha(struct sock *meta_sk) +{ + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk); + return mptcp_ccc->alpha; +} + +static inline void mptcp_set_alpha(struct sock *meta_sk, u64 alpha) +{ + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk); + mptcp_ccc->alpha = alpha; +} + +static inline u64 mptcp_ccc_scale(u32 val, int scale) +{ + return (u64) val << scale; +} + +static inline bool mptcp_get_forced(struct sock *meta_sk) +{ + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk); + return mptcp_ccc->forced_update; +} + +static inline void mptcp_set_forced(struct sock *meta_sk, bool force) +{ + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk); + mptcp_ccc->forced_update = force; +} + +static void mptcp_ccc_recalc_alpha(struct sock *sk) +{ + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + struct sock *sub_sk; + int best_cwnd = 0, best_rtt = 0, can_send = 0; + u64 max_numerator = 0, sum_denominator = 0, alpha = 1; + + if (!mpcb) + return; + + /* Only one subflow left - fall back to normal reno-behavior + * (set alpha to 1) */ + if (mpcb->cnt_established <= 1) + goto exit; + + /* Do regular alpha-calculation for multiple subflows */ + + /* Find the max numerator of the alpha-calculation */ + mptcp_for_each_sk(mpcb, sub_sk) { + struct tcp_sock *sub_tp = tcp_sk(sub_sk); + u64 tmp; + + if (!mptcp_ccc_sk_can_send(sub_sk)) + continue; + + can_send++; + + /* We need to look for the path, that provides the max-value. + * Integer-overflow is not possible here, because + * tmp will be in u64. + */ + tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd, + alpha_scale_num), (u64)sub_tp->srtt * sub_tp->srtt); + + if (tmp >= max_numerator) { + max_numerator = tmp; + best_cwnd = sub_tp->snd_cwnd; + best_rtt = sub_tp->srtt; + } + } + + /* No subflow is able to send - we don't care anymore */ + if (unlikely(!can_send)) + goto exit; + + /* Calculate the denominator */ + mptcp_for_each_sk(mpcb, sub_sk) { + struct tcp_sock *sub_tp = tcp_sk(sub_sk); + + if (!mptcp_ccc_sk_can_send(sub_sk)) + continue; + + sum_denominator += div_u64( + mptcp_ccc_scale(sub_tp->snd_cwnd, + alpha_scale_den) * best_rtt, + sub_tp->srtt); + } + sum_denominator *= sum_denominator; + if (unlikely(!sum_denominator)) { + pr_err("%s: sum_denominator == 0, cnt_established:%d\n", + __func__, mpcb->cnt_established); + mptcp_for_each_sk(mpcb, sub_sk) { + struct tcp_sock *sub_tp = tcp_sk(sub_sk); + pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u", + __func__, sub_tp->mptcp->path_index, + sub_sk->sk_state, sub_tp->srtt, + sub_tp->snd_cwnd); + } + } + + alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator); + + if (unlikely(!alpha)) + alpha = 1; + +exit: + mptcp_set_alpha(mptcp_meta_sk(sk), alpha); +} + +static void mptcp_ccc_init(struct sock *sk) +{ + if (tcp_sk(sk)->mpc) { + mptcp_set_forced(mptcp_meta_sk(sk), 0); + mptcp_set_alpha(mptcp_meta_sk(sk), 1); + } + /* If we do not mptcp, behave like reno: return */ +} + +static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_LOSS) + mptcp_ccc_recalc_alpha(sk); +} + +static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state) +{ + if (!tcp_sk(sk)->mpc) + return; + + mptcp_set_forced(mptcp_meta_sk(sk), 1); +} + +static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + int snd_cwnd; + + if (!tp->mpc) { + tcp_reno_cong_avoid(sk, ack, in_flight); + return; + } + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + tcp_slow_start(tp); + mptcp_ccc_recalc_alpha(sk); + return; + } + + if (mptcp_get_forced(mptcp_meta_sk(sk))) { + mptcp_ccc_recalc_alpha(sk); + mptcp_set_forced(mptcp_meta_sk(sk), 0); + } + + if (mpcb->cnt_established > 1) { + u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk)); + + /* This may happen, if at the initialization, the mpcb + * was not yet attached to the sock, and thus + * initializing alpha failed. + */ + if (unlikely(!alpha)) + alpha = 1; + + snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale), + alpha); + + /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd) + * Thus, we select here the max value. */ + if (snd_cwnd < tp->snd_cwnd) + snd_cwnd = tp->snd_cwnd; + } else { + snd_cwnd = tp->snd_cwnd; + } + + if (tp->snd_cwnd_cnt >= snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + mptcp_ccc_recalc_alpha(sk); + } + + tp->snd_cwnd_cnt = 0; + } else { + tp->snd_cwnd_cnt++; + } +} + +static struct tcp_congestion_ops mptcp_ccc = { + .init = mptcp_ccc_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = mptcp_ccc_cong_avoid, + .cwnd_event = mptcp_ccc_cwnd_event, + .set_state = mptcp_ccc_set_state, + .min_cwnd = tcp_reno_min_cwnd, + .owner = THIS_MODULE, + .name = "coupled", +}; + +static int __init mptcp_ccc_register(void) +{ + BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&mptcp_ccc); +} + +static void __exit mptcp_ccc_unregister(void) +{ + tcp_unregister_congestion_control(&mptcp_ccc); +} + +module_init(mptcp_ccc_register); +module_exit(mptcp_ccc_unregister); + +MODULE_AUTHOR("Christoph Paasch, Sébastien Barré"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL"); +MODULE_VERSION("0.1"); diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c new file mode 100644 index 0000000..74d068f --- /dev/null +++ b/net/mptcp/mptcp_ctrl.c @@ -0,0 +1,2295 @@ +/* + * MPTCP implementation - MPTCP-control + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *mptcp_sock_cache __read_mostly; +static struct kmem_cache *mptcp_cb_cache __read_mostly; +static struct kmem_cache *mptcp_tw_cache __read_mostly; + +int sysctl_mptcp_enabled __read_mostly = 1; +int sysctl_mptcp_checksum __read_mostly = 1; +int sysctl_mptcp_debug __read_mostly; +EXPORT_SYMBOL(sysctl_mptcp_debug); +int sysctl_mptcp_syn_retries __read_mostly = 3; + +bool mptcp_init_failed __read_mostly; + +static int proc_mptcp_path_manager(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + char val[MPTCP_PM_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = MPTCP_PM_NAME_MAX, + }; + int ret; + + mptcp_get_default_path_manager(val); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = mptcp_set_default_path_manager(val); + return ret; +} + +static struct ctl_table mptcp_table[] = { + { + .procname = "mptcp_enabled", + .data = &sysctl_mptcp_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "mptcp_checksum", + .data = &sysctl_mptcp_checksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "mptcp_debug", + .data = &sysctl_mptcp_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "mptcp_syn_retries", + .data = &sysctl_mptcp_syn_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "mptcp_path_manager", + .mode = 0644, + .maxlen = MPTCP_PM_NAME_MAX, + .proc_handler = proc_mptcp_path_manager, + }, + { } +}; + +static inline u32 mptcp_hash_tk(u32 token) +{ + return token % MPTCP_HASH_SIZE; +} + +struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE]; +EXPORT_SYMBOL(tk_hashtable); + +/* This second hashtable is needed to retrieve request socks + * created as a result of a join request. While the SYN contains + * the token, the final ack does not, so we need a separate hashtable + * to retrieve the mpcb. + */ +struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE]; +spinlock_t mptcp_reqsk_hlock; /* hashtable protection */ + +/* The following hash table is used to avoid collision of token */ +static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE]; +spinlock_t mptcp_tk_hashlock; /* hashtable protection */ + +static int mptcp_reqsk_find_tk(u32 token) +{ + u32 hash = mptcp_hash_tk(token); + struct mptcp_request_sock *mtreqsk; + const struct hlist_nulls_node *node; + + hlist_nulls_for_each_entry_rcu(mtreqsk, node, + &mptcp_reqsk_tk_htb[hash], collide_tk) { + if (token == mtreqsk->mptcp_loc_token) + return 1; + } + return 0; +} + +static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, u32 token) +{ + u32 hash = mptcp_hash_tk(token); + + hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->collide_tk, + &mptcp_reqsk_tk_htb[hash]); +} + +static void mptcp_reqsk_remove_tk(struct request_sock *reqsk) +{ + rcu_read_lock(); + spin_lock(&mptcp_tk_hashlock); + hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->collide_tk); + spin_unlock(&mptcp_tk_hashlock); + rcu_read_unlock(); +} + +void mptcp_reqsk_destructor(struct request_sock *req) +{ + if (!mptcp_rsk(req)->mpcb) { + if (in_softirq()) { + mptcp_reqsk_remove_tk(req); + } else { + rcu_read_lock_bh(); + spin_lock(&mptcp_tk_hashlock); + hlist_nulls_del_init_rcu(&mptcp_rsk(req)->collide_tk); + spin_unlock(&mptcp_tk_hashlock); + rcu_read_unlock_bh(); + } + } else { + mptcp_hash_request_remove(req); + } +} + +static void __mptcp_hash_insert(struct tcp_sock *meta_tp, u32 token) +{ + u32 hash = mptcp_hash_tk(token); + hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]); + meta_tp->inside_tk_table = 1; +} + +static int mptcp_find_token(u32 token) +{ + u32 hash = mptcp_hash_tk(token); + struct tcp_sock *meta_tp; + const struct hlist_nulls_node *node; + + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) { + if (token == meta_tp->mptcp_loc_token) + return 1; + } + return 0; +} + +static void mptcp_set_key_reqsk(struct request_sock *req, + const struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + + if (skb->protocol == htons(ETH_P_IP)) { + mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + ireq->loc_port, + ireq->rmt_port); +#if IS_ENABLED(CONFIG_IPV6) + } else { + mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32, + ipv6_hdr(skb)->daddr.s6_addr32, + ireq->loc_port, + ireq->rmt_port); +#endif + } + + mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); +} + +/* New MPTCP-connection request, prepare a new token for the meta-socket that + * will be created in mptcp_check_req_master(), and store the received token. + */ +void mptcp_reqsk_new_mptcp(struct request_sock *req, + const struct tcp_options_received *rx_opt, + const struct mptcp_options_received *mopt, + const struct sk_buff *skb) +{ + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + + tcp_rsk(req)->saw_mpc = 1; + + rcu_read_lock(); + spin_lock(&mptcp_tk_hashlock); + do { + mptcp_set_key_reqsk(req, skb); + } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || + mptcp_find_token(mtreq->mptcp_loc_token)); + + mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token); + spin_unlock(&mptcp_tk_hashlock); + rcu_read_unlock(); + mtreq->mptcp_rem_key = mopt->mptcp_key; +} + +static void mptcp_set_key_sk(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_sock *isk = inet_sk(sk); + + if (sk->sk_family == AF_INET) + tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr, + isk->inet_daddr, + isk->inet_sport, + isk->inet_dport); +#if IS_ENABLED(CONFIG_IPV6) + else + tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32, + inet6_sk(sk)->daddr.s6_addr32, + isk->inet_sport, + isk->inet_dport); +#endif + + mptcp_key_sha1(tp->mptcp_loc_key, + &tp->mptcp_loc_token, NULL); +} + +void mptcp_connect_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + rcu_read_lock_bh(); + spin_lock(&mptcp_tk_hashlock); + do { + mptcp_set_key_sk(sk); + } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) || + mptcp_find_token(tp->mptcp_loc_token)); + + __mptcp_hash_insert(tp, tp->mptcp_loc_token); + spin_unlock(&mptcp_tk_hashlock); + rcu_read_unlock_bh(); +} + +/** + * This function increments the refcount of the mpcb struct. + * It is the responsibility of the caller to decrement when releasing + * the structure. + */ +struct sock *mptcp_hash_find(struct net *net, u32 token) +{ + u32 hash = mptcp_hash_tk(token); + struct tcp_sock *meta_tp; + struct sock *meta_sk = NULL; + struct hlist_nulls_node *node; + + rcu_read_lock(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], + tk_table) { + meta_sk = (struct sock *)meta_tp; + if (token == meta_tp->mptcp_loc_token && + net_eq(net, sock_net(meta_sk)) && + atomic_inc_not_zero(&meta_sk->sk_refcnt)) + break; + meta_sk = NULL; + } + rcu_read_unlock(); + return meta_sk; +} + +void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) +{ + /* remove from the token hashtable */ + rcu_read_lock_bh(); + spin_lock(&mptcp_tk_hashlock); + hlist_nulls_del_init_rcu(&meta_tp->tk_table); + meta_tp->inside_tk_table = 0; + spin_unlock(&mptcp_tk_hashlock); + rcu_read_unlock_bh(); +} + +void mptcp_hash_remove(struct tcp_sock *meta_tp) +{ + rcu_read_lock(); + spin_lock(&mptcp_tk_hashlock); + hlist_nulls_del_init_rcu(&meta_tp->tk_table); + meta_tp->inside_tk_table = 0; + spin_unlock(&mptcp_tk_hashlock); + rcu_read_unlock(); +} + +static struct sock *mptcp_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return tcp_v6_syn_recv_sock(sk, skb, req, dst); + + /* sk->sk_family == AF_INET */ + if (req->rsk_ops->family == AF_INET6) + return mptcp_v6v4_syn_recv_sock(sk, skb, req, dst); +#endif + + /* sk->sk_family == AF_INET && req->rsk_ops->family == AF_INET */ + return tcp_v4_syn_recv_sock(sk, skb, req, dst); +} + +struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sock *sk, *subsk = NULL; + u32 max_data_seq = 0; + /* max_data_seq initialized to correct compiler-warning. + * But the initialization is handled by max_data_seq_set + */ + short max_data_seq_set = 0; + u32 min_time = 0xffffffff; + + /* How do we select the subflow to send the window-update on? + * + * 1. He has to be in a state where he can send an ack and is + * operational (pf = 0). + * 2. He has to be one of those subflow who recently + * contributed to the received stream + * (this guarantees a working subflow) + * a) its latest data_seq received is after the original + * copied_seq. + * We select the one with the lowest rtt, so that the + * window-update reaches our peer the fastest. + * b) if no subflow has this kind of data_seq (e.g., very + * strange meta-level retransmissions going on), we take + * the subflow who last sent the highest data_seq. + */ + mptcp_for_each_sk(meta_tp->mpcb, sk) { + struct tcp_sock *tp = tcp_sk(sk); + + if (!mptcp_sk_can_send_ack(sk) || tp->pf) + continue; + + /* Select among those who contributed to the + * current receive-queue. + */ + if (copied && after(tp->mptcp->last_data_seq, meta_tp->copied_seq - copied)) { + if (tp->srtt < min_time) { + min_time = tp->srtt; + subsk = sk; + max_data_seq_set = 0; + } + continue; + } + + if (!subsk && !max_data_seq_set) { + max_data_seq = tp->mptcp->last_data_seq; + max_data_seq_set = 1; + subsk = sk; + } + + /* Otherwise, take the one with the highest data_seq */ + if ((!subsk || max_data_seq_set) && + after(tp->mptcp->last_data_seq, max_data_seq)) { + max_data_seq = tp->mptcp->last_data_seq; + subsk = sk; + } + } + + if (!subsk) { + mptcp_debug("%s subsk is null, copied %d, cseq %u\n", __func__, + copied, meta_tp->copied_seq); + mptcp_for_each_sk(meta_tp->mpcb, sk) { + struct tcp_sock *tp = tcp_sk(sk); + mptcp_debug("%s pi %d state %u last_dseq %u\n", + __func__, tp->mptcp->path_index, sk->sk_state, + tp->mptcp->last_data_seq); + } + } + + return subsk; +} +EXPORT_SYMBOL(mptcp_select_ack_sock); + +static void mptcp_sock_def_error_report(struct sock *sk) +{ + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + + if (!sock_flag(sk, SOCK_DEAD)) + mptcp_sub_close(sk, 0); + + if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd || + mpcb->send_infinite_mapping) { + struct sock *meta_sk = mptcp_meta_sk(sk); + + meta_sk->sk_err = sk->sk_err; + meta_sk->sk_err_soft = sk->sk_err_soft; + + if (!sock_flag(meta_sk, SOCK_DEAD)) + meta_sk->sk_error_report(meta_sk); + + tcp_done(meta_sk); + } + + sk->sk_err = 0; + return; +} + +static void mptcp_mpcb_put(struct mptcp_cb *mpcb) +{ + if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) { + mptcp_cleanup_path_manager(mpcb); + kmem_cache_free(mptcp_cb_cache, mpcb); + } +} + +static void mptcp_sock_destruct(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + inet_sock_destruct(sk); + + BUG_ON(!list_empty(&tp->mptcp->cb_list)); + + kmem_cache_free(mptcp_sock_cache, tp->mptcp); + tp->mptcp = NULL; + + if (!is_meta_sk(sk) && !tp->was_meta_sk) { + /* Taken when mpcb pointer was set */ + sock_put(mptcp_meta_sk(sk)); + mptcp_mpcb_put(tp->mpcb); + } else { + struct mptcp_cb *mpcb = tp->mpcb; + struct mptcp_tw *mptw; + + /* The mpcb is disappearing - we can make the final + * update to the rcv_nxt of the time-wait-sock and remove + * its reference to the mpcb. + */ + spin_lock_bh(&mpcb->tw_lock); + list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) { + list_del_rcu(&mptw->list); + mptw->in_list = 0; + mptcp_mpcb_put(mpcb); + rcu_assign_pointer(mptw->mpcb, NULL); + } + spin_unlock_bh(&mpcb->tw_lock); + + mptcp_mpcb_put(mpcb); + + mptcp_debug("%s destroying meta-sk\n", __func__); + } +} + +void mptcp_destroy_sock(struct sock *sk) +{ + if (is_meta_sk(sk)) { + struct sock *sk_it, *tmpsk; + + __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue); + mptcp_purge_ofo_queue(tcp_sk(sk)); + + /* We have to close all remaining subflows. Normally, they + * should all be about to get closed. But, if the kernel is + * forcing a closure (e.g., tcp_write_err), the subflows might + * not have been closed properly (as we are waiting for the + * DATA_ACK of the DATA_FIN). + */ + mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) { + /* Already did call tcp_close - waiting for graceful + * closure, or if we are retransmitting fast-close on + * the subflow. The reset (or timeout) will kill the + * subflow.. + */ + if (tcp_sk(sk_it)->closing || + tcp_sk(sk_it)->send_mp_fclose) + continue; + + /* Allow the delayed work first to prevent time-wait state */ + if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work)) + continue; + + mptcp_sub_close(sk_it, 0); + } + } else { + mptcp_del_sock(sk); + } +} + +static void mptcp_set_state(struct sock *sk) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + + /* Meta is not yet established - wake up the application */ + if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) && + sk->sk_state == TCP_ESTABLISHED) { + tcp_set_state(meta_sk, TCP_ESTABLISHED); + + if (!sock_flag(meta_sk, SOCK_DEAD)) { + meta_sk->sk_state_change(meta_sk); + sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT); + } + } + + if (sk->sk_state == TCP_ESTABLISHED) { + tcp_sk(sk)->mptcp->establish_increased = 1; + tcp_sk(sk)->mpcb->cnt_established++; + } +} + +void mptcp_set_keepalive(struct sock *sk, int val) +{ + struct sock *sk_it; + + mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) { + tcp_set_keepalive(sk_it, val); + sock_valbool_flag(sk, SOCK_KEEPOPEN, val); + } +} + +u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; +u32 mptcp_key_seed = 0; + +void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn) +{ + u32 workspace[SHA_WORKSPACE_WORDS]; + u32 mptcp_hashed_key[SHA_DIGEST_WORDS]; + u8 input[64]; + int i; + + memset(workspace, 0, sizeof(workspace)); + + /* Initialize input with appropriate padding */ + memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte + * is explicitly set too */ + memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */ + input[8] = 0x80; /* Padding: First bit after message = 1 */ + input[63] = 0x40; /* Padding: Length of the message = 64 bits */ + + sha_init(mptcp_hashed_key); + sha_transform(mptcp_hashed_key, input, workspace); + + for (i = 0; i < 5; i++) + mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]); + + if (token) + *token = mptcp_hashed_key[0]; + if (idsn) + *idsn = *((u64 *)&mptcp_hashed_key[3]); +} + +void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2, + u32 *hash_out) +{ + u32 workspace[SHA_WORKSPACE_WORDS]; + u8 input[128]; /* 2 512-bit blocks */ + int i; + + memset(workspace, 0, sizeof(workspace)); + + /* Generate key xored with ipad */ + memset(input, 0x36, 64); + for (i = 0; i < 8; i++) + input[i] ^= key_1[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key_2[i]; + + memcpy(&input[64], rand_1, 4); + memcpy(&input[68], rand_2, 4); + input[72] = 0x80; /* Padding: First bit after message = 1 */ + memset(&input[73], 0, 53); + + /* Padding: Length of the message = 512 + 64 bits */ + input[126] = 0x02; + input[127] = 0x40; + + sha_init(hash_out); + sha_transform(hash_out, input, workspace); + memset(workspace, 0, sizeof(workspace)); + + sha_transform(hash_out, &input[64], workspace); + memset(workspace, 0, sizeof(workspace)); + + for (i = 0; i < 5; i++) + hash_out[i] = cpu_to_be32(hash_out[i]); + + /* Prepare second part of hmac */ + memset(input, 0x5C, 64); + for (i = 0; i < 8; i++) + input[i] ^= key_1[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key_2[i]; + + memcpy(&input[64], hash_out, 20); + input[84] = 0x80; + memset(&input[85], 0, 41); + + /* Padding: Length of the message = 512 + 160 bits */ + input[126] = 0x02; + input[127] = 0xA0; + + sha_init(hash_out); + sha_transform(hash_out, input, workspace); + memset(workspace, 0, sizeof(workspace)); + + sha_transform(hash_out, &input[64], workspace); + + for (i = 0; i < 5; i++) + hash_out[i] = cpu_to_be32(hash_out[i]); +} + +static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk) +{ + /* Socket-options handled by mptcp_inherit_sk while creating the meta-sk. + * ====== + * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT, + * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER, + * TCP_NODELAY, TCP_CORK + * + * Socket-options handled in this function here + * ====== + * SO_KEEPALIVE + * TCP_KEEP* + * TCP_DEFER_ACCEPT + * + * Socket-options on the todo-list + * ====== + * SO_BINDTODEVICE - should probably prevent creation of new subsocks + * across other devices. - what about the api-draft? + * SO_DEBUG + * SO_REUSEADDR - probably we don't care about this + * SO_DONTROUTE, SO_BROADCAST + * SO_OOBINLINE + * SO_LINGER + * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM + * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM + * SO_RXQ_OVFL + * TCP_COOKIE_TRANSACTIONS + * TCP_MAXSEG + * TCP_THIN_* - Handled by mptcp_inherit_sk, but we need to support this + * in mptcp_retransmit_timer. AND we need to check what is + * about the subsockets. + * TCP_LINGER2 + * TCP_WINDOW_CLAMP + * TCP_USER_TIMEOUT + * TCP_MD5SIG + * + * Socket-options of no concern for the meta-socket (but for the subsocket) + * ====== + * SO_PRIORITY + * SO_MARK + * TCP_CONGESTION + * TCP_SYNCNT + * TCP_QUICKACK + */ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + + /****** KEEPALIVE-handler ******/ + + /* Keepalive-timer has been started already, but it is handled at the + * subflow level. + */ + if (sock_flag(meta_sk, SOCK_KEEPOPEN)) { + inet_csk_delete_keepalive_timer(meta_sk); + inet_csk_reset_keepalive_timer(master_sk, keepalive_time_when(meta_tp)); + } + + /****** DEFER_ACCEPT-handler ******/ + + /* DEFER_ACCEPT is not of concern for new subflows - we always accept + * them + */ + inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0; +} + +static void mptcp_sub_inherit_sockopts(struct sock *meta_sk, struct sock *sub_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + /* Keepalive is handled at the subflow-level */ + if (sock_flag(meta_sk, SOCK_KEEPOPEN)) { + inet_csk_reset_keepalive_timer(sub_sk, keepalive_time_when(meta_tp)); + sock_valbool_flag(sub_sk, SOCK_KEEPOPEN, keepalive_time_when(meta_tp)); + } + + /* IP_TOS also goes to the subflow. */ + if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) { + inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos; + sub_sk->sk_priority = meta_sk->sk_priority; + sk_dst_reset(sub_sk); + } + + /* Inherit SO_REUSEADDR */ + sub_sk->sk_reuse = meta_sk->sk_reuse; + + /* Inherit snd/rcv-buffer locks */ + sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; +} + +int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb) +{ + /* skb-sk may be NULL if we receive a packet immediatly after the + * SYN/ACK + MP_CAPABLE. + */ + struct sock *sk = skb->sk ? skb->sk : meta_sk; + int ret = 0; + + skb->sk = NULL; + + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { + kfree_skb(skb); + return 0; + } + + if (sk->sk_family == AF_INET) + ret = tcp_v4_do_rcv(sk, skb); +#if IS_ENABLED(CONFIG_IPV6) + else + ret = tcp_v6_do_rcv(sk, skb); +#endif + + sock_put(sk); + return ret; +} + +struct lock_class_key meta_key; +struct lock_class_key meta_slock_key; + +/* Code heavily inspired from sk_clone() */ +static int mptcp_inherit_sk(const struct sock *sk, struct sock *newsk, + int family, const gfp_t flags) +{ + struct sk_filter *filter; + struct proto *prot = newsk->sk_prot; + const struct inet_connection_sock_af_ops *af_ops = inet_csk(newsk)->icsk_af_ops; +#ifdef CONFIG_SECURITY_NETWORK + void *sptr = newsk->sk_security; +#endif + + if (sk->sk_family == AF_INET) { + memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin)); + memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end, + sizeof(struct tcp_sock) - offsetof(struct sock, sk_dontcopy_end)); + } else { + memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin)); + memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end, + sizeof(struct tcp6_sock) - offsetof(struct sock, sk_dontcopy_end)); + } + +#ifdef CONFIG_SECURITY_NETWORK + newsk->sk_security = sptr; + security_sk_clone(sk, newsk); +#endif + + /* Has been changed by sock_copy above - we may need an IPv6-socket */ + newsk->sk_family = family; + newsk->sk_prot = prot; + newsk->sk_prot_creator = prot; + inet_csk(newsk)->icsk_af_ops = af_ops; + + /* We don't yet have the mptcp-point. Thus we still need inet_sock_destruct */ + newsk->sk_destruct = inet_sock_destruct; + + /* SANITY */ + get_net(sock_net(newsk)); + sk_node_init(&newsk->sk_node); + sock_lock_init_class_and_name(newsk, "slock-AF_INET-MPTCP", + &meta_slock_key, "sk_lock-AF_INET-MPTCP", + &meta_key); + + /* Unlocks are in: + * + * 1. If we are creating the master-sk + * * on client-side in tcp_rcv_state_process, "case TCP_SYN_SENT" + * * on server-side in tcp_child_process + * 2. If we are creating another subsock + * * Also in tcp_child_process + */ + bh_lock_sock(newsk); + newsk->sk_backlog.head = NULL; + newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; + + atomic_set(&newsk->sk_rmem_alloc, 0); + atomic_set(&newsk->sk_wmem_alloc, 1); + atomic_set(&newsk->sk_omem_alloc, 0); + + skb_queue_head_init(&newsk->sk_receive_queue); + skb_queue_head_init(&newsk->sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&newsk->sk_async_wait_queue); +#endif + + spin_lock_init(&newsk->sk_dst_lock); + rwlock_init(&newsk->sk_callback_lock); + lockdep_set_class_and_name(&newsk->sk_callback_lock, + af_callback_keys + newsk->sk_family, + af_family_clock_key_strings[newsk->sk_family]); + newsk->sk_dst_cache = NULL; + newsk->sk_rx_dst = NULL; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + newsk->sk_send_head = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + + tcp_sk(newsk)->mptcp = NULL; + + sock_reset_flag(newsk, SOCK_DONE); + skb_queue_head_init(&newsk->sk_error_queue); + + filter = rcu_dereference_protected(newsk->sk_filter, 1); + if (filter != NULL) + sk_filter_charge(newsk, filter); + + if (unlikely(xfrm_sk_clone_policy(newsk))) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + return -ENOMEM; + } + + newsk->sk_err = 0; + newsk->sk_priority = 0; + /* Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); + atomic_set(&newsk->sk_refcnt, 2); + + /* Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + sk_set_socket(newsk, NULL); + newsk->sk_wq = NULL; + + if (newsk->sk_prot->sockets_allocated) + sk_sockets_allocated_inc(newsk); + + if (sock_flag(newsk, SOCK_TIMESTAMP) || + sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + net_enable_timestamp(); + + return 0; +} + +int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window) +{ + struct mptcp_cb *mpcb; + struct sock *master_sk; + struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk); + struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk); + struct sk_buff *skb, *tmp; + u64 idsn; + + master_sk = sk_prot_alloc(meta_sk->sk_prot, GFP_ATOMIC | __GFP_ZERO, + meta_sk->sk_family); + if (!master_sk) + return -ENOBUFS; + + master_tp = tcp_sk(master_sk); + master_icsk = inet_csk(master_sk); + + /* Need to set this here - it is needed by mptcp_inherit_sk */ + master_sk->sk_prot = meta_sk->sk_prot; + master_sk->sk_prot_creator = meta_sk->sk_prot; + master_icsk->icsk_af_ops = meta_icsk->icsk_af_ops; + + mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC); + if (!mpcb) { + sk_free(master_sk); + return -ENOBUFS; + } + + /* master_sk inherits from meta_sk */ + if (mptcp_inherit_sk(meta_sk, master_sk, meta_sk->sk_family, GFP_ATOMIC)) { + kmem_cache_free(mptcp_cb_cache, mpcb); + return -ENOBUFS; + } + +#if IS_ENABLED(CONFIG_IPV6) + if (meta_icsk->icsk_af_ops == &ipv6_mapped) { + struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); + + inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6; + + newnp = inet6_sk(master_sk); + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; + newnp->opt = NULL; + newnp->pktoptions = NULL; + (void)xchg(&newnp->rxpmtu, NULL); + } else if (meta_sk->sk_family == AF_INET6) { + struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); + + inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6; + + newnp = inet6_sk(master_sk); + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + newnp->hop_limit = -1; + newnp->mcast_hops = IPV6_DEFAULT_MCASTHOPS; + newnp->mc_loop = 1; + newnp->pmtudisc = IPV6_PMTUDISC_WANT; + newnp->ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only; + } +#endif + + meta_tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, GFP_ATOMIC); + if (!meta_tp->mptcp) { + kmem_cache_free(mptcp_cb_cache, mpcb); + sk_free(master_sk); + return -ENOBUFS; + } + + INIT_LIST_HEAD(&meta_tp->mptcp->cb_list); + + /* Store the keys and generate the peer's token */ + mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key; + mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token; + + /* Generate Initial data-sequence-numbers */ + mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn); + idsn = ntohll(idsn) + 1; + mpcb->snd_high_order[0] = idsn >> 32; + mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1; + + meta_tp->write_seq = (u32)idsn; + meta_tp->snd_sml = meta_tp->write_seq; + meta_tp->snd_una = meta_tp->write_seq; + meta_tp->snd_nxt = meta_tp->write_seq; + meta_tp->pushed_seq = meta_tp->write_seq; + meta_tp->snd_up = meta_tp->write_seq; + + mpcb->mptcp_rem_key = remote_key; + mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn); + idsn = ntohll(idsn) + 1; + mpcb->rcv_high_order[0] = idsn >> 32; + mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1; + meta_tp->copied_seq = (u32) idsn; + meta_tp->rcv_nxt = (u32) idsn; + meta_tp->rcv_wup = (u32) idsn; + + meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1; + meta_tp->snd_wnd = window; + meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */ + + meta_tp->packets_out = 0; + meta_tp->mptcp->snt_isn = meta_tp->write_seq; /* Initial data-sequence-number */ + meta_icsk->icsk_probes_out = 0; + + /* Set mptcp-pointers */ + master_tp->mpcb = mpcb; + master_tp->meta_sk = meta_sk; + meta_tp->mpcb = mpcb; + meta_tp->meta_sk = meta_sk; + mpcb->meta_sk = meta_sk; + mpcb->master_sk = master_sk; + + meta_tp->mpc = 1; + meta_tp->mptcp->attached = 0; + meta_tp->was_meta_sk = 0; + + /* Initialize the queues */ + skb_queue_head_init(&mpcb->reinject_queue); + skb_queue_head_init(&master_tp->out_of_order_queue); + tcp_prequeue_init(master_tp); + INIT_LIST_HEAD(&master_tp->tsq_node); + + master_tp->tsq_flags = 0; + + /* Copy the write-queue from the meta down to the master. + * This is necessary to get the SYN to the master-write-queue. + * No other data can be queued, before tcp_sendmsg waits for the + * connection to finish. + */ + skb_queue_walk_safe(&meta_sk->sk_write_queue, skb, tmp) { + skb_unlink(skb, &meta_sk->sk_write_queue); + skb_queue_tail(&master_sk->sk_write_queue, skb); + + master_sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(master_sk, skb->truesize); + } + + meta_sk->sk_wmem_queued = 0; + meta_sk->sk_forward_alloc = 0; + + mutex_init(&mpcb->mpcb_mutex); + + /* Init the accept_queue structure, we support a queue of 32 pending + * connections, it does not need to be huge, since we only store here + * pending subflow creations. + */ + if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) { + inet_put_port(master_sk); + kmem_cache_free(mptcp_sock_cache, meta_tp->mptcp); + kmem_cache_free(mptcp_cb_cache, mpcb); + sk_free(master_sk); + meta_tp->mpc = 0; + return -ENOMEM; + } + + /* Redefine function-pointers as the meta-sk is now fully ready */ + meta_sk->sk_backlog_rcv = mptcp_backlog_rcv; + meta_sk->sk_destruct = mptcp_sock_destruct; + mpcb->syn_recv_sock = mptcp_syn_recv_sock; + + /* Meta-level retransmit timer */ + meta_icsk->icsk_rto *= 2; /* Double of initial - rto */ + + tcp_init_xmit_timers(master_sk); + /* Has been set for sending out the SYN */ + inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS); + + if (!meta_tp->inside_tk_table) { + /* Adding the meta_tp in the token hashtable - coming from server-side */ + rcu_read_lock(); + spin_lock(&mptcp_tk_hashlock); + + __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token); + + spin_unlock(&mptcp_tk_hashlock); + rcu_read_unlock(); + } + master_tp->inside_tk_table = 0; + + /* Init time-wait stuff */ + INIT_LIST_HEAD(&mpcb->tw_list); + spin_lock_init(&mpcb->tw_lock); + + INIT_LIST_HEAD(&mpcb->callback_list); + + mptcp_mpcb_inherit_sockopts(meta_sk, master_sk); + + mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf; + mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf; + mpcb->orig_window_clamp = meta_tp->window_clamp; + + /* The meta is directly linked - set refcnt to 1 */ + atomic_set(&mpcb->mpcb_refcnt, 1); + + mptcp_init_path_manager(mpcb); + + mptcp_debug("%s: created mpcb with token %#x\n", + __func__, mpcb->mptcp_loc_token); + + return 0; +} + +struct sock *mptcp_sk_clone(const struct sock *sk, int family, + const gfp_t priority) +{ + struct sock *newsk = NULL; + + if (family == AF_INET && sk->sk_family == AF_INET) { + newsk = sk_prot_alloc(&tcp_prot, priority, family); + if (!newsk) + return NULL; + + /* Set these pointers - they are needed by mptcp_inherit_sk */ + newsk->sk_prot = &tcp_prot; + newsk->sk_prot_creator = &tcp_prot; + inet_csk(newsk)->icsk_af_ops = &ipv4_specific; + newsk->sk_family = AF_INET; + } +#if IS_ENABLED(CONFIG_IPV6) + else { + newsk = sk_prot_alloc(&tcpv6_prot, priority, family); + if (!newsk) + return NULL; + + newsk->sk_prot = &tcpv6_prot; + newsk->sk_prot_creator = &tcpv6_prot; + if (family == AF_INET) + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; + else + inet_csk(newsk)->icsk_af_ops = &ipv6_specific; + newsk->sk_family = AF_INET6; + } +#endif + + if (mptcp_inherit_sk(sk, newsk, family, priority)) + return NULL; + + return newsk; +} + +void mptcp_fallback_meta_sk(struct sock *meta_sk) +{ + kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt); + kmem_cache_free(mptcp_sock_cache, tcp_sk(meta_sk)->mptcp); + kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb); +} + +int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, + gfp_t flags) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct tcp_sock *tp = tcp_sk(sk); + + tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags); + if (!tp->mptcp) + return -ENOMEM; + + tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb); + /* No more space for more subflows? */ + if (!tp->mptcp->path_index) { + kmem_cache_free(mptcp_sock_cache, tp->mptcp); + return -EPERM; + } + + INIT_LIST_HEAD(&tp->mptcp->cb_list); + + tp->mptcp->tp = tp; + tp->mpcb = mpcb; + tp->meta_sk = meta_sk; + tp->mpc = 1; + tp->mptcp->loc_id = loc_id; + tp->mptcp->rem_id = rem_id; + tp->mptcp->last_rbuf_opti = tcp_time_stamp; + + /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be + * included in mptcp_del_sock(), because the mpcb must remain alive + * until the last subsocket is completely destroyed. + */ + sock_hold(meta_sk); + atomic_inc(&mpcb->mpcb_refcnt); + + tp->mptcp->next = mpcb->connection_list; + mpcb->connection_list = tp; + tp->mptcp->attached = 1; + + mpcb->cnt_subflows++; + atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc), + &meta_sk->sk_rmem_alloc); + + mptcp_sub_inherit_sockopts(meta_sk, sk); + INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq); + + /* As we successfully allocated the mptcp_tcp_sock, we have to + * change the function-pointers here (for sk_destruct to work correctly) + */ + sk->sk_error_report = mptcp_sock_def_error_report; + sk->sk_data_ready = mptcp_data_ready; + sk->sk_write_space = mptcp_write_space; + sk->sk_state_change = mptcp_set_state; + sk->sk_destruct = mptcp_sock_destruct; + + if (sk->sk_family == AF_INET) + mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n", + __func__ , mpcb->mptcp_loc_token, + tp->mptcp->path_index, + &((struct inet_sock *)tp)->inet_saddr, + ntohs(((struct inet_sock *)tp)->inet_sport), + &((struct inet_sock *)tp)->inet_daddr, + ntohs(((struct inet_sock *)tp)->inet_dport), + mpcb->cnt_subflows); + else + mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n", + __func__ , mpcb->mptcp_loc_token, + tp->mptcp->path_index, &inet6_sk(sk)->saddr, + ntohs(((struct inet_sock *)tp)->inet_sport), + &inet6_sk(sk)->daddr, + ntohs(((struct inet_sock *)tp)->inet_dport), + mpcb->cnt_subflows); + + return 0; +} + +void mptcp_del_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk), *tp_prev; + struct mptcp_cb *mpcb; + + if (!tp->mptcp || !tp->mptcp->attached) + return; + + mpcb = tp->mpcb; + tp_prev = mpcb->connection_list; + + mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n", + __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index, + sk->sk_state, is_meta_sk(sk)); + + if (tp_prev == tp) { + mpcb->connection_list = tp->mptcp->next; + } else { + for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) { + if (tp_prev->mptcp->next == tp) { + tp_prev->mptcp->next = tp->mptcp->next; + break; + } + } + } + mpcb->cnt_subflows--; + if (tp->mptcp->establish_increased) + mpcb->cnt_established--; + + tp->mptcp->next = NULL; + tp->mptcp->attached = 0; + mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index); + + if (!skb_queue_empty(&sk->sk_write_queue)) + mptcp_reinject_data(sk, 0); + + if (is_master_tp(tp)) + mpcb->master_sk = NULL; + else if (tp->mptcp->pre_established) + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); + + rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL); +} + +/* Updates the metasocket ULID/port data, based on the given sock. + * The argument sock must be the sock accessible to the application. + * In this function, we update the meta socket info, based on the changes + * in the application socket (bind, address allocation, ...) + */ +void mptcp_update_metasocket(struct sock *sk, struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + union inet_addr addr; + int id; + + /* Get the local address-id */ + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) { + addr.ip = inet_sk(sk)->inet_saddr; + id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(meta_sk)); + } else { + addr.in6 = inet6_sk(sk)->saddr; + id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(meta_sk)); + } + + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) { + mptcp_v4_add_raddress(mpcb, + (struct in_addr *)&inet_sk(sk)->inet_daddr, + 0, 0); + if (id >= 0) + mptcp_v4_set_init_addr_bit(mpcb, inet_sk(sk)->inet_daddr, id); + } else { +#if IS_ENABLED(CONFIG_IPV6) + mptcp_v6_add_raddress(mpcb, &inet6_sk(sk)->daddr, 0, 0); + if (id >= 0) + mptcp_v6_set_init_addr_bit(mpcb, &inet6_sk(sk)->daddr, id); +#endif + } + + if (mpcb->pm_ops->new_session) + mpcb->pm_ops->new_session(meta_sk, id); + + tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio; +} + +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sock *sk; + __u32 rcv_window_now = 0; + + if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) { + rcv_window_now = tcp_receive_window(meta_tp); + + if (2 * rcv_window_now > meta_tp->window_clamp) + rcv_window_now = 0; + } + + mptcp_for_each_sk(meta_tp->mpcb, sk) { + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (!mptcp_sk_can_send_ack(sk)) + continue; + + if (!inet_csk_ack_scheduled(sk)) + goto second_part; + /* Delayed ACKs frequently hit locked sockets during bulk + * receive. + */ + if (icsk->icsk_ack.blocked || + /* Once-per-two-segments ACK was not sent by tcp_input.c */ + tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || + /* If this read emptied read buffer, we send ACK, if + * connection is not bidirectional, user drained + * receive buffer and there was a small segment + * in queue. + */ + (copied > 0 && + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + !icsk->icsk_ack.pingpong)) && + !atomic_read(&meta_sk->sk_rmem_alloc))) { + tcp_send_ack(sk); + continue; + } + +second_part: + /* This here is the second part of tcp_cleanup_rbuf */ + if (rcv_window_now) { + __u32 new_window = __tcp_select_window(sk); + + /* Send ACK now, if this read freed lots of space + * in our buffer. Certainly, new_window is new window. + * We can advertise it now, if it is not less than + * current one. + * "Lots" means "at least twice" here. + */ + if (new_window && new_window >= 2 * rcv_window_now) + tcp_send_ack(sk); + } + } +} + +static int mptcp_sub_send_fin(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = tcp_write_queue_tail(sk); + int mss_now; + + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. But be careful about outgoing SACKS + * and IP options. + */ + mss_now = tcp_current_mss(sk); + + if (tcp_send_head(sk) != NULL) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; + TCP_SKB_CB(skb)->end_seq++; + tp->write_seq++; + } else { + skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC); + if (!skb) + return 1; + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ + tcp_init_nondata_skb(skb, tp->write_seq, + TCPHDR_ACK | TCPHDR_FIN); + tcp_queue_skb(sk, skb); + } + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); + + return 0; +} + +void mptcp_sub_close_wq(struct work_struct *work) +{ + struct mptcp_tcp_sock *mptcp = container_of(work, struct mptcp_tcp_sock, work.work); + struct tcp_sock *tp = mptcp->tp; + struct sock *sk = (struct sock *)tp; + struct sock *meta_sk = mptcp_meta_sk(sk); + + mutex_lock(&tp->mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + if (sock_flag(sk, SOCK_DEAD)) + goto exit; + + /* We come from tcp_disconnect. We are sure that meta_sk is set */ + if (!tp->mpc) { + tp->closing = 1; + sock_rps_reset_flow(sk); + tcp_close(sk, 0); + goto exit; + } + + if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) { + tp->closing = 1; + sock_rps_reset_flow(sk); + tcp_close(sk, 0); + } else if (tcp_close_state(sk)) { + sk->sk_shutdown |= SEND_SHUTDOWN; + tcp_send_fin(sk); + } + +exit: + release_sock(meta_sk); + mutex_unlock(&tp->mpcb->mpcb_mutex); + sock_put(sk); +} + +void mptcp_sub_close(struct sock *sk, unsigned long delay) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct delayed_work *work = &tcp_sk(sk)->mptcp->work; + + /* We are already closing - e.g., call from sock_def_error_report upon + * tcp_disconnect in tcp_close. + */ + if (tp->closing) + return; + + /* Work already scheduled ? */ + if (work_pending(&work->work)) { + /* Work present - who will be first ? */ + if (jiffies + delay > work->timer.expires) + return; + + /* Try canceling - if it fails, work will be executed soon */ + if (!cancel_delayed_work(work)) + return; + sock_put(sk); + } + + if (!delay) { + unsigned char old_state = sk->sk_state; + + /* If we are in user-context we can directly do the closing + * procedure. No need to schedule a work-queue. + */ + if (!in_softirq()) { + if (sock_flag(sk, SOCK_DEAD)) + return; + + if (!tp->mpc) { + tp->closing = 1; + sock_rps_reset_flow(sk); + tcp_close(sk, 0); + return; + } + + if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) { + tp->closing = 1; + sock_rps_reset_flow(sk); + tcp_close(sk, 0); + } else if (tcp_close_state(sk)) { + sk->sk_shutdown |= SEND_SHUTDOWN; + tcp_send_fin(sk); + } + + return; + } + + /* We directly send the FIN. Because it may take so a long time, + * untile the work-queue will get scheduled... + * + * If mptcp_sub_send_fin returns 1, it failed and thus we reset + * the old state so that tcp_close will finally send the fin + * in user-context. + */ + if (!sk->sk_err && old_state != TCP_CLOSE && + tcp_close_state(sk) && mptcp_sub_send_fin(sk)) { + if (old_state == TCP_ESTABLISHED) + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); + sk->sk_state = old_state; + } + } + + sock_hold(sk); + queue_delayed_work(mptcp_wq, work, delay); +} + +void mptcp_sub_force_close(struct sock *sk) +{ + /* The below tcp_done may have freed the socket, if he is already dead. + * Thus, we are not allowed to access it afterwards. That's why + * we have to store the dead-state in this local variable. + */ + int sock_is_dead = sock_flag(sk, SOCK_DEAD); + + tcp_sk(sk)->mp_killed = 1; + + if (sk->sk_state != TCP_CLOSE) + tcp_done(sk); + + if (!sock_is_dead) + mptcp_sub_close(sk, 0); +} +EXPORT_SYMBOL(mptcp_sub_force_close); + +/* Update the mpcb send window, based on the contributions + * of each subflow + */ +void mptcp_update_sndbuf(struct mptcp_cb *mpcb) +{ + struct sock *meta_sk = mpcb->meta_sk, *sk; + int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf; + mptcp_for_each_sk(mpcb, sk) { + if (!mptcp_sk_can_send(sk)) + continue; + + new_sndbuf += sk->sk_sndbuf; + + if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) { + new_sndbuf = sysctl_tcp_wmem[2]; + break; + } + } + meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf); + + /* The subflow's call to sk_write_space in tcp_new_space ends up in + * mptcp_write_space. + * It has nothing to do with waking up the application. + * So, we do it here. + */ + if (old_sndbuf != meta_sk->sk_sndbuf) + meta_sk->sk_write_space(meta_sk); +} + +void mptcp_close(struct sock *meta_sk, long timeout) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sock *sk_it, *tmpsk; + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sk_buff *skb; + int data_was_unread = 0; + int state; + + mptcp_debug("%s: Close of meta_sk with tok %#x\n", + __func__, mpcb->mptcp_loc_token); + + mutex_lock(&mpcb->mpcb_mutex); + lock_sock(meta_sk); + + if (meta_tp->inside_tk_table) { + /* Detach the mpcb from the token hashtable */ + mptcp_hash_remove_bh(meta_tp); + reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue); + } + + meta_sk->sk_shutdown = SHUTDOWN_MASK; + /* We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - + tcp_hdr(skb)->fin; + data_was_unread += len; + __kfree_skb(skb); + } + + sk_mem_reclaim(meta_sk); + + /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ + if (meta_sk->sk_state == TCP_CLOSE) { + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { + if (tcp_sk(sk_it)->send_mp_fclose) + continue; + mptcp_sub_close(sk_it, 0); + } + goto adjudge_to_death; + } + + if (data_was_unread) { + /* Unread data was tossed, zap the connection. */ + NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE); + tcp_set_state(meta_sk, TCP_CLOSE); + tcp_send_active_reset(meta_sk, meta_sk->sk_allocation); + } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + meta_sk->sk_prot->disconnect(meta_sk, 0); + NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); + } else if (tcp_close_state(meta_sk)) { + mptcp_send_fin(meta_sk); + } else if (meta_tp->snd_una == meta_tp->write_seq) { + /* The DATA_FIN has been sent and acknowledged + * (e.g., by sk_shutdown). Close all the other subflows + */ + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { + unsigned long delay = 0; + /* If we are the passive closer, don't trigger + * subflow-fin until the subflow has been finned + * by the peer. - thus we add a delay + */ + if (mpcb->passive_close && + sk_it->sk_state == TCP_ESTABLISHED) + delay = inet_csk(sk_it)->icsk_rto << 3; + + mptcp_sub_close(sk_it, delay); + } + } + + sk_stream_wait_close(meta_sk, timeout); + +adjudge_to_death: + state = meta_sk->sk_state; + sock_hold(meta_sk); + sock_orphan(meta_sk); + + /* socket will be freed after mptcp_close - we have to prevent + * access from the subflows. + */ + mptcp_for_each_sk(mpcb, sk_it) { + /* Similar to sock_orphan, but we don't set it DEAD, because + * the callbacks are still set and must be called. + */ + write_lock_bh(&sk_it->sk_callback_lock); + sk_set_socket(sk_it, NULL); + sk_it->sk_wq = NULL; + write_unlock_bh(&sk_it->sk_callback_lock); + } + + /* It is the last release_sock in its life. It will remove backlog. */ + release_sock(meta_sk); + + /* Now socket is owned by kernel and we acquire BH lock + * to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(meta_sk); + WARN_ON(sock_owned_by_user(meta_sk)); + + percpu_counter_inc(meta_sk->sk_prot->orphan_count); + + /* Have we already been destroyed by a softirq or backlog? */ + if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE) + goto out; + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + * + * Nope, it was not mistake. It is really desired behaviour + * f.e. on http servers, when such sockets are useless, but + * consume significant resources. Let's do it with special + * linger2 option. --ANK + */ + + if (meta_sk->sk_state == TCP_FIN_WAIT2) { + if (meta_tp->linger2 < 0) { + tcp_set_state(meta_sk, TCP_CLOSE); + tcp_send_active_reset(meta_sk, GFP_ATOMIC); + NET_INC_STATS_BH(sock_net(meta_sk), + LINUX_MIB_TCPABORTONLINGER); + } else { + const int tmo = tcp_fin_time(meta_sk); + + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(meta_sk, + tmo - TCP_TIMEWAIT_LEN); + } else { + tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + } + if (meta_sk->sk_state != TCP_CLOSE) { + sk_mem_reclaim(meta_sk); + if (tcp_too_many_orphans(meta_sk, 0)) { + if (net_ratelimit()) + pr_info("MPTCP: too many of orphaned sockets\n"); + tcp_set_state(meta_sk, TCP_CLOSE); + tcp_send_active_reset(meta_sk, GFP_ATOMIC); + NET_INC_STATS_BH(sock_net(meta_sk), + LINUX_MIB_TCPABORTONMEMORY); + } + } + + + if (meta_sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(meta_sk); + /* Otherwise, socket is reprieved until protocol close. */ + +out: + bh_unlock_sock(meta_sk); + local_bh_enable(); + mutex_unlock(&mpcb->mpcb_mutex); + sock_put(meta_sk); /* Taken by sock_hold */ +} + +void mptcp_disconnect(struct sock *sk) +{ + struct sock *subsk, *tmpsk; + struct tcp_sock *tp = tcp_sk(sk); + + __skb_queue_purge(&tp->mpcb->reinject_queue); + + if (tp->inside_tk_table) { + mptcp_hash_remove_bh(tp); + reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue); + } + + local_bh_disable(); + mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) { + /* The socket will get removed from the subsocket-list + * and made non-mptcp by setting mpc to 0. + * + * This is necessary, because tcp_disconnect assumes + * that the connection is completly dead afterwards. + * Thus we need to do a mptcp_del_sock. Due to this call + * we have to make it non-mptcp. + * + * We have to lock the socket, because we set mpc to 0. + * An incoming packet would take the subsocket's lock + * and go on into the receive-path. + * This would be a race. + */ + + bh_lock_sock(subsk); + mptcp_del_sock(subsk); + tcp_sk(subsk)->mpc = 0; + mptcp_sub_force_close(subsk); + bh_unlock_sock(subsk); + } + local_bh_enable(); + + tp->was_meta_sk = 1; + tp->mpc = 0; +} + + +/* Returns 1 if we should enable MPTCP for that socket. */ +int mptcp_doit(struct sock *sk) +{ + /* Do not allow MPTCP enabling if the MPTCP initialization failed */ + if (mptcp_init_failed) + return 0; + + if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled) + return 0; + + /* Socket may already be established (e.g., called from tcp_recvmsg) */ + if (tcp_sk(sk)->mpc || tcp_sk(sk)->request_mptcp) + return 1; + + /* Don't do mptcp over loopback */ + if (sk->sk_family == AF_INET && + (ipv4_is_loopback(inet_sk(sk)->inet_daddr) || + ipv4_is_loopback(inet_sk(sk)->inet_saddr))) + return 0; + if (sk->sk_family == AF_INET6 && + (ipv6_addr_loopback(&inet6_sk(sk)->daddr) || + ipv6_addr_loopback(&inet6_sk(sk)->saddr))) + return 0; + if (mptcp_v6_is_v4_mapped(sk) && + ipv4_is_loopback(inet_sk(sk)->inet_saddr)) + return 0; + +#ifdef CONFIG_TCP_MD5SIG + /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */ + if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk)) + return 0; +#endif + + return 1; +} + +int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window) +{ + struct tcp_sock *master_tp; + struct sock *master_sk; + + if (mptcp_alloc_mpcb(meta_sk, remote_key, window)) + goto err_alloc_mpcb; + + master_sk = tcp_sk(meta_sk)->mpcb->master_sk; + master_tp = tcp_sk(master_sk); + + if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC)) + goto err_add_sock; + + if (__inet_inherit_port(meta_sk, master_sk) < 0) + goto err_add_sock; + + meta_sk->sk_prot->unhash(meta_sk); + + if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk)) + __inet_hash_nolisten(master_sk, NULL); +#if IS_ENABLED(CONFIG_IPV6) + else + __inet6_hash(master_sk, NULL); +#endif + + master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd; + + return 0; + +err_add_sock: + mptcp_fallback_meta_sk(meta_sk); + + inet_csk_prepare_forced_close(master_sk); + tcp_done(master_sk); + inet_csk_prepare_forced_close(meta_sk); + tcp_done(meta_sk); + +err_alloc_mpcb: + return -ENOBUFS; +} + +int mptcp_check_req_master(struct sock *sk, struct sock *child, + struct request_sock *req, + struct request_sock **prev, + struct mptcp_options_received *mopt) +{ + struct tcp_sock *child_tp = tcp_sk(child); + struct sock *meta_sk = child; + struct mptcp_cb *mpcb; + struct mptcp_request_sock *mtreq; + + if (!tcp_rsk(req)->saw_mpc) + return 1; + + /* Just set this values to pass them to mptcp_alloc_mpcb */ + mtreq = mptcp_rsk(req); + child_tp->mptcp_loc_key = mtreq->mptcp_loc_key; + child_tp->mptcp_loc_token = mtreq->mptcp_loc_token; + + if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key, + child_tp->snd_wnd)) + return -ENOBUFS; + + child = tcp_sk(child)->mpcb->master_sk; + child_tp = tcp_sk(child); + mpcb = child_tp->mpcb; + + child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; + child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; + + mpcb->dss_csum = mtreq->dss_csum; + mpcb->server_side = 1; + + /* Will be moved to ESTABLISHED by tcp_rcv_state_process() */ + mptcp_update_metasocket(child, meta_sk); + + /* Needs to be done here additionally, because when accepting a + * new connection we pass by __reqsk_free and not reqsk_free. + */ + mptcp_reqsk_remove_tk(req); + + /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */ + sock_put(meta_sk); + + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); + inet_csk_reqsk_queue_add(sk, req, meta_sk); + + return 0; +} + +struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child, + struct request_sock *req, + struct request_sock **prev, + struct mptcp_options_received *mopt) +{ + struct tcp_sock *child_tp = tcp_sk(child); + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + struct mptcp_cb *mpcb = mtreq->mpcb; + u8 hash_mac_check[20]; + + child_tp->inside_tk_table = 0; + + if (!mopt->join_ack) + goto teardown; + + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, + (u8 *)&mpcb->mptcp_loc_key, + (u8 *)&mtreq->mptcp_rem_nonce, + (u8 *)&mtreq->mptcp_loc_nonce, + (u32 *)hash_mac_check); + + if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20)) + goto teardown; + + /* Point it to the same struct socket and wq as the meta_sk */ + sk_set_socket(child, meta_sk->sk_socket); + child->sk_wq = meta_sk->sk_wq; + + if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) { + child_tp->mpc = 0; /* Has been inherited, but now + * child_tp->mptcp is NULL + */ + /* TODO when we support acking the third ack for new subflows, + * we should silently discard this third ack, by returning NULL. + * + * Maybe, at the retransmission we will have enough memory to + * fully add the socket to the meta-sk. + */ + goto teardown; + } + + /* The child is a clone of the meta socket, we must now reset + * some of the fields + */ + child_tp->mptcp->rcv_low_prio = mtreq->low_prio; + + /* We should allow proper increase of the snd/rcv-buffers. Thus, we + * use the original values instead of the bloated up ones from the + * clone. + */ + child->sk_sndbuf = mpcb->orig_sk_sndbuf; + child->sk_rcvbuf = mpcb->orig_sk_rcvbuf; + + child_tp->mptcp->slave_sk = 1; + child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; + child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; + child_tp->mptcp->init_rcv_wnd = req->rcv_wnd; + + child_tp->tsq_flags = 0; + + /* Subflows do not use the accept queue, as they + * are attached immediately to the mpcb. + */ + inet_csk_reqsk_queue_drop(meta_sk, req, prev); + return child; + +teardown: + /* Drop this request - sock creation failed. */ + inet_csk_reqsk_queue_drop(meta_sk, req, prev); + inet_csk_prepare_forced_close(child); + tcp_done(child); + return meta_sk; +} + +int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw) +{ + struct mptcp_tw *mptw; + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + + /* Alloc MPTCP-tw-sock */ + mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC); + if (!mptw) + return -ENOBUFS; + + atomic_inc(&mpcb->mpcb_refcnt); + + tw->mptcp_tw = mptw; + mptw->loc_key = mpcb->mptcp_loc_key; + mptw->meta_tw = mpcb->in_time_wait; + if (mptw->meta_tw) { + mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); + if (mpcb->mptw_state != TCP_TIME_WAIT) + mptw->rcv_nxt++; + } + rcu_assign_pointer(mptw->mpcb, mpcb); + + spin_lock(&mpcb->tw_lock); + list_add_rcu(&mptw->list, &tp->mpcb->tw_list); + mptw->in_list = 1; + spin_unlock(&mpcb->tw_lock); + + return 0; +} + +void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) +{ + struct mptcp_cb *mpcb; + + rcu_read_lock(); + mpcb = rcu_dereference(tw->mptcp_tw->mpcb); + + /* If we are still holding a ref to the mpcb, we have to remove ourself + * from the list and drop the ref properly. + */ + if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) { + spin_lock(&mpcb->tw_lock); + if (tw->mptcp_tw->in_list) { + list_del_rcu(&tw->mptcp_tw->list); + tw->mptcp_tw->in_list = 0; + } + spin_unlock(&mpcb->tw_lock); + + /* Twice, because we increased it above */ + mptcp_mpcb_put(mpcb); + mptcp_mpcb_put(mpcb); + } + + rcu_read_unlock(); + + kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw); +} + +/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a + * data-fin. + */ +void mptcp_update_tw_socks(const struct tcp_sock *tp, int state) +{ + struct mptcp_tw *mptw; + + /* Used for sockets that go into tw after the meta + * (see mptcp_time_wait()) + */ + tp->mpcb->in_time_wait = 1; + tp->mpcb->mptw_state = state; + + /* Update the time-wait-sock's information */ + rcu_read_lock_bh(); + list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) { + mptw->meta_tw = 1; + mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp); + + /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 - + * pretend as if the DATA_FIN has already reached us, that way + * the checks in tcp_timewait_state_process will be good as the + * DATA_FIN comes in. + */ + if (state != TCP_TIME_WAIT) + mptw->rcv_nxt++; + } + rcu_read_unlock_bh(); +} + +void mptcp_tsq_flags(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = mptcp_meta_sk(sk); + + /* It will be handled as a regular deferred-call */ + if (is_meta_sk(sk)) + return; + + if (list_empty(&tp->mptcp->cb_list)) { + list_add(&tp->mptcp->cb_list, &tp->mpcb->callback_list); + /* We need to hold it here, as the sock_hold is not assured + * by the release_sock as it is done in regular TCP. + * + * The subsocket may get inet_csk_destroy'd while it is inside + * the callback_list. + */ + sock_hold(sk); + } + + if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags)) + sock_hold(meta_sk); +} + +void mptcp_tsq_sub_deferred(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_tcp_sock *mptcp, *tmp; + + BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk); + + __sock_put(meta_sk); + list_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) { + struct tcp_sock *tp = mptcp->tp; + struct sock *sk = (struct sock *)tp; + + list_del_init(&mptcp->cb_list); + sk->sk_prot->release_cb(sk); + /* Final sock_put (cfr. mptcp_tsq_flags */ + sock_put(sk); + } +} + +struct workqueue_struct *mptcp_wq; +EXPORT_SYMBOL(mptcp_wq); + +/* Output /proc/net/mptcp */ +static int mptcp_pm_seq_show(struct seq_file *seq, void *v) +{ + struct tcp_sock *meta_tp; + struct net *net = seq->private; + int i, n = 0; + + seq_printf(seq, " sl loc_tok rem_tok v6 " + "local_address " + "remote_address " + "st ns tx_queue rx_queue inode"); + seq_putc(seq, '\n'); + + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + struct hlist_nulls_node *node; + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, + &tk_hashtable[i], tk_table) { + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sock *meta_sk = (struct sock *)meta_tp; + struct inet_sock *isk = inet_sk(meta_sk); + + if (!meta_tp->mpc || !net_eq(net, sock_net(meta_sk))) + continue; + + seq_printf(seq, "%4d: %04X %04X ", n++, + mpcb->mptcp_loc_token, + mpcb->mptcp_rem_token); + if (meta_sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(meta_sk)) { + seq_printf(seq, " 0 %08X:%04X %08X:%04X ", + isk->inet_rcv_saddr, + ntohs(isk->inet_sport), + isk->inet_daddr, + ntohs(isk->inet_dport)); +#if IS_ENABLED(CONFIG_IPV6) + } else if (meta_sk->sk_family == AF_INET6) { + struct in6_addr *src = &isk->pinet6->rcv_saddr; + struct in6_addr *dst = &isk->pinet6->daddr; + seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X", + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], + ntohs(isk->inet_sport), + dst->s6_addr32[0], dst->s6_addr32[1], + dst->s6_addr32[2], dst->s6_addr32[3], + ntohs(isk->inet_dport)); +#endif + } + seq_printf(seq, " %02X %02X %08X:%08X %lu", + meta_sk->sk_state, mpcb->cnt_subflows, + meta_tp->write_seq - meta_tp->snd_una, + max_t(int, meta_tp->rcv_nxt - + meta_tp->copied_seq, 0), + sock_i_ino(meta_sk)); + seq_putc(seq, '\n'); + } + rcu_read_unlock_bh(); + } + + return 0; +} + +static int mptcp_pm_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, mptcp_pm_seq_show); +} + +static const struct file_operations mptcp_pm_seq_fops = { + .owner = THIS_MODULE, + .open = mptcp_pm_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int mptcp_pm_init_net(struct net *net) +{ + if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops)) + return -ENOMEM; + + return 0; +} + +static void mptcp_pm_exit_net(struct net *net) +{ + remove_proc_entry("mptcp", net->proc_net); +} + +static struct pernet_operations mptcp_pm_proc_ops = { + .init = mptcp_pm_init_net, + .exit = mptcp_pm_exit_net, +}; + +/* General initialization of mptcp */ +void __init mptcp_init(void) +{ + int i; + struct ctl_table_header *mptcp_sysctl; + + mptcp_sock_cache = kmem_cache_create("mptcp_sock", + sizeof(struct mptcp_tcp_sock), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!mptcp_sock_cache) + goto mptcp_sock_cache_failed; + + mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb), + 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, + NULL); + if (!mptcp_cb_cache) + goto mptcp_cb_cache_failed; + + mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw), + 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, + NULL); + if (!mptcp_tw_cache) + goto mptcp_tw_cache_failed; + + get_random_bytes(mptcp_secret, sizeof(mptcp_secret)); + + mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); + if (!mptcp_wq) + goto alloc_workqueue_failed; + + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i); + INIT_LIST_HEAD(&mptcp_reqsk_htb[i]); + INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i); + } + + spin_lock_init(&mptcp_reqsk_hlock); + spin_lock_init(&mptcp_tk_hashlock); + + if (register_pernet_subsys(&mptcp_pm_proc_ops)) + goto pernet_failed; + +#if IS_ENABLED(CONFIG_IPV6) + if (mptcp_pm_v6_init()) + goto mptcp_pm_v6_failed; +#endif + if (mptcp_pm_v4_init()) + goto mptcp_pm_v4_failed; + + mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table); + if (!mptcp_sysctl) + goto register_sysctl_failed; + + if (mptcp_register_path_manager(&mptcp_pm_default)) + goto register_pm_failed; + + pr_info("MPTCP: release for 3.12 series, v0.88.11.1"); + + mptcp_init_failed = false; + + return; + +register_pm_failed: + unregister_net_sysctl_table(mptcp_sysctl); +register_sysctl_failed: + mptcp_pm_v4_undo(); +mptcp_pm_v4_failed: +#if IS_ENABLED(CONFIG_IPV6) + mptcp_pm_v6_undo(); +mptcp_pm_v6_failed: +#endif + unregister_pernet_subsys(&mptcp_pm_proc_ops); +pernet_failed: + destroy_workqueue(mptcp_wq); +alloc_workqueue_failed: + kmem_cache_destroy(mptcp_tw_cache); +mptcp_tw_cache_failed: + kmem_cache_destroy(mptcp_cb_cache); +mptcp_cb_cache_failed: + kmem_cache_destroy(mptcp_sock_cache); +mptcp_sock_cache_failed: + mptcp_init_failed = true; +} diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c new file mode 100644 index 0000000..4e83c38 --- /dev/null +++ b/net/mptcp/mptcp_fullmesh.c @@ -0,0 +1,1297 @@ +#include + +#include +#include + +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#endif + +enum { + MPTCP_EVENT_ADD = 1, + MPTCP_EVENT_DEL, + MPTCP_EVENT_MOD, +}; + +struct mptcp_loc_addr { + struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR]; + u8 loc4_bits; + u8 next_v4_index; + + struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR]; + u8 loc6_bits; + u8 next_v6_index; +}; + +struct mptcp_addr_event { + struct list_head list; + unsigned short family; + u8 code:7, + low_prio:1; + union { + struct in_addr addr4; + struct in6_addr addr6; + }u; +}; + +struct fullmesh_priv { + /* Worker struct for subflow establishment */ + struct work_struct subflow_work; + /* Delayed worker, when the routing-tables are not yet ready. */ + struct delayed_work subflow_retry_work; + + struct mptcp_cb *mpcb; + + u16 remove_addrs; /* Addresses to remove */ + u8 announced_addrs_v4; /* IPv4 Addresses we did announce */ + u8 announced_addrs_v6; /* IPv4 Addresses we did announce */ + + u8 add_addr; /* Are we sending an add_addr? */ +}; + +struct mptcp_fm_ns { + struct mptcp_loc_addr __rcu *local; + spinlock_t local_lock; /* Protecting the above pointer */ + struct list_head events; + struct delayed_work address_worker; + + struct net *net; +}; + +static struct mptcp_pm_ops full_mesh __read_mostly; + +static struct mptcp_fm_ns *fm_get_ns(struct net *net) +{ + return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH]; +} + +static void full_mesh_create_subflows(struct sock *meta_sk); + +static void retry_subflow_worker(struct work_struct *work) +{ + struct delayed_work *delayed_work = container_of(work, + struct delayed_work, + work); + struct fullmesh_priv *pm_priv = container_of(delayed_work, + struct fullmesh_priv, + subflow_retry_work); + struct mptcp_cb *mpcb = pm_priv->mpcb; + struct sock *meta_sk = mpcb->meta_sk; + struct mptcp_loc_addr *mptcp_local; + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); + int iter = 0, i; + + /* We need a local (stable) copy of the address-list. Really, it is not + * such a big deal, if the address-list is not 100% up-to-date. + */ + rcu_read_lock_bh(); + mptcp_local = rcu_dereference_bh(fm_ns->local); + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); + rcu_read_unlock_bh(); + + if (!mptcp_local) + return; + +next_subflow: + if (iter) { + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + + yield(); + } + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + iter++; + + if (sock_flag(meta_sk, SOCK_DEAD)) + goto exit; + + mptcp_for_each_bit_set(mpcb->rem4_bits, i) { + struct mptcp_rem4 *rem = &mpcb->remaddr4[i]; + /* Do we need to retry establishing a subflow ? */ + if (rem->retry_bitfield) { + int i = mptcp_find_free_index(~rem->retry_bitfield); + mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], rem); + rem->retry_bitfield &= ~(1 << i); + goto next_subflow; + } + } + +#if IS_ENABLED(CONFIG_IPV6) + mptcp_for_each_bit_set(mpcb->rem6_bits, i) { + struct mptcp_rem6 *rem = &mpcb->remaddr6[i]; + + /* Do we need to retry establishing a subflow ? */ + if (rem->retry_bitfield) { + int i = mptcp_find_free_index(~rem->retry_bitfield); + mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], rem); + rem->retry_bitfield &= ~(1 << i); + goto next_subflow; + } + } +#endif + +exit: + kfree(mptcp_local); + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + sock_put(meta_sk); +} + +/** + * Create all new subflows, by doing calls to mptcp_initX_subsockets + * + * This function uses a goto next_subflow, to allow releasing the lock between + * new subflows and giving other processes a chance to do some work on the + * socket and potentially finishing the communication. + **/ +static void create_subflow_worker(struct work_struct *work) +{ + struct fullmesh_priv *pm_priv = container_of(work, + struct fullmesh_priv, + subflow_work); + struct mptcp_cb *mpcb = pm_priv->mpcb; + struct sock *meta_sk = mpcb->meta_sk; + struct mptcp_loc_addr *mptcp_local; + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); + int iter = 0, retry = 0; + int i; + + /* We need a local (stable) copy of the address-list. Really, it is not + * such a big deal, if the address-list is not 100% up-to-date. + */ + rcu_read_lock_bh(); + mptcp_local = rcu_dereference_bh(fm_ns->local); + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); + rcu_read_unlock_bh(); + + if (!mptcp_local) + return; + +next_subflow: + if (iter) { + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + + yield(); + } + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + iter++; + + if (sock_flag(meta_sk, SOCK_DEAD)) + goto exit; + + if (mpcb->master_sk && + !tcp_sk(mpcb->master_sk)->mptcp->fully_established) + goto exit; + + mptcp_for_each_bit_set(mpcb->rem4_bits, i) { + struct mptcp_rem4 *rem; + u8 remaining_bits; + + rem = &mpcb->remaddr4[i]; + remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits; + + /* Are there still combinations to handle? */ + if (remaining_bits) { + int i = mptcp_find_free_index(~remaining_bits); + /* If a route is not yet available then retry once */ + if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], + rem) == -ENETUNREACH) + retry = rem->retry_bitfield |= (1 << i); + goto next_subflow; + } + } + +#if IS_ENABLED(CONFIG_IPV6) + mptcp_for_each_bit_set(mpcb->rem6_bits, i) { + struct mptcp_rem6 *rem; + u8 remaining_bits; + + rem = &mpcb->remaddr6[i]; + remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits; + + /* Are there still combinations to handle? */ + if (remaining_bits) { + int i = mptcp_find_free_index(~remaining_bits); + /* If a route is not yet available then retry once */ + if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], + rem) == -ENETUNREACH) + retry = rem->retry_bitfield |= (1 << i); + goto next_subflow; + } + } +#endif + + if (retry && !delayed_work_pending(&pm_priv->subflow_retry_work)) { + sock_hold(meta_sk); + queue_delayed_work(mptcp_wq, &pm_priv->subflow_retry_work, + msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY)); + } + +exit: + kfree(mptcp_local); + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + sock_put(meta_sk); +} + +static void announce_remove_addr(u8 addr_id, struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; + struct sock *sk = mptcp_select_ack_sock(meta_sk, 0); + + fmp->remove_addrs |= (1 << addr_id); + + if (sk) + tcp_send_ack(sk); +} + +static void update_addr_bitfields(struct sock *meta_sk, + const struct mptcp_loc_addr *mptcp_local) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; + int i; + + /* The bits in announced_addrs_* always match with loc*_bits. So, a + * simply & operation unsets the correct bits, because these go from + * announced to non-announced + */ + fmp->announced_addrs_v4 &= mptcp_local->loc4_bits; + + mptcp_for_each_bit_set(mpcb->rem4_bits, i) { + mpcb->remaddr4[i].bitfield &= mptcp_local->loc4_bits; + mpcb->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits; + } + + fmp->announced_addrs_v6 &= mptcp_local->loc6_bits; + + mptcp_for_each_bit_set(mpcb->rem6_bits, i) { + mpcb->remaddr6[i].bitfield &= mptcp_local->loc6_bits; + mpcb->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits; + } +} + +static int mptcp_find_address(struct mptcp_loc_addr *mptcp_local, + struct mptcp_addr_event *event) +{ + int i; + u8 loc_bits; + bool found = false; + + if (event->family == AF_INET) + loc_bits = mptcp_local->loc4_bits; + else + loc_bits = mptcp_local->loc6_bits; + + mptcp_for_each_bit_set(loc_bits, i) { + if (event->family == AF_INET && + mptcp_local->locaddr4[i].addr.s_addr == event->u.addr4.s_addr) { + found = true; + break; + } + if (event->family == AF_INET6 && + ipv6_addr_equal(&mptcp_local->locaddr6[i].addr, + &event->u.addr6)) { + found = true; + break; + } + } + + if (!found) + return -1; + + return i; +} + +static void mptcp_address_worker(struct work_struct *work) +{ + struct delayed_work *delayed_work = container_of(work, + struct delayed_work, + work); + struct mptcp_fm_ns *fm_ns = container_of(delayed_work, + struct mptcp_fm_ns, + address_worker); + struct net *net = fm_ns->net; + struct mptcp_addr_event *event = NULL; + struct mptcp_loc_addr *mptcp_local, *old; + int i, id = -1; /* id is used in the socket-code on a delete-event */ + bool success; /* Used to indicate if we succeeded handling the event */ + +next_event: + success = false; + kfree(event); + + /* First, let's dequeue an event from our event-list */ + rcu_read_lock_bh(); + spin_lock(&fm_ns->local_lock); + + event = list_first_entry_or_null(&fm_ns->events, + struct mptcp_addr_event, list); + if (!event) { + spin_unlock(&fm_ns->local_lock); + rcu_read_unlock_bh(); + return; + } + + list_del(&event->list); + + mptcp_local = rcu_dereference_bh(fm_ns->local); + + if (event->code == MPTCP_EVENT_DEL) { + id = mptcp_find_address(mptcp_local, event); + + /* Not in the list - so we don't care */ + if (id < 0) + goto duno; + + old = mptcp_local; + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), + GFP_ATOMIC); + if (!mptcp_local) + goto duno; + + if (event->family == AF_INET) + mptcp_local->loc4_bits &= ~(1 << id); + else + mptcp_local->loc6_bits &= ~(1 << id); + + rcu_assign_pointer(fm_ns->local, mptcp_local); + kfree(old); + } else { + int i = mptcp_find_address(mptcp_local, event); + int j = i; + + if (j < 0) { + /* Not in the list, so we have to find an empty slot */ + if (event->family == AF_INET) + i = __mptcp_find_free_index(mptcp_local->loc4_bits, 0, + mptcp_local->next_v4_index); + if (event->family == AF_INET6) + i = __mptcp_find_free_index(mptcp_local->loc6_bits, 0, + mptcp_local->next_v6_index); + + if (i < 0) { + mptcp_debug("%s no more space\n", __func__); + goto duno; + } + + /* It might have been a MOD-event. */ + event->code = MPTCP_EVENT_ADD; + } else { + /* Let's check if anything changes */ + if (event->family == AF_INET && + event->low_prio == mptcp_local->locaddr4[i].low_prio) + goto duno; + + if (event->family == AF_INET6 && + event->low_prio == mptcp_local->locaddr6[i].low_prio) + goto duno; + } + + old = mptcp_local; + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), + GFP_ATOMIC); + if (!mptcp_local) + goto duno; + + if (event->family == AF_INET) { + mptcp_local->locaddr4[i].addr.s_addr = event->u.addr4.s_addr; + mptcp_local->locaddr4[i].id = i; + mptcp_local->locaddr4[i].low_prio = event->low_prio; + } else { + mptcp_local->locaddr6[i].addr = event->u.addr6; + mptcp_local->locaddr6[i].id = i + MPTCP_MAX_ADDR; + mptcp_local->locaddr6[i].low_prio = event->low_prio; + } + + if (j < 0) { + if (event->family == AF_INET) { + mptcp_local->loc4_bits |= (1 << i); + mptcp_local->next_v4_index = i + 1; + } else { + mptcp_local->loc6_bits |= (1 << i); + mptcp_local->next_v6_index = i + 1; + } + } + + rcu_assign_pointer(fm_ns->local, mptcp_local); + kfree(old); + } + success = true; + +duno: + spin_unlock(&fm_ns->local_lock); + rcu_read_unlock_bh(); + + if (!success) + goto next_event; + + /* Now we iterate over the MPTCP-sockets and apply the event. */ + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + struct hlist_nulls_node *node; + struct tcp_sock *meta_tp; + + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i], + tk_table) { + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sock *meta_sk = (struct sock *)meta_tp, *sk; + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; + + if (sock_net(meta_sk) != net) + continue; + + if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) + continue; + + bh_lock_sock(meta_sk); + + if (!meta_tp->mpc || !is_meta_sk(meta_sk) || + mpcb->infinite_mapping_snd || + mpcb->infinite_mapping_rcv || + mpcb->send_infinite_mapping) + goto next; + + /* May be that the pm has changed in-between */ + if (mpcb->pm_ops != &full_mesh) + goto next; + + if (sock_owned_by_user(meta_sk)) { + if (!test_and_set_bit(MPTCP_PATH_MANAGER, + &meta_tp->tsq_flags)) + sock_hold(meta_sk); + + goto next; + } + + if (event->code == MPTCP_EVENT_ADD) { + if (event->family == AF_INET) + fmp->add_addr++; +#if IS_ENABLED(CONFIG_IPV6) + if (event->family == AF_INET6) + fmp->add_addr++; +#endif + + sk = mptcp_select_ack_sock(meta_sk, 0); + if (sk) + tcp_send_ack(sk); + + full_mesh_create_subflows(meta_sk); + } + + if (event->code == MPTCP_EVENT_DEL) { + struct sock *sk, *tmpsk; + struct mptcp_loc_addr *mptcp_local; + bool found = false; + + mptcp_local = rcu_dereference_bh(fm_ns->local); + + /* In any case, we need to update our bitfields */ + if (id >= 0) + update_addr_bitfields(meta_sk, mptcp_local); + + /* Look for the socket and remove him */ + mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { + if ((event->family == AF_INET6 && + (sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(sk))) || + (event->family == AF_INET && + (sk->sk_family == AF_INET6 && + !mptcp_v6_is_v4_mapped(sk)))) + continue; + + if (event->family == AF_INET && + (sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(sk)) && + inet_sk(sk)->inet_saddr != event->u.addr4.s_addr) + continue; + + if (event->family == AF_INET6 && + sk->sk_family == AF_INET6 && + !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->u.addr6)) + continue; + + /* Reinject, so that pf = 1 and so we + * won't select this one as the + * ack-sock. + */ + mptcp_reinject_data(sk, 0); + + /* We announce the removal of this id */ + announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk); + + mptcp_sub_force_close(sk); + found = true; + } + + if (found) + goto next; + + /* The id may have been given by the event, + * matching on a local address. And it may not + * have matched on one of the above sockets, + * because the client never created a subflow. + * So, we have to finally remove it here. + */ + if (id > 0) + announce_remove_addr(id, meta_sk); + } + + if (event->code == MPTCP_EVENT_MOD) { + struct sock *sk; + + mptcp_for_each_sk(mpcb, sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (event->family == AF_INET && + (sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(sk)) && + inet_sk(sk)->inet_saddr == event->u.addr4.s_addr) { + if (event->low_prio != tp->mptcp->low_prio) { + tp->mptcp->send_mp_prio = 1; + tp->mptcp->low_prio = event->low_prio; + + tcp_send_ack(sk); + } + } + + if (event->family == AF_INET6 && + sk->sk_family == AF_INET6 && + !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->u.addr6)) { + if (event->low_prio != tp->mptcp->low_prio) { + tp->mptcp->send_mp_prio = 1; + tp->mptcp->low_prio = event->low_prio; + + tcp_send_ack(sk); + } + } + } + } +next: + bh_unlock_sock(meta_sk); + sock_put(meta_sk); + } + rcu_read_unlock_bh(); + } + goto next_event; +} + +static struct mptcp_addr_event *lookup_similar_event(struct net *net, + struct mptcp_addr_event *event) +{ + struct mptcp_addr_event *eventq; + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + + list_for_each_entry(eventq, &fm_ns->events, list) { + if (eventq->family != event->family) + continue; + if (event->family == AF_INET) { + if (eventq->u.addr4.s_addr == event->u.addr4.s_addr) + return eventq; + } else { + if (ipv6_addr_equal(&eventq->u.addr6, &event->u.addr6)) + return eventq; + } + } + return NULL; +} + +/* We already hold the net-namespace MPTCP-lock */ +static void add_pm_event(struct net *net, struct mptcp_addr_event *event) +{ + struct mptcp_addr_event *eventq = lookup_similar_event(net, event); + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + + if (eventq) { + switch (event->code) { + case MPTCP_EVENT_DEL: + list_del(&eventq->list); + kfree(eventq); + break; + case MPTCP_EVENT_ADD: + eventq->low_prio = event->low_prio; + eventq->code = MPTCP_EVENT_ADD; + return; + case MPTCP_EVENT_MOD: + eventq->low_prio = event->low_prio; + return; + } + } + + /* OK, we have to add the new address to the wait queue */ + eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC); + if (!eventq) + return; + + list_add_tail(&eventq->list, &fm_ns->events); + + /* Create work-queue */ + if (!delayed_work_pending(&fm_ns->address_worker)) + queue_delayed_work(mptcp_wq, &fm_ns->address_worker, + msecs_to_jiffies(500)); +} + +static void addr4_event_handler(struct in_ifaddr *ifa, unsigned long event, + struct net *net) +{ + struct net_device *netdev = ifa->ifa_dev->dev; + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + struct mptcp_addr_event mpevent; + + if (ifa->ifa_scope > RT_SCOPE_LINK || + ipv4_is_loopback(ifa->ifa_local)) + return; + + spin_lock_bh(&fm_ns->local_lock); + + mpevent.family = AF_INET; + mpevent.u.addr4.s_addr = ifa->ifa_local; + mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; + + if (event == NETDEV_DOWN || !netif_running(netdev) || + (netdev->flags & IFF_NOMULTIPATH)) + mpevent.code = MPTCP_EVENT_DEL; + else if (event == NETDEV_UP) + mpevent.code = MPTCP_EVENT_ADD; + else if (event == NETDEV_CHANGE) + mpevent.code = MPTCP_EVENT_MOD; + + add_pm_event(net, &mpevent); + + spin_unlock_bh(&fm_ns->local_lock); + return; +} + +/* React on IPv4-addr add/rem-events */ +static int mptcp_pm_inetaddr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net *net = dev_net(ifa->ifa_dev->dev); + + addr4_event_handler(ifa, event, net); + + return NOTIFY_DONE; +} + +static struct notifier_block mptcp_pm_inetaddr_notifier = { + .notifier_call = mptcp_pm_inetaddr_event, +}; + +#if IS_ENABLED(CONFIG_IPV6) + +/* IPV6-related address/interface watchers */ +struct mptcp_dad_data { + struct timer_list timer; + struct inet6_ifaddr *ifa; +}; + +static void dad_callback(unsigned long arg); +static int inet6_addr_event(struct notifier_block *this, + unsigned long event, void *ptr); + +static int ipv6_is_in_dad_state(struct inet6_ifaddr *ifa) +{ + return ((ifa->flags & IFA_F_TENTATIVE) && + ifa->state == INET6_IFADDR_STATE_DAD); +} + +static void dad_init_timer(struct mptcp_dad_data *data, + struct inet6_ifaddr *ifa) +{ + data->ifa = ifa; + data->timer.data = (unsigned long)data; + data->timer.function = dad_callback; + if (ifa->idev->cnf.rtr_solicit_delay) + data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay; + else + data->timer.expires = jiffies + (HZ/10); +} + +static void dad_callback(unsigned long arg) +{ + struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg; + + if (ipv6_is_in_dad_state(data->ifa)) { + dad_init_timer(data, data->ifa); + add_timer(&data->timer); + } else { + inet6_addr_event(NULL, NETDEV_UP, data->ifa); + in6_ifa_put(data->ifa); + kfree(data); + } +} + +static inline void dad_setup_timer(struct inet6_ifaddr *ifa) +{ + struct mptcp_dad_data *data; + + data = kmalloc(sizeof(*data), GFP_ATOMIC); + + if (!data) + return; + + init_timer(&data->timer); + dad_init_timer(data, ifa); + add_timer(&data->timer); + in6_ifa_hold(ifa); +} + +static void addr6_event_handler(struct inet6_ifaddr *ifa, unsigned long event, + struct net *net) +{ + struct net_device *netdev = ifa->idev->dev; + int addr_type = ipv6_addr_type(&ifa->addr); + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + struct mptcp_addr_event mpevent; + + if (ifa->scope > RT_SCOPE_LINK || + addr_type == IPV6_ADDR_ANY || + (addr_type & IPV6_ADDR_LOOPBACK) || + (addr_type & IPV6_ADDR_LINKLOCAL)) + return; + + spin_lock_bh(&fm_ns->local_lock); + + mpevent.family = AF_INET6; + mpevent.u.addr6 = ifa->addr; + mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; + + if (event == NETDEV_DOWN ||!netif_running(netdev) || + (netdev->flags & IFF_NOMULTIPATH)) + mpevent.code = MPTCP_EVENT_DEL; + else if (event == NETDEV_UP) + mpevent.code = MPTCP_EVENT_ADD; + else if (event == NETDEV_CHANGE) + mpevent.code = MPTCP_EVENT_MOD; + + add_pm_event(net, &mpevent); + + spin_unlock_bh(&fm_ns->local_lock); + return; +} + +/* React on IPv6-addr add/rem-events */ +static int inet6_addr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr; + struct net *net = dev_net(ifa6->idev->dev); + + if (ipv6_is_in_dad_state(ifa6)) + dad_setup_timer(ifa6); + else + addr6_event_handler(ifa6, event, net); + + return NOTIFY_DONE; +} + +static struct notifier_block inet6_addr_notifier = { + .notifier_call = inet6_addr_event, +}; + +#endif + +/* React on ifup/down-events */ +static int netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct in_device *in_dev; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; +#endif + + if (!(event == NETDEV_UP || event == NETDEV_DOWN || + event == NETDEV_CHANGE)) + return NOTIFY_DONE; + + rcu_read_lock(); + in_dev = __in_dev_get_rtnl(dev); + + if (in_dev) { + for_ifa(in_dev) { + mptcp_pm_inetaddr_event(NULL, event, ifa); + } endfor_ifa(in_dev); + } + +#if IS_ENABLED(CONFIG_IPV6) + in6_dev = __in6_dev_get(dev); + + if (in6_dev) { + struct inet6_ifaddr *ifa6; + list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) + inet6_addr_event(NULL, event, ifa6); + } +#endif + + rcu_read_unlock(); + return NOTIFY_DONE; +} + +static struct notifier_block mptcp_pm_netdev_notifier = { + .notifier_call = netdev_event, +}; + +static void full_mesh_new_session(struct sock *meta_sk, int id) +{ + struct mptcp_loc_addr *mptcp_local; + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; + struct net *net = sock_net(meta_sk); + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + struct sock *sk; + int i; + + if (id == -1) { + mptcp_fallback_default(mpcb); + return; + } + + /* Initialize workqueue-struct */ + INIT_WORK(&fmp->subflow_work, create_subflow_worker); + INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker); + fmp->mpcb = mpcb; + + sk = mptcp_select_ack_sock(meta_sk, 0); + + rcu_read_lock(); + mptcp_local = rcu_dereference(fm_ns->local); + + /* Look for the address among the local addresses */ + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { + __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr; + + /* We do not need to announce the initial subflow's address again */ + if ((meta_sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(meta_sk)) && + inet_sk(meta_sk)->inet_saddr == ifa_address) + continue; + + fmp->add_addr++; + + if (sk) + tcp_send_ack(sk); + } + +#if IS_ENABLED(CONFIG_IPV6) + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { + struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr; + + /* We do not need to announce the initial subflow's address again */ + if (meta_sk->sk_family == AF_INET6 && + ipv6_addr_equal(&inet6_sk(meta_sk)->saddr, ifa6)) + continue; + + fmp->add_addr++; + + if (sk) + tcp_send_ack(sk); + } +#endif + + rcu_read_unlock(); + + if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) + fmp->announced_addrs_v4 |= (1 << id); + else + fmp->announced_addrs_v6 |= (1 << (id - MPTCP_MAX_ADDR)); +} + +static void full_mesh_create_subflows(struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *pm_priv = (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; + + if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv || + mpcb->send_infinite_mapping || + mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) + return; + + /* The master may not yet be fully established (address added through + * mptcp_update_metasocket). Then, we should not attempt to create new + * subflows. + */ + if (mpcb->master_sk && + !tcp_sk(mpcb->master_sk)->mptcp->fully_established) + return; + + if (!work_pending(&pm_priv->subflow_work)) { + sock_hold(meta_sk); + queue_work(mptcp_wq, &pm_priv->subflow_work); + } +} + +/* Called upon release_sock, if the socket was owned by the user during + * a path-management event. + */ +static void full_mesh_release_sock(struct sock *meta_sk) +{ + struct mptcp_loc_addr *mptcp_local; + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); + struct sock *sk, *tmpsk; + int i; + + rcu_read_lock(); + mptcp_local = rcu_dereference(fm_ns->local); + + /* First, detect modifications or additions */ + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { + struct in_addr ifa = mptcp_local->locaddr4[i].addr; + bool found = false; + + mptcp_for_each_sk(mpcb, sk) { + struct tcp_sock *tp = tcp_sk(sk); + + if (sk->sk_family == AF_INET6 && + !mptcp_v6_is_v4_mapped(sk)) + continue; + + if (inet_sk(sk)->inet_saddr != ifa.s_addr) + continue; + + found = true; + + if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) { + tp->mptcp->send_mp_prio = 1; + tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio; + + tcp_send_ack(sk); + } + } + + if (!found) { + fmp->add_addr++; + + sk = mptcp_select_ack_sock(meta_sk, 0); + if (sk) + tcp_send_ack(sk); + full_mesh_create_subflows(meta_sk); + } + } + +#if IS_ENABLED(CONFIG_IPV6) + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { + struct in6_addr ifa = mptcp_local->locaddr6[i].addr; + bool found = false; + + mptcp_for_each_sk(mpcb, sk) { + struct tcp_sock *tp = tcp_sk(sk); + + if (sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(sk)) + continue; + + if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa)) + continue; + + found = true; + + if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) { + tp->mptcp->send_mp_prio = 1; + tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio; + + tcp_send_ack(sk); + } + } + + if (!found) { + fmp->add_addr++; + + sk = mptcp_select_ack_sock(meta_sk, 0); + if (sk) + tcp_send_ack(sk); + full_mesh_create_subflows(meta_sk); + } + } +#endif + + /* Now, detect address-removals */ + mptcp_for_each_sk_safe(mpcb, sk, tmpsk) { + bool shall_remove = true; + + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) { + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { + if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) { + shall_remove = false; + break; + } + } + } else { + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { + if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) { + shall_remove = false; + break; + } + } + } + + if (shall_remove) { + /* Reinject, so that pf = 1 and so we + * won't select this one as the + * ack-sock. + */ + mptcp_reinject_data(sk, 0); + + announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, + meta_sk); + + mptcp_sub_force_close(sk); + } + } + + /* Just call it optimistically. It actually cannot do any harm */ + update_addr_bitfields(meta_sk, mptcp_local); + + rcu_read_unlock(); +} + +static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr, + struct net *net) +{ + struct mptcp_loc_addr *mptcp_local; + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + int id = -1, i; + + /* Handle the backup-flows */ + rcu_read_lock(); + mptcp_local = rcu_dereference(fm_ns->local); + + if (family == AF_INET) { + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { + if (addr->in.s_addr == mptcp_local->locaddr4[i].addr.s_addr) { + id = mptcp_local->locaddr4[i].id; + break; + } + } + } else { + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { + if (ipv6_addr_equal(&addr->in6, &mptcp_local->locaddr6[i].addr)) { + id = mptcp_local->locaddr6[i].id; + break; + } + } + } + rcu_read_unlock(); + + return id; +} + +static void full_mesh_addr_signal(struct sock *sk, unsigned *size, + struct tcp_out_options *opts, + struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; + struct mptcp_loc_addr *mptcp_local; + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk)); + int remove_addr_len; + u8 unannouncedv4, unannouncedv6; + + if (likely(!fmp->add_addr)) + goto remove_addr; + + rcu_read_lock(); + mptcp_local = rcu_dereference(fm_ns->local); + + /* IPv4 */ + unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits; + if (unannouncedv4 && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) { + int ind = mptcp_find_free_index(~unannouncedv4); + + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_ADD_ADDR; + opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].id; + opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr; + opts->add_addr_v4 = 1; + + if (skb) { + fmp->announced_addrs_v4 |= (1 << ind); + fmp->add_addr--; + } + *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN; + } + + /* IPv6 */ + unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits; + if (unannouncedv6 && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) { + int ind = mptcp_find_free_index(~unannouncedv6); + + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_ADD_ADDR; + opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].id; + opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr; + opts->add_addr_v6 = 1; + + if (skb) { + fmp->announced_addrs_v6 |= (1 << ind); + fmp->add_addr--; + } + *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN; + } + + rcu_read_unlock(); + + if (!unannouncedv4 && !unannouncedv6 && skb) { + fmp->add_addr--; + } + +remove_addr: + if (likely(!fmp->remove_addrs)) + return; + + remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs); + if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len) + return; + + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_REMOVE_ADDR; + opts->remove_addrs = fmp->remove_addrs; + *size += remove_addr_len; + if (skb) + fmp->remove_addrs = 0; +} + +static int mptcp_fm_init_net(struct net *net) +{ + struct mptcp_loc_addr *mptcp_local; + struct mptcp_fm_ns *fm_ns; + + fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL); + if (!fm_ns) + return -ENOBUFS; + + mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL); + if (!mptcp_local) { + kfree(fm_ns); + return -ENOBUFS; + } + + mptcp_local->next_v4_index = 1; + + rcu_assign_pointer(fm_ns->local, mptcp_local); + INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker); + INIT_LIST_HEAD(&fm_ns->events); + spin_lock_init(&fm_ns->local_lock); + fm_ns->net = net; + net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns; + + return 0; +} + +static void mptcp_fm_exit_net(struct net *net) +{ + struct mptcp_addr_event *eventq, *tmp; + struct mptcp_fm_ns *fm_ns; + struct mptcp_loc_addr *mptcp_local; + + fm_ns = fm_get_ns(net); + cancel_delayed_work_sync(&fm_ns->address_worker); + + rcu_read_lock_bh(); + + mptcp_local = rcu_dereference_bh(fm_ns->local); + kfree(mptcp_local); + + spin_lock(&fm_ns->local_lock); + list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) { + list_del(&eventq->list); + kfree(eventq); + } + spin_unlock(&fm_ns->local_lock); + + rcu_read_unlock_bh(); + + kfree(fm_ns); +} + +static struct pernet_operations full_mesh_net_ops = { + .init = mptcp_fm_init_net, + .exit = mptcp_fm_exit_net, +}; + +static struct mptcp_pm_ops full_mesh __read_mostly = { + .new_session = full_mesh_new_session, + .release_sock = full_mesh_release_sock, + .fully_established = full_mesh_create_subflows, + .new_remote_address = full_mesh_create_subflows, + .get_local_id = full_mesh_get_local_id, + .addr_signal = full_mesh_addr_signal, + .name = "fullmesh", + .owner = THIS_MODULE, +}; + +/* General initialization of MPTCP_PM */ +static int __init full_mesh_register(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE); + + ret = register_pernet_subsys(&full_mesh_net_ops); + if (ret) + goto out; + + ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); + if (ret) + goto err_reg_inetaddr; + ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier); + if (ret) + goto err_reg_netdev; + +#if IS_ENABLED(CONFIG_IPV6) + ret = register_inet6addr_notifier(&inet6_addr_notifier); + if (ret) + goto err_reg_inet6addr; +#endif + + ret = mptcp_register_path_manager(&full_mesh); + if (ret) + goto err_reg_pm; + +out: + return ret; + + +err_reg_pm: +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&inet6_addr_notifier); +err_reg_inet6addr: +#endif + unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); +err_reg_netdev: + unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); +err_reg_inetaddr: + unregister_pernet_subsys(&full_mesh_net_ops); + goto out; +} + +static void full_mesh_unregister(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&inet6_addr_notifier); +#endif + unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); + unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); + unregister_pernet_subsys(&full_mesh_net_ops); + mptcp_unregister_path_manager(&full_mesh); +} + +module_init(full_mesh_register); +module_exit(full_mesh_unregister); + +MODULE_AUTHOR("Christoph Paasch"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Full-Mesh MPTCP"); +MODULE_VERSION("0.88"); diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c new file mode 100644 index 0000000..08c375b --- /dev/null +++ b/net/mptcp/mptcp_input.c @@ -0,0 +1,2200 @@ +/* + * MPTCP implementation - Sending side + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include +#include +#include + +#include + +/* is seq1 < seq2 ? */ +static inline int before64(const u64 seq1, const u64 seq2) +{ + return (s64)(seq1 - seq2) < 0; +} + +/* is seq1 > seq2 ? */ +#define after64(seq1, seq2) before64(seq2, seq1) + +static inline void mptcp_become_fully_estab(struct sock *sk) +{ + tcp_sk(sk)->mptcp->fully_established = 1; + + if (is_master_tp(tcp_sk(sk)) && + tcp_sk(sk)->mpcb->pm_ops->fully_established) + tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk)); +} + +/* Similar to tcp_tso_acked without any memory accounting */ +static inline int mptcp_tso_acked_reinject(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 packets_acked, len; + + BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); + + packets_acked = tcp_skb_pcount(skb); + + if (skb_unclone(skb, GFP_ATOMIC)) + return 0; + + len = tp->snd_una - TCP_SKB_CB(skb)->seq; + __pskb_trim_head(skb, len); + + TCP_SKB_CB(skb)->seq += len; + skb->ip_summed = CHECKSUM_PARTIAL; + skb->truesize -= len; + + /* Any change of skb->len requires recalculation of tso factor. */ + if (tcp_skb_pcount(skb) > 1) + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); + packets_acked -= tcp_skb_pcount(skb); + + if (packets_acked) { + BUG_ON(tcp_skb_pcount(skb) == 0); + BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); + } + + return packets_acked; +} + +/** + * Cleans the meta-socket retransmission queue and the reinject-queue. + * @sk must be the metasocket. + */ +static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una) +{ + struct sk_buff *skb, *tmp; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + bool acked = false; + u32 acked_pcount; + + while ((skb = tcp_write_queue_head(meta_sk)) && + skb != tcp_send_head(meta_sk)) { + bool fully_acked = true; + + if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { + if (tcp_skb_pcount(skb) == 1 || + !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) + break; + + acked_pcount = tcp_tso_acked(meta_sk, skb); + if (!acked_pcount) + break; + + fully_acked = false; + } else { + acked_pcount = tcp_skb_pcount(skb); + } + + acked = true; + meta_tp->packets_out -= acked_pcount; + meta_tp->retrans_stamp = 0; + + if (!fully_acked) + break; + + tcp_unlink_write_queue(skb, meta_sk); + + if (mptcp_is_data_fin(skb)) { + struct sock *sk_it; + + /* DATA_FIN has been acknowledged - now we can close + * the subflows + */ + mptcp_for_each_sk(mpcb, sk_it) { + unsigned long delay = 0; + + /* If we are the passive closer, don't trigger + * subflow-fin until the subflow has been finned + * by the peer - thus we add a delay. + */ + if (mpcb->passive_close && + sk_it->sk_state == TCP_ESTABLISHED) + delay = inet_csk(sk_it)->icsk_rto << 3; + + mptcp_sub_close(sk_it, delay); + } + } + sk_wmem_free_skb(meta_sk, skb); + } + /* Remove acknowledged data from the reinject queue */ + skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) { + if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { + if (tcp_skb_pcount(skb) == 1 || + !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) + break; + + mptcp_tso_acked_reinject(meta_sk, skb); + break; + } + + __skb_unlink(skb, &mpcb->reinject_queue); + __kfree_skb(skb); + } + + if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una))) + meta_tp->snd_up = meta_tp->snd_una; + + if (acked) { + tcp_rearm_rto(meta_sk); + /* Normally this is done in tcp_try_undo_loss - but MPTCP + * does not call this function. + */ + inet_csk(meta_sk)->icsk_retransmits = 0; + } +} + +/* Inspired by tcp_rcv_state_process */ +static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk, + const struct sk_buff *skb, u32 data_seq, + u16 data_len) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); + struct tcphdr *th = tcp_hdr(skb); + + /* State-machine handling if FIN has been enqueued and he has + * been acked (snd_una == write_seq) - it's important that this + * here is after sk_wmem_free_skb because otherwise + * sk_forward_alloc is wrong upon inet_csk_destroy_sock() + */ + switch (meta_sk->sk_state) { + case TCP_FIN_WAIT1: + if (meta_tp->snd_una == meta_tp->write_seq) { + struct dst_entry *dst = __sk_dst_get(meta_sk); + + tcp_set_state(meta_sk, TCP_FIN_WAIT2); + meta_sk->sk_shutdown |= SEND_SHUTDOWN; + + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + + if (!sock_flag(meta_sk, SOCK_DEAD)) { + /* Wake up lingering close() */ + meta_sk->sk_state_change(meta_sk); + } else { + int tmo; + + if (meta_tp->linger2 < 0 || + (data_len && + after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0), + meta_tp->rcv_nxt))) { + mptcp_send_active_reset(meta_sk, GFP_ATOMIC); + tcp_done(meta_sk); + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); + return 1; + } + + tmo = tcp_fin_time(meta_sk); + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN); + } else if (mptcp_is_data_fin2(skb, tp) || + sock_owned_by_user(meta_sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + inet_csk_reset_keepalive_timer(meta_sk, tmo); + } else { + tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo); + } + } + } + break; + case TCP_CLOSING: + case TCP_LAST_ACK: + if (meta_tp->snd_una == meta_tp->write_seq) { + tcp_done(meta_sk); + return 1; + } + break; + } + + /* step 7: process the segment text */ + switch (meta_sk->sk_state) { + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* RFC 793 says to queue data in these states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. + */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && + !mptcp_is_data_fin2(skb, tp)) { + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); + mptcp_send_active_reset(meta_sk, GFP_ATOMIC); + tcp_reset(meta_sk); + return 1; + } + } + break; + } + + return 0; +} + +/** + * @return: + * i) 1: Everything's fine. + * ii) -1: A reset has been sent on the subflow - csum-failure + * iii) 0: csum-failure but no reset sent, because it's the last subflow. + * Last packet should not be destroyed by the caller because it has + * been done here. + */ +static int mptcp_verif_dss_csum(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *tmp, *tmp1, *last = NULL; + __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */ + int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0; + int iter = 0; + + skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) { + unsigned int csum_len; + + if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq)) + /* Mapping ends in the middle of the packet - + * csum only these bytes + */ + csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq; + else + csum_len = tmp->len; + + offset = 0; + if (overflowed) { + char first_word[4]; + first_word[0] = 0; + first_word[1] = 0; + first_word[2] = 0; + first_word[3] = *(tmp->data); + csum_tcp = csum_partial(first_word, 4, csum_tcp); + offset = 1; + csum_len--; + overflowed = 0; + } + + csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp); + + /* Was it on an odd-length? Then we have to merge the next byte + * correctly (see above) + */ + if (csum_len != (csum_len & (~1))) + overflowed = 1; + + if (mptcp_is_data_seq(tmp) && !dss_csum_added) { + __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32)); + + /* If a 64-bit dss is present, we increase the offset + * by 4 bytes, as the high-order 64-bits will be added + * in the final csum_partial-call. + */ + u32 offset = skb_transport_offset(tmp) + + TCP_SKB_CB(tmp)->dss_off; + if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET) + offset += 4; + + csum_tcp = skb_checksum(tmp, offset, + MPTCP_SUB_LEN_SEQ_CSUM, + csum_tcp); + + csum_tcp = csum_partial(&data_seq, + sizeof(data_seq), csum_tcp); + + dss_csum_added = 1; /* Just do it once */ + } + last = tmp; + iter++; + + if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) && + !before(TCP_SKB_CB(tmp1)->seq, + tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + break; + } + + /* Now, checksum must be 0 */ + if (unlikely(csum_fold(csum_tcp))) { + pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n", + __func__, csum_fold(csum_tcp), + TCP_SKB_CB(last)->seq, dss_csum_added, overflowed, + iter); + + tp->mptcp->send_mp_fail = 1; + + /* map_data_seq is the data-seq number of the + * mapping we are currently checking + */ + tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq; + + if (tp->mpcb->cnt_subflows > 1) { + mptcp_send_reset(sk); + ans = -1; + } else { + tp->mpcb->send_infinite_mapping = 1; + + /* Need to purge the rcv-queue as it's no more valid */ + while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + tp->copied_seq = TCP_SKB_CB(tmp)->end_seq; + kfree_skb(tmp); + } + + ans = 0; + } + } + + return ans; +} + +static inline void mptcp_prepare_skb(struct sk_buff *skb, struct sk_buff *next, + struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + u32 inc = 0; + + /* If skb is the end of this mapping (end is always at mapping-boundary + * thanks to the splitting/trimming), then we need to increase + * data-end-seq by 1 if this here is a data-fin. + * + * We need to do -1 because end_seq includes the subflow-FIN. + */ + if (tp->mptcp->map_data_fin && + (tcb->end_seq - (tcp_hdr(skb)->fin ? 1 : 0)) == + (tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { + inc = 1; + + /* We manually set the fin-flag if it is a data-fin. For easy + * processing in tcp_recvmsg. + */ + tcp_hdr(skb)->fin = 1; + } else { + /* We may have a subflow-fin with data but without data-fin */ + tcp_hdr(skb)->fin = 0; + } + + /* Adapt data-seq's to the packet itself. We kinda transform the + * dss-mapping to a per-packet granularity. This is necessary to + * correctly handle overlapping mappings coming from different + * subflows. Otherwise it would be a complete mess. + */ + tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq; + tcb->end_seq = tcb->seq + skb->len + inc; + +} + +/** + * @return: 1 if the segment has been eaten and can be suppressed, + * otherwise 0. + */ +static inline int mptcp_direct_copy(struct sk_buff *skb, struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len); + int eaten = 0; + + __set_current_state(TASK_RUNNING); + + local_bh_enable(); + if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) { + meta_tp->ucopy.len -= chunk; + meta_tp->copied_seq += chunk; + eaten = (chunk == skb->len); + tcp_rcv_space_adjust(meta_sk); + } + local_bh_disable(); + return eaten; +} + +static inline void mptcp_reset_mapping(struct tcp_sock *tp) +{ + tp->mptcp->map_data_len = 0; + tp->mptcp->map_data_seq = 0; + tp->mptcp->map_subseq = 0; + tp->mptcp->map_data_fin = 0; + tp->mptcp->mapping_present = 0; +} + +/* The DSS-mapping received on the sk only covers the second half of the skb + * (cut at seq). We trim the head from the skb. + * Data will be freed upon kfree(). + * + * Inspired by tcp_trim_head(). + */ +static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq) +{ + int len = seq - TCP_SKB_CB(skb)->seq; + u32 new_seq = TCP_SKB_CB(skb)->seq + len; + + if (len < skb_headlen(skb)) + __skb_pull(skb, len); + else + __pskb_trim_head(skb, len - skb_headlen(skb)); + + TCP_SKB_CB(skb)->seq = new_seq; + + skb->truesize -= len; + atomic_sub(len, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, len); +} + +/* The DSS-mapping received on the sk only covers the first half of the skb + * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue + * as further packets may resolve the mapping of the second half of data. + * + * Inspired by tcp_fragment(). + */ +static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq) +{ + struct sk_buff *buff; + int nsize; + int nlen, len; + + len = seq - TCP_SKB_CB(skb)->seq; + nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len; + if (nsize < 0) + nsize = 0; + + /* Get a new skb... force flag on. */ + buff = alloc_skb(nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; + + skb_reserve(buff, tcp_sk(sk)->tcp_header_len); + skb_reset_transport_header(buff); + + tcp_hdr(buff)->fin = tcp_hdr(skb)->fin; + tcp_hdr(skb)->fin = 0; + + /* We absolutly need to call skb_set_owner_r before refreshing the + * truesize of buff, otherwise the moved data will account twice. + */ + skb_set_owner_r(buff, sk); + nlen = skb->len - len - nsize; + buff->truesize += nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + skb_split(skb, buff, len); + + __skb_queue_after(&sk->sk_receive_queue, skb, buff); + + return 0; +} + +/* @return: 0 everything is fine. Just continue processing + * 1 subflow is broken stop everything + * -1 this packet was broken - continue with the next one. + */ +static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */ + if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) && + !tp->mpcb->infinite_mapping_rcv) { + /* Remove a pure subflow-fin from the queue and increase + * copied_seq. + */ + tp->copied_seq = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + return -1; + } + + /* If we are not yet fully established and do not know the mapping for + * this segment, this path has to fallback to infinite or be torn down. + */ + if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) && + !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) { + pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n", + __func__, tp->mpcb->mptcp_loc_token, + tp->mptcp->path_index, __builtin_return_address(0), + TCP_SKB_CB(skb)->seq); + + if (!is_master_tp(tp)) { + mptcp_send_reset(sk); + return 1; + } + + tp->mpcb->infinite_mapping_snd = 1; + tp->mpcb->infinite_mapping_rcv = 1; + tp->mptcp->fully_established = 1; + } + + /* Receiver-side becomes fully established when a whole rcv-window has + * been received without the need to fallback due to the previous + * condition. */ + if (!tp->mptcp->fully_established) { + tp->mptcp->init_rcv_wnd -= skb->len; + if (tp->mptcp->init_rcv_wnd < 0) + mptcp_become_fully_estab(sk); + } + + return 0; +} + +/* @return: 0 everything is fine. Just continue processing + * 1 subflow is broken stop everything + * -1 this packet was broken - continue with the next one. + */ +static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); + struct mptcp_cb *mpcb = tp->mpcb; + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + u32 *ptr; + u32 data_seq, sub_seq, data_len, tcp_end_seq; + + /* If we are in infinite-mapping-mode, the subflow is guaranteed to be + * in-order at the data-level. Thus data-seq-numbers can be inferred + * from what is expected at the data-level. + */ + if (mpcb->infinite_mapping_rcv) { + tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp); + tp->mptcp->map_subseq = tcb->seq; + tp->mptcp->map_data_len = skb->len; + tp->mptcp->map_data_fin = tcp_hdr(skb)->fin; + tp->mptcp->mapping_present = 1; + return 0; + } + + /* No mapping here? Exit - it is either already set or still on its way */ + if (!mptcp_is_data_seq(skb)) { + /* Too many packets without a mapping - this subflow is broken */ + if (!tp->mptcp->mapping_present && + tp->rcv_nxt - tp->copied_seq > 65536) { + mptcp_send_reset(sk); + return 1; + } + + return 0; + } + + ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb); + ptr++; + sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn; + ptr++; + data_len = get_unaligned_be16(ptr); + + /* If it's an empty skb with DATA_FIN, sub_seq must get fixed. + * The draft sets it to 0, but we really would like to have the + * real value, to have an easy handling afterwards here in this + * function. + */ + if (mptcp_is_data_fin(skb) && skb->len == 0) + sub_seq = TCP_SKB_CB(skb)->seq; + + /* If there is already a mapping - we check if it maps with the current + * one. If not - we reset. + */ + if (tp->mptcp->mapping_present && + (data_seq != (u32)tp->mptcp->map_data_seq || + sub_seq != tp->mptcp->map_subseq || + data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin || + mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) { + /* Mapping in packet is different from what we want */ + pr_err("%s Mappings do not match!\n", __func__); + pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n", + __func__, data_seq, (u32)tp->mptcp->map_data_seq, + sub_seq, tp->mptcp->map_subseq, data_len, + tp->mptcp->map_data_len, mptcp_is_data_fin(skb), + tp->mptcp->map_data_fin); + mptcp_send_reset(sk); + return 1; + } + + /* If the previous check was good, the current mapping is valid and we exit. */ + if (tp->mptcp->mapping_present) + return 0; + + /* Mapping not yet set on this subflow - we set it here! */ + + if (!data_len) { + mpcb->infinite_mapping_rcv = 1; + tp->mptcp->fully_established = 1; + /* We need to repeat mp_fail's until the sender felt + * back to infinite-mapping - here we stop repeating it. + */ + tp->mptcp->send_mp_fail = 0; + + /* We have to fixup data_len - it must be the same as skb->len */ + data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0); + sub_seq = tcb->seq; + + /* TODO kill all other subflows than this one */ + /* data_seq and so on are set correctly */ + + /* At this point, the meta-ofo-queue has to be emptied, + * as the following data is guaranteed to be in-order at + * the data and subflow-level + */ + mptcp_purge_ofo_queue(meta_tp); + } + + /* We are sending mp-fail's and thus are in fallback mode. + * Ignore packets which do not announce the fallback and still + * want to provide a mapping. + */ + if (tp->mptcp->send_mp_fail) { + tp->copied_seq = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + return -1; + } + + /* FIN increased the mapping-length by 1 */ + if (mptcp_is_data_fin(skb)) + data_len--; + + /* Subflow-sequences of packet must be + * (at least partially) be part of the DSS-mapping's + * subflow-sequence-space. + * + * Basically the mapping is not valid, if either of the + * following conditions is true: + * + * 1. It's not a data_fin and + * MPTCP-sub_seq >= TCP-end_seq + * + * 2. It's a data_fin and TCP-end_seq > TCP-seq and + * MPTCP-sub_seq >= TCP-end_seq + * + * The previous two can be merged into: + * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq + * Because if it's not a data-fin, TCP-end_seq > TCP-seq + * + * 3. It's a data_fin and skb->len == 0 and + * MPTCP-sub_seq > TCP-end_seq + * + * 4. It's not a data_fin and TCP-end_seq > TCP-seq and + * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq + * + * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq) + */ + + /* subflow-fin is not part of the mapping - ignore it here ! */ + tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin; + if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) || + (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) || + (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) || + before(sub_seq, tp->copied_seq)) { + /* Subflow-sequences of packet is different from what is in the + * packet's dss-mapping. The peer is misbehaving - reset + */ + pr_err("%s Packet's mapping does not map to the DSS sub_seq %u " + "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u" + "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb), + skb->len, data_len, tp->copied_seq); + mptcp_send_reset(sk); + return 1; + } + + /* Does the DSS had 64-bit seqnum's ? */ + if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) { + /* Wrapped around? */ + if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) { + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq); + } else { + /* Else, access the default high-order bits */ + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq); + } + } else { + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq); + + if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) { + /* We make sure that the data_seq is invalid. + * It will be dropped later. + */ + tp->mptcp->map_data_seq += 0xFFFFFFFF; + tp->mptcp->map_data_seq += 0xFFFFFFFF; + } + } + + tp->mptcp->map_data_len = data_len; + tp->mptcp->map_subseq = sub_seq; + tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0; + tp->mptcp->mapping_present = 1; + + return 0; +} + +/* Similar to tcp_sequence(...) */ +static inline int mptcp_sequence(const struct tcp_sock *meta_tp, + u64 data_seq, u64 end_data_seq) +{ + struct mptcp_cb *mpcb = meta_tp->mpcb; + u64 rcv_wup64; + + /* Wrap-around? */ + if (meta_tp->rcv_wup > meta_tp->rcv_nxt) { + rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) | + meta_tp->rcv_wup; + } else { + rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, + meta_tp->rcv_wup); + } + + return !before64(end_data_seq, rcv_wup64) && + !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp)); +} + +/* @return: 0 everything is fine. Just continue processing + * -1 this packet was broken - continue with the next one. + */ +static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *tmp, *tmp1; + u32 tcp_end_seq; + + if (!tp->mptcp->mapping_present) + return 0; + + /* either, the new skb gave us the mapping and the first segment + * in the sub-rcv-queue has to be trimmed ... + */ + tmp = skb_peek(&sk->sk_receive_queue); + if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) && + after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq)) + mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq); + + /* ... or the new skb (tail) has to be split at the end. */ + tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0); + if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { + u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len; + if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */ + /* TODO : maybe handle this here better. + * We now just force meta-retransmission. + */ + tp->copied_seq = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + return -1; + } + } + + /* Now, remove old sk_buff's from the receive-queue. + * This may happen if the mapping has been lost for these segments and + * the next mapping has already been received. + */ + if (tp->mptcp->mapping_present && + before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) { + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { + if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq)) + break; + + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + __skb_unlink(tmp1, &sk->sk_receive_queue); + + /* Impossible that we could free skb here, because his + * mapping is known to be valid from previous checks + */ + __kfree_skb(tmp1); + } + } + + return 0; +} + +/* @return: 0 everything is fine. Just continue processing + * 1 subflow is broken stop everything + * -1 this mapping has been put in the meta-receive-queue + * -2 this mapping has been eaten by the application + */ +static int mptcp_queue_skb(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); + struct sock *meta_sk = mptcp_meta_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + struct sk_buff *tmp, *tmp1; + u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp); + bool data_queued = false; + + /* Have we not yet received the full mapping? */ + if (!tp->mptcp->mapping_present || + before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + return 0; + + /* Is this an overlapping mapping? rcv_nxt >= end_data_seq + * OR + * This mapping is out of window + */ + if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) || + !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq, + tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) { + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { + __skb_unlink(tmp1, &sk->sk_receive_queue); + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + __kfree_skb(tmp1); + + if (!skb_queue_empty(&sk->sk_receive_queue) && + !before(TCP_SKB_CB(tmp)->seq, + tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + break; + } + + mptcp_reset_mapping(tp); + + return -1; + } + + /* Record it, because we want to send our data_fin on the same path */ + if (tp->mptcp->map_data_fin) { + mpcb->dfin_path_index = tp->mptcp->path_index; + mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN); + } + + /* Verify the checksum */ + if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) { + int ret = mptcp_verif_dss_csum(sk); + + if (ret <= 0) { + mptcp_reset_mapping(tp); + return 1; + } + } + + if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) { + /* Seg's have to go to the meta-ofo-queue */ + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + mptcp_prepare_skb(tmp1, tmp, sk); + __skb_unlink(tmp1, &sk->sk_receive_queue); + /* MUST be done here, because fragstolen may be true later. + * Then, kfree_skb_partial will not account the memory. + */ + skb_orphan(tmp1); + + if (!mpcb->in_time_wait) /* In time-wait, do not receive data */ + mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk); + else + __kfree_skb(tmp1); + + if (!skb_queue_empty(&sk->sk_receive_queue) && + !before(TCP_SKB_CB(tmp)->seq, + tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + break; + + } + } else { + /* Ready for the meta-rcv-queue */ + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { + int eaten = 0; + int copied_early = 0; + bool fragstolen = false; + u32 old_rcv_nxt = meta_tp->rcv_nxt; + + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + mptcp_prepare_skb(tmp1, tmp, sk); + __skb_unlink(tmp1, &sk->sk_receive_queue); + /* MUST be done here, because fragstolen may be true. + * Then, kfree_skb_partial will not account the memory. + */ + skb_orphan(tmp1); + + /* This segment has already been received */ + if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) { + __kfree_skb(tmp1); + goto next; + } + +#ifdef CONFIG_NET_DMA + if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt && + meta_tp->ucopy.task == current && + meta_tp->copied_seq == meta_tp->rcv_nxt && + tmp1->len <= meta_tp->ucopy.len && + sock_owned_by_user(meta_sk) && + tcp_dma_try_early_copy(meta_sk, tmp1, 0)) { + copied_early = 1; + eaten = 1; + } +#endif + + /* Is direct copy possible ? */ + if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt && + meta_tp->ucopy.task == current && + meta_tp->copied_seq == meta_tp->rcv_nxt && + meta_tp->ucopy.len && sock_owned_by_user(meta_sk) && + !copied_early) + eaten = mptcp_direct_copy(tmp1, meta_sk); + + if (mpcb->in_time_wait) /* In time-wait, do not receive data */ + eaten = 1; + + if (!eaten) + eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen); + + meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq; + mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); + + if (copied_early) + tcp_cleanup_rbuf(meta_sk, tmp1->len); + + if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait) + mptcp_fin(meta_sk); + + /* Check if this fills a gap in the ofo queue */ + if (!skb_queue_empty(&meta_tp->out_of_order_queue)) + mptcp_ofo_queue(meta_sk); + +#ifdef CONFIG_NET_DMA + if (copied_early) + __skb_queue_tail(&meta_sk->sk_async_wait_queue, + tmp1); + else +#endif + if (eaten) + kfree_skb_partial(tmp1, fragstolen); + + data_queued = true; +next: + if (!skb_queue_empty(&sk->sk_receive_queue) && + !before(TCP_SKB_CB(tmp)->seq, + tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + break; + } + } + + inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp; + tp->mptcp->last_data_seq = tp->mptcp->map_data_seq; + mptcp_reset_mapping(tp); + + return data_queued ? -1 : -2; +} + +void mptcp_data_ready(struct sock *sk, int bytes) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + struct sk_buff *skb, *tmp; + int queued = 0; + + /* If the meta is already closed, there is no point in pushing data */ + if (meta_sk->sk_state == TCP_CLOSE && !tcp_sk(sk)->mpcb->in_time_wait) { + skb_queue_purge(&sk->sk_receive_queue); + tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt; + goto exit; + } + +restart: + /* Iterate over all segments, detect their mapping (if we don't have + * one yet), validate them and push everything one level higher. + */ + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { + int ret; + /* Pre-validation - e.g., early fallback */ + ret = mptcp_prevalidate_skb(sk, skb); + if (ret < 0) + goto restart; + else if (ret > 0) + break; + + /* Set the current mapping */ + ret = mptcp_detect_mapping(sk, skb); + if (ret < 0) + goto restart; + else if (ret > 0) + break; + + /* Validation */ + if (mptcp_validate_mapping(sk, skb) < 0) + goto restart; + + /* Push a level higher */ + ret = mptcp_queue_skb(sk); + if (ret < 0) { + if (ret == -1) + queued = ret; + goto restart; + } else if (ret == 0) { + continue; + } else { /* ret == 1 */ + break; + } + } + +exit: + if (tcp_sk(sk)->close_it) { + tcp_send_ack(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + } + + if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD)) + meta_sk->sk_data_ready(meta_sk, 0); +} + + +int mptcp_check_req(struct sk_buff *skb, struct net *net) +{ + struct tcphdr *th = tcp_hdr(skb); + struct sock *meta_sk = NULL; + + /* MPTCP structures not initialized */ + if (mptcp_init_failed) + return 0; + + if (skb->protocol == htons(ETH_P_IP)) + meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, net); +#if IS_ENABLED(CONFIG_IPV6) + else /* IPv6 */ + meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, net); +#endif /* CONFIG_IPV6 */ + + if (!meta_sk) + return 0; + + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN; + + bh_lock_sock_nested(meta_sk); + if (sock_owned_by_user(meta_sk)) { + skb->sk = meta_sk; + if (unlikely(sk_add_backlog(meta_sk, skb, + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { + bh_unlock_sock(meta_sk); + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + sock_put(meta_sk); /* Taken by mptcp_search_req */ + kfree_skb(skb); + return 1; + } + } else if (skb->protocol == htons(ETH_P_IP)) { + tcp_v4_do_rcv(meta_sk, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { /* IPv6 */ + tcp_v6_do_rcv(meta_sk, skb); +#endif /* CONFIG_IPV6 */ + } + bh_unlock_sock(meta_sk); + sock_put(meta_sk); /* Taken by mptcp_vX_search_req */ + return 1; +} + +struct mp_join *mptcp_find_join(struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + unsigned char *ptr; + int length = (th->doff * 4) - sizeof(struct tcphdr); + + /* Jump through the options to check whether JOIN is there */ + ptr = (unsigned char *)(th + 1); + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return NULL; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return NULL; + if (opsize > length) + return NULL; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP && + ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) { + return (struct mp_join *)(ptr - 2); + } + ptr += opsize - 2; + length -= opsize; + } + } + return NULL; +} + +int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw) +{ + struct mptcp_cb *mpcb; + struct sock *meta_sk; + u32 token; + struct mp_join *join_opt = mptcp_find_join(skb); + if (!join_opt) + return 0; + + /* MPTCP structures were not initialized, so return error */ + if (mptcp_init_failed) + return -1; + + token = join_opt->u.syn.token; + meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token); + if (!meta_sk) { + mptcp_debug("%s:mpcb not found:%x\n", __func__, token); + return -1; + } + + mpcb = tcp_sk(meta_sk)->mpcb; + if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) { + /* We are in fallback-mode on the reception-side - + * no new subflows! + */ + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return -1; + } + + /* Coming from time-wait-sock processing in tcp_v4_rcv. + * We have to deschedule it before continuing, because otherwise + * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req. + */ + if (tw) { + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); + } + + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN; + /* OK, this is a new syn/join, let's create a new open request and + * send syn+ack + */ + bh_lock_sock_nested(meta_sk); + if (sock_owned_by_user(meta_sk)) { + skb->sk = meta_sk; + if (unlikely(sk_add_backlog(meta_sk, skb, + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { + bh_unlock_sock(meta_sk); + NET_INC_STATS_BH(sock_net(meta_sk), + LINUX_MIB_TCPBACKLOGDROP); + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + kfree_skb(skb); + return 1; + } + } else if (skb->protocol == htons(ETH_P_IP)) { + tcp_v4_do_rcv(meta_sk, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { + tcp_v6_do_rcv(meta_sk, skb); +#endif /* CONFIG_IPV6 */ + } + bh_unlock_sock(meta_sk); + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return 1; +} + +int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt, + struct tcp_options_received *tmp_opt, struct net *net) +{ + struct sock *meta_sk; + u32 token; + + token = mopt->mptcp_rem_token; + meta_sk = mptcp_hash_find(net, token); + if (!meta_sk) { + mptcp_debug("%s:mpcb not found:%x\n", __func__, token); + return -1; + } + + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN; + + /* OK, this is a new syn/join, let's create a new open request and + * send syn+ack + */ + bh_lock_sock(meta_sk); + + /* This check is also done in mptcp_vX_do_rcv. But, there we cannot + * call tcp_vX_send_reset, because we hold already two socket-locks. + * (the listener and the meta from above) + * + * And the send-reset will try to take yet another one (ip_send_reply). + * Thus, we propagate the reset up to tcp_rcv_state_process. + */ + if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv || + tcp_sk(meta_sk)->mpcb->send_infinite_mapping || + meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) { + bh_unlock_sock(meta_sk); + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return -1; + } + + if (sock_owned_by_user(meta_sk)) { + skb->sk = meta_sk; + if (unlikely(sk_add_backlog(meta_sk, skb, + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + else + /* Must make sure that upper layers won't free the + * skb if it is added to the backlog-queue. + */ + skb_get(skb); + } else { + /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as + * the skb will finally be freed by tcp_v4_do_rcv (where we are + * coming from) + */ + skb_get(skb); + if (skb->protocol == htons(ETH_P_IP)) { + tcp_v4_do_rcv(meta_sk, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { /* IPv6 */ + tcp_v6_do_rcv(meta_sk, skb); +#endif /* CONFIG_IPV6 */ + } + } + + bh_unlock_sock(meta_sk); + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return 0; +} + +/** + * Equivalent of tcp_fin() for MPTCP + * Can be called only when the FIN is validly part + * of the data seqnum space. Not before when we get holes. + */ +void mptcp_fin(struct sock *meta_sk) +{ + struct sock *sk = NULL, *sk_it; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + + mptcp_for_each_sk(mpcb, sk_it) { + if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) { + sk = sk_it; + break; + } + } + + if (!sk || sk->sk_state == TCP_CLOSE) + sk = mptcp_select_ack_sock(meta_sk, 0); + + inet_csk_schedule_ack(sk); + + meta_sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(meta_sk, SOCK_DONE); + + switch (meta_sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(meta_sk, TCP_CLOSE_WAIT); + inet_csk(sk)->icsk_ack.pingpong = 1; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; + + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_send_ack(sk); + tcp_set_state(meta_sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); + tcp_time_wait(meta_sk, TCP_TIME_WAIT, 0); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__, + meta_sk->sk_state); + break; + } + + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ + mptcp_purge_ofo_queue(meta_tp); + sk_mem_reclaim(meta_sk); + + if (!sock_flag(meta_sk, SOCK_DEAD)) { + meta_sk->sk_state_change(meta_sk); + + /* Do not send POLL_HUP for half duplex close. */ + if (meta_sk->sk_shutdown == SHUTDOWN_MASK || + meta_sk->sk_state == TCP_CLOSE) + sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN); + } + + return; +} + +static void mptcp_xmit_retransmit_queue(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sk_buff *skb; + + if (!meta_tp->packets_out) + return; + + tcp_for_write_queue(skb, meta_sk) { + if (skb == tcp_send_head(meta_sk)) + break; + + if (mptcp_retransmit_skb(meta_sk, skb)) + return; + + if (skb == tcp_write_queue_head(meta_sk)) + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, + inet_csk(meta_sk)->icsk_rto, + TCP_RTO_MAX); + } +} + +/* Handle the DATA_ACK */ +static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + u32 prior_snd_una = meta_tp->snd_una; + int prior_packets; + u32 nwin, data_ack, data_seq; + u16 data_len = 0; + + /* A valid packet came in - subflow is operational again */ + tp->pf = 0; + + /* Even if there is no data-ack, we stop retransmitting. + * Except if this is a SYN/ACK. Then it is just a retransmission + */ + if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) { + tp->mptcp->pre_established = 0; + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); + } + + /* If we are in infinite mapping mode, rx_opt.data_ack has been + * set by mptcp_clean_rtx_infinite. + */ + if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd) + goto exit; + + data_ack = tp->mptcp->rx_opt.data_ack; + + if (unlikely(!tp->mptcp->fully_established) && + (data_ack != meta_tp->mptcp->snt_isn || + tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)) + /* As soon as data has been data-acked, + * or a subflow-data-ack (not acking syn - thus snt_isn + 1) + * includes a data-ack, we are fully established + */ + mptcp_become_fully_estab(sk); + + /* Get the data_seq */ + if (mptcp_is_data_seq(skb)) { + data_seq = tp->mptcp->rx_opt.data_seq; + data_len = tp->mptcp->rx_opt.data_len; + } else { + data_seq = meta_tp->snd_wl1; + } + + /* If the ack is older than previous acks + * then we can probably ignore it. + */ + if (before(data_ack, prior_snd_una)) + goto exit; + + /* If the ack includes data we haven't sent yet, discard + * this segment (RFC793 Section 3.9). + */ + if (after(data_ack, meta_tp->snd_nxt)) + goto exit; + + /*** Now, update the window - inspired by tcp_ack_update_window ***/ + nwin = ntohs(tcp_hdr(skb)->window); + + if (likely(!tcp_hdr(skb)->syn)) + nwin <<= tp->rx_opt.snd_wscale; + + if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) { + tcp_update_wl(meta_tp, data_seq); + + /* Draft v09, Section 3.3.5: + * [...] It should only update its local receive window values + * when the largest sequence number allowed (i.e. DATA_ACK + + * receive window) increases. [...] + */ + if (meta_tp->snd_wnd != nwin && + !before(data_ack + nwin, tcp_wnd_end(meta_tp))) { + meta_tp->snd_wnd = nwin; + + if (nwin > meta_tp->max_window) + meta_tp->max_window = nwin; + } + } + /*** Done, update the window ***/ + + /* We passed data and got it acked, remove any soft error + * log. Something worked... + */ + sk->sk_err_soft = 0; + inet_csk(meta_sk)->icsk_probes_out = 0; + meta_tp->rcv_tstamp = tcp_time_stamp; + prior_packets = meta_tp->packets_out; + if (!prior_packets) + goto no_queue; + + meta_tp->snd_una = data_ack; + + mptcp_clean_rtx_queue(meta_sk, prior_snd_una); + + /* We are in loss-state, and something got acked, retransmit the whole + * queue now! + */ + if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss && + after(data_ack, prior_snd_una)) { + mptcp_xmit_retransmit_queue(meta_sk); + inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open; + } + + /* Simplified version of tcp_new_space, because the snd-buffer + * is handled by all the subflows. + */ + if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) { + sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK); + if (meta_sk->sk_socket && + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) + meta_sk->sk_write_space(meta_sk); + } + + if (meta_sk->sk_state != TCP_ESTABLISHED && + mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len)) + return; + +exit: + mptcp_push_pending_frames(meta_sk); + + return; + +no_queue: + if (tcp_send_head(meta_sk)) + tcp_ack_probe(meta_sk); + + mptcp_push_pending_frames(meta_sk); + + return; +} + +void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk)); + + if (!tp->mpcb->infinite_mapping_snd) + return; + + /* The difference between both write_seq's represents the offset between + * data-sequence and subflow-sequence. As we are infinite, this must + * match. + * + * Thus, from this difference we can infer the meta snd_una. + */ + tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt + + tp->snd_una; + + mptcp_data_ack(sk, skb); +} + +/**** static functions used by mptcp_parse_options */ + +static inline int mptcp_rem_raddress(struct mptcp_cb *mpcb, u8 rem_id) +{ + if (mptcp_v4_rem_raddress(mpcb, rem_id) < 0) { +#if IS_ENABLED(CONFIG_IPV6) + if (mptcp_v6_rem_raddress(mpcb, rem_id) < 0) + return -1; +#else + return -1; +#endif /* CONFIG_IPV6 */ + } + return 0; +} + +static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id) +{ + struct sock *sk_it, *tmpsk; + + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { + if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) { + mptcp_reinject_data(sk_it, 0); + sk_it->sk_err = ECONNRESET; + if (tcp_need_reset(sk_it->sk_state)) + tcp_send_active_reset(sk_it, GFP_ATOMIC); + mptcp_sub_force_close(sk_it); + } + } +} + +void mptcp_parse_options(const uint8_t *ptr, int opsize, + struct tcp_options_received *opt_rx, + struct mptcp_options_received *mopt, + const struct sk_buff *skb) +{ + struct mptcp_option *mp_opt = (struct mptcp_option *)ptr; + + /* If the socket is mp-capable we would have a mopt. */ + if (!mopt) + return; + + switch (mp_opt->sub) { + case MPTCP_SUB_CAPABLE: + { + struct mp_capable *mpcapable = (struct mp_capable *)ptr; + + if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN && + opsize != MPTCP_SUB_LEN_CAPABLE_ACK) { + mptcp_debug("%s: mp_capable: bad option size %d\n", + __func__, opsize); + break; + } + + if (!sysctl_mptcp_enabled) + break; + + /* We only support MPTCP version 0 */ + if (mpcapable->ver != 0) + break; + + /* MPTCP-RFC 6824: + * "If receiving a message with the 'B' flag set to 1, and this + * is not understood, then this SYN MUST be silently ignored; + */ + if (mpcapable->b) { + mopt->drop_me = 1; + break; + } + + /* MPTCP-RFC 6824: + * "An implementation that only supports this method MUST set + * bit "H" to 1, and bits "C" through "G" to 0." + */ + if (!mpcapable->h) + break; + + mopt->saw_mpc = 1; + mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a; + + if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN) + mopt->mptcp_key = mpcapable->sender_key; + + break; + } + case MPTCP_SUB_JOIN: + { + struct mp_join *mpjoin = (struct mp_join *)ptr; + + if (opsize != MPTCP_SUB_LEN_JOIN_SYN && + opsize != MPTCP_SUB_LEN_JOIN_SYNACK && + opsize != MPTCP_SUB_LEN_JOIN_ACK) { + mptcp_debug("%s: mp_join: bad option size %d\n", + __func__, opsize); + break; + } + + /* saw_mpc must be set, because in tcp_check_req we assume that + * it is set to support falling back to reg. TCP if a rexmitted + * SYN has no MP_CAPABLE or MP_JOIN + */ + switch (opsize) { + case MPTCP_SUB_LEN_JOIN_SYN: + mopt->is_mp_join = 1; + mopt->saw_mpc = 1; + mopt->low_prio = mpjoin->b; + mopt->rem_id = mpjoin->addr_id; + mopt->mptcp_rem_token = mpjoin->u.syn.token; + mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce; + break; + case MPTCP_SUB_LEN_JOIN_SYNACK: + mopt->saw_mpc = 1; + mopt->low_prio = mpjoin->b; + mopt->rem_id = mpjoin->addr_id; + mopt->mptcp_recv_tmac = mpjoin->u.synack.mac; + mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce; + break; + case MPTCP_SUB_LEN_JOIN_ACK: + mopt->saw_mpc = 1; + mopt->join_ack = 1; + memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20); + break; + } + break; + } + case MPTCP_SUB_DSS: + { + struct mp_dss *mdss = (struct mp_dss *)ptr; + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* We check opsize for the csum and non-csum case. We do this, + * because the draft says that the csum SHOULD be ignored if + * it has not been negotiated in the MP_CAPABLE but still is + * present in the data. + * + * It will get ignored later in mptcp_queue_skb. + */ + if (opsize != mptcp_sub_len_dss(mdss, 0) && + opsize != mptcp_sub_len_dss(mdss, 1)) { + mptcp_debug("%s: mp_dss: bad option size %d\n", + __func__, opsize); + break; + } + + ptr += 4; + + if (mdss->A) { + tcb->mptcp_flags |= MPTCPHDR_ACK; + + if (mdss->a) { + mopt->data_ack = (u32) get_unaligned_be64(ptr); + ptr += MPTCP_SUB_LEN_ACK_64; + } else { + mopt->data_ack = get_unaligned_be32(ptr); + ptr += MPTCP_SUB_LEN_ACK; + } + } + + tcb->dss_off = (ptr - skb_transport_header(skb)); + + if (mdss->M) { + if (mdss->m) { + u64 data_seq64 = get_unaligned_be64(ptr); + + tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET; + mopt->data_seq = (u32) data_seq64; + + ptr += 12; /* 64-bit dseq + subseq */ + } else { + mopt->data_seq = get_unaligned_be32(ptr); + ptr += 8; /* 32-bit dseq + subseq */ + } + mopt->data_len = get_unaligned_be16(ptr); + + tcb->mptcp_flags |= MPTCPHDR_SEQ; + + /* Is a check-sum present? */ + if (opsize == mptcp_sub_len_dss(mdss, 1)) + tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM; + + /* DATA_FIN only possible with DSS-mapping */ + if (mdss->F) + tcb->mptcp_flags |= MPTCPHDR_FIN; + } + + break; + } + case MPTCP_SUB_ADD_ADDR: + { +#if IS_ENABLED(CONFIG_IPV6) + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; + + if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 && + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) || + (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 && + opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) { +#else + if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 && + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) { +#endif /* CONFIG_IPV6 */ + mptcp_debug("%s: mp_add_addr: bad option size %d\n", + __func__, opsize); + break; + } + + /* We have to manually parse the options if we got two of them. */ + if (mopt->saw_add_addr) { + mopt->more_add_addr = 1; + break; + } + mopt->saw_add_addr = 1; + mopt->add_addr_ptr = ptr; + break; + } + case MPTCP_SUB_REMOVE_ADDR: + if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) { + mptcp_debug("%s: mp_remove_addr: bad option size %d\n", + __func__, opsize); + break; + } + + if (mopt->saw_rem_addr) { + mopt->more_rem_addr = 1; + break; + } + mopt->saw_rem_addr = 1; + mopt->rem_addr_ptr = ptr; + break; + case MPTCP_SUB_PRIO: + { + struct mp_prio *mpprio = (struct mp_prio *)ptr; + + if (opsize != MPTCP_SUB_LEN_PRIO && + opsize != MPTCP_SUB_LEN_PRIO_ADDR) { + mptcp_debug("%s: mp_prio: bad option size %d\n", + __func__, opsize); + break; + } + + mopt->saw_low_prio = 1; + mopt->low_prio = mpprio->b; + + if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) { + mopt->saw_low_prio = 2; + mopt->prio_addr_id = mpprio->addr_id; + } + break; + } + case MPTCP_SUB_FAIL: + if (opsize != MPTCP_SUB_LEN_FAIL) { + mptcp_debug("%s: mp_fail: bad option size %d\n", + __func__, opsize); + break; + } + mopt->mp_fail = 1; + break; + case MPTCP_SUB_FCLOSE: + if (opsize != MPTCP_SUB_LEN_FCLOSE) { + mptcp_debug("%s: mp_fclose: bad option size %d\n", + __func__, opsize); + break; + } + + mopt->mp_fclose = 1; + mopt->mptcp_key = ((struct mp_fclose *)ptr)->key; + + break; + default: + mptcp_debug("%s: Received unkown subtype: %d\n", + __func__, mp_opt->sub); + break; + } +} + +int mptcp_check_rtt(const struct tcp_sock *tp, int time) +{ + struct mptcp_cb *mpcb = tp->mpcb; + struct sock *sk; + u32 rtt_max = 0; + + /* In MPTCP, we take the max delay across all flows, + * in order to take into account meta-reordering buffers. + */ + mptcp_for_each_sk(mpcb, sk) { + if (!mptcp_sk_can_recv(sk)) + continue; + + if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt) + rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt; + } + if (time < (rtt_max >> 3) || !rtt_max) + return 1; + + return 0; +} + +static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk) +{ + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; + + if (mpadd->ipver == 4) { + __be16 port = 0; + if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2) + port = mpadd->u.v4.port; + + mptcp_v4_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v4.addr, port, + mpadd->addr_id); +#if IS_ENABLED(CONFIG_IPV6) + } else if (mpadd->ipver == 6) { + __be16 port = 0; + if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2) + port = mpadd->u.v6.port; + + mptcp_v6_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v6.addr, port, + mpadd->addr_id); +#endif /* CONFIG_IPV6 */ + } +} + +static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk) +{ + struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; + int i; + u8 rem_id; + + for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) { + rem_id = (&mprem->addrs_id)[i]; + if (!mptcp_rem_raddress(tcp_sk(sk)->mpcb, rem_id)) + mptcp_send_reset_rem_id(tcp_sk(sk)->mpcb, rem_id); + } +} + +static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk) +{ + struct tcphdr *th = tcp_hdr(skb); + unsigned char *ptr; + int length = (th->doff * 4) - sizeof(struct tcphdr); + + /* Jump through the options to check whether ADD_ADDR is there */ + ptr = (unsigned char *)(th + 1); + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) + return; + if (opsize > length) + return; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP && + ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) { +#if IS_ENABLED(CONFIG_IPV6) + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; + if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 && + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) || + (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 && + opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) +#else + if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 && + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) +#endif /* CONFIG_IPV6 */ + goto cont; + + mptcp_handle_add_addr(ptr, sk); + } + if (opcode == TCPOPT_MPTCP && + ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) { + if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) + goto cont; + + mptcp_handle_rem_addr(ptr, sk); + } +cont: + ptr += opsize - 2; + length -= opsize; + } + } + return; +} + +static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th) +{ + struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp; + struct sock *meta_sk = mptcp_meta_sk(sk); + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + + if (unlikely(mptcp->rx_opt.mp_fail)) { + mptcp->rx_opt.mp_fail = 0; + + if (!th->rst && !mpcb->infinite_mapping_snd) { + struct sock *sk_it; + + mpcb->send_infinite_mapping = 1; + /* We resend everything that has not been acknowledged */ + meta_sk->sk_send_head = tcp_write_queue_head(meta_sk); + + /* We artificially restart the whole send-queue. Thus, + * it is as if no packets are in flight + */ + tcp_sk(meta_sk)->packets_out = 0; + + /* If the snd_nxt already wrapped around, we have to + * undo the wrapping, as we are restarting from snd_una + * on. + */ + if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) { + mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2; + mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; + } + tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una; + + /* Trigger a sending on the meta. */ + mptcp_push_pending_frames(meta_sk); + + mptcp_for_each_sk(mpcb, sk_it) { + if (sk != sk_it) + mptcp_sub_force_close(sk_it); + } + } + + return 0; + } + + if (unlikely(mptcp->rx_opt.mp_fclose)) { + struct sock *sk_it, *tmpsk; + + mptcp->rx_opt.mp_fclose = 0; + if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key) + return 0; + + if (tcp_need_reset(sk->sk_state)) + tcp_send_active_reset(sk, GFP_ATOMIC); + + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) + mptcp_sub_force_close(sk_it); + + tcp_reset(meta_sk); + + return 1; + } + + return 0; +} + +static inline void mptcp_path_array_check(struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + + if (unlikely(mpcb->list_rcvd)) { + mpcb->list_rcvd = 0; + if (mpcb->pm_ops->new_remote_address) + mpcb->pm_ops->new_remote_address(meta_sk); + } +} + +int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; + + if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd) + return 0; + + if (mptcp_mp_fail_rcvd(sk, th)) + return 1; + + /* RFC 6824, Section 3.3: + * If a checksum is not present when its use has been negotiated, the + * receiver MUST close the subflow with a RST as it is considered broken. + */ + if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum && + !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) { + if (tcp_need_reset(sk->sk_state)) + tcp_send_active_reset(sk, GFP_ATOMIC); + + mptcp_sub_force_close(sk); + return 1; + } + + /* We have to acknowledge retransmissions of the third + * ack. + */ + if (mopt->join_ack) { + tcp_send_delayed_ack(sk); + mopt->join_ack = 0; + } + + if (mopt->saw_add_addr || mopt->saw_rem_addr) { + if (mopt->more_add_addr || mopt->more_rem_addr) { + mptcp_parse_addropt(skb, sk); + } else { + if (mopt->saw_add_addr) + mptcp_handle_add_addr(mopt->add_addr_ptr, sk); + if (mopt->saw_rem_addr) + mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk); + } + + mopt->more_add_addr = 0; + mopt->saw_add_addr = 0; + mopt->more_rem_addr = 0; + mopt->saw_rem_addr = 0; + } + if (mopt->saw_low_prio) { + if (mopt->saw_low_prio == 1) { + tp->mptcp->rcv_low_prio = mopt->low_prio; + } else { + struct sock *sk_it; + mptcp_for_each_sk(tp->mpcb, sk_it) { + struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp; + if (mptcp->rem_id == mopt->prio_addr_id) + mptcp->rcv_low_prio = mopt->low_prio; + } + } + mopt->saw_low_prio = 0; + } + + mptcp_data_ack(sk, skb); + + mptcp_path_array_check(mptcp_meta_sk(sk)); + /* Socket may have been mp_killed by a REMOVE_ADDR */ + if (tp->mp_killed) + return 1; + + return 0; +} + +/* The skptr is needed, because if we become MPTCP-capable, we have to switch + * from meta-socket to master-socket. + * + * @return: 1 - we want to reset this connection + * 2 - we want to discard the received syn/ack + * 0 - everything is fine - continue + */ +int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, + struct sk_buff *skb, + struct mptcp_options_received *mopt) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->mpc) { + u8 hash_mac_check[20]; + struct mptcp_cb *mpcb = tp->mpcb; + + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, + (u8 *)&mpcb->mptcp_loc_key, + (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, + (u8 *)&tp->mptcp->mptcp_loc_nonce, + (u32 *)hash_mac_check); + if (memcmp(hash_mac_check, + (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) { + mptcp_sub_force_close(sk); + return 1; + } + + /* Set this flag in order to postpone data sending + * until the 4th ack arrives. + */ + tp->mptcp->pre_established = 1; + tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio; + + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, + (u8 *)&mpcb->mptcp_rem_key, + (u8 *)&tp->mptcp->mptcp_loc_nonce, + (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, + (u32 *)&tp->mptcp->sender_mac[0]); + + } else if (mopt->saw_mpc) { + if (mptcp_create_master_sk(sk, mopt->mptcp_key, + ntohs(tcp_hdr(skb)->window))) + return 2; + + sk = tcp_sk(sk)->mpcb->master_sk; + *skptr = sk; + tp = tcp_sk(sk); + + /* snd_nxt - 1, because it has been incremented + * by tcp_connect for the SYN + */ + tp->mptcp->snt_isn = tp->snd_nxt - 1; + tp->mpcb->dss_csum = mopt->dss_csum; + tp->mptcp->include_mpc = 1; + + sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket); + sk->sk_wq = mptcp_meta_sk(sk)->sk_wq; + + mptcp_update_metasocket(sk, mptcp_meta_sk(sk)); + + /* hold in mptcp_inherit_sk due to initialization to 2 */ + sock_put(sk); + } else { + tp->request_mptcp = 0; + + if (tp->inside_tk_table) + mptcp_hash_remove(tp); + } + + if (tp->mpc) + tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq; + + return 0; +} + +bool mptcp_should_expand_sndbuf(struct sock *meta_sk) +{ + struct sock *sk_it; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + int cnt_backups = 0; + int backup_available = 0; + + /* We circumvent this check in tcp_check_space, because we want to + * always call sk_write_space. So, we reproduce the check here. + */ + if (!meta_sk->sk_socket || + !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) + return false; + + /* If the user specified a specific send buffer setting, do + * not modify it. + */ + if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return false; + + /* If we are under global TCP memory pressure, do not expand. */ + if (sk_under_memory_pressure(meta_sk)) + return false; + + /* If we are under soft global TCP memory pressure, do not expand. */ + if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0)) + return false; + + + /* For MPTCP we look for a subsocket that could send data. + * If we found one, then we update the send-buffer. + */ + mptcp_for_each_sk(meta_tp->mpcb, sk_it) { + struct tcp_sock *tp_it = tcp_sk(sk_it); + + if (!mptcp_sk_can_send(sk_it)) + continue; + + /* Backup-flows have to be counted - if there is no other + * subflow we take the backup-flow into account. */ + if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) { + cnt_backups++; + } + + if (tp_it->packets_out < tp_it->snd_cwnd) { + if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) { + backup_available = 1; + continue; + } + return true; + } + } + + /* Backup-flow is available for sending - update send-buffer */ + if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available) + return true; + return false; +} diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c new file mode 100644 index 0000000..d3e2780 --- /dev/null +++ b/net/mptcp/mptcp_ipv4.c @@ -0,0 +1,593 @@ +/* + * MPTCP implementation - IPv4-specific functions + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, + u32 seq) +{ + u32 hash[MD5_DIGEST_WORDS]; + + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = seq; + + md5_transform(hash, mptcp_secret); + + return hash[0]; +} + +u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) +{ + u32 hash[MD5_DIGEST_WORDS]; + + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = mptcp_key_seed++; + + md5_transform(hash, mptcp_secret); + + return *((u64 *)hash); +} + + +static void mptcp_v4_reqsk_destructor(struct request_sock *req) +{ + mptcp_reqsk_destructor(req); + + tcp_v4_reqsk_destructor(req); +} + +/* Similar to tcp_request_sock_ops */ +struct request_sock_ops mptcp_request_sock_ops __read_mostly = { + .family = PF_INET, + .obj_size = sizeof(struct mptcp_request_sock), + .rtx_syn_ack = tcp_v4_rtx_synack, + .send_ack = tcp_v4_reqsk_send_ack, + .destructor = mptcp_v4_reqsk_destructor, + .send_reset = tcp_v4_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, +}; + +static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk, + struct request_sock *req, + unsigned long timeout) +{ + const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, + inet_rsk(req)->rmt_port, + 0, MPTCP_HASH_SIZE); + + inet_csk_reqsk_queue_hash_add(meta_sk, req, timeout); + + spin_lock(&mptcp_reqsk_hlock); + list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h]); + spin_unlock(&mptcp_reqsk_hlock); +} + +/* Similar to tcp_v4_conn_request */ +static void mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct tcp_options_received tmp_opt; + struct mptcp_options_received mopt; + struct request_sock *req; + struct inet_request_sock *ireq; + struct mptcp_request_sock *mtreq; + struct dst_entry *dst = NULL; + u8 mptcp_hash_mac[20]; + __be32 saddr = ip_hdr(skb)->saddr; + __be32 daddr = ip_hdr(skb)->daddr; + __u32 isn = TCP_SKB_CB(skb)->when; + int want_cookie = 0; + union inet_addr addr; + + tcp_clear_options(&tmp_opt); + mptcp_init_mp_opt(&mopt); + tmp_opt.mss_clamp = TCP_MSS_DEFAULT; + tmp_opt.user_mss = tcp_sk(meta_sk)->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); + + req = inet_reqsk_alloc(&mptcp_request_sock_ops); + if (!req) + return; + + mtreq = mptcp_rsk(req); + mtreq->mpcb = mpcb; + INIT_LIST_HEAD(&mtreq->collide_tuple); + +#ifdef CONFIG_TCP_MD5SIG + tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; +#endif + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb); + + ireq = inet_rsk(req); + ireq->loc_addr = daddr; + ireq->rmt_addr = saddr; + ireq->no_srccheck = inet_sk(meta_sk)->transparent; + ireq->opt = tcp_v4_save_options(skb); + + if (security_inet_conn_request(meta_sk, skb, req)) + goto drop_and_free; + + if (!want_cookie || tmp_opt.tstamp_ok) + TCP_ECN_create_request(req, skb, sock_net(meta_sk)); + + if (!isn) { + struct flowi4 fl4; + + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && + tcp_death_row.sysctl_tw_recycle && + (dst = inet_csk_route_req(meta_sk, &fl4, req)) != NULL && + fl4.daddr == saddr) { + if (!tcp_peer_is_proven(req, dst, true)) { + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) < + (sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst, false)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), + &saddr, ntohs(tcp_hdr(skb)->source)); + goto drop_and_release; + } + + isn = tcp_v4_init_sequence(skb); + } + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_rsk(req)->listener = NULL; + + mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce; + mtreq->mptcp_rem_key = mpcb->mptcp_rem_key; + mtreq->mptcp_loc_key = mpcb->mptcp_loc_key; + mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(saddr, daddr, + tcp_hdr(skb)->source, + tcp_hdr(skb)->dest, isn); + mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key, + (u8 *)&mtreq->mptcp_rem_key, + (u8 *)&mtreq->mptcp_loc_nonce, + (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac); + mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac; + + addr.ip = ireq->loc_addr; + mtreq->loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(meta_sk)); + if (mtreq->loc_id == -1) /* Address not part of the allowed ones */ + goto drop_and_release; + mtreq->rem_id = mopt.rem_id; + mtreq->low_prio = mopt.low_prio; + tcp_rsk(req)->saw_mpc = 1; + + if (tcp_v4_send_synack(meta_sk, dst, req, skb_get_queue_mapping(skb))) + goto drop_and_free; + + /* Adding to request queue in metasocket */ + mptcp_v4_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT); + + return; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); + return; +} + +int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id) +{ + int i; + + for (i = 0; i < MPTCP_MAX_ADDR; i++) { + if (!((1 << i) & mpcb->rem4_bits)) + continue; + + if (mpcb->remaddr4[i].id == id) { + /* remove address from bitfield */ + mpcb->rem4_bits &= ~(1 << i); + + return 0; + } + } + + return -1; +} + +/* Based on function tcp_v4_conn_request (tcp_ipv4.c) + * Returns -1 if there is no space anymore to store an additional + * address + */ +int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr, + __be16 port, u8 id) +{ + int i; + struct mptcp_rem4 *rem4; + + mptcp_for_each_bit_set(mpcb->rem4_bits, i) { + rem4 = &mpcb->remaddr4[i]; + + /* Address is already in the list --- continue */ + if (rem4->id == id && + rem4->addr.s_addr == addr->s_addr && rem4->port == port) + return 0; + + /* This may be the case, when the peer is behind a NAT. He is + * trying to JOIN, thus sending the JOIN with a certain ID. + * However the src_addr of the IP-packet has been changed. We + * update the addr in the list, because this is the address as + * OUR BOX sees it. + */ + if (rem4->id == id && rem4->addr.s_addr != addr->s_addr) { + /* update the address */ + mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n", + __func__, &rem4->addr.s_addr, + &addr->s_addr, id); + rem4->addr.s_addr = addr->s_addr; + rem4->port = port; + mpcb->list_rcvd = 1; + return 0; + } + } + + i = mptcp_find_free_index(mpcb->rem4_bits); + /* Do we have already the maximum number of local/remote addresses? */ + if (i < 0) { + mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n", + __func__, MPTCP_MAX_ADDR, &addr->s_addr); + return -1; + } + + rem4 = &mpcb->remaddr4[i]; + + /* Address is not known yet, store it */ + rem4->addr.s_addr = addr->s_addr; + rem4->port = port; + rem4->bitfield = 0; + rem4->retry_bitfield = 0; + rem4->id = id; + mpcb->list_rcvd = 1; + mpcb->rem4_bits |= (1 << i); + + return 0; +} + +/* Sets the bitfield of the remote-address field + * local address is not set as it will disappear with the global address-list + */ +void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr, u8 id) +{ + int i; + + mptcp_for_each_bit_set(mpcb->rem4_bits, i) { + if (mpcb->remaddr4[i].addr.s_addr == daddr) { + /* It's the initial flow - thus local index == 0 */ + mpcb->remaddr4[i].bitfield |= (1 << id); + return; + } + } +} + +/* We only process join requests here. (either the SYN or the final ACK) */ +int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sock *child, *rsk = NULL; + int ret; + + if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) { + struct tcphdr *th = tcp_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); + struct sock *sk; + + sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo, + iph->saddr, th->source, iph->daddr, + th->dest, inet_iif(skb)); + + if (!sk) { + kfree_skb(skb); + return 0; + } + if (is_meta_sk(sk)) { + WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); + kfree_skb(skb); + sock_put(sk); + return 0; + } + + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + kfree_skb(skb); + return 0; + } + + ret = tcp_v4_do_rcv(sk, skb); + sock_put(sk); + + return ret; + } + TCP_SKB_CB(skb)->mptcp_flags = 0; + + /* Has been removed from the tk-table. Thus, no new subflows. + * + * Check for close-state is necessary, because we may have been closed + * without passing by mptcp_close(). + * + * When falling back, no new subflows are allowed either. + */ + if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table || + mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) + goto reset_and_discard; + + child = tcp_v4_hnd_req(meta_sk, skb); + + if (!child) + goto discard; + + if (child != meta_sk) { + sock_rps_save_rxhash(child, skb); + /* We don't call tcp_child_process here, because we hold + * already the meta-sk-lock and are sure that it is not owned + * by the user. + */ + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); + bh_unlock_sock(child); + sock_put(child); + if (ret) { + rsk = child; + goto reset_and_discard; + } + } else { + if (tcp_hdr(skb)->syn) { + struct mp_join *join_opt = mptcp_find_join(skb); + /* Currently we make two calls to mptcp_find_join(). This + * can probably be optimized. + */ + if (mptcp_v4_add_raddress(mpcb, + (struct in_addr *)&ip_hdr(skb)->saddr, + 0, + join_opt->addr_id) < 0) + goto reset_and_discard; + mpcb->list_rcvd = 0; + + mptcp_v4_join_request(meta_sk, skb); + goto discard; + } + goto reset_and_discard; + } + return 0; + +reset_and_discard: + tcp_v4_send_reset(rsk, skb); +discard: + kfree_skb(skb); + return 0; +} + +/* After this, the ref count of the meta_sk associated with the request_sock + * is incremented. Thus it is the responsibility of the caller + * to call sock_put() when the reference is not needed anymore. + */ +struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, + const __be32 laddr, const struct net *net) +{ + struct mptcp_request_sock *mtreq; + struct sock *meta_sk = NULL; + + spin_lock(&mptcp_reqsk_hlock); + list_for_each_entry(mtreq, + &mptcp_reqsk_htb[inet_synq_hash(raddr, rport, 0, + MPTCP_HASH_SIZE)], + collide_tuple) { + struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq)); + meta_sk = mtreq->mpcb->meta_sk; + + if (ireq->rmt_port == rport && + ireq->rmt_addr == raddr && + ireq->loc_addr == laddr && + rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET && + net_eq(net, sock_net(meta_sk))) + break; + meta_sk = NULL; + } + + if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) + meta_sk = NULL; + spin_unlock(&mptcp_reqsk_hlock); + + return meta_sk; +} + +/* Create a new IPv4 subflow. + * + * We are in user-context and meta-sock-lock is hold. + */ +int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, + struct mptcp_rem4 *rem) +{ + struct tcp_sock *tp; + struct sock *sk; + struct sockaddr_in loc_in, rem_in; + struct socket sock; + int ulid_size = 0, ret; + + /* Don't try again - even if it fails */ + rem->bitfield |= (1 << loc->id); + + /** First, create and prepare the new socket */ + + sock.type = meta_sk->sk_socket->type; + sock.state = SS_UNCONNECTED; + sock.wq = meta_sk->sk_socket->wq; + sock.file = meta_sk->sk_socket->file; + sock.ops = NULL; + + ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1); + if (unlikely(ret < 0)) { + mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret); + return ret; + } + + sk = sock.sk; + tp = tcp_sk(sk); + + /* All subsockets need the MPTCP-lock-class */ + lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP"); + lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0); + + if (mptcp_add_sock(meta_sk, sk, loc->id, rem->id, GFP_KERNEL)) + goto error; + + tp->mptcp->slave_sk = 1; + tp->mptcp->low_prio = loc->low_prio; + + /* Initializing the timer for an MPTCP subflow */ + setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk); + + /** Then, connect the socket to the peer */ + + ulid_size = sizeof(struct sockaddr_in); + loc_in.sin_family = AF_INET; + rem_in.sin_family = AF_INET; + loc_in.sin_port = 0; + if (rem->port) + rem_in.sin_port = rem->port; + else + rem_in.sin_port = inet_sk(meta_sk)->inet_dport; + loc_in.sin_addr = loc->addr; + rem_in.sin_addr = rem->addr; + + ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size); + if (ret < 0) { + mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n", + __func__, ret); + goto error; + } + + mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n", + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, + tp->mptcp->path_index, &loc_in.sin_addr, + ntohs(loc_in.sin_port), &rem_in.sin_addr, + ntohs(rem_in.sin_port)); + + ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in, + ulid_size, O_NONBLOCK); + if (ret < 0 && ret != -EINPROGRESS) { + mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n", + __func__, ret); + goto error; + } + + sk_set_socket(sk, meta_sk->sk_socket); + sk->sk_wq = meta_sk->sk_wq; + + return 0; + +error: + /* May happen if mptcp_add_sock fails first */ + if (!tp->mpc) { + tcp_close(sk, 0); + } else { + local_bh_disable(); + mptcp_sub_force_close(sk); + local_bh_enable(); + } + return ret; +} +EXPORT_SYMBOL(mptcp_init4_subsockets); + +/* General initialization of IPv4 for MPTCP */ +int mptcp_pm_v4_init(void) +{ + int ret = 0; + struct request_sock_ops *ops = &mptcp_request_sock_ops; + + ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP"); + if (ops->slab_name == NULL) { + ret = -ENOMEM; + goto out; + } + + ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, + SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, + NULL); + + if (ops->slab == NULL) { + ret = -ENOMEM; + goto err_reqsk_create; + } + +out: + return ret; + +err_reqsk_create: + kfree(ops->slab_name); + ops->slab_name = NULL; + goto out; +} + +void mptcp_pm_v4_undo(void) +{ + kmem_cache_destroy(mptcp_request_sock_ops.slab); + kfree(mptcp_request_sock_ops.slab_name); +} + + diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c new file mode 100644 index 0000000..6f8effb --- /dev/null +++ b/net/mptcp/mptcp_ipv6.c @@ -0,0 +1,813 @@ +/* + * MPTCP implementation - IPv6-specific functions + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer: + * Jaakko Korkeaniemi + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req, + u16 queue_mapping); + +__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport, u32 seq) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u32 i; + + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = mptcp_secret[i] + (__force u32)daddr[i]; + secret[4] = mptcp_secret[4] + + (((__force u16)sport << 16) + (__force u16)dport); + secret[5] = seq; + for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = mptcp_secret[i]; + + md5_transform(hash, secret); + + return hash[0]; +} + +u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u32 i; + + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = mptcp_secret[i] + (__force u32)daddr[i]; + secret[4] = mptcp_secret[4] + + (((__force u16)sport << 16) + (__force u16)dport); + secret[5] = mptcp_key_seed++; + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = mptcp_secret[i]; + + md5_transform(hash, secret); + + return *((u64 *)hash); +} + +static void mptcp_v6_reqsk_destructor(struct request_sock *req) +{ + mptcp_reqsk_destructor(req); + + tcp_v6_reqsk_destructor(req); +} + +/* Similar to tcp_v6_rtx_synack */ +static int mptcp_v6_rtx_synack(struct sock *meta_sk, struct request_sock *req) +{ + if (meta_sk->sk_family == AF_INET6) + return tcp_v6_rtx_synack(meta_sk, req); + + TCP_INC_STATS_BH(sock_net(meta_sk), TCP_MIB_RETRANSSEGS); + return mptcp_v6v4_send_synack(meta_sk, req, 0); +} + +/* Similar to tcp6_request_sock_ops */ +struct request_sock_ops mptcp6_request_sock_ops __read_mostly = { + .family = AF_INET6, + .obj_size = sizeof(struct mptcp6_request_sock), + .rtx_syn_ack = mptcp_v6_rtx_synack, + .send_ack = tcp_v6_reqsk_send_ack, + .destructor = mptcp_v6_reqsk_destructor, + .send_reset = tcp_v6_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, +}; + +static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk, + struct request_sock *req, + unsigned long timeout) +{ + const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, + inet_rsk(req)->rmt_port, + 0, MPTCP_HASH_SIZE); + + inet6_csk_reqsk_queue_hash_add(meta_sk, req, timeout); + + spin_lock(&mptcp_reqsk_hlock); + list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h]); + spin_unlock(&mptcp_reqsk_hlock); +} + +/* Similar to tcp_v6_send_synack + * + * The meta-socket is IPv4, but a new subsocket is IPv6 + */ +static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req, + u16 queue_mapping) +{ + struct inet6_request_sock *treq = inet6_rsk(req); + struct sk_buff *skb; + struct flowi6 fl6; + struct dst_entry *dst; + int err = -ENOMEM; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = treq->rmt_addr; + fl6.saddr = treq->loc_addr; + fl6.flowlabel = 0; + fl6.flowi6_oif = treq->iif; + fl6.flowi6_mark = meta_sk->sk_mark; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_rsk(req)->loc_port; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL, false); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + return err; + } + skb = tcp_make_synack(meta_sk, dst, req, NULL); + + if (skb) { + __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); + + fl6.daddr = treq->rmt_addr; + skb_set_queue_mapping(skb, queue_mapping); + err = ip6_xmit(meta_sk, skb, &fl6, NULL, 0); + err = net_xmit_eval(err); + } + + return err; +} + +/* Similar to tcp_v6_syn_recv_sock + * + * The meta-socket is IPv4, but a new subsocket is IPv6 + */ +struct sock *mptcp_v6v4_syn_recv_sock(struct sock *meta_sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *treq; + struct ipv6_pinfo *newnp; + struct tcp6_sock *newtcp6sk; + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; + + treq = inet6_rsk(req); + + if (sk_acceptq_is_full(meta_sk)) + goto out_overflow; + + if (!dst) { + /* This code is similar to inet6_csk_route_req, but as we + * don't have a np-pointer in the meta, we have to do it + * manually. + */ + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = treq->rmt_addr; + fl6.saddr = treq->loc_addr; + fl6.flowi6_oif = treq->iif; + fl6.flowi6_mark = meta_sk->sk_mark; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_rsk(req)->loc_port; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL, false); + if (IS_ERR(dst)) + goto out; + } + + newsk = tcp_create_openreq_child(meta_sk, req, skb); + if (newsk == NULL) + goto out_nonewsk; + + /* Diff to tcp_v6_syn_recv_sock: Must do this prior to __ip6_dst_store, + * as it tries to access the pinet6-pointer. + */ + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, tcp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ + + newsk->sk_gso_type = SKB_GSO_TCPV6; + __ip6_dst_store(newsk, dst, NULL, NULL); + inet6_sk_rx_dst_set(newsk, skb); + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + + newnp->daddr = treq->rmt_addr; + newnp->saddr = treq->loc_addr; + newnp->rcv_saddr = treq->loc_addr; + newsk->sk_bound_dev_if = treq->iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->inet_opt = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; + newnp->rxopt.all = 0; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (treq->pktopts != NULL) { + newnp->pktoptions = skb_clone(treq->pktopts, + sk_gfp_atomic(meta_sk, GFP_ATOMIC)); + consume_skb(treq->pktopts); + treq->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + newnp->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + + /* Initialization copied from inet6_create - normally this should have + * been handled by the memcpy as in tcp_v6_syn_recv_sock + */ + newnp->hop_limit = -1; + newnp->mc_loop = 1; + newnp->pmtudisc = IPV6_PMTUDISC_WANT; + (void)xchg(&newnp->rxpmtu, NULL); + + inet_csk(newsk)->icsk_ext_hdr_len = 0; + + tcp_mtup_init(newsk); + tcp_sync_mss(newsk, dst_mtu(dst)); + newtp->advmss = dst_metric_advmss(dst); + if (tcp_sk(meta_sk)->rx_opt.user_mss && + tcp_sk(meta_sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(meta_sk)->rx_opt.user_mss; + + tcp_initialize_rcv_mss(newsk); + + newinet->inet_daddr = LOOPBACK4_IPV6; + newinet->inet_saddr = LOOPBACK4_IPV6; + newinet->inet_rcv_saddr = LOOPBACK4_IPV6; + + if (__inet_inherit_port(meta_sk, newsk) < 0) { + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); + goto out; + } + __inet6_hash(newsk, NULL); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: + dst_release(dst); +out: + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENDROPS); + return NULL; +} + +/* Similar to tcp_v6_conn_request */ +static void mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct tcp_options_received tmp_opt; + struct mptcp_options_received mopt; + struct ipv6_pinfo *np = inet6_sk(meta_sk); + struct request_sock *req; + struct inet6_request_sock *treq; + struct mptcp_request_sock *mtreq; + u8 mptcp_hash_mac[20]; + __u32 isn = TCP_SKB_CB(skb)->when; + struct dst_entry *dst = NULL; + struct flowi6 fl6; + int want_cookie = 0; + union inet_addr addr; + + tcp_clear_options(&tmp_opt); + mptcp_init_mp_opt(&mopt); + tmp_opt.mss_clamp = TCP_MSS_DEFAULT; + tmp_opt.user_mss = tcp_sk(meta_sk)->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL); + + req = inet6_reqsk_alloc(&mptcp6_request_sock_ops); + if (!req) + return; + + mtreq = mptcp_rsk(req); + mtreq->mpcb = mpcb; + INIT_LIST_HEAD(&mtreq->collide_tuple); + +#ifdef CONFIG_TCP_MD5SIG + tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; +#endif + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb); + + treq = inet6_rsk(req); + treq->rmt_addr = ipv6_hdr(skb)->saddr; + treq->loc_addr = ipv6_hdr(skb)->daddr; + + if (!want_cookie || tmp_opt.tstamp_ok) + TCP_ECN_create_request(req, skb, sock_net(meta_sk)); + + treq->iif = meta_sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!meta_sk->sk_bound_dev_if && + ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) + treq->iif = inet6_iif(skb); + + if (!isn) { + if (meta_sk->sk_family == AF_INET6 && + (ipv6_opt_accepted(meta_sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)) { + atomic_inc(&skb->users); + treq->pktopts = skb; + } + + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && + tcp_death_row.sysctl_tw_recycle && + (dst = inet6_csk_route_req(meta_sk, &fl6, req)) != NULL) { + if (!tcp_peer_is_proven(req, dst, true)) { + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) < + (sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst, false)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", + &treq->rmt_addr, ntohs(tcp_hdr(skb)->source)); + goto drop_and_release; + } + + isn = tcp_v6_init_sequence(skb); + } + + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_rsk(req)->listener = NULL; + + mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce; + mtreq->mptcp_rem_key = mpcb->mptcp_rem_key; + mtreq->mptcp_loc_key = mpcb->mptcp_loc_key; + mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source, isn); + mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key, + (u8 *)&mtreq->mptcp_rem_key, + (u8 *)&mtreq->mptcp_loc_nonce, + (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac); + mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac; + + addr.in6 = treq->loc_addr; + mtreq->loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(meta_sk)); + if (mtreq->loc_id == -1) /* Address not part of the allowed ones */ + goto drop_and_release; + mtreq->rem_id = mopt.rem_id; + mtreq->low_prio = mopt.low_prio; + tcp_rsk(req)->saw_mpc = 1; + + if (meta_sk->sk_family == AF_INET6) { + if (tcp_v6_send_synack(meta_sk, dst, &fl6, req, + skb_get_queue_mapping(skb))) + goto drop_and_free; + } else { + if (mptcp_v6v4_send_synack(meta_sk, req, skb_get_queue_mapping(skb))) + goto drop_and_free; + } + + /* Adding to request queue in metasocket */ + mptcp_v6_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT); + + return; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); + return; +} + +int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id) +{ + int i; + + for (i = 0; i < MPTCP_MAX_ADDR; i++) { + if (!((1 << i) & mpcb->rem6_bits)) + continue; + + if (mpcb->remaddr6[i].id == id) { + /* remove address from bitfield */ + mpcb->rem6_bits &= ~(1 << i); + + return 0; + } + } + + return -1; +} + +/* Returns -1 if there is no space anymore to store an additional + * address + */ +int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr, + __be16 port, u8 id) +{ + int i; + struct mptcp_rem6 *rem6; + + mptcp_for_each_bit_set(mpcb->rem6_bits, i) { + rem6 = &mpcb->remaddr6[i]; + + /* Address is already in the list --- continue */ + if (rem6->id == id && + ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port) + return 0; + + /* This may be the case, when the peer is behind a NAT. He is + * trying to JOIN, thus sending the JOIN with a certain ID. + * However the src_addr of the IP-packet has been changed. We + * update the addr in the list, because this is the address as + * OUR BOX sees it. + */ + if (rem6->id == id) { + /* update the address */ + mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n", + __func__, &rem6->addr, addr, id); + rem6->addr = *addr; + rem6->port = port; + mpcb->list_rcvd = 1; + return 0; + } + } + + i = mptcp_find_free_index(mpcb->rem6_bits); + /* Do we have already the maximum number of local/remote addresses? */ + if (i < 0) { + mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n", + __func__, MPTCP_MAX_ADDR, addr); + return -1; + } + + rem6 = &mpcb->remaddr6[i]; + + /* Address is not known yet, store it */ + rem6->addr = *addr; + rem6->port = port; + rem6->bitfield = 0; + rem6->retry_bitfield = 0; + rem6->id = id; + mpcb->list_rcvd = 1; + mpcb->rem6_bits |= (1 << i); + + return 0; +} + +/* Sets the bitfield of the remote-address field + * local address is not set as it will disappear with the global address-list + */ +void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb, + const struct in6_addr *daddr, u8 id) +{ + int i; + mptcp_for_each_bit_set(mpcb->rem6_bits, i) { + if (ipv6_addr_equal(&mpcb->remaddr6[i].addr, daddr)) { + /* It's the initial flow - thus local index == 0 */ + mpcb->remaddr6[i].bitfield |= (1 << (id - MPTCP_MAX_ADDR)); + return; + } + } +} + +int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sock *child, *rsk = NULL; + int ret; + + if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) { + struct tcphdr *th = tcp_hdr(skb); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct sock *sk; + + sk = __inet6_lookup_established(sock_net(meta_sk), + &tcp_hashinfo, + &ip6h->saddr, th->source, + &ip6h->daddr, ntohs(th->dest), + inet6_iif(skb)); + + if (!sk) { + kfree_skb(skb); + return 0; + } + if (is_meta_sk(sk)) { + WARN("%s Did not find a sub-sk!\n", __func__); + kfree_skb(skb); + sock_put(sk); + return 0; + } + + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + kfree_skb(skb); + return 0; + } + + ret = tcp_v6_do_rcv(sk, skb); + sock_put(sk); + + return ret; + } + TCP_SKB_CB(skb)->mptcp_flags = 0; + + /* Has been removed from the tk-table. Thus, no new subflows. + * + * Check for close-state is necessary, because we may have been closed + * without passing by mptcp_close(). + * + * When falling back, no new subflows are allowed either. + */ + if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table || + mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) + goto reset_and_discard; + + child = tcp_v6_hnd_req(meta_sk, skb); + + if (!child) + goto discard; + + if (child != meta_sk) { + sock_rps_save_rxhash(child, skb); + /* We don't call tcp_child_process here, because we hold + * already the meta-sk-lock and are sure that it is not owned + * by the user. + */ + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len); + bh_unlock_sock(child); + sock_put(child); + if (ret) { + rsk = child; + goto reset_and_discard; + } + } else { + if (tcp_hdr(skb)->syn) { + struct mp_join *join_opt = mptcp_find_join(skb); + /* Currently we make two calls to mptcp_find_join(). This + * can probably be optimized. */ + if (mptcp_v6_add_raddress(mpcb, + (struct in6_addr *)&ipv6_hdr(skb)->saddr, + 0, + join_opt->addr_id) < 0) + goto reset_and_discard; + mpcb->list_rcvd = 0; + + mptcp_v6_join_request(meta_sk, skb); + goto discard; + } + goto reset_and_discard; + } + return 0; + +reset_and_discard: + tcp_v6_send_reset(rsk, skb); +discard: + kfree_skb(skb); + return 0; +} + +/* After this, the ref count of the meta_sk associated with the request_sock + * is incremented. Thus it is the responsibility of the caller + * to call sock_put() when the reference is not needed anymore. + */ +struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, + const struct in6_addr *laddr, const struct net *net) +{ + struct mptcp_request_sock *mtreq; + struct sock *meta_sk = NULL; + + spin_lock(&mptcp_reqsk_hlock); + list_for_each_entry(mtreq, + &mptcp_reqsk_htb[inet6_synq_hash(raddr, rport, 0, + MPTCP_HASH_SIZE)], + collide_tuple) { + struct inet6_request_sock *treq = inet6_rsk(rev_mptcp_rsk(mtreq)); + meta_sk = mtreq->mpcb->meta_sk; + + if (inet_rsk(rev_mptcp_rsk(mtreq))->rmt_port == rport && + rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 && + ipv6_addr_equal(&treq->rmt_addr, raddr) && + ipv6_addr_equal(&treq->loc_addr, laddr) && + net_eq(net, sock_net(meta_sk))) + break; + meta_sk = NULL; + } + + if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt))) + meta_sk = NULL; + spin_unlock(&mptcp_reqsk_hlock); + + return meta_sk; +} + +/* Create a new IPv6 subflow. + * + * We are in user-context and meta-sock-lock is hold. + */ +int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, + struct mptcp_rem6 *rem) +{ + struct tcp_sock *tp; + struct sock *sk; + struct sockaddr_in6 loc_in, rem_in; + struct socket sock; + int ulid_size = 0, ret; + + /* Don't try again - even if it fails. + * There is a special case as the IPv6 address of the initial subflow + * has an id = 0. The other ones have id's in the range [8, 16[. + */ + rem->bitfield |= (1 << (loc->id - MPTCP_MAX_ADDR)); + + /** First, create and prepare the new socket */ + + sock.type = meta_sk->sk_socket->type; + sock.state = SS_UNCONNECTED; + sock.wq = meta_sk->sk_socket->wq; + sock.file = meta_sk->sk_socket->file; + sock.ops = NULL; + + ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1); + if (unlikely(ret < 0)) { + mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret); + return ret; + } + + sk = sock.sk; + tp = tcp_sk(sk); + + /* All subsockets need the MPTCP-lock-class */ + lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP"); + lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0); + + if (mptcp_add_sock(meta_sk, sk, loc->id, rem->id, GFP_KERNEL)) + goto error; + + tp->mptcp->slave_sk = 1; + tp->mptcp->low_prio = loc->low_prio; + + /* Initializing the timer for an MPTCP subflow */ + setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk); + + /** Then, connect the socket to the peer */ + + ulid_size = sizeof(struct sockaddr_in6); + loc_in.sin6_family = AF_INET6; + rem_in.sin6_family = AF_INET6; + loc_in.sin6_port = 0; + if (rem->port) + rem_in.sin6_port = rem->port; + else + rem_in.sin6_port = inet_sk(meta_sk)->inet_dport; + loc_in.sin6_addr = loc->addr; + rem_in.sin6_addr = rem->addr; + + ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size); + if (ret < 0) { + mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n", + __func__, ret); + goto error; + } + + mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n", + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, + tp->mptcp->path_index, &loc_in.sin6_addr, + ntohs(loc_in.sin6_port), &rem_in.sin6_addr, + ntohs(rem_in.sin6_port)); + + ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in, + ulid_size, O_NONBLOCK); + if (ret < 0 && ret != -EINPROGRESS) { + mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n", + __func__, ret); + goto error; + } + + sk_set_socket(sk, meta_sk->sk_socket); + sk->sk_wq = meta_sk->sk_wq; + + return 0; + +error: + /* May happen if mptcp_add_sock fails first */ + if (!tp->mpc) { + tcp_close(sk, 0); + } else { + local_bh_disable(); + mptcp_sub_force_close(sk); + local_bh_enable(); + } + return ret; +} +EXPORT_SYMBOL(mptcp_init6_subsockets); + +int mptcp_pm_v6_init(void) +{ + int ret = 0; + struct request_sock_ops *ops = &mptcp6_request_sock_ops; + + ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6"); + if (ops->slab_name == NULL) { + ret = -ENOMEM; + goto out; + } + + ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, + SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN, + NULL); + + if (ops->slab == NULL) { + ret = -ENOMEM; + goto err_reqsk_create; + } + +out: + return ret; + +err_reqsk_create: + kfree(ops->slab_name); + ops->slab_name = NULL; + goto out; +} + +void mptcp_pm_v6_undo(void) +{ + kmem_cache_destroy(mptcp6_request_sock_ops.slab); + kfree(mptcp6_request_sock_ops.slab_name); +} diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c new file mode 100644 index 0000000..7775f4e --- /dev/null +++ b/net/mptcp/mptcp_ndiffports.c @@ -0,0 +1,170 @@ +#include + +#include +#include + +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif + +struct ndiffports_priv { + /* Worker struct for subflow establishment */ + struct work_struct subflow_work; + + struct mptcp_cb *mpcb; +}; + +static int sysctl_mptcp_ndiffports __read_mostly = 2; + +/** + * Create all new subflows, by doing calls to mptcp_initX_subsockets + * + * This function uses a goto next_subflow, to allow releasing the lock between + * new subflows and giving other processes a chance to do some work on the + * socket and potentially finishing the communication. + **/ +static void create_subflow_worker(struct work_struct *work) +{ + struct ndiffports_priv *pm_priv = container_of(work, + struct ndiffports_priv, + subflow_work); + struct mptcp_cb *mpcb = pm_priv->mpcb; + struct sock *meta_sk = mpcb->meta_sk; + int iter = 0; + +next_subflow: + if (iter) { + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + + yield(); + } + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + iter++; + + if (sock_flag(meta_sk, SOCK_DEAD)) + goto exit; + + if (mpcb->master_sk && + !tcp_sk(mpcb->master_sk)->mptcp->fully_established) + goto exit; + + if (sysctl_mptcp_ndiffports > iter && + sysctl_mptcp_ndiffports > mpcb->cnt_subflows) { + if (meta_sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(meta_sk)) { + struct mptcp_loc4 loc; + + loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; + loc.id = 0; + loc.low_prio = 0; + + mptcp_init4_subsockets(meta_sk, &loc, &mpcb->remaddr4[0]); + } else { +#if IS_ENABLED(CONFIG_IPV6) + struct mptcp_loc6 loc; + + loc.addr = inet6_sk(meta_sk)->saddr; + loc.id = 0; + loc.low_prio = 0; + + mptcp_init6_subsockets(meta_sk, &loc, &mpcb->remaddr6[0]); +#endif + } + goto next_subflow; + } + +exit: + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + sock_put(meta_sk); +} + +static void ndiffports_new_session(struct sock *meta_sk, int id) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; + + /* Initialize workqueue-struct */ + INIT_WORK(&fmp->subflow_work, create_subflow_worker); + fmp->mpcb = mpcb; +} + +static void ndiffports_create_subflows(struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; + + if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv || + mpcb->send_infinite_mapping || + mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) + return; + + if (!work_pending(&pm_priv->subflow_work)) { + sock_hold(meta_sk); + queue_work(mptcp_wq, &pm_priv->subflow_work); + } +} + +static int ndiffports_get_local_id(sa_family_t family, union inet_addr *addr, + struct net *net) +{ + return 0; +} + +static struct mptcp_pm_ops ndiffports __read_mostly = { + .new_session = ndiffports_new_session, + .fully_established = ndiffports_create_subflows, + .get_local_id = ndiffports_get_local_id, + .name = "ndiffports", + .owner = THIS_MODULE, +}; + +static struct ctl_table ndiff_table[] = { + { + .procname = "mptcp_ndiffports", + .data = &sysctl_mptcp_ndiffports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { } +}; + +struct ctl_table_header *mptcp_sysctl; + +/* General initialization of MPTCP_PM */ +static int __init ndiffports_register(void) +{ + BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE); + + mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", ndiff_table); + if (!mptcp_sysctl) + goto exit; + + if (mptcp_register_path_manager(&ndiffports)) + goto pm_failed; + + return 0; + +pm_failed: + unregister_net_sysctl_table(mptcp_sysctl); +exit: + return -1; +} + +static void ndiffports_unregister(void) +{ + mptcp_unregister_path_manager(&ndiffports); + unregister_net_sysctl_table(mptcp_sysctl); +} + +module_init(ndiffports_register); +module_exit(ndiffports_unregister); + +MODULE_AUTHOR("Christoph Paasch"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("NDIFF-PORTS MPTCP"); +MODULE_VERSION("0.88"); diff --git a/net/mptcp/mptcp_ofo_queue.c b/net/mptcp/mptcp_ofo_queue.c new file mode 100644 index 0000000..e182855 --- /dev/null +++ b/net/mptcp/mptcp_ofo_queue.c @@ -0,0 +1,278 @@ +/* + * MPTCP implementation - Fast algorithm for MPTCP meta-reordering + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, + const struct sk_buff *skb) +{ + struct tcp_sock *tp; + + mptcp_for_each_tp(mpcb, tp) { + if (tp->mptcp->shortcut_ofoqueue == skb) { + tp->mptcp->shortcut_ofoqueue = NULL; + return; + } + } +} + +/* Does 'skb' fits after 'here' in the queue 'head' ? + * If yes, we queue it and return 1 + */ +static int mptcp_ofo_queue_after(struct sk_buff_head *head, + struct sk_buff *skb, struct sk_buff *here, + struct tcp_sock *tp) +{ + struct sock *meta_sk = tp->meta_sk; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + u32 seq = TCP_SKB_CB(skb)->seq; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + /* We want to queue skb after here, thus seq >= end_seq */ + if (before(seq, TCP_SKB_CB(here)->end_seq)) + return 0; + + if (seq == TCP_SKB_CB(here)->end_seq) { + bool fragstolen = false; + + if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) { + __skb_queue_after(&meta_tp->out_of_order_queue, here, skb); + return 1; + } else { + kfree_skb_partial(skb, fragstolen); + return -1; + } + } + + /* If here is the last one, we can always queue it */ + if (skb_queue_is_last(head, here)) { + __skb_queue_after(head, here, skb); + return 1; + } else { + struct sk_buff *skb1 = skb_queue_next(head, here); + /* It's not the last one, but does it fits between 'here' and + * the one after 'here' ? Thus, does end_seq <= after_here->seq + */ + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) { + __skb_queue_after(head, here, skb); + return 1; + } + } + + return 0; +} + +static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb, + struct sk_buff_head *head, struct tcp_sock *tp) +{ + struct sock *meta_sk = tp->meta_sk; + struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sk_buff *skb1, *best_shortcut = NULL; + u32 seq = TCP_SKB_CB(skb)->seq; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + u32 distance = 0xffffffff; + + /* First, check the tp's shortcut */ + if (!shortcut) { + if (skb_queue_empty(head)) { + __skb_queue_head(head, skb); + goto end; + } + } else { + int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp); + /* Does the tp's shortcut is a hit? If yes, we insert. */ + + if (ret) { + skb = (ret > 0) ? skb : NULL; + goto end; + } + } + + /* Check the shortcuts of the other subsockets. */ + mptcp_for_each_tp(mpcb, tp_it) { + shortcut = tp_it->mptcp->shortcut_ofoqueue; + /* Can we queue it here? If yes, do so! */ + if (shortcut) { + int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp); + + if (ret) { + skb = (ret > 0) ? skb : NULL; + goto end; + } + } + + /* Could not queue it, check if we are close. + * We are looking for a shortcut, close enough to seq to + * set skb1 prematurely and thus improve the subsequent lookup, + * which tries to find a skb1 so that skb1->seq <= seq. + * + * So, here we only take shortcuts, whose shortcut->seq > seq, + * and minimize the distance between shortcut->seq and seq and + * set best_shortcut to this one with the minimal distance. + * + * That way, the subsequent while-loop is shortest. + */ + if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) { + /* Are we closer than the current best shortcut? */ + if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) { + distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq); + best_shortcut = shortcut; + } + } + } + + if (best_shortcut) + skb1 = best_shortcut; + else + skb1 = skb_peek_tail(head); + + if (seq == TCP_SKB_CB(skb1)->end_seq) { + bool fragstolen = false; + + if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) { + __skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb); + } else { + kfree_skb_partial(skb, fragstolen); + skb = NULL; + } + + goto end; + } + + /* Find the insertion point, starting from best_shortcut if available. + * + * Inspired from tcp_data_queue_ofo. + */ + while (1) { + /* skb1->seq <= seq */ + if (!after(TCP_SKB_CB(skb1)->seq, seq)) + break; + if (skb_queue_is_first(head, skb1)) { + skb1 = NULL; + break; + } + skb1 = skb_queue_prev(head, skb1); + } + + /* Do skb overlap to previous one? */ + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. */ + __kfree_skb(skb); + skb = NULL; + goto end; + } + if (seq == TCP_SKB_CB(skb1)->seq) { + if (skb_queue_is_first(head, skb1)) + skb1 = NULL; + else + skb1 = skb_queue_prev(head, skb1); + } + } + if (!skb1) + __skb_queue_head(head, skb); + else + __skb_queue_after(head, skb1, skb); + + /* And clean segments covered by new one as whole. */ + while (!skb_queue_is_last(head, skb)) { + skb1 = skb_queue_next(head, skb); + + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + + __skb_unlink(skb1, head); + mptcp_remove_shortcuts(mpcb, skb1); + __kfree_skb(skb1); + } + +end: + if (skb) { + skb_set_owner_r(skb, meta_sk); + tp->mptcp->shortcut_ofoqueue = skb; + } + + return; +} + +/** + * @sk: the subflow that received this skb. + */ +void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb, + struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + try_shortcut(tp->mptcp->shortcut_ofoqueue, skb, + &tcp_sk(meta_sk)->out_of_order_queue, tp); +} + +void mptcp_ofo_queue(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sk_buff *skb; + + while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) { + u32 old_rcv_nxt = meta_tp->rcv_nxt; + if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt)) + break; + + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) { + __skb_unlink(skb, &meta_tp->out_of_order_queue); + mptcp_remove_shortcuts(meta_tp->mpcb, skb); + __kfree_skb(skb); + continue; + } + + __skb_unlink(skb, &meta_tp->out_of_order_queue); + mptcp_remove_shortcuts(meta_tp->mpcb, skb); + + __skb_queue_tail(&meta_sk->sk_receive_queue, skb); + meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); + + if (tcp_hdr(skb)->fin) + mptcp_fin(meta_sk); + } +} + +void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) +{ + struct sk_buff_head *head = &meta_tp->out_of_order_queue; + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(head, skb, tmp) { + __skb_unlink(skb, head); + mptcp_remove_shortcuts(meta_tp->mpcb, skb); + kfree_skb(skb); + } +} diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c new file mode 100644 index 0000000..7c628e0 --- /dev/null +++ b/net/mptcp/mptcp_olia.c @@ -0,0 +1,314 @@ +/* + * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL: + * + * Algorithm design: + * Ramin Khalili + * Nicolas Gast + * Jean-Yves Le Boudec + * + * Implementation: + * Ramin Khalili + * + * Ported to the official MPTCP-kernel: + * Christoph Paasch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include + +#include + +static int scale = 10; + +struct mptcp_olia { + u32 mptcp_loss1; + u32 mptcp_loss2; + u32 mptcp_loss3; + int epsilon_num; + u32 epsilon_den; + int mptcp_snd_cwnd_cnt; +}; + +static inline int mptcp_olia_sk_can_send(const struct sock *sk) +{ + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt; +} + +static inline u64 mptcp_olia_scale(u64 val, int scale) +{ + return (u64) val << scale; +} + +/* take care of artificially inflate (see RFC5681) + * of cwnd during fast-retransmit phase + */ +static u32 mptcp_get_crt_cwnd(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_state == TCP_CA_Recovery) + return tcp_sk(sk)->snd_ssthresh; + else + return tcp_sk(sk)->snd_cwnd; +} + +/* return the dominator of the first term of the increasing term */ +static u64 mptcp_get_rate(struct mptcp_cb *mpcb , u32 path_rtt) +{ + struct sock *sk; + u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */ + + mptcp_for_each_sk(mpcb, sk) { + struct tcp_sock *tp = tcp_sk(sk); + u64 scaled_num; + u32 tmp_cwnd; + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + tmp_cwnd = mptcp_get_crt_cwnd(sk); + scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt; + rate += div_u64(scaled_num , tp->srtt); + } + rate *= rate; + return rate; +} + +/* find the maximum cwnd, used to find set M */ +static u32 mptcp_get_max_cwnd(struct mptcp_cb *mpcb) +{ + struct sock *sk; + u32 best_cwnd = 0; + + mptcp_for_each_sk(mpcb, sk) { + u32 tmp_cwnd; + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + tmp_cwnd = mptcp_get_crt_cwnd(sk); + if (tmp_cwnd > best_cwnd) + best_cwnd = tmp_cwnd; + } + return best_cwnd; +} + +static void mptcp_get_epsilon(struct mptcp_cb *mpcb) +{ + struct mptcp_olia *ca; + struct tcp_sock *tp; + struct sock *sk; + u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1; + u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd; + u8 M = 0, B_not_M = 0; + + /* TODO - integrate this in the following loop - we just want to iterate once */ + + max_cwnd = mptcp_get_max_cwnd(mpcb); + + /* find the best path */ + mptcp_for_each_sk(mpcb, sk) { + tp = tcp_sk(sk); + ca = inet_csk_ca(sk); + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + tmp_rtt = tp->srtt * tp->srtt; + /* TODO - check here and rename variables */ + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, + ca->mptcp_loss2 - ca->mptcp_loss1); + + tmp_cwnd = mptcp_get_crt_cwnd(sk); + if (tmp_int * best_rtt >= best_int * tmp_rtt) { + best_rtt = tmp_rtt; + best_int = tmp_int; + best_cwnd = tmp_cwnd; + } + } + + /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */ + /* find the size of M and B_not_M */ + mptcp_for_each_sk(mpcb, sk) { + tp = tcp_sk(sk); + ca = inet_csk_ca(sk); + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + tmp_cwnd = mptcp_get_crt_cwnd(sk); + if (tmp_cwnd == max_cwnd) { + M++; + } else { + tmp_rtt = tp->srtt * tp->srtt; + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, + ca->mptcp_loss2 - ca->mptcp_loss1); + + if (tmp_int * best_rtt == best_int * tmp_rtt) + B_not_M++; + } + } + + /* check if the path is in M or B_not_M and set the value of epsilon accordingly */ + mptcp_for_each_sk(mpcb, sk) { + tp = tcp_sk(sk); + ca = inet_csk_ca(sk); + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + if (B_not_M == 0) { + ca->epsilon_num = 0; + ca->epsilon_den = 1; + } else { + tmp_rtt = tp->srtt * tp->srtt; + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, + ca->mptcp_loss2 - ca->mptcp_loss1); + tmp_cwnd = mptcp_get_crt_cwnd(sk); + + if (tmp_cwnd < max_cwnd && + tmp_int * best_rtt == best_int * tmp_rtt){ + ca->epsilon_num = 1; + ca->epsilon_den = mpcb->cnt_established * B_not_M; + } else if (tmp_cwnd == max_cwnd) { + ca->epsilon_num = -1; + ca->epsilon_den = mpcb->cnt_established * M; + } else { + ca->epsilon_num = 0; + ca->epsilon_den = 1; + } + } + } + +} + +/* setting the initial values */ +static void mptcp_olia_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_olia *ca = inet_csk_ca(sk); + + if (tp->mpc) { + ca->mptcp_loss1 = tp->snd_una; + ca->mptcp_loss2 = tp->snd_una; + ca->mptcp_loss3 = tp->snd_una; + ca->mptcp_snd_cwnd_cnt = 0; + ca->epsilon_num = 0; + ca->epsilon_den = 1; + } +} + +/* updating inter-loss distance and ssthresh */ +static void mptcp_olia_set_state(struct sock *sk, u8 new_state) +{ + if (!tcp_sk(sk)->mpc) + return; + + if (new_state == TCP_CA_Loss || + new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) { + struct mptcp_olia *ca = inet_csk_ca(sk); + + if (ca->mptcp_loss3 != ca->mptcp_loss2 && + !inet_csk(sk)->icsk_retransmits) { + ca->mptcp_loss1 = ca->mptcp_loss2; + ca->mptcp_loss2 = ca->mptcp_loss3; + } + } + +} + +/* main algorithm */ +static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_olia *ca = inet_csk_ca(sk); + struct mptcp_cb *mpcb = tp->mpcb; + + u64 inc_num, inc_den, rate, cwnd_scaled; + + if (!tp->mpc) { + tcp_reno_cong_avoid(sk, ack, in_flight); + return; + } + + ca->mptcp_loss3 = tp->snd_una; + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* slow start if it is in the safe area */ + if (tp->snd_cwnd <= tp->snd_ssthresh) { + tcp_slow_start(tp); + return; + } + + mptcp_get_epsilon(mpcb); + rate = mptcp_get_rate(mpcb, tp->srtt); + cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale); + inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1; + + /* calculate the increasing term, scaling is used to reduce the rounding effect */ + if (ca->epsilon_num == -1) { + if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) { + inc_num = rate - ca->epsilon_den * + cwnd_scaled * cwnd_scaled; + ca->mptcp_snd_cwnd_cnt -= div64_u64( + mptcp_olia_scale(inc_num , scale) , inc_den); + } else { + inc_num = ca->epsilon_den * + cwnd_scaled * cwnd_scaled - rate; + ca->mptcp_snd_cwnd_cnt += div64_u64( + mptcp_olia_scale(inc_num , scale) , inc_den); + } + } else { + inc_num = ca->epsilon_num * rate + + ca->epsilon_den * cwnd_scaled * cwnd_scaled; + ca->mptcp_snd_cwnd_cnt += div64_u64( + mptcp_olia_scale(inc_num , scale) , inc_den); + } + + + if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + ca->mptcp_snd_cwnd_cnt = 0; + } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) { + tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1); + ca->mptcp_snd_cwnd_cnt = 0; + } +} + +static struct tcp_congestion_ops mptcp_olia = { + .init = mptcp_olia_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = mptcp_olia_cong_avoid, + .set_state = mptcp_olia_set_state, + .min_cwnd = tcp_reno_min_cwnd, + .owner = THIS_MODULE, + .name = "olia", +}; + +static int __init mptcp_olia_register(void) +{ + BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&mptcp_olia); +} + +static void __exit mptcp_olia_unregister(void) +{ + tcp_unregister_congestion_control(&mptcp_olia); +} + +module_init(mptcp_olia_register); +module_exit(mptcp_olia_unregister); + +MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL"); +MODULE_VERSION("0.1"); diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c new file mode 100644 index 0000000..0333c57 --- /dev/null +++ b/net/mptcp/mptcp_output.c @@ -0,0 +1,2271 @@ +/* + * MPTCP implementation - Sending side + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include + +static inline int mptcp_pi_to_flag(int pi) +{ + return 1 << (pi - 1); +} + +static inline int mptcp_sub_len_remove_addr(u16 bitfield) +{ + unsigned int c; + for (c = 0; bitfield; c++) + bitfield &= bitfield - 1; + return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1; +} + +int mptcp_sub_len_remove_addr_align(u16 bitfield) +{ + return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4); +} +EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align); + +/* If the sub-socket sk available to send the skb? */ +static int mptcp_is_available(struct sock *sk, struct sk_buff *skb, + unsigned int *mss, bool wndtest) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int mss_now; + + /* Set of states for which we are allowed to send data */ + if (!mptcp_sk_can_send(sk)) + return 0; + + /* We do not send data on this subflow unless it is + * fully established, i.e. the 4th ack has been received. + */ + if (tp->mptcp->pre_established) + return 0; + + if (tp->pf || + (tp->mpcb->noneligible & mptcp_pi_to_flag(tp->mptcp->path_index))) + return 0; + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { + /* If SACK is disabled, and we got a loss, TCP does not exit + * the loss-state until something above high_seq has been acked. + * (see tcp_try_undo_recovery) + * + * high_seq is the snd_nxt at the moment of the RTO. As soon + * as we have an RTO, we won't push data on the subflow. + * Thus, snd_una can never go beyond high_seq. + */ + if (!tcp_is_reno(tp)) + return 0; + else if (tp->snd_una != tp->high_seq) + return 0; + } + + if (!tp->mptcp->fully_established) { + /* Make sure that we send in-order data */ + if (skb && tp->mptcp->second_packet && + tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) + return 0; + } + + if (!tcp_cwnd_test(tp, skb)) + return 0; + + mss_now = tcp_current_mss(sk); + /* Don't send on this subflow if we bypass the allowed send-window at + * the per-subflow level. Similar to tcp_snd_wnd_test, but manually + * calculated end_seq (because here at this point end_seq is still at + * the meta-level). + */ + if (skb && wndtest && + after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp))) + return 0; + + if (mss) + *mss = mss_now; + + return 1; +} + +/* Are we not allowed to reinject this skb on tp? */ +static int mptcp_dont_reinject_skb(struct tcp_sock *tp, struct sk_buff *skb) +{ + /* If the skb has already been enqueued in this sk, try to find + * another one. + */ + return skb && + /* Has the skb already been enqueued into this subsocket? */ + mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; +} + +/* This is the scheduler. This function decides on which flow to send + * a given MSS. If all subflows are found to be busy, NULL is returned + * The flow is selected based on the shortest RTT. + * If all paths have full cong windows, we simply return NULL. + * + * Additionally, this function is aware of the backup-subflows. + */ +static struct sock *get_available_subflow(struct sock *meta_sk, + struct sk_buff *skb, + unsigned int *mss_now, + bool wndtest) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL; + unsigned int mss = 0, mss_lowprio = 0, mss_backup = 0; + u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff; + int cnt_backups = 0; + + /* if there is only one subflow, bypass the scheduling function */ + if (mpcb->cnt_subflows == 1) { + bestsk = (struct sock *)mpcb->connection_list; + if (!mptcp_is_available(bestsk, skb, mss_now, wndtest)) + bestsk = NULL; + return bestsk; + } + + /* Answer data_fin on same subflow!!! */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN && + skb && mptcp_is_data_fin(skb)) { + mptcp_for_each_sk(mpcb, sk) { + if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && + mptcp_is_available(sk, skb, mss_now, wndtest)) + return sk; + } + } + + /* First, find the best subflow */ + mptcp_for_each_sk(mpcb, sk) { + struct tcp_sock *tp = tcp_sk(sk); + int this_mss; + + if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) + cnt_backups++; + + if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && + tp->srtt < lowprio_min_time_to_peer) { + + if (!mptcp_is_available(sk, skb, &this_mss, wndtest)) + continue; + + if (mptcp_dont_reinject_skb(tp, skb)) { + mss_backup = this_mss; + backupsk = sk; + continue; + } + + lowprio_min_time_to_peer = tp->srtt; + lowpriosk = sk; + mss_lowprio = this_mss; + } else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) && + tp->srtt < min_time_to_peer) { + if (!mptcp_is_available(sk, skb, &this_mss, wndtest)) + continue; + + if (mptcp_dont_reinject_skb(tp, skb)) { + mss_backup = this_mss; + backupsk = sk; + continue; + } + + min_time_to_peer = tp->srtt; + bestsk = sk; + mss = this_mss; + } + } + + if (mpcb->cnt_established == cnt_backups && lowpriosk) { + mss = mss_lowprio; + sk = lowpriosk; + } else if (bestsk) { + sk = bestsk; + } else if (backupsk){ + /* It has been sent on all subflows once - let's give it a + * chance again by restarting its pathmask. + */ + if (skb) + TCP_SKB_CB(skb)->path_mask = 0; + mss = mss_backup; + sk = backupsk; + } + + if (mss_now) + *mss_now = mss; + + return sk; +} + +static struct mp_dss *mptcp_skb_find_dss(const struct sk_buff *skb) +{ + if (!mptcp_is_data_seq(skb)) + return NULL; + + return (struct mp_dss *)(skb->data - (MPTCP_SUB_LEN_DSS_ALIGN + + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN)); +} + +/* get the data-seq and end-data-seq and store them again in the + * tcp_skb_cb + */ +static int mptcp_reconstruct_mapping(struct sk_buff *skb, struct sk_buff *orig_skb) +{ + struct mp_dss *mpdss = mptcp_skb_find_dss(orig_skb); + u32 *p32; + u16 *p16; + + if (!mpdss || !mpdss->M) + return 1; + + /* Move the pointer to the data-seq */ + p32 = (u32 *)mpdss; + p32++; + if (mpdss->A) { + p32++; + if (mpdss->a) + p32++; + } + + TCP_SKB_CB(skb)->seq = ntohl(*p32); + + /* Get the data_len to calculate the end_data_seq */ + p32++; + p32++; + p16 = (u16 *)p32; + TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq; + + return 0; +} + +/* Similar to __pskb_copy and sk_stream_alloc_skb. */ +static struct sk_buff *mptcp_pskb_copy(struct sk_buff *skb) +{ + struct sk_buff *n; + /* The TCP header must be at least 32-bit aligned. */ + int size = ALIGN(skb_headlen(skb), 4); + + n = alloc_skb_fclone(size + MAX_TCP_HEADER, GFP_ATOMIC); + if (!n) + return NULL; + + /* Set the data pointer */ + skb_reserve(n, MAX_TCP_HEADER); + /* Set the tail pointer and length */ + skb_put(n, skb_headlen(skb)); + /* Copy the bytes */ + skb_copy_from_linear_data(skb, n->data, n->len); + + n->truesize += skb->data_len; + n->data_len = skb->data_len; + n->len = skb->len; + + if (skb_shinfo(skb)->nr_frags) { + int i; + + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, GFP_ATOMIC)) { + kfree_skb(n); + n = NULL; + goto out; + } + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; + skb_frag_ref(skb, i); + } + skb_shinfo(n)->nr_frags = i; + } + + if (skb_has_frag_list(skb)) { + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; + skb_clone_fraglist(n); + } + + copy_skb_header(n, skb); +out: + return n; +} + +/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are + * coming from the meta-retransmit-timer + */ +static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk, + struct sock *sk, int clone_it) +{ + struct sk_buff *skb, *skb1; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + u32 seq, end_seq; + + if (clone_it) { + /* pskb_copy is necessary here, because the TCP/IP-headers + * will be changed when it's going to be reinjected on another + * subflow. + */ + skb = mptcp_pskb_copy(orig_skb); + } else { + __skb_unlink(orig_skb, &sk->sk_write_queue); + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + sk->sk_wmem_queued -= orig_skb->truesize; + sk_mem_uncharge(sk, orig_skb->truesize); + skb = orig_skb; + } + if (unlikely(!skb)) + return; + + if (sk && mptcp_reconstruct_mapping(skb, orig_skb)) { + __kfree_skb(skb); + return; + } + + skb->sk = meta_sk; + + /* If it reached already the destination, we don't have to reinject it */ + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { + __kfree_skb(skb); + return; + } + + /* Only reinject segments that are fully covered by the mapping */ + if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) != + TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { + u32 seq = TCP_SKB_CB(skb)->seq; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + __kfree_skb(skb); + + /* Ok, now we have to look for the full mapping in the meta + * send-queue :S + */ + tcp_for_write_queue(skb, meta_sk) { + /* Not yet at the mapping? */ + if (before(TCP_SKB_CB(skb)->seq, seq)) + continue; + /* We have passed by the mapping */ + if (after(TCP_SKB_CB(skb)->end_seq, end_seq)) + return; + + __mptcp_reinject_data(skb, meta_sk, NULL, 1); + } + return; + } + + /* If it's empty, just add */ + if (skb_queue_empty(&mpcb->reinject_queue)) { + skb_queue_head(&mpcb->reinject_queue, skb); + return; + } + + /* Find place to insert skb - or even we can 'drop' it, as the + * data is already covered by other skb's in the reinject-queue. + * + * This is inspired by code from tcp_data_queue. + */ + + skb1 = skb_peek_tail(&mpcb->reinject_queue); + seq = TCP_SKB_CB(skb)->seq; + while (1) { + if (!after(TCP_SKB_CB(skb1)->seq, seq)) + break; + if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) { + skb1 = NULL; + break; + } + skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); + } + + /* Do skb overlap to previous one? */ + end_seq = TCP_SKB_CB(skb)->end_seq; + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Don't reinject */ + __kfree_skb(skb); + return; + } + if (seq == TCP_SKB_CB(skb1)->seq) { + if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) + skb1 = NULL; + else + skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); + } + } + if (!skb1) + __skb_queue_head(&mpcb->reinject_queue, skb); + else + __skb_queue_after(&mpcb->reinject_queue, skb1, skb); + + /* And clean segments covered by new one as whole. */ + while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) { + skb1 = skb_queue_next(&mpcb->reinject_queue, skb); + + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + + __skb_unlink(skb1, &mpcb->reinject_queue); + __kfree_skb(skb1); + } + return; +} + +/* Inserts data into the reinject queue */ +void mptcp_reinject_data(struct sock *sk, int clone_it) +{ + struct sk_buff *skb_it, *tmp; + struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = tp->meta_sk; + + /* It has already been closed - there is really no point in reinjecting */ + if (meta_sk->sk_state == TCP_CLOSE) + return; + + skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) { + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); + /* Subflow syn's and fin's are not reinjected. + * + * As well as empty subflow-fins with a data-fin. + * They are reinjected below (without the subflow-fin-flag) + */ + if (tcb->tcp_flags & TCPHDR_SYN || + (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) || + (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len)) + continue; + + __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it); + } + + skb_it = tcp_write_queue_tail(meta_sk); + /* If sk has sent the empty data-fin, we have to reinject it too. */ + if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 && + TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) { + __mptcp_reinject_data(skb_it, meta_sk, NULL, 1); + } + + mptcp_push_pending_frames(meta_sk); + + tp->pf = 1; +} +EXPORT_SYMBOL(mptcp_reinject_data); + +static void mptcp_combine_dfin(struct sk_buff *skb, struct sock *meta_sk, + struct sock *subsk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sock *sk_it; + int all_empty = 1, all_acked; + + /* In infinite mapping we always try to combine */ + if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) { + subsk->sk_shutdown |= SEND_SHUTDOWN; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; + return; + } + + /* Don't combine, if they didn't combine - otherwise we end up in + * TIME_WAIT, even if our app is smart enough to avoid it + */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { + if (!mpcb->dfin_combined) + return; + } + + /* If no other subflow has data to send, we can combine */ + mptcp_for_each_sk(mpcb, sk_it) { + if (!mptcp_sk_can_send(sk_it)) + continue; + + if (!tcp_write_queue_empty(sk_it)) + all_empty = 0; + } + + /* If all data has been DATA_ACKed, we can combine. + * -1, because the data_fin consumed one byte + */ + all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1)); + + if ((all_empty || all_acked) && tcp_close_state(subsk)) { + subsk->sk_shutdown |= SEND_SHUTDOWN; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; + } +} + +static struct sk_buff *mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, + int reinject) +{ + __be32 *ptr; + __u16 data_len; + struct mp_dss *mdss; + struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = mptcp_meta_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + struct tcp_skb_cb *tcb; + struct sk_buff *subskb = NULL; + + if (!reinject) + TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? + MPTCPHDR_SEQ64_INDEX : 0); + + subskb = mptcp_pskb_copy(skb); + if (!subskb) + return NULL; + + TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index); + + if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0); + subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE; + } + + /* The subskb is going in the subflow send-queue. Its path-mask + * is not needed anymore and MUST be set to 0, as the path-mask + * is a union with inet_skb_param. + */ + tcb = TCP_SKB_CB(subskb); + tcb->path_mask = 0; + + if (mptcp_is_data_fin(subskb)) + mptcp_combine_dfin(subskb, meta_sk, sk); + + if (tp->mpcb->infinite_mapping_snd) + goto no_data_seq; + + if (tp->mpcb->send_infinite_mapping && + !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) { + tp->mptcp->fully_established = 1; + tp->mpcb->infinite_mapping_snd = 1; + tp->mptcp->infinite_cutoff_seq = tp->write_seq; + tcb->mptcp_flags |= MPTCPHDR_INF; + data_len = 0; + } else { + data_len = tcb->end_seq - tcb->seq; + } + + /**** Write MPTCP DSS-option to the packet. ****/ + ptr = (__be32 *)(subskb->data - (MPTCP_SUB_LEN_DSS_ALIGN + + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN)); + + /* Then we start writing it from the start */ + mdss = (struct mp_dss *)ptr; + + mdss->kind = TCPOPT_MPTCP; + mdss->sub = MPTCP_SUB_DSS; + mdss->rsv1 = 0; + mdss->rsv2 = 0; + mdss->F = (mptcp_is_data_fin(subskb) ? 1 : 0); + mdss->m = 0; + mdss->M = 1; + mdss->a = 0; + mdss->A = 1; + mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); + + ptr++; + ptr++; /* data_ack will be set in mptcp_options_write */ + *ptr++ = htonl(tcb->seq); /* data_seq */ + + /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */ + if (mptcp_is_data_fin(subskb) && subskb->len == 0) + *ptr++ = 0; /* subseq */ + else + *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */ + + if (tp->mpcb->dss_csum && data_len) { + __be16 *p16 = (__be16 *)ptr; + __be32 hdseq = mptcp_get_highorder_sndbits(subskb, tp->mpcb); + __wsum csum; + *ptr = htonl(((data_len) << 16) | + (TCPOPT_EOL << 8) | + (TCPOPT_EOL)); + + csum = csum_partial(ptr - 2, 12, subskb->csum); + p16++; + *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum)); + } else { + *ptr++ = htonl(((data_len) << 16) | + (TCPOPT_NOP << 8) | + (TCPOPT_NOP)); + } + +no_data_seq: + tcb->seq = tp->write_seq; + tcb->sacked = 0; /* reset the sacked field: from the point of view + * of this subflow, we are sending a brand new + * segment */ + /* Take into account seg len */ + tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0); + tcb->end_seq = tp->write_seq; + + /* If it's a non-payload DATA_FIN (also no subflow-fin), the + * segment is not part of the subflow but on a meta-only-level + */ + if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) { + tcp_add_write_queue_tail(sk, subskb); + sk->sk_wmem_queued += subskb->truesize; + sk_mem_charge(sk, subskb->truesize); + } + + return subskb; +} + +static void mptcp_sub_event_new_data_sent(struct sock *sk, + struct sk_buff *subskb, + struct sk_buff *skb) +{ + /* If it's a non-payload DATA_FIN (also no subflow-fin), the + * segment is not part of the subflow but on a meta-only-level + * + * We free it, because it has been queued nowhere. + */ + if (!mptcp_is_data_fin(subskb) || + (TCP_SKB_CB(subskb)->end_seq != TCP_SKB_CB(subskb)->seq)) { + tcp_event_new_data_sent(sk, subskb); + tcp_sk(sk)->mptcp->second_packet = 1; + tcp_sk(sk)->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq; + } else { + kfree_skb(subskb); + } +} + +/* Handle the packets and sockets after a tcp_transmit_skb failed */ +static void mptcp_transmit_skb_failed(struct sock *sk, struct sk_buff *skb, + struct sk_buff *subskb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + + /* No work to do if we are in infinite mapping mode + * There is only one subflow left and we cannot send this segment on + * another subflow. + */ + if (mpcb->infinite_mapping_snd) + return; + + TCP_SKB_CB(skb)->path_mask &= ~mptcp_pi_to_flag(tp->mptcp->path_index); + + if (TCP_SKB_CB(subskb)->tcp_flags & TCPHDR_FIN) { + /* If it is a subflow-fin we must leave it on the + * subflow-send-queue, so that the probe-timer + * can retransmit it. + */ + if (!tp->packets_out && !inet_csk(sk)->icsk_pending) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + } else if (mptcp_is_data_fin(subskb) && + TCP_SKB_CB(subskb)->end_seq == TCP_SKB_CB(subskb)->seq) { + /* An empty data-fin has not been enqueued on the subflow + * and thus we free it. + */ + + kfree_skb(subskb); + } else { + /* In all other cases we remove it from the sub-queue. + * Other subflows may send it, or the probe-timer will + * handle it. + */ + tcp_advance_send_head(sk, subskb); + + /* tcp_add_write_queue_tail initialized highest_sack. We have + * to reset it, if necessary. + */ + if (tp->highest_sack == subskb) + tp->highest_sack = NULL; + + tcp_unlink_write_queue(subskb, sk); + tp->write_seq -= subskb->len; + sk_wmem_free_skb(sk, subskb); + } +} + +/* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. + * Remember, these are still headerless SKBs at this point. + */ +int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + unsigned int mss_now, int reinject) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + int nsize, old_factor; + int nlen; + u8 flags; + int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN; + char dss[MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN]; + + if (WARN_ON(len > skb->len)) + return -EINVAL; + + /* DSS-option must be recovered afterwards. */ + if (!is_meta_sk(sk)) + memcpy(dss, skb->data - dsslen, dsslen); + + nsize = skb_headlen(skb) - len; + if (nsize < 0) + nsize = 0; + + if (skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + /* Recover dss-option */ + if (!is_meta_sk(sk)) + memcpy(skb->data - dsslen, dss, dsslen); + } + + /* Get a new skb... force flag on. */ + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; /* We'll just try again later. */ + + /* See below - if reinject == 1, the buff will be added to the reinject- + * queue, which is currently not part of the memory-accounting. + */ + if (reinject != 1) { + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); + } + nlen = skb->len - len - nsize; + buff->truesize += nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); + TCP_SKB_CB(buff)->tcp_flags = flags; + TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; + + flags = TCP_SKB_CB(skb)->mptcp_flags; + TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); + TCP_SKB_CB(buff)->mptcp_flags = flags; + + if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { + /* Copy and checksum data tail into the new buffer. */ + buff->csum = csum_partial_copy_nocheck(skb->data + len, + skb_put(buff, nsize), + nsize, 0); + + skb_trim(skb, len); + + skb->csum = csum_block_sub(skb->csum, buff->csum, len); + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb_split(skb, buff, len); + } + + /* We lost the dss-option when creating buff - put it back! */ + if (!is_meta_sk(sk)) + memcpy(buff->data - dsslen, dss, dsslen); + + buff->ip_summed = skb->ip_summed; + + /* Looks stupid, but our code really uses when of + * skbs, which it never sent before. --ANK + */ + TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + buff->tstamp = skb->tstamp; + + old_factor = tcp_skb_pcount(skb); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(sk, buff, mss_now); + + /* If this packet has been sent out already, we must + * adjust the various packet counters. + */ + if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) { + int diff = old_factor - tcp_skb_pcount(skb) - + tcp_skb_pcount(buff); + + if (diff) + tcp_adjust_pcount(sk, skb, diff); + } + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + if (reinject == 1) + __skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff); + else + tcp_insert_write_queue_after(skb, buff, sk); + + return 0; +} + +int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, + unsigned int mss_now, gfp_t gfp, int reinject) +{ + struct sk_buff *buff; + int nlen = skb->len - len, old_factor; + u8 flags; + int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN; + + /* All of a TSO frame must be composed of paged data. */ + if (skb->len != skb->data_len) + return mptcp_fragment(sk, skb, len, mss_now, reinject); + + buff = sk_stream_alloc_skb(sk, 0, gfp); + if (unlikely(buff == NULL)) + return -ENOMEM; + + /* See below - if reinject == 1, the buff will be added to the reinject- + * queue, which is currently not part of the memory-accounting. + */ + if (reinject != 1) { + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); + } + buff->truesize += nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); + TCP_SKB_CB(buff)->tcp_flags = flags; + + flags = TCP_SKB_CB(skb)->mptcp_flags; + TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); + TCP_SKB_CB(buff)->mptcp_flags = flags; + + /* This packet was never sent out yet, so no SACK bits. */ + TCP_SKB_CB(buff)->sacked = 0; + + buff->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; + skb_split(skb, buff, len); + + /* We lost the dss-option when creating buff - put it back! */ + if (!is_meta_sk(sk)) + memcpy(buff->data - dsslen, skb->data - dsslen, dsslen); + + old_factor = tcp_skb_pcount(skb); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(sk, buff, mss_now); + + /* If this packet has been sent out already, we must + * adjust the various packet counters. + */ + if (!before(tcp_sk(sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) { + int diff = old_factor - tcp_skb_pcount(skb) - + tcp_skb_pcount(buff); + + if (diff) + tcp_adjust_pcount(sk, skb, diff); + } + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + if (reinject == 1) + __skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff); + else + tcp_insert_write_queue_after(skb, buff, sk); + + return 0; +} + +/* Inspired by tcp_write_wakeup */ +int mptcp_write_wakeup(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sk_buff *skb, *subskb; + + skb = tcp_send_head(meta_sk); + if (skb && + before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) { + int err; + unsigned int mss; + unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq; + struct sock *subsk = get_available_subflow(meta_sk, skb, &mss, + false); + struct tcp_sock *subtp; + if (!subsk) + return -1; + subtp = tcp_sk(subsk); + + seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq, + tcp_wnd_end(subtp) - subtp->write_seq); + + if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; + if (mptcp_fragment(meta_sk, skb, seg_size, mss, 0)) + return -1; + } else if (!tcp_skb_pcount(skb)) { + tcp_set_skb_tso_segs(meta_sk, skb, mss); + } + + subskb = mptcp_skb_entail(subsk, skb, 0); + if (!subskb) + return -1; + + TCP_SKB_CB(subskb)->tcp_flags |= TCPHDR_PSH; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(subskb)->when = tcp_time_stamp; + err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC); + if (unlikely(err)) { + mptcp_transmit_skb_failed(subsk, skb, subskb); + return err; + } + + mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq); + tcp_event_new_data_sent(meta_sk, skb); + mptcp_sub_event_new_data_sent(subsk, subskb, skb); + + return 0; + } else { + struct sock *sk_it; + int ans = 0; + + if (between(meta_tp->snd_up, meta_tp->snd_una + 1, + meta_tp->snd_una + 0xFFFF)) { + mptcp_for_each_sk(meta_tp->mpcb, sk_it) { + if (mptcp_sk_can_send_ack(sk_it)) + tcp_xmit_probe_skb(sk_it, 1); + } + } + + /* At least one of the tcp_xmit_probe_skb's has to succeed */ + mptcp_for_each_sk(meta_tp->mpcb, sk_it) { + int ret; + + if (!mptcp_sk_can_send_ack(sk_it)) + continue; + + ret = tcp_xmit_probe_skb(sk_it, 0); + if (unlikely(ret > 0)) + ans = ret; + } + return ans; + } +} + +static void mptcp_find_and_set_pathmask(struct sock *meta_sk, struct sk_buff *skb) +{ + struct sk_buff *skb_it; + + skb_it = tcp_write_queue_head(meta_sk); + + tcp_for_write_queue_from(skb_it, meta_sk) { + if (skb_it == tcp_send_head(meta_sk)) + break; + + if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) { + TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask; + break; + } + } +} + +static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal) +{ + struct sock *meta_sk; + struct tcp_sock *tp = tcp_sk(sk), *tp_it; + struct sk_buff *skb_head; + + if (tp->mpcb->cnt_subflows == 1) + return NULL; + + meta_sk = mptcp_meta_sk(sk); + skb_head = tcp_write_queue_head(meta_sk); + + if (!skb_head || skb_head == tcp_send_head(meta_sk)) + return NULL; + + /* If penalization is optional (coming from mptcp_next_segment() and + * We are not send-buffer-limited we do not penalize. The retransmission + * is just an optimization to fix the idle-time due to the delay before + * we wake up the application. + */ + if (!penal && sk_stream_memory_free(meta_sk)) + goto retrans; + + /* Half the cwnd of the slow flow */ + if (tcp_time_stamp - tp->mptcp->last_rbuf_opti >= tp->srtt >> 3) { + mptcp_for_each_tp(tp->mpcb, tp_it) { + if (tp_it != tp && + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { + if (tp->srtt < tp_it->srtt && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { + tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); + if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH) + tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); + + tp->mptcp->last_rbuf_opti = tcp_time_stamp; + } + break; + } + } + } + +retrans: + + /* Segment not yet injected into this path? Take it!!! */ + if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { + bool do_retrans = false; + mptcp_for_each_tp(tp->mpcb, tp_it) { + if (tp_it != tp && + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { + if (tp_it->snd_cwnd <= 4) { + do_retrans = true; + break; + } + + if (4 * tp->srtt >= tp_it->srtt) { + do_retrans = false; + break; + } else { + do_retrans = true; + } + } + } + + if (do_retrans) + return skb_head; + } + return NULL; +} + +int mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp; + struct sock *subsk; + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sk_buff *skb; + unsigned int tso_segs, old_factor, sent_pkts; + int cwnd_quota; + int result; + int reinject = 0; + + sent_pkts = 0; + + /* Currently mtu-probing is not done in MPTCP */ + if (!push_one && 0) { + /* Do MTU probing. */ + result = tcp_mtu_probe(meta_sk); + if (!result) + return 0; + else if (result > 0) + sent_pkts = 1; + } + + while ((skb = mptcp_next_segment(meta_sk, &reinject))) { + unsigned int limit; + struct sk_buff *subskb = NULL; + u32 noneligible = mpcb->noneligible; + + if (reinject == 1) { + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { + /* Segment already reached the peer, take the next one */ + __skb_unlink(skb, &mpcb->reinject_queue); + __kfree_skb(skb); + continue; + } + + /* Reinjection and it is coming from a subflow? We need + * to find out the path-mask from the meta-write-queue + * to properly select a subflow. + */ + if (!TCP_SKB_CB(skb)->path_mask) + mptcp_find_and_set_pathmask(meta_sk, skb); + } + +subflow: + subsk = get_available_subflow(meta_sk, skb, &mss_now, true); + if (!subsk) + break; + subtp = tcp_sk(subsk); + + /* Since all subsocks are locked before calling the scheduler, + * the tcp_send_head should not change. + */ + BUG_ON(!reinject && tcp_send_head(meta_sk) != skb); +retry: + /* If the segment was cloned (e.g. a meta retransmission), + * the header must be expanded/copied so that there is no + * corruption of TSO information. + */ + if (skb_unclone(skb, GFP_ATOMIC)) + break; + + old_factor = tcp_skb_pcount(skb); + tcp_set_skb_tso_segs(meta_sk, skb, mss_now); + tso_segs = tcp_skb_pcount(skb); + + if (reinject == -1) { + /* The packet has already once been sent, so if we + * change the pcount here we have to adjust packets_out + * in the meta-sk + */ + int diff = old_factor - tso_segs; + + if (diff) + tcp_adjust_pcount(meta_sk, skb, diff); + } + + cwnd_quota = tcp_cwnd_test(subtp, skb); + if (!cwnd_quota) { + /* May happen due to two cases: + * + * - if at the first selection we circumvented + * the test due to a DATA_FIN (and got rejected at + * tcp_snd_wnd_test), but the reinjected segment is not + * a DATA_FIN. + * - if we take a DATA_FIN with data, but + * tcp_set_skb_tso_segs() increases the number of + * tso_segs to something > 1. Then, cwnd_test might + * reject it. + */ + mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index); + continue; + } + + if (!reinject && unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) { + skb = mptcp_rcv_buf_optimization(subsk, 1); + if (skb) { + reinject = -1; + goto retry; + } + break; + } + + if (tso_segs == 1) { + if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now, + (tcp_skb_is_last(meta_sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + } else { + /* Do not try to defer the transmission of a reinjected + * segment. Send it directly. + * If it is not possible to send the TSO segment on the + * best subflow right now try to look for another subflow. + * If there is no subflow available defer the segment to avoid + * the call to mptso_fragment. + */ + if (!push_one && !reinject && tcp_tso_should_defer(subsk, skb)) { + mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index); + goto subflow; + } + } + + limit = mss_now; + if (tso_segs > 1 && !tcp_urg_mode(meta_tp)) + limit = tcp_mss_split_point(subsk, skb, mss_now, + min_t(unsigned int, + cwnd_quota, + subsk->sk_gso_max_segs)); + + if (skb->len > limit && + unlikely(mptso_fragment(meta_sk, skb, limit, mss_now, gfp, reinject))) + break; + + subskb = mptcp_skb_entail(subsk, skb, reinject); + if (!subskb) + break; + + mpcb->noneligible = noneligible; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(subskb)->when = tcp_time_stamp; + if (unlikely(tcp_transmit_skb(subsk, subskb, 1, gfp))) { + mptcp_transmit_skb_failed(subsk, skb, subskb); + mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index); + continue; + } + + if (!reinject) { + mptcp_check_sndseq_wrap(meta_tp, + TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq); + tcp_event_new_data_sent(meta_sk, skb); + } + + tcp_minshall_update(meta_tp, mss_now, skb); + sent_pkts += tcp_skb_pcount(skb); + tcp_sk(subsk)->mptcp->sent_pkts += tcp_skb_pcount(skb); + + mptcp_sub_event_new_data_sent(subsk, subskb, skb); + + if (reinject > 0) { + __skb_unlink(skb, &mpcb->reinject_queue); + kfree_skb(skb); + } + + if (push_one) + break; + } + + mpcb->noneligible = 0; + + if (likely(sent_pkts)) { + mptcp_for_each_sk(mpcb, subsk) { + subtp = tcp_sk(subsk); + if (subtp->mptcp->sent_pkts) { + if (tcp_in_cwnd_reduction(subsk)) + subtp->prr_out += subtp->mptcp->sent_pkts; + tcp_cwnd_validate(subsk); + subtp->mptcp->sent_pkts = 0; + } + } + return 0; + } + + return !meta_tp->packets_out && tcp_send_head(meta_sk); +} + +void mptcp_write_space(struct sock *sk) +{ + mptcp_push_pending_frames(mptcp_meta_sk(sk)); +} + +u32 __mptcp_select_window(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); + int mss, free_space, full_space, window; + + /* MSS for the peer's data. Previous versions used mss_clamp + * here. I don't know if the value based on our guesses + * of peer's MSS is better for the performance. It's more correct + * but may be worse for the performance because of rcv_mss + * fluctuations. --SAW 1998/11/1 + */ + mss = icsk->icsk_ack.rcv_mss; + free_space = tcp_space(sk); + full_space = min_t(int, meta_tp->window_clamp, + tcp_full_space(sk)); + + if (mss > full_space) + mss = full_space; + + if (free_space < (full_space >> 1)) { + icsk->icsk_ack.quick = 0; + + if (tcp_memory_pressure) + /* TODO this has to be adapted when we support different + * MSS's among the subflows. + */ + meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh, + 4U * meta_tp->advmss); + + if (free_space < mss) + return 0; + } + + if (free_space > meta_tp->rcv_ssthresh) + free_space = meta_tp->rcv_ssthresh; + + /* Don't do rounding if we are using window scaling, since the + * scaled window will not line up with the MSS boundary anyway. + */ + window = meta_tp->rcv_wnd; + if (tp->rx_opt.rcv_wscale) { + window = free_space; + + /* Advertise enough space so that it won't get scaled away. + * Import case: prevent zero window announcement if + * 1< mss. + */ + if (((window >> tp->rx_opt.rcv_wscale) << tp-> + rx_opt.rcv_wscale) != window) + window = (((window >> tp->rx_opt.rcv_wscale) + 1) + << tp->rx_opt.rcv_wscale); + } else { + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + if (window <= free_space - mss || window > free_space) + window = (free_space / mss) * mss; + else if (mss == full_space && + free_space > window + (full_space >> 1)) + window = free_space; + } + + return window; +} + +static void mptcp_set_nonce(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + + if (sk->sk_family == AF_INET) + tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + inet->inet_dport, + tp->write_seq); +#if IS_ENABLED(CONFIG_IPV6) + else + tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(inet6_sk(sk)->saddr.s6_addr32, + inet6_sk(sk)->daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport, + tp->write_seq); +#endif + + tp->mptcp->nonce_set = 1; +} + +void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts, + unsigned *remaining) +{ + struct tcp_sock *tp = tcp_sk(sk); + + opts->options |= OPTION_MPTCP; + if (is_master_tp(tp)) { + opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN; + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; + opts->mp_capable.sender_key = tp->mptcp_loc_key; + opts->dss_csum = !!sysctl_mptcp_checksum; + } else { + struct mptcp_cb *mpcb = tp->mpcb; + + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN; + *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN; + opts->mp_join_syns.token = mpcb->mptcp_rem_token; + opts->addr_id = tp->mptcp->loc_id; + + if (!tp->mptcp->nonce_set) + mptcp_set_nonce(sk); + + opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce; + } +} + +void mptcp_synack_options(struct request_sock *req, + struct tcp_out_options *opts, unsigned *remaining) +{ + struct mptcp_request_sock *mtreq; + mtreq = mptcp_rsk(req); + + opts->options |= OPTION_MPTCP; + /* MPCB not yet set - thus it's a new MPTCP-session */ + if (!mtreq->mpcb) { + opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK; + opts->mp_capable.sender_key = mtreq->mptcp_loc_key; + opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum; + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; + } else { + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK; + opts->mp_join_syns.sender_truncated_mac = + mtreq->mptcp_hash_tmac; + opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce; + opts->addr_id = mtreq->loc_id; + *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN; + } +} + +void mptcp_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, unsigned *size) +{ + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); + struct mptcp_cb *mpcb = tp->mpcb; + struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; + + /* In fallback mp_fail-mode, we have to repeat it until the fallback + * has been done by the sender + */ + if (unlikely(tp->mptcp->send_mp_fail)) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_FAIL; + opts->data_ack = (__u32)(mpcb->csum_cutoff_seq >> 32); + opts->data_seq = (__u32)mpcb->csum_cutoff_seq; + *size += MPTCP_SUB_LEN_FAIL; + return; + } + + if (unlikely(tp->send_mp_fclose)) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_FCLOSE; + opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; + *size += MPTCP_SUB_LEN_FCLOSE_ALIGN; + return; + } + + /* 1. If we are the sender of the infinite-mapping, we need the + * MPTCPHDR_INF-flag, because a retransmission of the + * infinite-announcment still needs the mptcp-option. + * + * We need infinite_cutoff_seq, because retransmissions from before + * the infinite-cutoff-moment still need the MPTCP-signalling to stay + * consistent. + * + * 2. If we are the receiver of the infinite-mapping, we always skip + * mptcp-options, because acknowledgments from before the + * infinite-mapping point have already been sent out. + * + * I know, the whole infinite-mapping stuff is ugly... + * + * TODO: Handle wrapped data-sequence numbers + * (even if it's very unlikely) + */ + if (unlikely(mpcb->infinite_mapping_snd) && + tp->mptcp->fully_established && + ((mpcb->send_infinite_mapping && tcb && + !(tcb->mptcp_flags & MPTCPHDR_INF) && + !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) || + !mpcb->send_infinite_mapping)) + return; + + if (unlikely(tp->mptcp->include_mpc)) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_CAPABLE | + OPTION_TYPE_ACK; + *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN; + opts->mp_capable.sender_key = mpcb->mptcp_loc_key; + opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; + opts->dss_csum = mpcb->dss_csum; + + if (skb) + tp->mptcp->include_mpc = 0; + } + if (unlikely(tp->mptcp->pre_established)) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK; + *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN; + } + + if (!tp->mptcp_add_addr_ack && !tp->mptcp->include_mpc && + !tp->mptcp->pre_established) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_DATA_ACK; + /* If !skb, we come from tcp_current_mss and thus we always + * assume that the DSS-option will be set for the data-packet. + */ + if (skb && !mptcp_is_data_seq(skb)) { + opts->data_ack = meta_tp->rcv_nxt; + + *size += MPTCP_SUB_LEN_ACK_ALIGN; + } else { + opts->data_ack = meta_tp->rcv_nxt; + + /* Doesn't matter, if csum included or not. It will be + * either 10 or 12, and thus aligned = 12 + */ + *size += MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN; + } + + *size += MPTCP_SUB_LEN_DSS_ALIGN; + } + + if (mpcb->pm_ops->addr_signal) + mpcb->pm_ops->addr_signal(sk, size, opts, skb); + + if (unlikely(tp->mptcp->send_mp_prio) && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_PRIO; + if (skb) + tp->mptcp->send_mp_prio = 0; + *size += MPTCP_SUB_LEN_PRIO_ALIGN; + } + + return; +} + +void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, + struct tcp_out_options *opts, + struct sk_buff *skb) +{ + if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) { + struct mp_capable *mpc = (struct mp_capable *)ptr; + + mpc->kind = TCPOPT_MPTCP; + + if ((OPTION_TYPE_SYN & opts->mptcp_options) || + (OPTION_TYPE_SYNACK & opts->mptcp_options)) { + mpc->sender_key = opts->mp_capable.sender_key; + mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; + ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; + } else if (OPTION_TYPE_ACK & opts->mptcp_options) { + mpc->sender_key = opts->mp_capable.sender_key; + mpc->receiver_key = opts->mp_capable.receiver_key; + mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK; + ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2; + } + + mpc->sub = MPTCP_SUB_CAPABLE; + mpc->ver = 0; + mpc->a = opts->dss_csum; + mpc->b = 0; + mpc->rsv = 0; + mpc->h = 1; + } + + if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) { + struct mp_join *mpj = (struct mp_join *)ptr; + + mpj->kind = TCPOPT_MPTCP; + mpj->sub = MPTCP_SUB_JOIN; + mpj->rsv = 0; + mpj->addr_id = opts->addr_id; + + if (OPTION_TYPE_SYN & opts->mptcp_options) { + mpj->len = MPTCP_SUB_LEN_JOIN_SYN; + mpj->u.syn.token = opts->mp_join_syns.token; + mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce; + mpj->b = tp->mptcp->low_prio; + ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2; + } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) { + mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK; + mpj->u.synack.mac = + opts->mp_join_syns.sender_truncated_mac; + mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce; + mpj->b = tp->mptcp->low_prio; + ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2; + } else if (OPTION_TYPE_ACK & opts->mptcp_options) { + mpj->len = MPTCP_SUB_LEN_JOIN_ACK; + memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20); + ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2; + } + } + if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) { + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; + + mpadd->kind = TCPOPT_MPTCP; + if (opts->add_addr_v4) { + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4; + mpadd->sub = MPTCP_SUB_ADD_ADDR; + mpadd->ipver = 4; + mpadd->addr_id = opts->add_addr4.addr_id; + mpadd->u.v4.addr = opts->add_addr4.addr; + ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2; + } else if (opts->add_addr_v6) { + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6; + mpadd->sub = MPTCP_SUB_ADD_ADDR; + mpadd->ipver = 6; + mpadd->addr_id = opts->add_addr6.addr_id; + memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr, + sizeof(mpadd->u.v6.addr)); + ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2; + } + } + if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) { + struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; + u8 *addrs_id; + int id, len, len_align; + + len = mptcp_sub_len_remove_addr(opts->remove_addrs); + len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs); + + mprem->kind = TCPOPT_MPTCP; + mprem->len = len; + mprem->sub = MPTCP_SUB_REMOVE_ADDR; + mprem->rsv = 0; + addrs_id = &mprem->addrs_id; + + mptcp_for_each_bit_set(opts->remove_addrs, id) + *(addrs_id++) = id; + + /* Fill the rest with NOP's */ + if (len_align > len) { + int i; + for (i = 0; i < len_align - len; i++) + *(addrs_id++) = TCPOPT_NOP; + } + + ptr += len_align >> 2; + } + if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) { + struct mp_fail *mpfail = (struct mp_fail *)ptr; + + mpfail->kind = TCPOPT_MPTCP; + mpfail->len = MPTCP_SUB_LEN_FAIL; + mpfail->sub = MPTCP_SUB_FAIL; + mpfail->rsv1 = 0; + mpfail->rsv2 = 0; + mpfail->data_seq = htonll(((u64)opts->data_ack << 32) | opts->data_seq); + + ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2; + } + if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) { + struct mp_fclose *mpfclose = (struct mp_fclose *)ptr; + + mpfclose->kind = TCPOPT_MPTCP; + mpfclose->len = MPTCP_SUB_LEN_FCLOSE; + mpfclose->sub = MPTCP_SUB_FCLOSE; + mpfclose->rsv1 = 0; + mpfclose->rsv2 = 0; + mpfclose->key = opts->mp_capable.receiver_key; + + ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2; + } + + if (OPTION_DATA_ACK & opts->mptcp_options) { + if (!mptcp_is_data_seq(skb)) { + struct mp_dss *mdss = (struct mp_dss *)ptr; + + mdss->kind = TCPOPT_MPTCP; + mdss->sub = MPTCP_SUB_DSS; + mdss->rsv1 = 0; + mdss->rsv2 = 0; + mdss->F = 0; + mdss->m = 0; + mdss->M = 0; + mdss->a = 0; + mdss->A = 1; + mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); + + ptr++; + *ptr++ = htonl(opts->data_ack); + } else { + /**** Just update the data_ack ****/ + + /* Get pointer to data_ack-field. MPTCP is always at + * the end of the TCP-options. + */ + /* TODO if we allow sending 64-bit dseq's we have to change "16" */ + __be32 *dack = (__be32 *)(skb->data + (tcp_hdr(skb)->doff << 2) - 16); + + *dack = htonl(opts->data_ack); + } + } + if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) { + struct mp_prio *mpprio = (struct mp_prio *)ptr; + + mpprio->kind = TCPOPT_MPTCP; + mpprio->len = MPTCP_SUB_LEN_PRIO; + mpprio->sub = MPTCP_SUB_PRIO; + mpprio->rsv = 0; + mpprio->b = tp->mptcp->low_prio; + mpprio->addr_id = TCPOPT_NOP; + + ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2; + } +} + +/* Returns the next segment to be sent from the mptcp meta-queue. + * (chooses the reinject queue if any segment is waiting in it, otherwise, + * chooses the normal write queue). + * Sets *@reinject to 1 if the returned segment comes from the + * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, + * and sets it to -1 if it is a meta-level retransmission to optimize the + * receive-buffer. + */ +struct sk_buff *mptcp_next_segment(struct sock *meta_sk, int *reinject) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sk_buff *skb = NULL; + if (reinject) + *reinject = 0; + + /* If we are in fallback-mode, just take from the meta-send-queue */ + if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) + return tcp_send_head(meta_sk); + + skb = skb_peek(&mpcb->reinject_queue); + + if (skb) { + if (reinject) + *reinject = 1; + } else { + skb = tcp_send_head(meta_sk); + + if (!skb && meta_sk->sk_socket && + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) && + sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { + struct sock *subsk = get_available_subflow(meta_sk, NULL, + NULL, true); + if (!subsk) + return NULL; + + skb = mptcp_rcv_buf_optimization(subsk, 0); + if (skb && reinject) + *reinject = -1; + } + } + return skb; +} + +/* Sends the datafin */ +void mptcp_send_fin(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sk_buff *skb = tcp_write_queue_tail(meta_sk); + int mss_now; + + if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) + meta_tp->mpcb->passive_close = 1; + + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. But be careful about outgoing SACKS + * and IP options. + */ + mss_now = mptcp_current_mss(meta_sk); + + if (tcp_send_head(meta_sk) != NULL) { + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; + TCP_SKB_CB(skb)->end_seq++; + meta_tp->write_seq++; + } else { + /* Socket is locked, keep trying until memory is available. */ + for (;;) { + skb = alloc_skb_fclone(MAX_TCP_HEADER, + meta_sk->sk_allocation); + if (skb) + break; + yield(); + } + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + + tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK); + TCP_SKB_CB(skb)->end_seq++; + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN | MPTCPHDR_SEQ; + tcp_queue_skb(meta_sk, skb); + } + __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF); +} + +void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sock *sk = NULL, *sk_it = NULL, *tmpsk; + + if (!mpcb->cnt_subflows) + return; + + WARN_ON(meta_tp->send_mp_fclose); + + /* First - select a socket */ + sk = mptcp_select_ack_sock(meta_sk, 0); + + /* May happen if no subflow is in an appropriate state */ + if (!sk) + return; + + /* We are in infinite mode - just send a reset */ + if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) { + sk->sk_err = ECONNRESET; + if (tcp_need_reset(sk->sk_state)) + tcp_send_active_reset(sk, priority); + mptcp_sub_force_close(sk); + return; + } + + + tcp_sk(sk)->send_mp_fclose = 1; + /** Reset all other subflows */ + + /* tcp_done must be handled with bh disabled */ + if (!in_serving_softirq()) + local_bh_disable(); + + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { + if (tcp_sk(sk_it)->send_mp_fclose) + continue; + + sk_it->sk_err = ECONNRESET; + if (tcp_need_reset(sk_it->sk_state)) + tcp_send_active_reset(sk_it, GFP_ATOMIC); + mptcp_sub_force_close(sk_it); + } + + if (!in_serving_softirq()) + local_bh_enable(); + + tcp_send_ack(sk); + inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto); + + meta_tp->send_mp_fclose = 1; +} + +static void mptcp_ack_retransmit_timer(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + goto out; /* Routing failure or similar */ + + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_time_stamp ? : 1; + + if (tcp_write_timeout(sk)) { + tp->mptcp->pre_established = 0; + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); + tcp_send_active_reset(sk, GFP_ATOMIC); + goto out; + } + + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (skb == NULL) { + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, + jiffies + icsk->icsk_rto); + return; + } + + /* Reserve space for headers and prepare control bits */ + skb_reserve(skb, MAX_TCP_HEADER); + tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK); + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!icsk->icsk_retransmits) + icsk->icsk_retransmits = 1; + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, + jiffies + icsk->icsk_rto); + return; + } + + + icsk->icsk_retransmits++; + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, + jiffies + icsk->icsk_rto); + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) { + __sk_dst_reset(sk); + } + +out:; +} + +void mptcp_ack_handler(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct sock *meta_sk = mptcp_meta_sk(sk); + + bh_lock_sock(meta_sk); + if (sock_owned_by_user(meta_sk)) { + /* Try again later */ + sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer, + jiffies + (HZ / 20)); + goto out_unlock; + } + + if (sk->sk_state == TCP_CLOSE) + goto out_unlock; + + mptcp_ack_retransmit_timer(sk); + + sk_mem_reclaim(sk); + +out_unlock: + bh_unlock_sock(meta_sk); + sock_put(sk); +} + +/* Similar to tcp_retransmit_skb + * + * The diff is that we handle the retransmission-stats (retrans_stamp) at the + * meta-level. + */ +int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sock *subsk; + struct sk_buff *subskb; + unsigned int limit, tso_segs, mss_now; + int err = -1, oldpcount; + + /* Do not sent more than we queued. 1/4 is reserved for possible + * copying overhead: fragmentation, tunneling, mangling etc. + * + * This is a meta-retransmission thus we check on the meta-socket. + */ + if (atomic_read(&meta_sk->sk_wmem_alloc) > + min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) { + return -EAGAIN; + } + + /* We need to make sure that the retransmitted segment can be sent on a + * subflow right now. If it is too big, it needs to be fragmented. + */ + subsk = get_available_subflow(meta_sk, skb, &mss_now, true); + if (!subsk) { + /* We want to increase icsk_retransmits, thus return 0, so that + * mptcp_retransmit_timer enters the desired branch. + */ + err = 0; + goto failed; + } + + /* If the segment was cloned (e.g. a meta retransmission), the header + * must be expanded/copied so that there is no corruption of TSO + * information. + */ + if (skb_unclone(skb, GFP_ATOMIC)) { + err = ENOMEM; + goto failed; + } + + oldpcount = tcp_skb_pcount(skb); + tcp_set_skb_tso_segs(meta_sk, skb, mss_now); + tso_segs = tcp_skb_pcount(skb); + BUG_ON(!tso_segs); + + /* The MSS might have changed and so the number of segments. We + * need to account for this change. + */ + if (unlikely(oldpcount != tso_segs)) + tcp_adjust_pcount(meta_sk, skb, oldpcount - tso_segs); + + limit = mss_now; + if (tso_segs > 1 && !tcp_urg_mode(meta_tp)) + limit = tcp_mss_split_point(subsk, skb, mss_now, + min_t(unsigned int, + tcp_cwnd_test(tcp_sk(subsk), skb), + subsk->sk_gso_max_segs)); + + if (skb->len > limit && + unlikely(mptso_fragment(meta_sk, skb, limit, mss_now, + GFP_ATOMIC, 0))) + goto failed; + + subskb = mptcp_skb_entail(subsk, skb, -1); + if (!subskb) + goto failed; + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(subskb)->when = tcp_time_stamp; + err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC); + if (!err) { + /* Update global TCP statistics. */ + TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS); + + /* Diff to tcp_retransmit_skb */ + + /* Save stamp of the first retransmit. */ + if (!meta_tp->retrans_stamp) + meta_tp->retrans_stamp = TCP_SKB_CB(subskb)->when; + mptcp_sub_event_new_data_sent(subsk, subskb, skb); + } else { + mptcp_transmit_skb_failed(subsk, skb, subskb); + } + +failed: + return err; +} + +/* Similar to tcp_retransmit_timer + * + * The diff is that we have to handle retransmissions of the FAST_CLOSE-message + * and that we don't have an srtt estimation at the meta-level. + */ +void mptcp_retransmit_timer(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); + int err; + + /* In fallback, retransmission is handled at the subflow-level */ + if (!meta_tp->packets_out || mpcb->infinite_mapping_snd || + mpcb->send_infinite_mapping) + return; + + WARN_ON(tcp_write_queue_empty(meta_sk)); + + if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) && + !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { + /* Receiver dastardly shrinks window. Our retransmits + * become zero probes, but we should not timeout this + * connection. If the socket is an orphan, time it out, + * we cannot allow such beasts to hang infinitely. + */ + struct inet_sock *meta_inet = inet_sk(meta_sk); + if (meta_sk->sk_family == AF_INET) { + LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &meta_inet->inet_daddr, + ntohs(meta_inet->inet_dport), + meta_inet->inet_num, meta_tp->snd_una, + meta_tp->snd_nxt); + } +#if IS_ENABLED(CONFIG_IPV6) + else if (meta_sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(meta_sk); + LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &np->daddr, ntohs(meta_inet->inet_dport), + meta_inet->inet_num, meta_tp->snd_una, + meta_tp->snd_nxt); + } +#endif + if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) { + tcp_write_err(meta_sk); + return; + } + + mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); + goto out_reset_timer; + } + + if (tcp_write_timeout(meta_sk)) + return; + + if (meta_icsk->icsk_retransmits == 0) + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS); + + meta_icsk->icsk_ca_state = TCP_CA_Loss; + + err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk)); + if (err > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!meta_icsk->icsk_retransmits) + meta_icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, + min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); + return; + } + + /* Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + meta_icsk->icsk_backoff++; + meta_icsk->icsk_retransmits++; + +out_reset_timer: + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this + * might be increased if the stream oscillates between thin and thick, + * thus the old value might already be too high compared to the value + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without + * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating + * exponential backoff behaviour to avoid continue hammering + * linear-timeout retransmissions into a black hole + */ + if (meta_sk->sk_state == TCP_ESTABLISHED && + (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + tcp_stream_is_thin(meta_tp) && + meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { + meta_icsk->icsk_backoff = 0; + /* We cannot do the same as in tcp_write_timer because the + * srtt is not set here. + */ + mptcp_set_rto(meta_sk); + } else { + /* Use normal (exponential) backoff */ + meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX); + } + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX); + + return; +} + +/* Modify values to an mptcp-level for the initial window of new subflows */ +void mptcp_select_initial_window(int *__space, __u32 *window_clamp, + const struct sock *sk) +{ + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + + *window_clamp = mpcb->orig_window_clamp; + *__space = tcp_win_from_space(mpcb->orig_sk_rcvbuf); +} + +unsigned int mptcp_current_mss(struct sock *meta_sk) +{ + unsigned int mss = 0; + struct sock *sk; + + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { + int this_mss; + + if (!mptcp_sk_can_send(sk)) + continue; + + this_mss = tcp_current_mss(sk); + if (this_mss > mss) + mss = this_mss; + } + + /* If no subflow is available, we take a default-mss from the + * meta-socket. + */ + return !mss ? tcp_current_mss(meta_sk) : mss; +} + +int mptcp_select_size(const struct sock *meta_sk, bool sg) +{ + int mss = 0; /* We look for the smallest MSS */ + struct sock *sk; + + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { + int this_mss; + + if (!mptcp_sk_can_send(sk)) + continue; + + this_mss = tcp_sk(sk)->mss_cache; + if (this_mss > mss) + mss = this_mss; + } + + if (sg) { + if (mptcp_sk_can_gso(meta_sk)) { + mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); + } else { + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + + if (mss >= pgbreak && + mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) + mss = pgbreak; + } + } + + return !mss ? tcp_sk(meta_sk)->mss_cache : mss; +} + +int mptcp_check_snd_buf(const struct tcp_sock *tp) +{ + struct sock *sk; + u32 rtt_max = tp->srtt; + u64 bw_est; + + if (!tp->srtt) + return tp->reordering + 1; + + mptcp_for_each_sk(tp->mpcb, sk) { + if (!mptcp_sk_can_send(sk)) + continue; + + if (rtt_max < tcp_sk(sk)->srtt) + rtt_max = tcp_sk(sk)->srtt; + } + + bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16, + (u64)tp->srtt); + + return max_t(unsigned int, (u32)(bw_est >> 16), + tp->reordering + 1); + +} + +unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now, + int large_allowed) +{ + struct sock *sk; + u32 xmit_size_goal = 0; + + if (large_allowed && mptcp_sk_can_gso(meta_sk)) { + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) { + int this_size_goal; + + if (!mptcp_sk_can_send(sk)) + continue; + + this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1); + if (this_size_goal > xmit_size_goal) + xmit_size_goal = this_size_goal; + } + } + + return max(xmit_size_goal, mss_now); +} + +/* Similar to tcp_trim_head - but we correctly copy the DSS-option */ +int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +{ + int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN; + char dss[dsslen]; + + /* DSS-option must be recovered afterwards. */ + memcpy(dss, skb->data - dsslen, dsslen); + + if (skb_cloned(skb)) { + /* pskb_expand_head will delete our DSS-option. We have to copy + * it back if pskb_expand_head succeeds. + */ + + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + + memcpy(skb->data - dsslen, dss, dsslen); + } + + __pskb_trim_head(skb, len); + + /* Put the DSS-option back in our header */ + memcpy(skb->data - dsslen, dss, dsslen); + + TCP_SKB_CB(skb)->seq += len; + skb->ip_summed = CHECKSUM_PARTIAL; + + skb->truesize -= len; + sk->sk_wmem_queued -= len; + sk_mem_uncharge(sk, len); + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + + /* Any change of skb->len requires recalculation of tso factor. */ + if (tcp_skb_pcount(skb) > 1) + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); + + return 0; +} diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c new file mode 100644 index 0000000..9dc309d --- /dev/null +++ b/net/mptcp/mptcp_pm.c @@ -0,0 +1,169 @@ +/* + * MPTCP implementation - MPTCP-subflow-management + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include + +static DEFINE_SPINLOCK(mptcp_pm_list_lock); +static LIST_HEAD(mptcp_pm_list); + +static int mptcp_default_id(sa_family_t family, union inet_addr *addr, + struct net *net) +{ + return 0; +} + +struct mptcp_pm_ops mptcp_pm_default = { + .get_local_id = mptcp_default_id, + .name = "default", + .owner = THIS_MODULE, +}; + +static struct mptcp_pm_ops *mptcp_pm_find(const char *name) +{ + struct mptcp_pm_ops *e; + + list_for_each_entry_rcu(e, &mptcp_pm_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +int mptcp_register_path_manager(struct mptcp_pm_ops *pm) +{ + int ret = 0; + + if (!pm->get_local_id) + return -EINVAL; + + spin_lock(&mptcp_pm_list_lock); + if (mptcp_pm_find(pm->name)) { + pr_notice("%s already registered\n", pm->name); + ret = -EEXIST; + } else { + list_add_tail_rcu(&pm->list, &mptcp_pm_list); + pr_info("%s registered\n", pm->name); + } + spin_unlock(&mptcp_pm_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(mptcp_register_path_manager); + +void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm) +{ + spin_lock(&mptcp_pm_list_lock); + list_del_rcu(&pm->list); + spin_unlock(&mptcp_pm_list_lock); +} +EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager); + +void mptcp_get_default_path_manager(char *name) +{ + struct mptcp_pm_ops *pm; + + BUG_ON(list_empty(&mptcp_pm_list)); + + rcu_read_lock(); + pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list); + strncpy(name, pm->name, MPTCP_PM_NAME_MAX); + rcu_read_unlock(); +} + +int mptcp_set_default_path_manager(const char *name) +{ + struct mptcp_pm_ops *pm; + int ret = -ENOENT; + + spin_lock(&mptcp_pm_list_lock); + pm = mptcp_pm_find(name); +#ifdef CONFIG_MODULES + if (!pm && capable(CAP_NET_ADMIN)) { + spin_unlock(&mptcp_pm_list_lock); + + request_module("mptcp_%s", name); + spin_lock(&mptcp_pm_list_lock); + pm = mptcp_pm_find(name); + } +#endif + + if (pm) { + list_move(&pm->list, &mptcp_pm_list); + ret = 0; + } else { + pr_info("%s is not available\n", name); + } + spin_unlock(&mptcp_pm_list_lock); + + return ret; +} + +void mptcp_init_path_manager(struct mptcp_cb *mpcb) +{ + struct mptcp_pm_ops *pm; + + rcu_read_lock(); + list_for_each_entry_rcu(pm, &mptcp_pm_list, list) { + if (try_module_get(pm->owner)) { + mpcb->pm_ops = pm; + break; + } + } + rcu_read_unlock(); +} + +/* Manage refcounts on socket close. */ +void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb) +{ + module_put(mpcb->pm_ops->owner); +} + +/* Fallback to the default path-manager. */ +void mptcp_fallback_default(struct mptcp_cb *mpcb) +{ + struct mptcp_pm_ops *pm; + + mptcp_cleanup_path_manager(mpcb); + pm = mptcp_pm_find("default"); + + /* Cannot fail - it's the default module */ + try_module_get(pm->owner); + mpcb->pm_ops = pm; +} +EXPORT_SYMBOL_GPL(mptcp_fallback_default); + +/* Set default value from kernel configuration at bootup */ +static int __init mptcp_path_manager_default(void) +{ + return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM); +} +late_initcall(mptcp_path_manager_default);