diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 3c617d620b6f..bedc76c6e691 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -763,6 +763,18 @@ tcp_challenge_ack_limit - INTEGER in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) Default: 100 +MPTCP variables: + +mptcp_enabled - INTEGER + Enable or disable Multipath TCP for new connections. + Possible values are: + + 0: Multipath TCP is disabled on all TCP-sockets that are newly created. + 1: Multipath TCP is enabled by default on all new TCP-sockets. Note that + existing sockets in LISTEN-state will still use regular TCP. + 2: Enables Multipath TCP only upon the request of the application + throught the socket-option MPTCP_ENABLED. + UDP variables: udp_l3mdev_accept - BOOLEAN diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 6c1a093b164e..1a7fd8425cda 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3783,7 +3783,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos) */ memset(&tmp_opt, 0, sizeof(tmp_opt)); tcp_clear_options(&tmp_opt); - tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL); + tcp_parse_options(&init_net, skb, &tmp_opt, NULL, 0, NULL, NULL); req = __skb_push(skb, sizeof(*req)); memset(req, 0, sizeof(*req)); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f97734f34746..2f0290dc647e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -697,7 +697,7 @@ struct sk_buff { * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */ - char cb[48] __aligned(8); + char cb[80] __aligned(8); union { struct { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1192f1e76015..29267da5fe9f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -58,7 +58,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb) /* TCP Fast Open */ #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ -#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ +#define TCP_FASTOPEN_COOKIE_SIZE 4 /* the size employed by this impl. */ /* TCP Fast Open Cookie as stored in memory */ struct tcp_fastopen_cookie { @@ -83,6 +83,56 @@ struct tcp_sack_block { u32 end_seq; }; +struct tcp_out_options { + u16 options; /* bit field of OPTION_* */ + u16 mss; /* 0 to disable */ + u8 ws; /* window scale, 0 to disable */ + u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 hash_size; /* bytes in hash_location */ + __u8 *hash_location; /* temporary pointer, overloaded */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ + struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ +#ifdef CONFIG_MPTCP + u16 mptcp_options; /* bit field of MPTCP related OPTION_* */ + u8 dss_csum:1, /* dss-checksum required? */ + add_addr_v4:1, + add_addr_v6:1, + mptcp_ver:4; + + union { + struct { + __u64 sender_key; /* sender's key for mptcp */ + __u64 receiver_key; /* receiver's key for mptcp */ + } mp_capable; + + struct { + __u64 sender_truncated_mac; + __u32 sender_nonce; + /* random number of the sender */ + __u32 token; /* token for mptcp */ + u8 low_prio:1; + } mp_join_syns; + }; + + struct { + __u64 trunc_mac; + struct in_addr addr; + u16 port; + u8 addr_id; + } add_addr4; + + struct { + __u64 trunc_mac; + struct in6_addr addr; + u16 port; + u8 addr_id; + } add_addr6; + + u16 remove_addrs; /* list of address id */ + u8 addr_id; /* address id (mp_join or add_address) */ +#endif /* CONFIG_MPTCP */ +}; + /*These are used to set the sack_ok field in struct tcp_options_received */ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ @@ -106,6 +156,9 @@ struct tcp_options_received { u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ }; +struct mptcp_cb; +struct mptcp_tcp_sock; + static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; @@ -144,6 +197,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) return (struct tcp_request_sock *)req; } +struct tcp_md5sig_key; + struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -300,6 +355,7 @@ struct tcp_sock { u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_wnd; /* Current receiver window */ + u32 rcv_right_edge; /* Highest announced right edge */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ @@ -400,6 +456,44 @@ struct tcp_sock { */ struct request_sock *fastopen_rsk; u32 *saved_syn; + + /* MPTCP/TCP-specific callbacks */ + const struct tcp_sock_ops *ops; + + struct mptcp_cb *mpcb; + struct sock *meta_sk; + /* We keep these flags even if CONFIG_MPTCP is not checked, because + * it allows checking MPTCP capability just by checking the mpc flag, + * rather than adding ifdefs everywhere. + */ + u32 mpc:1, /* Other end is multipath capable */ + inside_tk_table:1, /* Is the tcp_sock inside the token-table? */ + send_mp_fclose:1, + request_mptcp:1, /* Did we send out an MP_CAPABLE? + * (this speeds up mptcp_doit() in tcp_recvmsg) + */ + pf:1, /* Potentially Failed state: when this flag is set, we + * stop using the subflow + */ + mp_killed:1, /* Killed with a tcp_done in mptcp? */ + is_master_sk:1, + close_it:1, /* Must close socket in mptcp_data_ready? */ + closing:1, + mptcp_ver:4, + mptcp_sched_setsockopt:1, + mptcp_pm_setsockopt:1, + record_master_info:1, + tcp_disconnect:1; + struct mptcp_tcp_sock *mptcp; +#ifdef CONFIG_MPTCP +#define MPTCP_SCHED_NAME_MAX 16 +#define MPTCP_PM_NAME_MAX 16 + struct hlist_nulls_node tk_table; + u32 mptcp_loc_token; + u64 mptcp_loc_key; + char mptcp_sched_name[MPTCP_SCHED_NAME_MAX]; + char mptcp_pm_name[MPTCP_PM_NAME_MAX]; +#endif /* CONFIG_MPTCP */ }; enum tsq_enum { @@ -411,6 +505,8 @@ enum tsq_enum { TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call * tcp_v{4|6}_mtu_reduced() */ + MPTCP_PATH_MANAGER_DEFERRED, /* MPTCP deferred creation of new subflows */ + MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */ }; enum tsq_flags { @@ -420,6 +516,8 @@ enum tsq_flags { TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED), TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED), TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED), + TCPF_PATH_MANAGER_DEFERRED = (1UL << MPTCP_PATH_MANAGER_DEFERRED), + TCPF_SUB_DEFERRED = (1UL << MPTCP_SUB_DEFERRED), }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -442,6 +540,7 @@ struct tcp_timewait_sock { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif + struct mptcp_tw *mptcp_tw; }; static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 3ca969cbd161..ccabf004c0e9 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -2,6 +2,8 @@ #ifndef _INET_COMMON_H #define _INET_COMMON_H +#include + extern const struct proto_ops inet_stream_ops; extern const struct proto_ops inet_dgram_ops; @@ -14,6 +16,8 @@ struct sockaddr; struct socket; +int inet_create(struct net *net, struct socket *sock, int protocol, int kern); +int inet6_create(struct net *net, struct socket *sock, int protocol, int kern); int inet_release(struct socket *sock); int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index da8a582ab032..bd073a6af697 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -29,6 +29,7 @@ struct inet_bind_bucket; struct tcp_congestion_ops; +struct tcp_options_received; /* * Pointers to address related TCP functions diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a80fd0ac4563..436b40d4c464 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -83,7 +83,7 @@ struct inet_request_sock { #define ireq_state req.__req_common.skc_state #define ireq_family req.__req_common.skc_family - u16 snd_wscale : 4, + u32 snd_wscale : 4, rcv_wscale : 4, tstamp_ok : 1, sack_ok : 1, @@ -91,6 +91,8 @@ struct inet_request_sock { ecn_ok : 1, acked : 1, no_srccheck: 1, + mptcp_rqsk : 1, + saw_mpc : 1, smc_ok : 1; u32 ir_mark; union { diff --git a/include/net/mptcp.h b/include/net/mptcp.h new file mode 100644 index 000000000000..ccb239f28587 --- /dev/null +++ b/include/net/mptcp.h @@ -0,0 +1,1466 @@ +/* + * MPTCP implementation + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _MPTCP_H +#define _MPTCP_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if defined(__LITTLE_ENDIAN_BITFIELD) + #define ntohll(x) be64_to_cpu(x) + #define htonll(x) cpu_to_be64(x) +#elif defined(__BIG_ENDIAN_BITFIELD) + #define ntohll(x) (x) + #define htonll(x) (x) +#endif + +struct mptcp_loc4 { + u8 loc4_id; + u8 low_prio:1; + int if_idx; + struct in_addr addr; +}; + +struct mptcp_rem4 { + u8 rem4_id; + __be16 port; + struct in_addr addr; +}; + +struct mptcp_loc6 { + u8 loc6_id; + u8 low_prio:1; + int if_idx; + struct in6_addr addr; +}; + +struct mptcp_rem6 { + u8 rem6_id; + __be16 port; + struct in6_addr addr; +}; + +struct mptcp_request_sock { + struct tcp_request_sock req; + struct hlist_nulls_node hash_entry; + + union { + struct { + /* Only on initial subflows */ + u64 mptcp_loc_key; + u64 mptcp_rem_key; + u32 mptcp_loc_token; + }; + + struct { + /* Only on additional subflows */ + u32 mptcp_rem_nonce; + u32 mptcp_loc_nonce; + u64 mptcp_hash_tmac; + }; + }; + + u8 loc_id; + u8 rem_id; /* Address-id in the MP_JOIN */ + u8 dss_csum:1, + is_sub:1, /* Is this a new subflow? */ + low_prio:1, /* Interface set to low-prio? */ + rcv_low_prio:1, + mptcp_ver:4; +}; + +struct mptcp_options_received { + u16 saw_mpc:1, + dss_csum:1, + drop_me:1, + + is_mp_join:1, + join_ack:1, + + saw_low_prio:2, /* 0x1 - low-prio set for this subflow + * 0x2 - low-prio set for another subflow + */ + low_prio:1, + + saw_add_addr:2, /* Saw at least one add_addr option: + * 0x1: IPv4 - 0x2: IPv6 + */ + more_add_addr:1, /* Saw one more add-addr. */ + + saw_rem_addr:1, /* Saw at least one rem_addr option */ + more_rem_addr:1, /* Saw one more rem-addr. */ + + mp_fail:1, + mp_fclose:1; + u8 rem_id; /* Address-id in the MP_JOIN */ + u8 prio_addr_id; /* Address-id in the MP_PRIO */ + + const unsigned char *add_addr_ptr; /* Pointer to add-address option */ + const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */ + + u32 data_ack; + u32 data_seq; + u16 data_len; + + u8 mptcp_ver; /* MPTCP version */ + + /* Key inside the option (from mp_capable or fast_close) */ + u64 mptcp_sender_key; + u64 mptcp_receiver_key; + + u32 mptcp_rem_token; /* Remote token */ + + u32 mptcp_recv_nonce; + u64 mptcp_recv_tmac; + u8 mptcp_recv_mac[20]; +}; + +struct mptcp_tcp_sock { + struct hlist_node node; + struct hlist_node cb_list; + struct mptcp_options_received rx_opt; + + /* Those three fields record the current mapping */ + u64 map_data_seq; + u32 map_subseq; + u16 map_data_len; + u16 slave_sk:1, + fully_established:1, + second_packet:1, + attached:1, + send_mp_fail:1, + include_mpc:1, + mapping_present:1, + map_data_fin:1, + low_prio:1, /* use this socket as backup */ + rcv_low_prio:1, /* Peer sent low-prio option to us */ + send_mp_prio:1, /* Trigger to send mp_prio on this socket */ + pre_established:1; /* State between sending 3rd ACK and + * receiving the fourth ack of new subflows. + */ + + /* isn: needed to translate abs to relative subflow seqnums */ + u32 snt_isn; + u32 rcv_isn; + u8 path_index; + u8 loc_id; + u8 rem_id; + u8 sk_err; + +#define MPTCP_SCHED_SIZE 16 + u8 mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8); + + int init_rcv_wnd; + u32 infinite_cutoff_seq; + struct delayed_work work; + u32 mptcp_loc_nonce; + struct tcp_sock *tp; + u32 last_end_data_seq; + + /* MP_JOIN subflow: timer for retransmitting the 3rd ack */ + struct timer_list mptcp_ack_timer; + + /* HMAC of the third ack */ + char sender_mac[20]; +}; + +struct mptcp_tw { + struct list_head list; + u64 loc_key; + u64 rcv_nxt; + struct mptcp_cb __rcu *mpcb; + u8 meta_tw:1, + in_list:1; +}; + +#define MPTCP_PM_NAME_MAX 16 +struct mptcp_pm_ops { + struct list_head list; + + /* Signal the creation of a new MPTCP-session. */ + void (*new_session)(const struct sock *meta_sk); + void (*release_sock)(struct sock *meta_sk); + void (*fully_established)(struct sock *meta_sk); + void (*close_session)(struct sock *meta_sk); + void (*new_remote_address)(struct sock *meta_sk); + int (*get_local_id)(const struct sock *meta_sk, sa_family_t family, + union inet_addr *addr, bool *low_prio); + void (*addr_signal)(struct sock *sk, unsigned *size, + struct tcp_out_options *opts, struct sk_buff *skb); + void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr, + sa_family_t family, __be16 port, u8 id); + void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id); + void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr); + void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr); + void (*established_subflow)(struct sock *sk); + void (*delete_subflow)(struct sock *sk); + void (*prio_changed)(struct sock *sk, int low_prio); + + char name[MPTCP_PM_NAME_MAX]; + struct module *owner; +}; + +#define MPTCP_SCHED_NAME_MAX 16 +struct mptcp_sched_ops { + struct list_head list; + + struct sock * (*get_subflow)(struct sock *meta_sk, + struct sk_buff *skb, + bool zero_wnd_test); + struct sk_buff * (*next_segment)(struct sock *meta_sk, + int *reinject, + struct sock **subsk, + unsigned int *limit); + void (*init)(struct sock *sk); + void (*release)(struct sock *sk); + + char name[MPTCP_SCHED_NAME_MAX]; + struct module *owner; +}; + +struct mptcp_cb { + /* list of sockets in this multipath connection */ + struct hlist_head conn_list; + /* list of sockets that need a call to release_cb */ + struct hlist_head callback_list; + + /* Lock used for protecting the different rcu-lists of mptcp_cb */ + spinlock_t mpcb_list_lock; + + /* High-order bits of 64-bit sequence numbers */ + u32 snd_high_order[2]; + u32 rcv_high_order[2]; + + u16 send_infinite_mapping:1, + in_time_wait:1, + list_rcvd:1, /* XXX TO REMOVE */ + addr_signal:1, /* Path-manager wants us to call addr_signal */ + dss_csum:1, + server_side:1, + infinite_mapping_rcv:1, + infinite_mapping_snd:1, + infinite_send_una_ahead:1, /* While falling back, the snd_una + *on meta is ahead of the subflow. + */ + dfin_combined:1, /* Was the DFIN combined with subflow-fin? */ + passive_close:1, + snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */ + rcv_hiseq_index:1, /* Index in rcv_high_order of rcv_nxt */ + tcp_ca_explicit_set:1; /* was meta CC set by app? */ + +#define MPTCP_SCHED_DATA_SIZE 8 + u8 mptcp_sched[MPTCP_SCHED_DATA_SIZE] __aligned(8); + const struct mptcp_sched_ops *sched_ops; + + struct sk_buff_head reinject_queue; + /* First cache-line boundary is here minus 8 bytes. But from the + * reinject-queue only the next and prev pointers are regularly + * accessed. Thus, the whole data-path is on a single cache-line. + */ + + u64 csum_cutoff_seq; + u64 infinite_rcv_seq; + + /***** Start of fields, used for connection closure */ + unsigned char mptw_state; + u8 dfin_path_index; + + struct list_head tw_list; + + /***** Start of fields, used for subflow establishment and closure */ + refcount_t mpcb_refcnt; + + /* Mutex needed, because otherwise mptcp_close will complain that the + * socket is owned by the user. + * E.g., mptcp_sub_close_wq is taking the meta-lock. + */ + struct mutex mpcb_mutex; + + /***** Start of fields, used for subflow establishment */ + struct sock *meta_sk; + + /* Master socket, also part of the conn_list, this + * socket is the one that the application sees. + */ + struct sock *master_sk; + + __u64 mptcp_loc_key; + __u64 mptcp_rem_key; + __u32 mptcp_loc_token; + __u32 mptcp_rem_token; + +#define MPTCP_PM_SIZE 608 + u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8); + const struct mptcp_pm_ops *pm_ops; + + unsigned long path_index_bits; + + __u8 mptcp_ver; + + /* Original snd/rcvbuf of the initial subflow. + * Used for the new subflows on the server-side to allow correct + * autotuning + */ + int orig_sk_rcvbuf; + int orig_sk_sndbuf; + u32 orig_window_clamp; + + struct tcp_info *master_info; +}; + +#define MPTCP_VERSION_0 0 +#define MPTCP_VERSION_1 1 + +#define MPTCP_SUB_CAPABLE 0 +#define MPTCP_SUB_LEN_CAPABLE_SYN 12 +#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12 +#define MPTCP_SUB_LEN_CAPABLE_ACK 20 +#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20 + +#define MPTCP_SUB_JOIN 1 +#define MPTCP_SUB_LEN_JOIN_SYN 12 +#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12 +#define MPTCP_SUB_LEN_JOIN_SYNACK 16 +#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16 +#define MPTCP_SUB_LEN_JOIN_ACK 24 +#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24 + +#define MPTCP_SUB_DSS 2 +#define MPTCP_SUB_LEN_DSS 4 +#define MPTCP_SUB_LEN_DSS_ALIGN 4 + +/* Lengths for seq and ack are the ones without the generic MPTCP-option header, + * as they are part of the DSS-option. + * To get the total length, just add the different options together. + */ +#define MPTCP_SUB_LEN_SEQ 10 +#define MPTCP_SUB_LEN_SEQ_CSUM 12 +#define MPTCP_SUB_LEN_SEQ_ALIGN 12 + +#define MPTCP_SUB_LEN_SEQ_64 14 +#define MPTCP_SUB_LEN_SEQ_CSUM_64 16 +#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16 + +#define MPTCP_SUB_LEN_ACK 4 +#define MPTCP_SUB_LEN_ACK_ALIGN 4 + +#define MPTCP_SUB_LEN_ACK_64 8 +#define MPTCP_SUB_LEN_ACK_64_ALIGN 8 + +/* This is the "default" option-length we will send out most often. + * MPTCP DSS-header + * 32-bit data sequence number + * 32-bit data ack + * + * It is necessary to calculate the effective MSS we will be using when + * sending data. + */ +#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \ + MPTCP_SUB_LEN_SEQ_ALIGN + \ + MPTCP_SUB_LEN_ACK_ALIGN) + +#define MPTCP_SUB_ADD_ADDR 3 +#define MPTCP_SUB_LEN_ADD_ADDR4 8 +#define MPTCP_SUB_LEN_ADD_ADDR4_VER1 16 +#define MPTCP_SUB_LEN_ADD_ADDR6 20 +#define MPTCP_SUB_LEN_ADD_ADDR6_VER1 28 +#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8 +#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1 16 +#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20 +#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1 28 + +#define MPTCP_SUB_REMOVE_ADDR 4 +#define MPTCP_SUB_LEN_REMOVE_ADDR 4 + +#define MPTCP_SUB_PRIO 5 +#define MPTCP_SUB_LEN_PRIO 3 +#define MPTCP_SUB_LEN_PRIO_ADDR 4 +#define MPTCP_SUB_LEN_PRIO_ALIGN 4 + +#define MPTCP_SUB_FAIL 6 +#define MPTCP_SUB_LEN_FAIL 12 +#define MPTCP_SUB_LEN_FAIL_ALIGN 12 + +#define MPTCP_SUB_FCLOSE 7 +#define MPTCP_SUB_LEN_FCLOSE 12 +#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12 + + +#define OPTION_MPTCP (1 << 5) + +/* Max number of fastclose retransmissions */ +#define MPTCP_FASTCLOSE_RETRIES 3 + +#ifdef CONFIG_MPTCP + +/* Used for checking if the mptcp initialization has been successful */ +extern bool mptcp_init_failed; + +/* MPTCP options */ +#define OPTION_TYPE_SYN (1 << 0) +#define OPTION_TYPE_SYNACK (1 << 1) +#define OPTION_TYPE_ACK (1 << 2) +#define OPTION_MP_CAPABLE (1 << 3) +#define OPTION_DATA_ACK (1 << 4) +#define OPTION_ADD_ADDR (1 << 5) +#define OPTION_MP_JOIN (1 << 6) +#define OPTION_MP_FAIL (1 << 7) +#define OPTION_MP_FCLOSE (1 << 8) +#define OPTION_REMOVE_ADDR (1 << 9) +#define OPTION_MP_PRIO (1 << 10) + +/* MPTCP flags: both TX and RX */ +#define MPTCPHDR_SEQ 0x01 /* DSS.M option is present */ +#define MPTCPHDR_FIN 0x02 /* DSS.F option is present */ +#define MPTCPHDR_SEQ64_INDEX 0x04 /* index of seq in mpcb->snd_high_order */ +/* MPTCP flags: RX only */ +#define MPTCPHDR_ACK 0x08 +#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number? */ +#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */ +#define MPTCPHDR_DSS_CSUM 0x40 +/* MPTCP flags: TX only */ +#define MPTCPHDR_INF 0x08 +#define MPTCP_REINJECT 0x10 /* Did we reinject this segment? */ + +struct mptcp_option { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ver:4, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + ver:4; +#else +#error "Adjust your defines" +#endif +}; + +struct mp_capable { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ver:4, + sub:4; + __u8 h:1, + rsv:5, + b:1, + a:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + ver:4; + __u8 a:1, + b:1, + rsv:5, + h:1; +#else +#error "Adjust your defines" +#endif + __u64 sender_key; + __u64 receiver_key; +} __attribute__((__packed__)); + +struct mp_join { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 b:1, + rsv:3, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + rsv:3, + b:1; +#else +#error "Adjust your defines" +#endif + __u8 addr_id; + union { + struct { + u32 token; + u32 nonce; + } syn; + struct { + __u64 mac; + u32 nonce; + } synack; + struct { + __u8 mac[20]; + } ack; + } u; +} __attribute__((__packed__)); + +struct mp_dss { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 rsv1:4, + sub:4, + A:1, + a:1, + M:1, + m:1, + F:1, + rsv2:3; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 sub:4, + rsv1:4, + rsv2:3, + F:1, + m:1, + M:1, + a:1, + A:1; +#else +#error "Adjust your defines" +#endif +}; + +struct mp_add_addr { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ipver:4, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + ipver:4; +#else +#error "Adjust your defines" +#endif + __u8 addr_id; + union { + struct { + struct in_addr addr; + __be16 port; + __u8 mac[8]; + } v4; + struct { + struct in6_addr addr; + __be16 port; + __u8 mac[8]; + } v6; + } u; +} __attribute__((__packed__)); + +struct mp_remove_addr { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 rsv:4, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + rsv:4; +#else +#error "Adjust your defines" +#endif + /* list of addr_id */ + __u8 addrs_id; +}; + +struct mp_fail { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 rsv1:4, + sub:4, + rsv2:8; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 sub:4, + rsv1:4, + rsv2:8; +#else +#error "Adjust your defines" +#endif + __be64 data_seq; +} __attribute__((__packed__)); + +struct mp_fclose { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 rsv1:4, + sub:4, + rsv2:8; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 sub:4, + rsv1:4, + rsv2:8; +#else +#error "Adjust your defines" +#endif + __u64 key; +} __attribute__((__packed__)); + +struct mp_prio { + __u8 kind; + __u8 len; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 b:1, + rsv:3, + sub:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 sub:4, + rsv:3, + b:1; +#else +#error "Adjust your defines" +#endif + __u8 addr_id; +} __attribute__((__packed__)); + +static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum) +{ + return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2); +} + +#define MPTCP_SYSCTL 1 + +extern int sysctl_mptcp_enabled; +extern int sysctl_mptcp_version; +extern int sysctl_mptcp_checksum; +extern int sysctl_mptcp_debug; +extern int sysctl_mptcp_syn_retries; + +extern struct workqueue_struct *mptcp_wq; + +#define mptcp_debug(fmt, args...) \ + do { \ + if (unlikely(sysctl_mptcp_debug)) \ + pr_err(fmt, ##args); \ + } while (0) + +static inline struct sock *mptcp_to_sock(const struct mptcp_tcp_sock *mptcp) +{ + return (struct sock *)mptcp->tp; +} + +#define mptcp_for_each_sub(__mpcb, __mptcp) \ + hlist_for_each_entry_rcu(__mptcp, &((__mpcb)->conn_list), node) + +/* Must be called with the appropriate lock held */ +#define mptcp_for_each_sub_safe(__mpcb, __mptcp, __tmp) \ + hlist_for_each_entry_safe(__mptcp, __tmp, &((__mpcb)->conn_list), node) + +/* Iterates over all bit set to 1 in a bitset */ +#define mptcp_for_each_bit_set(b, i) \ + for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1) + +#define mptcp_for_each_bit_unset(b, i) \ + mptcp_for_each_bit_set(~b, i) + +#define MPTCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mptcp.mptcp_statistics, field) + +enum +{ + MPTCP_MIB_NUM = 0, + MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEACTIVE, /* Sent SYN with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEACTIVEACK, /* Received SYN/ACK with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ + MPTCP_MIB_MPCAPABLERETRANSFALLBACK,/* Client-side stopped sending MP_CAPABLE after too many SYN-retransmissions */ + MPTCP_MIB_CSUMENABLED, /* Created MPTCP-connection with DSS-checksum enabled */ + MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ + MPTCP_MIB_MPFAILRX, /* Received an MP_FAIL */ + MPTCP_MIB_CSUMFAIL, /* Received segment with invalid checksum */ + MPTCP_MIB_FASTCLOSERX, /* Recevied a FAST_CLOSE */ + MPTCP_MIB_FASTCLOSETX, /* Sent a FAST_CLOSE */ + MPTCP_MIB_FBACKSUB, /* Fallback upon ack without data-ack on new subflow */ + MPTCP_MIB_FBACKINIT, /* Fallback upon ack without data-ack on initial subflow */ + MPTCP_MIB_FBDATASUB, /* Fallback upon data without DSS at the beginning on new subflow */ + MPTCP_MIB_FBDATAINIT, /* Fallback upon data without DSS at the beginning on initial subflow */ + MPTCP_MIB_REMADDRSUB, /* Remove subflow due to REMOVE_ADDR */ + MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ + MPTCP_MIB_JOINFALLBACK, /* Received MP_JOIN on session that has fallen back to reg. TCP */ + MPTCP_MIB_JOINSYNTX, /* Sent a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ + MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ + MPTCP_MIB_JOINACKFAIL, /* Third ACK on new subflow did not contain an MP_JOIN */ + MPTCP_MIB_JOINACKRTO, /* Retransmission timer for third ACK + MP_JOIN timed out */ + MPTCP_MIB_JOINACKRXMIT, /* Retransmitted an ACK + MP_JOIN */ + MPTCP_MIB_NODSSWINDOW, /* Received too many packets without a DSS-option */ + MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ + MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ + MPTCP_MIB_DSSTRIMHEAD, /* Trimmed segment at the head (coalescing middlebox) */ + MPTCP_MIB_DSSSPLITTAIL, /* Trimmed segment at the tail (coalescing middlebox) */ + MPTCP_MIB_PURGEOLD, /* Removed old skb from the rcv-queue due to missing DSS-mapping */ + MPTCP_MIB_ADDADDRRX, /* Received an ADD_ADDR */ + MPTCP_MIB_ADDADDRTX, /* Sent an ADD_ADDR */ + MPTCP_MIB_REMADDRRX, /* Received a REMOVE_ADDR */ + MPTCP_MIB_REMADDRTX, /* Sent a REMOVE_ADDR */ + __MPTCP_MIB_MAX +}; + +#define MPTCP_MIB_MAX __MPTCP_MIB_MAX +struct mptcp_mib { + unsigned long mibs[MPTCP_MIB_MAX]; +}; + +extern struct lock_class_key meta_key; +extern char *meta_key_name; +extern struct lock_class_key meta_slock_key; +extern char *meta_slock_key_name; + +extern siphash_key_t mptcp_secret; + +/* This is needed to ensure that two subsequent key/nonce-generation result in + * different keys/nonces if the IPs and ports are the same. + */ +extern u32 mptcp_seed; + +#define MPTCP_HASH_SIZE 1024 + +extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE]; + +/* Request-sockets can be hashed in the tk_htb for collision-detection or in + * the regular htb for join-connections. We need to define different NULLS + * values so that we can correctly detect a request-socket that has been + * recycled. See also c25eb3bfb9729. + */ +#define MPTCP_REQSK_NULLS_BASE (1U << 29) + + +void mptcp_data_ready(struct sock *sk); +void mptcp_write_space(struct sock *sk); + +void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb, + struct sock *sk); +void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied); +int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, + gfp_t flags); +void mptcp_del_sock(struct sock *sk); +void mptcp_update_metasocket(const struct sock *meta_sk); +void mptcp_reinject_data(struct sock *orig_sk, int clone_it); +void mptcp_update_sndbuf(const struct tcp_sock *tp); +void mptcp_send_fin(struct sock *meta_sk); +void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority); +bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); +void tcp_parse_mptcp_options(const struct sk_buff *skb, + struct mptcp_options_received *mopt); +bool mptcp_handle_ack_in_infinite(struct sock *sk, const struct sk_buff *skb, + int flag); +void mptcp_parse_options(const uint8_t *ptr, int opsize, + struct mptcp_options_received *mopt, + const struct sk_buff *skb, + struct tcp_sock *tp); +void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts, + unsigned *remaining); +void mptcp_synack_options(struct request_sock *req, + struct tcp_out_options *opts, + unsigned *remaining); +void mptcp_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, unsigned *size); +void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, + const struct tcp_out_options *opts, + struct sk_buff *skb); +void mptcp_close(struct sock *meta_sk, long timeout); +bool mptcp_doit(struct sock *sk); +int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, + __u8 mptcp_ver, u32 window); +int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req); +int mptcp_check_req_master(struct sock *sk, struct sock *child, + struct request_sock *req, const struct sk_buff *skb, + int drop, u32 tsoff); +struct sock *mptcp_check_req_child(struct sock *meta_sk, + struct sock *child, + struct request_sock *req, + struct sk_buff *skb, + const struct mptcp_options_received *mopt); +u32 __mptcp_select_window(struct sock *sk); +void mptcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, + __u32 *rcv_wnd, __u32 *window_clamp, + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd); +unsigned int mptcp_current_mss(struct sock *meta_sk); +int mptcp_select_size(const struct sock *meta_sk, bool first_skb, bool zc); +void mptcp_hmac_sha1(const u8 *key_1, const u8 *key_2, u32 *hash_out, + int arg_num, ...); +void mptcp_fin(struct sock *meta_sk); +void mptcp_meta_retransmit_timer(struct sock *meta_sk); +void mptcp_sub_retransmit_timer(struct sock *sk); +int mptcp_write_wakeup(struct sock *meta_sk, int mib); +void mptcp_sub_close_wq(struct work_struct *work); +void mptcp_sub_close(struct sock *sk, unsigned long delay); +struct sock *mptcp_select_ack_sock(const struct sock *meta_sk); +void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb); +int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb); +void mptcp_ack_handler(struct timer_list *t); +bool mptcp_check_rtt(const struct tcp_sock *tp, int time); +int mptcp_check_snd_buf(const struct tcp_sock *tp); +bool mptcp_handle_options(struct sock *sk, const struct tcphdr *th, + const struct sk_buff *skb); +void __init mptcp_init(void); +void mptcp_destroy_sock(struct sock *sk); +int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, + const struct sk_buff *skb, + const struct mptcp_options_received *mopt); +unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now, + int large_allowed); +int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw); +void mptcp_twsk_destructor(struct tcp_timewait_sock *tw); +void mptcp_time_wait(struct sock *sk, int state, int timeo); +void mptcp_disconnect(struct sock *meta_sk); +bool mptcp_should_expand_sndbuf(const struct sock *sk); +int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb); +void mptcp_tsq_flags(struct sock *sk); +void mptcp_tsq_sub_deferred(struct sock *meta_sk); +struct mp_join *mptcp_find_join(const struct sk_buff *skb); +void mptcp_hash_remove_bh(struct tcp_sock *meta_tp); +struct sock *mptcp_hash_find(const struct net *net, const u32 token); +int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw); +int mptcp_do_join_short(struct sk_buff *skb, + const struct mptcp_options_received *mopt, + struct net *net); +void mptcp_reqsk_destructor(struct request_sock *req); +void mptcp_connect_init(struct sock *sk); +void mptcp_sub_force_close(struct sock *sk); +int mptcp_sub_len_remove_addr_align(u16 bitfield); +void mptcp_join_reqsk_init(const struct mptcp_cb *mpcb, + const struct request_sock *req, + struct sk_buff *skb); +void mptcp_reqsk_init(struct request_sock *req, const struct sock *sk, + const struct sk_buff *skb, bool want_cookie); +int mptcp_conn_request(struct sock *sk, struct sk_buff *skb); +void mptcp_enable_sock(struct sock *sk); +void mptcp_disable_sock(struct sock *sk); +void mptcp_disable_static_key(void); +void mptcp_cookies_reqsk_init(struct request_sock *req, + struct mptcp_options_received *mopt, + struct sk_buff *skb); +void mptcp_mpcb_put(struct mptcp_cb *mpcb); +int mptcp_finish_handshake(struct sock *child, struct sk_buff *skb); +int mptcp_get_info(const struct sock *meta_sk, char __user *optval, int optlen); +void mptcp_clear_sk(struct sock *sk, int size); + +/* MPTCP-path-manager registration/initialization functions */ +int mptcp_register_path_manager(struct mptcp_pm_ops *pm); +void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm); +void mptcp_init_path_manager(struct mptcp_cb *mpcb); +void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb); +void mptcp_fallback_default(struct mptcp_cb *mpcb); +void mptcp_get_default_path_manager(char *name); +int mptcp_set_scheduler(struct sock *sk, const char *name); +int mptcp_set_path_manager(struct sock *sk, const char *name); +int mptcp_set_default_path_manager(const char *name); +extern struct mptcp_pm_ops mptcp_pm_default; + +/* MPTCP-scheduler registration/initialization functions */ +int mptcp_register_scheduler(struct mptcp_sched_ops *sched); +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); +void mptcp_init_scheduler(struct mptcp_cb *mpcb); +void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb); +void mptcp_get_default_scheduler(char *name); +int mptcp_set_default_scheduler(const char *name); +bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb, + bool zero_wnd_test); +bool mptcp_is_def_unavailable(struct sock *sk); +bool subflow_is_active(const struct tcp_sock *tp); +bool subflow_is_backup(const struct tcp_sock *tp); +struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, + bool zero_wnd_test); +extern struct mptcp_sched_ops mptcp_sched_default; + +/* Initializes function-pointers and MPTCP-flags */ +static inline void mptcp_init_tcp_sock(struct sock *sk) +{ + if (!mptcp_init_failed && sysctl_mptcp_enabled == MPTCP_SYSCTL) + mptcp_enable_sock(sk); +} + +static inline int mptcp_pi_to_flag(int pi) +{ + return 1 << (pi - 1); +} + +static inline +struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req) +{ + return (struct mptcp_request_sock *)req; +} + +static inline +struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req) +{ + return (struct request_sock *)req; +} + +static inline bool mptcp_can_sendpage(struct sock *sk) +{ + struct mptcp_tcp_sock *mptcp; + + if (tcp_sk(sk)->mpcb->dss_csum) + return false; + + mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + if (!(sk_it->sk_route_caps & NETIF_F_SG)) + return false; + } + + return true; +} + +static inline void mptcp_push_pending_frames(struct sock *meta_sk) +{ + /* We check packets out and send-head here. TCP only checks the + * send-head. But, MPTCP also checks packets_out, as this is an + * indication that we might want to do opportunistic reinjection. + */ + if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) { + struct tcp_sock *tp = tcp_sk(meta_sk); + + /* We don't care about the MSS, because it will be set in + * mptcp_write_xmit. + */ + __tcp_push_pending_frames(meta_sk, 0, tp->nonagle); + } +} + +static inline void mptcp_send_reset(struct sock *sk) +{ + if (tcp_need_reset(sk->sk_state)) + tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); + mptcp_sub_force_close(sk); +} + +static inline void mptcp_sub_force_close_all(struct mptcp_cb *mpcb, + struct sock *except) +{ + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + + mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + if (sk_it != except) + mptcp_send_reset(sk_it); + } +} + +static inline bool mptcp_is_data_seq(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ; +} + +static inline bool mptcp_is_data_fin(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN; +} + +/* Is it a data-fin while in infinite mapping mode? + * In infinite mode, a subflow-fin is in fact a data-fin. + */ +static inline bool mptcp_is_data_fin2(const struct sk_buff *skb, + const struct tcp_sock *tp) +{ + return mptcp_is_data_fin(skb) || + (tp->mpcb->infinite_mapping_rcv && + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)); +} + +static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb) +{ + u64 data_seq_high = (u32)(data_seq >> 32); + + if (mpcb->rcv_high_order[0] == data_seq_high) + return 0; + else if (mpcb->rcv_high_order[1] == data_seq_high) + return MPTCPHDR_SEQ64_INDEX; + else + return MPTCPHDR_SEQ64_OFO; +} + +/* Sets the data_seq and returns pointer to the in-skb field of the data_seq. + * If the packet has a 64-bit dseq, the pointer points to the last 32 bits. + */ +static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb, + u32 *data_seq, + struct mptcp_cb *mpcb) +{ + __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off); + + if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) { + u64 data_seq64 = get_unaligned_be64(ptr); + + if (mpcb) + TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb); + + *data_seq = (u32)data_seq64; + ptr++; + } else { + *data_seq = get_unaligned_be32(ptr); + } + + return ptr; +} + +static inline struct sock *mptcp_meta_sk(const struct sock *sk) +{ + return tcp_sk(sk)->meta_sk; +} + +static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) +{ + return tcp_sk(tp->meta_sk); +} + +static inline int is_meta_tp(const struct tcp_sock *tp) +{ + return tp->mpcb && mptcp_meta_tp(tp) == tp; +} + +static inline int is_meta_sk(const struct sock *sk) +{ + return sk->sk_state != TCP_NEW_SYN_RECV && + sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP && + mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk; +} + +static inline int is_master_tp(const struct tcp_sock *tp) +{ + return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp)); +} + +static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt) +{ + mopt->saw_mpc = 0; + mopt->dss_csum = 0; + mopt->drop_me = 0; + + mopt->is_mp_join = 0; + mopt->join_ack = 0; + + mopt->saw_low_prio = 0; + mopt->low_prio = 0; + + mopt->saw_add_addr = 0; + mopt->more_add_addr = 0; + + mopt->saw_rem_addr = 0; + mopt->more_rem_addr = 0; + + mopt->mp_fail = 0; + mopt->mp_fclose = 0; +} + +static inline void mptcp_reset_mopt(struct tcp_sock *tp) +{ + struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; + + mopt->saw_low_prio = 0; + mopt->saw_add_addr = 0; + mopt->more_add_addr = 0; + mopt->saw_rem_addr = 0; + mopt->more_rem_addr = 0; + mopt->join_ack = 0; + mopt->mp_fail = 0; + mopt->mp_fclose = 0; +} + +static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb, + const struct mptcp_cb *mpcb) +{ + return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags & + MPTCPHDR_SEQ64_INDEX) ? 1 : 0]); +} + +static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index, + u32 data_seq_32) +{ + return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32; +} + +static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp) +{ + struct mptcp_cb *mpcb = meta_tp->mpcb; + return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, + meta_tp->rcv_nxt); +} + +static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc) +{ + if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) { + struct mptcp_cb *mpcb = meta_tp->mpcb; + mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; + mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2; + } +} + +static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp, + u32 old_rcv_nxt) +{ + if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) { + struct mptcp_cb *mpcb = meta_tp->mpcb; + mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2; + mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1; + } +} + +static inline int mptcp_sk_can_send(const struct sock *sk) +{ + return tcp_passive_fastopen(sk) || + ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + !tcp_sk(sk)->mptcp->pre_established); +} + +static inline int mptcp_sk_can_recv(const struct sock *sk) +{ + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2); +} + +static inline int mptcp_sk_can_send_ack(const struct sock *sk) +{ + return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | + TCPF_CLOSE | TCPF_LISTEN)) && + !tcp_sk(sk)->mptcp->pre_established; +} + +static inline bool mptcp_can_sg(const struct sock *meta_sk) +{ + struct mptcp_tcp_sock *mptcp; + + if (tcp_sk(meta_sk)->mpcb->dss_csum) + return false; + + mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + + if (!mptcp_sk_can_send(sk)) + continue; + if (!(sk->sk_route_caps & NETIF_F_SG)) + return false; + } + return true; +} + +static inline void mptcp_set_rto(struct sock *sk) +{ + struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk)); + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_tcp_sock *mptcp; + __u32 max_rto = 0; + + /* We are in recovery-phase on the MPTCP-level. Do not update the + * RTO, because this would kill exponential backoff. + */ + if (micsk->icsk_retransmits) + return; + + mptcp_for_each_sub(tp->mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + if ((mptcp_sk_can_send(sk_it) || sk_it->sk_state == TCP_SYN_RECV) && + inet_csk(sk_it)->icsk_retransmits == 0 && + inet_csk(sk_it)->icsk_backoff == 0 && + inet_csk(sk_it)->icsk_rto > max_rto) + max_rto = inet_csk(sk_it)->icsk_rto; + } + if (max_rto) { + micsk->icsk_rto = max_rto << 1; + + /* A successfull rto-measurement - reset backoff counter */ + micsk->icsk_backoff = 0; + } +} + +static inline void mptcp_sub_close_passive(struct sock *sk) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk); + + /* Only close, if the app did a send-shutdown (passive close), and we + * received the data-ack of the data-fin. + */ + if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq) + mptcp_sub_close(sk, 0); +} + +/* Returns true if all subflows were closed */ +static inline bool mptcp_fallback_close(struct mptcp_cb *mpcb, + struct sock *except) +{ + /* It can happen that the meta is already closed. In that case, don't + * keep the subflow alive - close everything! + */ + if (mpcb->meta_sk->sk_state == TCP_CLOSE) + except = NULL; + + mptcp_sub_force_close_all(mpcb, except); + + if (mpcb->pm_ops->close_session) + mpcb->pm_ops->close_session(mptcp_meta_sk(except)); + + return !except; +} + +static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk) +{ + return sk->sk_family == AF_INET6 && + ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED; +} + +/* We are in or are becoming to be in infinite mapping mode */ +static inline bool mptcp_in_infinite_mapping_weak(const struct mptcp_cb *mpcb) +{ + return mpcb->infinite_mapping_rcv || + mpcb->infinite_mapping_snd || + mpcb->send_infinite_mapping; +} + +static inline bool mptcp_can_new_subflow(const struct sock *meta_sk) +{ + /* Has been removed from the tk-table. Thus, no new subflows. + * + * Check for close-state is necessary, because we may have been closed + * without passing by mptcp_close(). + * + * When falling back, no new subflows are allowed either. + */ + return meta_sk->sk_state != TCP_CLOSE && + tcp_sk(meta_sk)->inside_tk_table && + !tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv && + !tcp_sk(meta_sk)->mpcb->send_infinite_mapping; +} + +static inline int mptcp_subflow_count(const struct mptcp_cb *mpcb) +{ + struct mptcp_tcp_sock *mptcp; + int i = 0; + + mptcp_for_each_sub(mpcb, mptcp) + i++; + + return i; +} + +/* TCP and MPTCP mpc flag-depending functions */ +u16 mptcp_select_window(struct sock *sk); +void mptcp_tcp_set_rto(struct sock *sk); + +#else /* CONFIG_MPTCP */ +#define mptcp_debug(fmt, args...) \ + do { \ + } while (0) + +static inline struct sock *mptcp_to_sock(const struct mptcp_tcp_sock *mptcp) +{ + return NULL; +} + +#define mptcp_for_each_sub(__mpcb, __mptcp) \ + if (0) + +#define MPTCP_INC_STATS(net, field) \ + do { \ + } while(0) + +static inline bool mptcp_is_data_fin(const struct sk_buff *skb) +{ + return false; +} +static inline bool mptcp_is_data_seq(const struct sk_buff *skb) +{ + return false; +} +static inline struct sock *mptcp_meta_sk(const struct sock *sk) +{ + return NULL; +} +static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp) +{ + return NULL; +} +static inline int is_meta_sk(const struct sock *sk) +{ + return 0; +} +static inline int is_master_tp(const struct tcp_sock *tp) +{ + return 0; +} +static inline void mptcp_del_sock(const struct sock *sk) {} +static inline void mptcp_update_metasocket(const struct sock *meta_sk) {} +static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {} +static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {} +static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {} +static inline void mptcp_set_rto(const struct sock *sk) {} +static inline void mptcp_send_fin(const struct sock *meta_sk) {} +static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize, + struct mptcp_options_received *mopt, + const struct sk_buff *skb, + const struct tcp_sock *tp) {} +static inline void mptcp_syn_options(const struct sock *sk, + struct tcp_out_options *opts, + unsigned *remaining) {} +static inline void mptcp_synack_options(struct request_sock *req, + struct tcp_out_options *opts, + unsigned *remaining) {} + +static inline void mptcp_established_options(struct sock *sk, + struct sk_buff *skb, + struct tcp_out_options *opts, + unsigned *size) {} +static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, + const struct tcp_out_options *opts, + struct sk_buff *skb) {} +static inline void mptcp_close(struct sock *meta_sk, long timeout) {} +static inline bool mptcp_doit(struct sock *sk) +{ + return false; +} +static inline int mptcp_check_req_fastopen(struct sock *child, + struct request_sock *req) +{ + return 1; +} +static inline int mptcp_check_req_master(const struct sock *sk, + const struct sock *child, + const struct request_sock *req, + const struct sk_buff *skb, + int drop, + u32 tsoff) +{ + return 1; +} +static inline struct sock *mptcp_check_req_child(const struct sock *meta_sk, + const struct sock *child, + const struct request_sock *req, + struct sk_buff *skb, + const struct mptcp_options_received *mopt) +{ + return NULL; +} +static inline unsigned int mptcp_current_mss(struct sock *meta_sk) +{ + return 0; +} +static inline void mptcp_sub_close_passive(struct sock *sk) {} +static inline bool mptcp_handle_ack_in_infinite(const struct sock *sk, + const struct sk_buff *skb, + int flag) +{ + return false; +} +static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {} +static inline void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb) {} +static inline bool mptcp_check_rtt(const struct tcp_sock *tp, int time) +{ + return false; +} +static inline int mptcp_check_snd_buf(const struct tcp_sock *tp) +{ + return 0; +} +static inline void mptcp_push_pending_frames(struct sock *meta_sk) {} +static inline void mptcp_send_reset(const struct sock *sk) {} +static inline void mptcp_sub_force_close_all(struct mptcp_cb *mpcb, + struct sock *except) {} +static inline bool mptcp_handle_options(struct sock *sk, + const struct tcphdr *th, + struct sk_buff *skb) +{ + return false; +} +static inline void mptcp_reset_mopt(struct tcp_sock *tp) {} +static inline void __init mptcp_init(void) {} +static inline bool mptcp_can_sg(const struct sock *meta_sk) +{ + return false; +} +static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, + u32 mss_now, int large_allowed) +{ + return 0; +} +static inline void mptcp_destroy_sock(struct sock *sk) {} +static inline int mptcp_rcv_synsent_state_process(struct sock *sk, + struct sock **skptr, + struct sk_buff *skb, + const struct mptcp_options_received *mopt) +{ + return 0; +} +static inline bool mptcp_can_sendpage(struct sock *sk) +{ + return false; +} +static inline int mptcp_init_tw_sock(struct sock *sk, + struct tcp_timewait_sock *tw) +{ + return 0; +} +static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {} +static inline void mptcp_disconnect(struct sock *meta_sk) {} +static inline void mptcp_tsq_flags(struct sock *sk) {} +static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {} +static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {} +static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb, + const struct sk_buff *skb) {} +static inline void mptcp_init_tcp_sock(struct sock *sk) {} +static inline void mptcp_disable_static_key(void) {} +static inline void mptcp_cookies_reqsk_init(struct request_sock *req, + struct mptcp_options_received *mopt, + struct sk_buff *skb) {} +static inline void mptcp_mpcb_put(struct mptcp_cb *mpcb) {} +static inline void mptcp_fin(struct sock *meta_sk) {} +static inline bool mptcp_in_infinite_mapping_weak(const struct mptcp_cb *mpcb) +{ + return false; +} +static inline bool mptcp_can_new_subflow(const struct sock *meta_sk) +{ + return false; +} + +#endif /* CONFIG_MPTCP */ + +#endif /* _MPTCP_H */ diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h new file mode 100644 index 000000000000..c58d42b11f6a --- /dev/null +++ b/include/net/mptcp_v4.h @@ -0,0 +1,76 @@ +/* + * MPTCP implementation + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef MPTCP_V4_H_ +#define MPTCP_V4_H_ + + +#include +#include +#include +#include +#include + +extern struct request_sock_ops mptcp_request_sock_ops; +extern const struct inet_connection_sock_af_ops mptcp_v4_specific; +extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops; +extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops; + +#ifdef CONFIG_MPTCP + +int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb); +struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr, + const __be32 laddr, const struct net *net); +int __mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, + __be16 sport, struct mptcp_rem4 *rem, + struct sock **subsk); +int mptcp_pm_v4_init(void); +void mptcp_pm_v4_undo(void); +u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); +u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, + u32 seed); + +static inline int mptcp_init4_subsockets(struct sock *meta_sk, + const struct mptcp_loc4 *loc, + struct mptcp_rem4 *rem) +{ + return __mptcp_init4_subsockets(meta_sk, loc, 0, rem, NULL); +} + +#else + +static inline int mptcp_v4_do_rcv(const struct sock *meta_sk, + const struct sk_buff *skb) +{ + return 0; +} + +#endif /* CONFIG_MPTCP */ + +#endif /* MPTCP_V4_H_ */ diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h new file mode 100644 index 000000000000..93e8c87c2eb1 --- /dev/null +++ b/include/net/mptcp_v6.h @@ -0,0 +1,77 @@ +/* + * MPTCP implementation + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Jaakko Korkeaniemi + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _MPTCP_V6_H +#define _MPTCP_V6_H + +#include +#include + +#include + + +#ifdef CONFIG_MPTCP +extern const struct inet_connection_sock_af_ops mptcp_v6_mapped; +extern const struct inet_connection_sock_af_ops mptcp_v6_specific; +extern struct request_sock_ops mptcp6_request_sock_ops; +extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops; +extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops; + +int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb); +struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr, + const struct in6_addr *laddr, const struct net *net); +int __mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, + __be16 sport, struct mptcp_rem6 *rem, + struct sock **subsk); +int mptcp_pm_v6_init(void); +void mptcp_pm_v6_undo(void); +__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport); +u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport, u32 seed); + +static inline int mptcp_init6_subsockets(struct sock *meta_sk, + const struct mptcp_loc6 *loc, + struct mptcp_rem6 *rem) +{ + return __mptcp_init6_subsockets(meta_sk, loc, 0, rem, NULL); +} + +#else /* CONFIG_MPTCP */ + +#define mptcp_v6_mapped ipv6_mapped + +static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) +{ + return 0; +} + +#endif /* CONFIG_MPTCP */ + +#endif /* _MPTCP_V6_H */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index bc88ac6c2e1d..a9e0c39013d7 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -110,6 +111,9 @@ struct net { #if IS_ENABLED(CONFIG_IPV6) struct netns_ipv6 ipv6; #endif +#if IS_ENABLED(CONFIG_MPTCP) + struct netns_mptcp mptcp; +#endif #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) struct netns_ieee802154_lowpan ieee802154_lowpan; #endif diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h new file mode 100644 index 000000000000..6680f3bbcfc8 --- /dev/null +++ b/include/net/netns/mptcp.h @@ -0,0 +1,52 @@ +/* + * MPTCP implementation - MPTCP namespace + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef __NETNS_MPTCP_H__ +#define __NETNS_MPTCP_H__ + +#include + +enum { + MPTCP_PM_FULLMESH = 0, + MPTCP_PM_MAX +}; + +struct mptcp_mib; + +struct netns_mptcp { + DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics); + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_net_mptcp; +#endif + + void *path_managers[MPTCP_PM_MAX]; +}; + +#endif /* __NETNS_MPTCP_H__ */ diff --git a/include/net/snmp.h b/include/net/snmp.h index c9228ad7ee91..0d397491cb9f 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -91,7 +91,6 @@ struct icmpv6msg_mib_device { atomic_long_t mibs[ICMP6MSG_MIB_MAX]; }; - /* TCP */ #define TCP_MIB_MAX __TCP_MIB_MAX struct tcp_mib { diff --git a/include/net/sock.h b/include/net/sock.h index 75677050c82e..ce2f31dab948 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -817,6 +817,7 @@ enum sock_flags { SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ SOCK_TXTIME, + SOCK_MPTCP, /* MPTCP set on this socket */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -1124,6 +1125,7 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); + void (*clear_sk)(struct sock *sk, int size); /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS diff --git a/include/net/tcp.h b/include/net/tcp.h index 3f0d654984cf..0c4e4438caaf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -185,6 +185,7 @@ #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ +#define TCPOPT_MPTCP 30 #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP @@ -241,6 +242,31 @@ */ #define TFO_SERVER_WO_SOCKOPT1 0x400 +/* Flags from tcp_input.c for tcp_ack */ +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ +#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ +#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ + +#define MPTCP_FLAG_DATA_ACKED 0x20000 + +#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) +#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; @@ -313,6 +339,96 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift) #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) +/**** START - Exports needed for MPTCP ****/ +extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; +extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; + +struct mptcp_options_received; + +void tcp_cleanup_rbuf(struct sock *sk, int copied); +int tcp_close_state(struct sock *sk); +void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, + const struct sk_buff *skb); +int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib); +void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb); +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask); +u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now); +unsigned int tcp_mss_split_point(const struct sock *sk, + const struct sk_buff *skb, + unsigned int mss_now, + unsigned int max_segs, + int nonagle); +bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss, int nonagle); +bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss); +unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb); +int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now); +int __pskb_trim_head(struct sk_buff *skb, int len); +void tcp_queue_skb(struct sock *sk, struct sk_buff *skb); +void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags); +void tcp_reset(struct sock *sk); +bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, + const u32 ack_seq, const u32 nwin); +bool tcp_urg_mode(const struct tcp_sock *tp); +void tcp_ack_probe(struct sock *sk); +void tcp_rearm_rto(struct sock *sk); +int tcp_write_timeout(struct sock *sk); +bool retransmits_timed_out(struct sock *sk, + unsigned int boundary, + unsigned int timeout); +void tcp_write_err(struct sock *sk); +void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr); +void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb); +void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now); + +void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + struct request_sock *req); +void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb); +struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb); +void tcp_v4_reqsk_destructor(struct request_sock *req); + +void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + struct request_sock *req); +void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb); +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); +int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +void tcp_v6_destroy_sock(struct sock *sk); +void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); +void tcp_v6_hash(struct sock *sk); +struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb); +struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req); +void tcp_v6_reqsk_destructor(struct request_sock *req); + +unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + int large_allowed); +u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb); + +void skb_clone_fraglist(struct sk_buff *skb); + +void inet_twsk_free(struct inet_timewait_sock *tw); +int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb); +/* These states need RST on ABORT according to RFC793 */ +static inline bool tcp_need_reset(int state) +{ + return (1 << state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | + TCPF_FIN_WAIT2 | TCPF_SYN_RECV); +} + +int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, + bool *fragstolen); +void tcp_ofo_queue(struct sock *sk); +void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb); +int linear_payload_sz(bool first_skb); +/**** END - Exports needed for MPTCP ****/ + void tcp_tasklet_init(void); void tcp_v4_err(struct sk_buff *skb, u32); @@ -412,7 +528,9 @@ int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, - int estab, struct tcp_fastopen_cookie *foc); + struct mptcp_options_received *mopt_rx, + int estab, struct tcp_fastopen_cookie *foc, + struct tcp_sock *tp); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); /* @@ -421,6 +539,7 @@ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); +void tcp_v6_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, @@ -538,7 +657,8 @@ static inline u32 tcp_cookie_time(void) u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); -__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); +__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk, + const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); @@ -552,7 +672,8 @@ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); -__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); +__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk, + const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ @@ -588,10 +709,16 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb); +u16 tcp_select_window(struct sock *sk); +bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); + /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk); +void tcp_set_rto(struct sock *sk); +bool tcp_should_expand_sndbuf(const struct sock *sk); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); @@ -635,7 +762,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) } /* tcp.c */ -void tcp_get_info(struct sock *, struct tcp_info *); +void tcp_get_info(struct sock *, struct tcp_info *, bool no_lock); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, @@ -713,7 +840,7 @@ static inline u32 tcp_min_rtt(const struct tcp_sock *tp) * Rcv_nxt can be after the window if our peer push more data * than the offered window. */ -static inline u32 tcp_receive_window(const struct tcp_sock *tp) +static inline u32 tcp_receive_window_now(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; @@ -722,6 +849,32 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp) return (u32) win; } +/* right edge only moves forward, even if window shrinks due + * to mptcp meta + */ +static inline void tcp_update_rcv_right_edge(struct tcp_sock *tp) +{ + if (after(tp->rcv_wup + tp->rcv_wnd, tp->rcv_right_edge)) + tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd; +} + +/* Compute receive window which will never shrink. The way MPTCP handles + * the receive window can cause the effective right edge to shrink, + * causing valid segments to become out of window. + * This function should be used when checking if a segment is valid for + * the max right edge announced. + */ +static inline u32 tcp_receive_window_no_shrink(const struct tcp_sock *tp) +{ + s32 win = tp->rcv_right_edge - tp->rcv_nxt; + + win = max_t(s32, win, tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt); + + if (unlikely(win < 0)) + win = 0; + return (u32) win; +} + /* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection. @@ -823,6 +976,12 @@ struct tcp_skb_cb { u16 tcp_gso_size; }; }; + +#ifdef CONFIG_MPTCP + __u8 mptcp_flags; /* flags for the MPTCP layer */ + __u8 dss_off; /* Number of 4-byte words until + * seq-number */ +#endif __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK. */ @@ -841,6 +1000,14 @@ struct tcp_skb_cb { has_rxtstamp:1, /* SKB has a RX timestamp */ unused:5; __u32 ack_seq; /* Sequence number ACK'd */ + +#ifdef CONFIG_MPTCP + union { /* For MPTCP outgoing frames */ + __u32 path_mask; /* paths that tried to send this skb */ + __u32 dss[6]; /* DSS options */ + }; +#endif + union { struct { /* There is space for up to 24 bytes */ @@ -1067,6 +1234,8 @@ struct tcp_congestion_ops { int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit, bool cap_net_admin); +int __tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); @@ -1361,6 +1530,19 @@ static inline int tcp_win_from_space(const struct sock *sk, int space) space - (space>>tcp_adv_win_scale); } +#ifdef CONFIG_MPTCP +extern struct static_key mptcp_static_key; +static inline bool mptcp(const struct tcp_sock *tp) +{ + return static_key_false(&mptcp_static_key) && tp->mpc; +} +#else +static inline bool mptcp(const struct tcp_sock *tp) +{ + return 0; +} +#endif + /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { @@ -1911,6 +2093,31 @@ struct tcp_sock_af_ops { #endif }; +/* TCP/MPTCP-specific functions */ +struct tcp_sock_ops { + u32 (*__select_window)(struct sock *sk); + u16 (*select_window)(struct sock *sk); + void (*select_initial_window)(const struct sock *sk, int __space, + __u32 mss, __u32 *rcv_wnd, + __u32 *window_clamp, int wscale_ok, + __u8 *rcv_wscale, __u32 init_rcv_wnd); + int (*select_size)(const struct sock *sk, bool first_skb, bool zc); + void (*init_buffer_space)(struct sock *sk); + void (*set_rto)(struct sock *sk); + bool (*should_expand_sndbuf)(const struct sock *sk); + void (*send_fin)(struct sock *sk); + bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); + void (*send_active_reset)(struct sock *sk, gfp_t priority); + int (*write_wakeup)(struct sock *sk, int mib); + void (*retransmit_timer)(struct sock *sk); + void (*time_wait)(struct sock *sk, int state, int timeo); + void (*cleanup_rbuf)(struct sock *sk, int copied); + int (*set_cong_ctrl)(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin); +}; +extern const struct tcp_sock_ops tcp_specific; + struct tcp_request_sock_ops { u16 mss_clamp; #ifdef CONFIG_TCP_MD5SIG @@ -1921,12 +2128,13 @@ struct tcp_request_sock_ops { const struct sock *sk, const struct sk_buff *skb); #endif - void (*init_req)(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb); + int (*init_req)(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb, + bool want_cookie); #ifdef CONFIG_SYN_COOKIES - __u32 (*cookie_init_seq)(const struct sk_buff *skb, - __u16 *mss); + __u32 (*cookie_init_seq)(struct request_sock *req, const struct sock *sk, + const struct sk_buff *skb, __u16 *mss); #endif struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, const struct request_sock *req); @@ -1940,15 +2148,17 @@ struct tcp_request_sock_ops { #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, + struct request_sock *req, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { tcp_synq_overflow(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return ops->cookie_init_seq(skb, mss); + return ops->cookie_init_seq(req, sk, skb, mss); } #else static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, + struct request_sock *req, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h index 2875e169d744..962a9ddce381 100644 --- a/include/net/tcp_states.h +++ b/include/net/tcp_states.h @@ -26,6 +26,7 @@ enum { TCP_LISTEN, TCP_CLOSING, /* Now a valid state */ TCP_NEW_SYN_RECV, + TCP_RST_WAIT, TCP_MAX_STATES /* Leave at the end! */ }; @@ -47,6 +48,7 @@ enum { TCPF_LISTEN = (1 << TCP_LISTEN), TCPF_CLOSING = (1 << TCP_CLOSING), TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV), + TCPF_RST_WAIT = (1 << TCP_RST_WAIT), }; #endif /* _LINUX_TCP_STATES_H */ diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index a8f6020f1196..5e70b086fdfb 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -58,6 +58,8 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, /* address family specific functions */ extern const struct inet_connection_sock_af_ops ipv4_specific; +extern const struct inet_connection_sock_af_ops ipv6_mapped; +extern const struct inet_connection_sock_af_ops ipv6_specific; void inet6_destroy_sock(struct sock *sk); diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index ac55b328d61b..6c0c923582c8 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ @@ -178,6 +179,13 @@ TP_ARGS(sk) ); +DEFINE_EVENT(tcp_event_sk_skb, mptcp_retransmit, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + + TP_ARGS(sk, skb) +); + TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), @@ -245,6 +253,7 @@ __field(__u32, srtt) __field(__u32, rcv_wnd) __field(__u64, sock_cookie) + __field(__u8, mptcp) ), TP_fast_assign( @@ -271,13 +280,15 @@ __entry->ssthresh = tcp_current_ssthresh(sk); __entry->srtt = tp->srtt_us >> 3; __entry->sock_cookie = sock_gen_cookie(sk); + __entry->mptcp = mptcp(tp) ? tp->mptcp->path_index : 0; ), - TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", + TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx mptcp=%d", __entry->saddr, __entry->daddr, __entry->mark, __entry->data_len, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, - __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) + __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie, + __entry->mptcp) ); #endif /* _TRACE_TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8481fc7676c0..081f587f110f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2688,6 +2688,7 @@ enum { BPF_TCP_LISTEN, BPF_TCP_CLOSING, /* Now a valid state */ BPF_TCP_NEW_SYN_RECV, + BPF_TCP_RST_WAIT, BPF_TCP_MAX_STATES /* Leave at the end! */ }; diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 7fea0fd7d6f5..7255e08393db 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -132,6 +132,9 @@ enum net_device_flags { #define IFF_ECHO IFF_ECHO #endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */ +#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */ +#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */ + #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\ IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h new file mode 100644 index 000000000000..f268e9805fe1 --- /dev/null +++ b/include/uapi/linux/mptcp.h @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Netlink API for Multipath TCP + * + * Author: Gregory Detal + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _LINUX_MPTCP_H +#define _LINUX_MPTCP_H + +#define MPTCP_GENL_NAME "mptcp" +#define MPTCP_GENL_EV_GRP_NAME "mptcp_events" +#define MPTCP_GENL_CMD_GRP_NAME "mptcp_commands" +#define MPTCP_GENL_VER 0x1 + +/* + * ATTR types defined for MPTCP + */ +enum { + MPTCP_ATTR_UNSPEC = 0, + + MPTCP_ATTR_TOKEN, /* u32 */ + MPTCP_ATTR_FAMILY, /* u16 */ + MPTCP_ATTR_LOC_ID, /* u8 */ + MPTCP_ATTR_REM_ID, /* u8 */ + MPTCP_ATTR_SADDR4, /* u32 */ + MPTCP_ATTR_SADDR6, /* struct in6_addr */ + MPTCP_ATTR_DADDR4, /* u32 */ + MPTCP_ATTR_DADDR6, /* struct in6_addr */ + MPTCP_ATTR_SPORT, /* u16 */ + MPTCP_ATTR_DPORT, /* u16 */ + MPTCP_ATTR_BACKUP, /* u8 */ + MPTCP_ATTR_ERROR, /* u8 */ + MPTCP_ATTR_FLAGS, /* u16 */ + MPTCP_ATTR_TIMEOUT, /* u32 */ + MPTCP_ATTR_IF_IDX, /* s32 */ + + __MPTCP_ATTR_AFTER_LAST +}; + +#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1) + +/* + * Events generated by MPTCP: + * - MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport + * A new connection has been created. It is the good time to allocate + * memory and send ADD_ADDR if needed. Depending on the traffic-patterns + * it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. + * + * - MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport + * A connection is established (can start new subflows). + * + * - MPTCP_EVENT_CLOSED: token + * A connection has stopped. + * + * - MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] + * A new address has been announced by the peer. + * + * - MPTCP_EVENT_REMOVED: token, rem_id + * An address has been lost by the peer. + * + * - MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, + * saddr4 | saddr6, daddr4 | daddr6, sport, + * dport, backup, if_idx [, error] + * A new subflow has been established. 'error' should not be set. + * + * - MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6, + * daddr4 | daddr6, sport, dport, backup, if_idx + * [, error] + * A subflow has been closed. An error (copy of sk_err) could be set if an + * error has been detected for this subflow. + * + * - MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6, + * daddr4 | daddr6, sport, dport, backup, if_idx + * [, error] + * The priority of a subflow has changed. 'error' should not be set. + * + * Commands for MPTCP: + * - MPTCP_CMD_ANNOUNCE: token, loc_id, family, saddr4 | saddr6 [, sport] + * Announce a new address to the peer. + * + * - MPTCP_CMD_REMOVE: token, loc_id + * Announce that an address has been lost to the peer. + * + * - MPTCP_CMD_SUB_CREATE: token, family, loc_id, rem_id, daddr4 | daddr6, + * dport [, saddr4 | saddr6, sport, backup, if_idx] + * Create a new subflow. + * + * - MPTCP_CMD_SUB_DESTROY: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport + * Close a subflow. + * + * - MPTCP_CMD_SUB_PRIORITY: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, backup + * Change the priority of a subflow. + * + * - MPTCP_CMD_SET_FILTER: flags + * Set the filter on events. Set MPTCPF_* flags to only receive specific + * events. Default is to receive all events. + * + * - MPTCP_CMD_EXIST: token + * Check if this token is linked to an existing socket. + */ +enum { + MPTCP_CMD_UNSPEC = 0, + + MPTCP_EVENT_CREATED, + MPTCP_EVENT_ESTABLISHED, + MPTCP_EVENT_CLOSED, + + MPTCP_CMD_ANNOUNCE, + MPTCP_CMD_REMOVE, + MPTCP_EVENT_ANNOUNCED, + MPTCP_EVENT_REMOVED, + + MPTCP_CMD_SUB_CREATE, + MPTCP_CMD_SUB_DESTROY, + MPTCP_EVENT_SUB_ESTABLISHED, + MPTCP_EVENT_SUB_CLOSED, + + MPTCP_CMD_SUB_PRIORITY, + MPTCP_EVENT_SUB_PRIORITY, + + MPTCP_CMD_SET_FILTER, + + MPTCP_CMD_EXIST, + + __MPTCP_CMD_AFTER_LAST +}; + +#define MPTCP_CMD_MAX (__MPTCP_CMD_AFTER_LAST - 1) + +enum { + MPTCPF_EVENT_CREATED = (1 << 1), + MPTCPF_EVENT_ESTABLISHED = (1 << 2), + MPTCPF_EVENT_CLOSED = (1 << 3), + MPTCPF_EVENT_ANNOUNCED = (1 << 4), + MPTCPF_EVENT_REMOVED = (1 << 5), + MPTCPF_EVENT_SUB_ESTABLISHED = (1 << 6), + MPTCPF_EVENT_SUB_CLOSED = (1 << 7), + MPTCPF_EVENT_SUB_PRIORITY = (1 << 8), +}; + +#endif /* _LINUX_MPTCP_H */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index e02d31986ff9..b7782ef035c4 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -18,9 +18,15 @@ #ifndef _UAPI_LINUX_TCP_H #define _UAPI_LINUX_TCP_H -#include +#ifndef __KERNEL__ +#include +#endif + #include +#include +#include #include +#include struct tcphdr { __be16 source; @@ -131,6 +137,13 @@ enum { #define TCP_REPAIR_OFF 0 #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ +#define MPTCP_ENABLED 42 +#define MPTCP_SCHEDULER 43 +#define MPTCP_PATH_MANAGER 44 +#define MPTCP_INFO 45 + +#define MPTCP_INFO_FLAG_SAVE_MASTER 0x01 + struct tcp_repair_opt { __u32 opt_code; __u32 opt_val; @@ -268,6 +281,53 @@ enum { TCP_NLA_REORD_SEEN, /* reordering events seen */ }; +struct mptcp_meta_info { + __u8 mptcpi_state; + __u8 mptcpi_retransmits; + __u8 mptcpi_probes; + __u8 mptcpi_backoff; + + __u32 mptcpi_rto; + __u32 mptcpi_unacked; + + /* Times. */ + __u32 mptcpi_last_data_sent; + __u32 mptcpi_last_data_recv; + __u32 mptcpi_last_ack_recv; + + __u32 mptcpi_total_retrans; + + __u64 mptcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + __u64 mptcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ +}; + +struct mptcp_sub_info { + union { + struct sockaddr src; + struct sockaddr_in src_v4; + struct sockaddr_in6 src_v6; + }; + + union { + struct sockaddr dst; + struct sockaddr_in dst_v4; + struct sockaddr_in6 dst_v6; + }; +}; + +struct mptcp_info { + __u32 tcp_info_len; /* Length of each struct tcp_info in subflows pointer */ + __u32 sub_len; /* Total length of memory pointed to by subflows pointer */ + __u32 meta_len; /* Length of memory pointed to by meta_info */ + __u32 sub_info_len; /* Length of each struct mptcp_sub_info in subflow_info pointer */ + __u32 total_sub_info_len; /* Total length of memory pointed to by subflow_info */ + + struct mptcp_meta_info *meta_info; + struct tcp_info *initial; + struct tcp_info *subflows; /* Pointer to array of tcp_info structs */ + struct mptcp_sub_info *subflow_info; +}; + /* for TCP_MD5SIG socket option */ #define TCP_MD5SIG_MAXKEYLEN 80 diff --git a/net/Kconfig b/net/Kconfig index 228dfa382eec..274282e9b742 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -89,6 +89,7 @@ if INET source "net/ipv4/Kconfig" source "net/ipv6/Kconfig" source "net/netlabel/Kconfig" +source "net/mptcp/Kconfig" endif # if INET diff --git a/net/Makefile b/net/Makefile index 449fc0b221f8..08683343642e 100644 --- a/net/Makefile +++ b/net/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX_SCM) += unix/ obj-$(CONFIG_NET) += ipv6/ +obj-$(CONFIG_MPTCP) += mptcp/ obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/core/dev.c b/net/core/dev.c index 42f6ff8b9703..fa5bc51915e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7663,7 +7663,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | - IFF_AUTOMEDIA)) | + IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) | (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI)); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 419af6dfe29f..0f939748ec97 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -48,3 +48,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); + +EXPORT_TRACEPOINT_SYMBOL_GPL(mptcp_retransmit); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c623c129d0ab..40246b24b69d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -545,7 +545,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb) skb_drop_list(&skb_shinfo(skb)->frag_list); } -static void skb_clone_fraglist(struct sk_buff *skb) +void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; diff --git a/net/core/sock.c b/net/core/sock.c index 41a77027a549..12a73f0bfa64 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -140,6 +140,11 @@ #include +#ifdef CONFIG_MPTCP +#include +#include +#endif + #include #include @@ -1015,7 +1020,8 @@ int sock_setsockopt(struct socket *sock, int level, int optname, case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (sk->sk_protocol != IPPROTO_TCP || + sock_flag(sk, SOCK_MPTCP)) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; @@ -1429,6 +1435,23 @@ int sock_getsockopt(struct socket *sock, int level, int optname, */ static inline void sock_lock_init(struct sock *sk) { +#ifdef CONFIG_MPTCP + /* Reclassify the lock-class for subflows */ + if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) + if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) { + sock_lock_init_class_and_name(sk, meta_slock_key_name, + &meta_slock_key, + meta_key_name, + &meta_key); + + /* We don't yet have the mptcp-point. + * Thus we still need inet_sock_destruct + */ + sk->sk_destruct = inet_sock_destruct; + return; + } +#endif + if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, @@ -1477,8 +1500,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; - if (priority & __GFP_ZERO) - sk_prot_clear_nulls(sk, prot->obj_size); + if (priority & __GFP_ZERO) { + if (prot->clear_sk) + prot->clear_sk(sk, prot->obj_size); + else + sk_prot_clear_nulls(sk, prot->obj_size); + } } else sk = kmalloc(prot->obj_size, priority); @@ -1708,6 +1735,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); + sock_reset_flag(newsk, SOCK_MPTCP); /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 2e12f848203a..de7e3bfece4f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -681,6 +681,51 @@ config TCP_CONG_BBR bufferbloat, policers, or AQM schemes that do not provide a delay signal. It requires the fq ("Fair Queue") pacing packet scheduler. +config TCP_CONG_LIA + tristate "MPTCP Linked Increase" + depends on MPTCP + default n + ---help--- + MultiPath TCP Linked Increase Congestion Control + To enable it, just put 'lia' in tcp_congestion_control + +config TCP_CONG_OLIA + tristate "MPTCP Opportunistic Linked Increase" + depends on MPTCP + default n + ---help--- + MultiPath TCP Opportunistic Linked Increase Congestion Control + To enable it, just put 'olia' in tcp_congestion_control + +config TCP_CONG_WVEGAS + tristate "MPTCP WVEGAS CONGESTION CONTROL" + depends on MPTCP + default n + ---help--- + wVegas congestion control for MPTCP + To enable it, just put 'wvegas' in tcp_congestion_control + +config TCP_CONG_BALIA + tristate "MPTCP BALIA CONGESTION CONTROL" + depends on MPTCP + default n + ---help--- + Multipath TCP Balanced Linked Adaptation Congestion Control + To enable it, just put 'balia' in tcp_congestion_control + +config TCP_CONG_MCTCPDESYNC + tristate "DESYNCHRONIZED MCTCP CONGESTION CONTROL (EXPERIMENTAL)" + depends on MPTCP + default n + ---help--- + Desynchronized MultiChannel TCP Congestion Control. This is experimental + code that only supports single path and must have set mptcp_ndiffports + larger than one. + To enable it, just put 'mctcpdesync' in tcp_congestion_control + For further details see: + http://ieeexplore.ieee.org/abstract/document/6911722/ + https://doi.org/10.1016/j.comcom.2015.07.010 + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -718,6 +763,21 @@ choice config DEFAULT_BBR bool "BBR" if TCP_CONG_BBR=y + config DEFAULT_LIA + bool "Lia" if TCP_CONG_LIA=y + + config DEFAULT_OLIA + bool "Olia" if TCP_CONG_OLIA=y + + config DEFAULT_WVEGAS + bool "Wvegas" if TCP_CONG_WVEGAS=y + + config DEFAULT_BALIA + bool "Balia" if TCP_CONG_BALIA=y + + config DEFAULT_MCTCPDESYNC + bool "Mctcpdesync (EXPERIMENTAL)" if TCP_CONG_MCTCPDESYNC=y + config DEFAULT_RENO bool "Reno" endchoice @@ -738,6 +798,10 @@ config DEFAULT_TCP_CONG default "vegas" if DEFAULT_VEGAS default "westwood" if DEFAULT_WESTWOOD default "veno" if DEFAULT_VENO + default "lia" if DEFAULT_LIA + default "olia" if DEFAULT_OLIA + default "wvegas" if DEFAULT_WVEGAS + default "balia" if DEFAULT_BALIA default "reno" if DEFAULT_RENO default "dctcp" if DEFAULT_DCTCP default "cdg" if DEFAULT_CDG diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d8c22246629a..705b67cd4f2a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -104,6 +104,7 @@ #include #include #include +#include #include #include #include @@ -150,6 +151,9 @@ void inet_sock_destruct(struct sock *sk) return; } + if (sock_flag(sk, SOCK_MPTCP)) + mptcp_disable_static_key(); + WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(sk->sk_wmem_queued); @@ -244,8 +248,7 @@ int inet_listen(struct socket *sock, int backlog) * Create an inet socket. */ -static int inet_create(struct net *net, struct socket *sock, int protocol, - int kern) +int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; @@ -739,6 +742,24 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags, lock_sock(sk2); sock_rps_record_flow(sk2); + + if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) { + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(tcp_sk(sk2)->mpcb, mptcp) { + sock_rps_record_flow(mptcp_to_sock(mptcp)); + } + + if (tcp_sk(sk2)->mpcb->master_sk) { + struct sock *sk_it = tcp_sk(sk2)->mpcb->master_sk; + + write_lock_bh(&sk_it->sk_callback_lock); + sk_it->sk_wq = newsock->wq; + sk_it->sk_socket = newsock; + write_unlock_bh(&sk_it->sk_callback_lock); + } + } + WARN_ON(!((1 << sk2->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_CLOSE_WAIT | TCPF_CLOSE))); @@ -1962,6 +1983,9 @@ static int __init inet_init(void) if (init_ipv4_mibs()) panic("%s: Cannot init ipv4 mibs\n", __func__); + /* We must initialize MPTCP before TCP. */ + mptcp_init(); + /* Setup TCP slab cache for open requests. */ tcp_init(); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 439a55d1aa99..d13c3f9c99de 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -728,7 +729,10 @@ static void reqsk_timer_handler(struct timer_list *t) int max_retries, thresh; u8 defer_accept; - if (inet_sk_state_load(sk_listener) != TCP_LISTEN) + if (!is_meta_sk(sk_listener) && inet_sk_state_load(sk_listener) != TCP_LISTEN) + goto drop; + + if (is_meta_sk(sk_listener) && !mptcp_can_new_subflow(sk_listener)) goto drop; max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; @@ -821,7 +825,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority) { - struct sock *newsk = sk_clone_lock(sk, priority); + struct sock *newsk; + + newsk = sk_clone_lock(sk, priority); if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); @@ -1021,7 +1027,14 @@ void inet_csk_listen_stop(struct sock *sk) */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk; + bool mutex_taken = false; + struct mptcp_cb *mpcb = tcp_sk(child)->mpcb; + if (is_meta_sk(child)) { + WARN_ON(refcount_inc_not_zero(&mpcb->mpcb_refcnt) == 0); + mutex_lock(&mpcb->mpcb_mutex); + mutex_taken = true; + } local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); @@ -1031,6 +1044,10 @@ void inet_csk_listen_stop(struct sock *sk) reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); + if (mutex_taken) { + mutex_unlock(&mpcb->mpcb_mutex); + mptcp_mpcb_put(mpcb); + } sock_put(child); cond_resched(); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 82f341e84fae..3244ac035500 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -44,6 +44,8 @@ #endif #include +#include + #include #include @@ -655,7 +657,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; old = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); - if (inet->is_icsk) { + if (inet->is_icsk && !is_meta_sk(sk)) { struct inet_connection_sock *icsk = inet_csk(sk); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET || @@ -749,6 +751,20 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->tos = val; sk->sk_priority = rt_tos2priority(val); sk_dst_reset(sk); + /* Update TOS on mptcp subflow */ + if (is_meta_sk(sk)) { + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + if (inet_sk(sk_it)->tos != inet_sk(sk)->tos) { + inet_sk(sk_it)->tos = inet_sk(sk)->tos; + sk_it->sk_priority = sk->sk_priority; + sk_dst_reset(sk_it); + } + } + } } break; case IP_TTL: diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 1a06850ef3cc..bdf82d471b93 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include #include @@ -179,7 +181,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); -__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) +__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk, + const struct sk_buff *skb, __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); @@ -209,9 +212,27 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; bool own_req; +#ifdef CONFIG_MPTCP + int ret; +#endif child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, NULL, &own_req); + +#ifdef CONFIG_MPTCP + if (!child) + goto listen_overflow; + + ret = mptcp_check_req_master(sk, child, req, skb, 0, tsoff); + if (ret < 0) + return NULL; + + if (!ret) + return tcp_sk(child)->mpcb->master_sk; + +listen_overflow: +#endif + if (child) { refcount_set(&req->rsk_refcnt, 1); tcp_sk(child)->tsoffset = tsoff; @@ -289,6 +310,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct tcp_options_received tcp_opt; + struct mptcp_options_received mopt; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct tcp_sock *tp = tcp_sk(sk); @@ -318,7 +340,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL); + mptcp_init_mp_opt(&mopt); + tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcp_ts_off(sock_net(sk), @@ -331,7 +354,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ +#ifdef CONFIG_MPTCP + if (mopt.saw_mpc) + req = inet_reqsk_alloc(&mptcp_request_sock_ops, sk, false); /* for safety */ + else +#endif + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ if (!req) goto out; @@ -351,6 +379,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->sack_ok = tcp_opt.sack_ok; ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; + ireq->mptcp_rqsk = 0; + ireq->saw_mpc = 0; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = 0; treq->tfo_listener = false; @@ -359,6 +389,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->ir_iif = inet_request_bound_dev_if(sk, skb); + if (mopt.saw_mpc) + mptcp_cookies_reqsk_init(req, &mopt, skb); + /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ @@ -397,10 +430,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; - tcp_select_initial_window(sk, full_space, req->mss, - &req->rsk_rcv_wnd, &req->rsk_window_clamp, - ireq->wscale_ok, &rcv_wscale, - dst_metric(&rt->dst, RTAX_INITRWND)); + tp->ops->select_initial_window(sk, full_space, req->mss, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, + ireq->wscale_ok, &rcv_wscale, + dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4dce1b418acc..075b128f46a7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -274,6 +274,7 @@ #include #include +#include #include #include #include @@ -399,6 +400,26 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) return rate64; } +static int select_size(const struct sock *sk, bool first_skb, bool zc); + +const struct tcp_sock_ops tcp_specific = { + .__select_window = __tcp_select_window, + .select_window = tcp_select_window, + .select_initial_window = tcp_select_initial_window, + .select_size = select_size, + .init_buffer_space = tcp_init_buffer_space, + .set_rto = tcp_set_rto, + .should_expand_sndbuf = tcp_should_expand_sndbuf, + .send_fin = tcp_send_fin, + .write_xmit = tcp_write_xmit, + .send_active_reset = tcp_send_active_reset, + .write_wakeup = tcp_write_wakeup, + .retransmit_timer = tcp_retransmit_timer, + .time_wait = tcp_time_wait, + .cleanup_rbuf = tcp_cleanup_rbuf, + .set_cong_ctrl = __tcp_set_congestion_control, +}; + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to @@ -452,6 +473,11 @@ void tcp_init_sock(struct sock *sk) sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; + tp->ops = &tcp_specific; + + /* Initialize MPTCP-specific stuff and function-pointers */ + mptcp_init_tcp_sock(sk); + sk_sockets_allocated_inc(sk); sk->sk_route_forced_caps = NETIF_F_GSO; } @@ -466,7 +492,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_init_metrics(sk); tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); - tcp_init_buffer_space(sk); + tcp_sk(sk)->ops->init_buffer_space(sk); } static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) @@ -495,7 +521,7 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, return true; if (tcp_rmem_pressure(sk)) return true; - if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) + if (tcp_receive_window_now(tp) <= inet_csk(sk)->icsk_ack.rcv_mss) return true; } if (sk->sk_prot->stream_memory_read) @@ -796,6 +822,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, int ret; sock_rps_record_flow(sk); + /* * We can't seek on a socket input */ @@ -806,6 +833,16 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, lock_sock(sk); +#ifdef CONFIG_MPTCP + if (mptcp(tcp_sk(sk))) { + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { + sock_rps_record_flow(mptcp_to_sock(mptcp)); + } + } +#endif + timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); while (tss.len) { ret = __tcp_splice_read(sk, &tss); @@ -909,8 +946,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, return NULL; } -static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, - int large_allowed) +unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; @@ -938,8 +974,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; - mss_now = tcp_current_mss(sk); - *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + if (mptcp(tcp_sk(sk))) { + mss_now = mptcp_current_mss(sk); + *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + } else { + mss_now = tcp_current_mss(sk); + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + } return mss_now; } @@ -974,12 +1015,34 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && - !tcp_passive_fastopen(sk)) { + !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ? + tp->mpcb->master_sk : sk)) { err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto out_err; } + if (mptcp(tp)) { + struct mptcp_tcp_sock *mptcp; + + /* We must check this with socket-lock hold because we iterate + * over the subflows. + */ + if (!mptcp_can_sendpage(sk)) { + ssize_t ret; + + release_sock(sk); + ret = sock_no_sendpage(sk->sk_socket, page, offset, + size, flags); + lock_sock(sk); + return ret; + } + + mptcp_for_each_sub(tp->mpcb, mptcp) { + sock_rps_record_flow(mptcp_to_sock(mptcp)); + } + } + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1098,7 +1161,8 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags) { - if (!(sk->sk_route_caps & NETIF_F_SG)) + /* If MPTCP is enabled, we check it later after establishment */ + if (!mptcp(tcp_sk(sk)) && !(sk->sk_route_caps & NETIF_F_SG)) return sock_no_sendpage_locked(sk, page, offset, size, flags); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ @@ -1130,14 +1194,14 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, * This also speeds up tso_fragment(), since it wont fallback * to tcp_fragment(). */ -static int linear_payload_sz(bool first_skb) +int linear_payload_sz(bool first_skb) { if (first_skb) return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); return 0; } -static int select_size(bool first_skb, bool zc) +static int select_size(const struct sock *sk, bool first_skb, bool zc) { if (zc) return 0; @@ -1247,12 +1311,21 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && - !tcp_passive_fastopen(sk)) { + !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ? + tp->mpcb->master_sk : sk)) { err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto do_error; } + if (mptcp(tp)) { + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(tp->mpcb, mptcp) { + sock_rps_record_flow(mptcp_to_sock(mptcp)); + } + } + if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); @@ -1308,7 +1381,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - linear = select_size(first_skb, zc); + linear = tp->ops->select_size(sk, first_skb, zc); skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, first_skb); if (!skb) @@ -1546,7 +1619,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -static void tcp_cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -1585,11 +1658,11 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) * in states, where we will not receive more. It is useless. */ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { - __u32 rcv_window_now = tcp_receive_window(tp); + __u32 rcv_window_now = tcp_receive_window_now(tp); /* Optimize, __tcp_select_window() is not cheap. */ if (2*rcv_window_now <= tp->window_clamp) { - __u32 new_window = __tcp_select_window(sk); + __u32 new_window = tp->ops->__select_window(sk); /* Send ACK now, if this read freed lots of space * in our buffer. Certainly, new_window is new window. @@ -1705,7 +1778,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, /* Clean up data we have read: This will do ACK frames. */ if (copied > 0) { tcp_recv_skb(sk, seq, &offset); - tcp_cleanup_rbuf(sk, copied); + tp->ops->cleanup_rbuf(sk, copied); } return copied; } @@ -1963,6 +2036,16 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, lock_sock(sk); +#ifdef CONFIG_MPTCP + if (mptcp(tp)) { + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(tp->mpcb, mptcp) { + sock_rps_record_flow(mptcp_to_sock(mptcp)); + } + } +#endif + err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; @@ -2081,7 +2164,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, } } - tcp_cleanup_rbuf(sk, copied); + tp->ops->cleanup_rbuf(sk, copied); if (copied >= target) { /* Do not sleep, just process backlog. */ @@ -2174,7 +2257,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, */ /* Clean up data we have read: This will do ACK frames. */ - tcp_cleanup_rbuf(sk, copied); + tp->ops->cleanup_rbuf(sk, copied); release_sock(sk); @@ -2284,9 +2367,10 @@ void tcp_set_state(struct sock *sk, int state) [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ + [TCP_RST_WAIT] = TCP_CLOSE, }; -static int tcp_close_state(struct sock *sk) +int tcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; @@ -2316,7 +2400,7 @@ void tcp_shutdown(struct sock *sk, int how) TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) - tcp_send_fin(sk); + tcp_sk(sk)->ops->send_fin(sk); } } EXPORT_SYMBOL(tcp_shutdown); @@ -2341,6 +2425,17 @@ void tcp_close(struct sock *sk, long timeout) int data_was_unread = 0; int state; + if (is_meta_sk(sk)) { + /* TODO: Currently forcing timeout to 0 because + * sk_stream_wait_close will complain during lockdep because + * of the mpcb_mutex (circular lock dependency through + * inet_csk_listen_stop()). + * We should find a way to get rid of the mpcb_mutex. + */ + mptcp_close(sk, 0); + return; + } + lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; @@ -2385,7 +2480,7 @@ void tcp_close(struct sock *sk, long timeout) /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, sk->sk_allocation); + tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -2459,7 +2554,7 @@ void tcp_close(struct sock *sk, long timeout) struct tcp_sock *tp = tcp_sk(sk); if (tp->linger2 < 0) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tp->ops->send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { @@ -2469,7 +2564,8 @@ void tcp_close(struct sock *sk, long timeout) inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { - tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2, + tmo); goto out; } } @@ -2478,7 +2574,7 @@ void tcp_close(struct sock *sk, long timeout) sk_mem_reclaim(sk); if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { @@ -2507,15 +2603,6 @@ void tcp_close(struct sock *sk, long timeout) } EXPORT_SYMBOL(tcp_close); -/* These states need RST on ABORT according to RFC793 */ - -static inline bool tcp_need_reset(int state) -{ - return (1 << state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | - TCPF_FIN_WAIT2 | TCPF_SYN_RECV); -} - static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); @@ -2537,6 +2624,10 @@ void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; + if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) && + !tcp_rtx_and_write_queues_empty(sk)) + mptcp_reinject_data(sk, 0); + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { tcp_skb_tsorted_anchor_cleanup(skb); @@ -2572,7 +2663,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ - tcp_send_active_reset(sk, gfp_any()); + tp->ops->send_active_reset(sk, gfp_any()); sk->sk_err = ECONNRESET; } else if (old_state == TCP_SYN_SENT) sk->sk_err = ECONNRESET; @@ -2590,6 +2681,14 @@ int tcp_disconnect(struct sock *sk, int flags) if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); + if (is_meta_sk(sk)) { + mptcp_disconnect(sk); + } else { + tp->request_mptcp = 0; + if (tp->inside_tk_table) + mptcp_hash_remove_bh(tp); + } + sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; @@ -2656,7 +2755,7 @@ int tcp_disconnect(struct sock *sk, int flags) static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && - (sk->sk_state != TCP_LISTEN); + (sk->sk_state != TCP_LISTEN) && !sock_flag(sk, SOCK_MPTCP); } static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) @@ -2687,6 +2786,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l tp->rcv_wnd = opt.rcv_wnd; tp->rcv_wup = opt.rcv_wup; + tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd; return 0; } @@ -2802,6 +2902,61 @@ static int do_tcp_setsockopt(struct sock *sk, int level, return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); } +#ifdef CONFIG_MPTCP + case MPTCP_SCHEDULER: { + char name[MPTCP_SCHED_NAME_MAX]; + + if (optlen < 1) + return -EINVAL; + + /* Cannot be used if MPTCP is not used or we already have + * established an MPTCP-connection. + */ + if (mptcp_init_failed || !sysctl_mptcp_enabled || + sk->sk_state != TCP_CLOSE) + return -EPERM; + + val = strncpy_from_user(name, optval, + min_t(long, MPTCP_SCHED_NAME_MAX - 1, + optlen)); + + if (val < 0) + return -EFAULT; + name[val] = 0; + + lock_sock(sk); + err = mptcp_set_scheduler(sk, name); + release_sock(sk); + return err; + } + + case MPTCP_PATH_MANAGER: { + char name[MPTCP_PM_NAME_MAX]; + + if (optlen < 1) + return -EINVAL; + + /* Cannot be used if MPTCP is not used or we already have + * established an MPTCP-connection. + */ + if (mptcp_init_failed || !sysctl_mptcp_enabled || + sk->sk_state != TCP_CLOSE) + return -EPERM; + + val = strncpy_from_user(name, optval, + min_t(long, MPTCP_PM_NAME_MAX - 1, + optlen)); + + if (val < 0) + return -EFAULT; + name[val] = 0; + + lock_sock(sk); + err = mptcp_set_path_manager(sk, name); + release_sock(sk); + return err; + } +#endif default: /* fallthru */ break; @@ -2991,6 +3146,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: + /* An established MPTCP-connection (mptcp(tp) only returns true + * if the socket is established) should not use DEFER on new + * subflows. + */ + if (mptcp(tp)) + break; /* Translate value in seconds to number of retransmits */ icsk->icsk_accept_queue.rskq_defer_accept = secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, @@ -3018,7 +3179,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - tcp_cleanup_rbuf(sk, 1); + tp->ops->cleanup_rbuf(sk, 1); if (!(val & 1)) icsk->icsk_ack.pingpong = 1; } @@ -3028,7 +3189,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: - err = tp->af_specific->md5_parse(sk, optname, optval, optlen); + if (!sock_flag(sk, SOCK_MPTCP)) + err = tp->af_specific->md5_parse(sk, optname, optval, optlen); + else + err = -EINVAL; break; #endif case TCP_USER_TIMEOUT: @@ -3084,6 +3248,33 @@ static int do_tcp_setsockopt(struct sock *sk, int level, tp->notsent_lowat = val; sk->sk_write_space(sk); break; +#ifdef CONFIG_MPTCP + case MPTCP_ENABLED: + if (mptcp_init_failed || !sysctl_mptcp_enabled || + sk->sk_state != TCP_CLOSE || + sock_flag(sk, SOCK_ZEROCOPY) +#ifdef CONFIG_TCP_MD5SIG + || tp->md5sig_info +#endif + ) { + err = -EPERM; + break; + } + + if (val) + mptcp_enable_sock(sk); + else + mptcp_disable_sock(sk); + break; + case MPTCP_INFO: + if (mptcp_init_failed || !sysctl_mptcp_enabled) { + err = -EPERM; + break; + } + + tp->record_master_info = !!(val & MPTCP_INFO_FLAG_SAVE_MASTER); + break; +#endif case TCP_INQ: if (val > 1 || val < 0) err = -EINVAL; @@ -3143,7 +3334,7 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, } /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(struct sock *sk, struct tcp_info *info) +void tcp_get_info(struct sock *sk, struct tcp_info *info, bool no_lock) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3180,7 +3371,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) return; } - slow = lock_sock_fast(sk); + if (!no_lock) + slow = lock_sock_fast(sk); info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; @@ -3254,7 +3446,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; - unlock_sock_fast(sk, slow); + + if (!no_lock) + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3399,7 +3593,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (get_user(len, optlen)) return -EFAULT; - tcp_get_info(sk, &info); + tcp_get_info(sk, &info, false); len = min_t(unsigned int, len, sizeof(info)); if (put_user(len, optlen)) @@ -3590,6 +3784,87 @@ static int do_tcp_getsockopt(struct sock *sk, int level, } return 0; } +#ifdef CONFIG_MPTCP + case MPTCP_SCHEDULER: + if (get_user(len, optlen)) + return -EFAULT; + len = min_t(unsigned int, len, MPTCP_SCHED_NAME_MAX); + if (put_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (mptcp(tcp_sk(sk))) { + struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb; + + if (copy_to_user(optval, mpcb->sched_ops->name, len)) { + release_sock(sk); + return -EFAULT; + } + } else { + if (copy_to_user(optval, tcp_sk(sk)->mptcp_sched_name, + len)) { + release_sock(sk); + return -EFAULT; + } + } + release_sock(sk); + return 0; + + case MPTCP_PATH_MANAGER: + if (get_user(len, optlen)) + return -EFAULT; + len = min_t(unsigned int, len, MPTCP_PM_NAME_MAX); + if (put_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (mptcp(tcp_sk(sk))) { + struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb; + + if (copy_to_user(optval, mpcb->pm_ops->name, len)) { + release_sock(sk); + return -EFAULT; + } + } else { + if (copy_to_user(optval, tcp_sk(sk)->mptcp_pm_name, + len)) { + release_sock(sk); + return -EFAULT; + } + } + release_sock(sk); + return 0; + + case MPTCP_ENABLED: + if (sk->sk_state != TCP_SYN_SENT) + val = mptcp(tp) ? 1 : 0; + else + val = sock_flag(sk, SOCK_MPTCP) ? 1 : 0; + break; + case MPTCP_INFO: + { + int ret; + + if (!mptcp(tp)) + return -EINVAL; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(struct mptcp_info)); + + lock_sock(sk); + ret = mptcp_get_info(sk, optval, len); + release_sock(sk); + + if (ret) + return ret; + + if (put_user(len, optlen)) + return -EFAULT; + return 0; + } +#endif #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { struct tcp_zerocopy_receive zc; @@ -3786,7 +4061,9 @@ void tcp_done(struct sock *sk) if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + WARN_ON(sk->sk_state == TCP_CLOSE); tcp_set_state(sk, TCP_CLOSE); + tcp_clear_xmit_timers(sk); if (req) reqsk_fastopen_remove(sk, req, false); @@ -3802,6 +4079,8 @@ void tcp_done(struct sock *sk) int tcp_abort(struct sock *sk, int err) { + struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk; + if (!sk_fullsock(sk)) { if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); @@ -3815,7 +4094,7 @@ int tcp_abort(struct sock *sk, int err) } /* Don't race with userspace socket closes such as tcp_close. */ - lock_sock(sk); + lock_sock(meta_sk); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); @@ -3824,7 +4103,7 @@ int tcp_abort(struct sock *sk, int err) /* Don't race with BH socket closes such as inet_csk_listen_stop. */ local_bh_disable(); - bh_lock_sock(sk); + bh_lock_sock(meta_sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_err = err; @@ -3832,14 +4111,14 @@ int tcp_abort(struct sock *sk, int err) smp_wmb(); sk->sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); } - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); local_bh_enable(); tcp_write_queue_purge(sk); - release_sock(sk); + release_sock(meta_sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 533f8d84d2f7..8e5a0c3e5e5e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -336,13 +336,19 @@ int tcp_set_allowed_congestion_control(char *val) return ret; } +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin) +{ + return tcp_sk(sk)->ops->set_cong_ctrl(sk, name, load, reinit, cap_net_admin); +} + /* Change congestion control for socket. If load is false, then it is the * responsibility of the caller to call tcp_init_congestion_control or * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin) +int __tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 2a46f9f81ba0..0af91d13b374 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -35,7 +35,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; } if (info) - tcp_get_info(sk, info); + tcp_get_info(sk, info, false); } #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 119d2c2f3b04..d3b1f0decb58 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -9,6 +9,7 @@ #include #include #include +#include void tcp_fastopen_init_key_once(struct net *net) { @@ -218,8 +219,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; - struct sock *child; + struct sock *child, *meta_sk; bool own_req; + int ret; req->num_retrans = 0; req->num_timeout = 0; @@ -258,15 +260,27 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); - /* Now finish processing the fastopen child socket. */ - tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_fastopen_add_skb(child, skb); tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; tp->rcv_wup = tp->rcv_nxt; + tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd; + + meta_sk = child; + ret = mptcp_check_req_fastopen(meta_sk, req); + if (ret < 0) + return NULL; + + if (ret == 0) { + child = tcp_sk(meta_sk)->mpcb->master_sk; + tp = tcp_sk(child); + } + + /* Now finish processing the fastopen child socket. */ + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); + /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 757e1f60e00d..b8a123fadde6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -76,35 +76,15 @@ #include #include #include +#include +#include +#include #include #include #include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -#define FLAG_DATA 0x01 /* Incoming frame contained data. */ -#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ -#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ -#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ -#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ -#define FLAG_DATA_SACKED 0x20 /* New SACK. */ -#define FLAG_ECE 0x40 /* ECE in this ACK */ -#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ -#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ -#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ -#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ -#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ -#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ -#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ -#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ -#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ -#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ - -#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) -#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) -#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) -#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) - #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) @@ -343,8 +323,12 @@ static void tcp_sndbuf_expand(struct sock *sk) per_mss = roundup_pow_of_two(per_mss) + SKB_DATA_ALIGN(sizeof(struct sk_buff)); - nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); - nr_segs = max_t(u32, nr_segs, tp->reordering + 1); + if (mptcp(tp)) { + nr_segs = mptcp_check_snd_buf(tp); + } else { + nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); + nr_segs = max_t(u32, nr_segs, tp->reordering + 1); + } /* Fast Recovery (RFC 5681 3.2) : * Cubic needs 1.7 factor, rounded to 2 to include @@ -353,8 +337,16 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; sndmem *= nr_segs * per_mss; - if (sk->sk_sndbuf < sndmem) + /* MPTCP: after this sndmem is the new contribution of the + * current subflow to the aggregated sndbuf */ + if (sk->sk_sndbuf < sndmem) { + int old_sndbuf = sk->sk_sndbuf; sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); + /* MPTCP: ok, the subflow sndbuf has grown, reflect + * this in the aggregate buffer.*/ + if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf) + mptcp_update_sndbuf(tp); + } } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -403,9 +395,14 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); int room; - room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; + if (is_meta_sk(sk)) + return; + + room = min_t(int, meta_tp->window_clamp, tcp_space(meta_sk)) - meta_tp->rcv_ssthresh; /* Check #1 */ if (room > 0 && !tcp_under_memory_pressure(sk)) { @@ -415,13 +412,13 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) * will fit to rcvbuf in future. */ if (tcp_win_from_space(sk, skb->truesize) <= skb->len) - incr = 2 * tp->advmss; + incr = 2 * meta_tp->advmss; else - incr = __tcp_grow_window(sk, skb); + incr = __tcp_grow_window(meta_sk, skb); if (incr) { incr = max_t(int, incr, 2 * skb->len); - tp->rcv_ssthresh += min(room, incr); + meta_tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } } @@ -604,7 +601,10 @@ void tcp_rcv_space_adjust(struct sock *sk) tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); - if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) + if (mptcp(tp)) { + if (mptcp_check_rtt(tp, time)) + return; + } else if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; /* Number of bytes copied to user in last RTT */ @@ -823,7 +823,7 @@ static void tcp_update_pacing_rate(struct sock *sk) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static void tcp_set_rto(struct sock *sk) +void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -1395,6 +1395,13 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, int len; int in_sack; + /* For MPTCP we cannot shift skb-data and remove one skb from the + * send-queue, because this will make us loose the DSS-option (which + * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing. + */ + if (mptcp(tp)) + goto fallback; + /* Normally R but no L won't result in plain S */ if (!dup_sack && (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) @@ -2949,7 +2956,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ tcp_update_rtt_min(sk, ca_rtt_us, flag); tcp_rtt_estimator(sk, seq_rtt_us); - tcp_set_rto(sk); + tp->ops->set_rto(sk); /* RFC6298: only reset backoff on valid RTT measurement. */ inet_csk(sk)->icsk_backoff = 0; @@ -3017,7 +3024,7 @@ static void tcp_set_xmit_timer(struct sock *sk) } /* If we get here, the whole TSO packet has not been acked. */ -static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) +u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); u32 packets_acked; @@ -3143,6 +3150,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, */ if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { flag |= FLAG_DATA_ACKED; + if (mptcp(tp) && mptcp_is_data_seq(skb)) + flag |= MPTCP_FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; @@ -3262,7 +3271,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, return flag; } -static void tcp_ack_probe(struct sock *sk) +void tcp_ack_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *head = tcp_send_head(sk); @@ -3334,9 +3343,8 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline bool tcp_may_update_window(const struct tcp_sock *tp, - const u32 ack, const u32 ack_seq, - const u32 nwin) +bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, + const u32 ack_seq, const u32 nwin) { return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || @@ -3574,7 +3582,7 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) } /* This routine deals with incoming acks, but not outgoing ones. */ -static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3696,6 +3704,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + if (mptcp(tp)) { + if (mptcp_handle_ack_in_infinite(sk, skb, flag)) { + pr_debug("%s resetting flow\n", __func__); + mptcp_send_reset(sk); + goto invalid_ack; + } + } + if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3796,8 +3812,10 @@ static void smc_parse_options(const struct tcphdr *th, */ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, - struct tcp_options_received *opt_rx, int estab, - struct tcp_fastopen_cookie *foc) + struct tcp_options_received *opt_rx, + struct mptcp_options_received *mopt, + int estab, struct tcp_fastopen_cookie *foc, + struct tcp_sock *tp) { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); @@ -3881,6 +3899,10 @@ void tcp_parse_options(const struct net *net, */ break; #endif + case TCPOPT_MPTCP: + mptcp_parse_options(ptr - 2, opsize, mopt, skb, tp); + break; + case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, @@ -3948,7 +3970,9 @@ static bool tcp_fast_parse_options(const struct net *net, return true; } - tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); + tcp_parse_options(net, skb, &tp->rx_opt, + mptcp(tp) ? &tp->mptcp->rx_opt : NULL, 1, NULL, tp); + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; @@ -4058,7 +4082,7 @@ static inline bool tcp_paws_discard(const struct sock *sk, static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) { return !before(end_seq, tp->rcv_wup) && - !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); + !after(seq, tp->rcv_nxt + tcp_receive_window_no_shrink(tp)); } /* When we get a reset we do this. */ @@ -4107,6 +4131,11 @@ void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + if (is_meta_sk(sk)) { + mptcp_fin(sk); + return; + } + inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; @@ -4117,6 +4146,10 @@ void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); + + if (mptcp(tp)) + mptcp_sub_close_passive(sk); + inet_csk(sk)->icsk_ack.pingpong = 1; break; @@ -4139,9 +4172,16 @@ void tcp_fin(struct sock *sk) tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: + if (mptcp(tp)) { + /* The socket will get closed by mptcp_data_ready. + * We first have to process all data-sequences. + */ + tp->close_it = 1; + break; + } /* Received a FIN -- send ACK and enter TIME_WAIT. */ tcp_send_ack(sk); - tcp_time_wait(sk, TCP_TIME_WAIT, 0); + tp->ops->time_wait(sk, TCP_TIME_WAIT, 0); break; default: /* Only TCP_LISTEN and TCP_CLOSE are left, in these @@ -4163,6 +4203,10 @@ void tcp_fin(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); + /* Don't wake up MPTCP-subflows */ + if (mptcp(tp)) + return; + /* Do not send POLL_HUP for half duplex close. */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) @@ -4365,6 +4409,9 @@ static bool tcp_try_coalesce(struct sock *sk, *fragstolen = false; + if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk)) + return false; + /* Its possible this segment overlaps with prior segment in queue */ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; @@ -4419,7 +4466,7 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb) /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ -static void tcp_ofo_queue(struct sock *sk) +void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; @@ -4442,7 +4489,14 @@ static void tcp_ofo_queue(struct sock *sk) p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); - if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { + /* In case of MPTCP, the segment may be empty if it's a + * non-data DATA_FIN. (see beginning of tcp_data_queue) + * + * But this only holds true for subflows, not for the + * meta-socket. + */ + if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) && + (is_meta_sk(sk) || !mptcp(tp) || TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); tcp_drop(sk, skb); continue; @@ -4476,6 +4530,9 @@ static void tcp_ofo_queue(struct sock *sk) static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) { + if (mptcp(tcp_sk(sk))) + sk = mptcp_meta_sk(sk); + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, size)) { @@ -4490,7 +4547,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, return 0; } -static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) +void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node **p, *parent; @@ -4563,7 +4620,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) continue; } if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { - if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) && + (is_meta_sk(sk) || !mptcp(tp) || end_seq != seq)) { /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); @@ -4610,6 +4668,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq); break; } + /* MPTCP allows non-data data-fin to be in the ofo-queue */ + if (mptcp(tp) && !is_meta_sk(sk) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq) { + skb = skb1; + continue; + } rb_erase(&skb1->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); @@ -4621,7 +4684,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->ooo_last_skb = skb; add_sack: - if (tcp_is_sack(tp)) + if (tcp_is_sack(tp) && seq != end_seq) tcp_sack_new_ofo_skb(sk, seq, end_seq); end: if (skb) { @@ -4635,8 +4698,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) } } -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, - bool *fragstolen) +int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, + bool *fragstolen) { int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); @@ -4712,7 +4775,8 @@ void tcp_data_ready(struct sock *sk) if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && !sock_flag(sk, SOCK_DONE) && - tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss) + tcp_receive_window_now(tp) > inet_csk(sk)->icsk_ack.rcv_mss && + !mptcp(tp)) return; sk->sk_data_ready(sk); @@ -4724,10 +4788,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) bool fragstolen; int eaten; - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + /* If no data is present, but a data_fin is in the options, we still + * have to call mptcp_queue_skb later on. */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && + !(mptcp(tp) && mptcp_is_data_fin(skb))) { __kfree_skb(skb); return; } + skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); @@ -4738,7 +4806,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (tcp_receive_window(tp) == 0) { + if (tcp_receive_window_no_shrink(tp) == 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } @@ -4754,7 +4822,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - if (skb->len) + if (skb->len || mptcp_is_data_fin(skb)) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); @@ -4776,7 +4844,11 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten > 0) kfree_skb_partial(skb, fragstolen); - if (!sock_flag(sk, SOCK_DEAD)) + if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp)) + /* MPTCP: we always have to call data_ready, because + * we may be about to receive a data-fin, which still + * must get queued. + */ tcp_data_ready(sk); return; } @@ -4795,7 +4867,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } /* Out of window. F.e. zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) + if (!before(TCP_SKB_CB(skb)->seq, + tp->rcv_nxt + tcp_receive_window_no_shrink(tp))) goto out_of_window; if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { @@ -4809,7 +4882,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ - if (!tcp_receive_window(tp)) { + if (!tcp_receive_window_no_shrink(tp)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } @@ -5124,7 +5197,7 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -static bool tcp_should_expand_sndbuf(const struct sock *sk) +bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -5159,7 +5232,7 @@ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_should_expand_sndbuf(sk)) { + if (tp->ops->should_expand_sndbuf(sk)) { tcp_sndbuf_expand(sk); tp->snd_cwnd_stamp = tcp_jiffies32; } @@ -5173,10 +5246,11 @@ static void tcp_check_space(struct sock *sk) sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); /* pairs with tcp_poll() */ smp_mb(); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + if (mptcp(tcp_sk(sk)) || + (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))) { tcp_new_space(sk); - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + if (sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } } @@ -5195,6 +5269,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); unsigned long rtt, delay; + struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -5203,8 +5279,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) * If application uses SO_RCVLOWAT, we want send ack now if * we have not received enough bytes to satisfy the condition. */ - (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || - __tcp_select_window(sk) >= tp->rcv_wnd)) || + (meta_tp->rcv_nxt - meta_tp->copied_seq < meta_sk->sk_rcvlowat || + tp->ops->__select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ @@ -5339,6 +5415,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t { struct tcp_sock *tp = tcp_sk(sk); + /* MPTCP urgent data is not yet supported */ + if (mptcp(tp)) + return; + /* Check if we get a new urgent pointer - normally not. */ if (th->urg) tcp_check_urg(sk, th); @@ -5481,9 +5561,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, goto discard; } + /* If valid: post process the received MPTCP options. */ + if (mptcp(tp) && mptcp_handle_options(sk, th, skb)) + goto discard; + return true; discard: + if (mptcp(tp)) + mptcp_reset_mopt(tp); tcp_drop(sk, skb); return false; } @@ -5540,6 +5626,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tp->rx_opt.saw_tstamp = 0; + /* MPTCP: force slowpath. */ + if (mptcp(tp)) + goto slow_path; + /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 @@ -5725,17 +5815,24 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; + struct sk_buff *data = NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; + if (tp->syn_data) { + if (mptcp(tp)) + data = tcp_write_queue_head(mptcp_meta_sk(sk)); + else + data = tcp_rtx_queue_head(sk); + } + if (mss == tp->rx_opt.user_mss) { struct tcp_options_received opt; /* Get original SYNACK MSS value if user MSS sets mss_clamp */ tcp_clear_options(&opt); opt.user_mss = opt.mss_clamp = 0; - tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL); + tcp_parse_options(sock_net(sk), synack, &opt, NULL, 0, NULL, NULL); mss = opt.mss_clamp; } @@ -5759,7 +5856,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); - if (data) { /* Retransmit unacked data in SYN */ + /* In mptcp case, we do not rely on "retransmit", but instead on + * "transmit", because if fastopen data is not acked, the retransmission + * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen). + */ + if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */ skb_rbtree_walk_from(data) { if (__tcp_retransmit_skb(sk, data, 1)) break; @@ -5799,9 +5900,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; + struct mptcp_options_received mopt; bool fastopen_fail; - tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc); + mptcp_init_mp_opt(&mopt); + + tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, + mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc, tp); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; @@ -5861,11 +5966,41 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_ack(sk, skb, FLAG_SLOWPATH); + if (tp->request_mptcp || mptcp(tp)) { + int ret; + + rcu_read_lock(); + local_bh_disable(); + ret = mptcp_rcv_synsent_state_process(sk, &sk, + skb, &mopt); + local_bh_enable(); + rcu_read_unlock(); + + /* May have changed if we support MPTCP */ + tp = tcp_sk(sk); + icsk = inet_csk(sk); + + if (ret == 1) + goto reset_and_undo; + if (ret == 2) + goto discard; + } + + if (mptcp(tp) && !is_master_tp(tp)) { + /* Timer for repeating the ACK until an answer + * arrives. Used only when establishing an additional + * subflow inside of an MPTCP connection. + */ + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, + jiffies + icsk->icsk_rto); + } + /* Ok.. it's good. Set up sequence numbers and * move to established. */ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + tcp_update_rcv_right_edge(tp); /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. @@ -5887,6 +6022,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } + if (mptcp(tp)) { + tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; + } + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -5910,9 +6050,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, } if (fastopen_fail) return -1; - if (sk->sk_write_pending || + /* With MPTCP we cannot send data on the third ack due to the + * lack of option-space to combine with an MP_CAPABLE. + */ + if (!mptcp(tp) && (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || - icsk->icsk_ack.pingpong) { + icsk->icsk_ack.pingpong)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * @@ -5951,6 +6094,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_paws_reject(&tp->rx_opt, 0)) goto discard_and_undo; + /* TODO - check this here for MPTCP */ if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. @@ -5967,9 +6111,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } + if (mptcp(tp)) { + tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; + } + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + tcp_update_rcv_right_edge(tp); /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. @@ -6025,6 +6175,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + __releases(&sk->sk_lock.slock) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -6067,6 +6218,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->rx_opt.saw_tstamp = 0; tcp_mstamp_refresh(tp); queued = tcp_rcv_synsent_state_process(sk, skb, th); + if (is_meta_sk(sk)) { + sk = tcp_sk(sk)->mpcb->master_sk; + tp = tcp_sk(sk); + + /* Need to call it here, because it will announce new + * addresses, which can only be done after the third ack + * of the 3-way handshake. + */ + mptcp_update_metasocket(tp->meta_sk); + } if (queued >= 0) return queued; @@ -6149,6 +6310,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + if (mptcp(tp)) + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN; if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); @@ -6158,6 +6321,30 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp); + + /* Send an ACK when establishing a new MPTCP subflow, i.e. + * using an MP_JOIN subtype. + */ + if (mptcp(tp)) { + if (is_master_tp(tp)) { + mptcp_update_metasocket(mptcp_meta_sk(sk)); + } else { + struct sock *meta_sk = mptcp_meta_sk(sk); + + tcp_send_ack(sk); + + /* Update RTO as it might be worse/better */ + mptcp_set_rto(sk); + + /* If the new RTO would fire earlier, pull it in! */ + if (tcp_sk(meta_sk)->packets_out && + icsk->icsk_timeout > inet_csk(meta_sk)->icsk_rto + jiffies) { + tcp_rearm_rto(meta_sk); + } + + mptcp_push_pending_frames(mptcp_meta_sk(sk)); + } + } break; case TCP_FIN_WAIT1: { @@ -6205,7 +6392,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); - } else if (th->fin || sock_owned_by_user(sk)) { + } else if (th->fin || mptcp_is_data_fin(skb) || + sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing * and not so rare event. We still can lose it now, @@ -6214,7 +6402,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) */ inet_csk_reset_keepalive_timer(sk, tmo); } else { - tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo); goto discard; } break; @@ -6222,7 +6410,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk, TCP_TIME_WAIT, 0); + tp->ops->time_wait(sk, TCP_TIME_WAIT, 0); goto discard; } break; @@ -6234,6 +6422,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) goto discard; } break; + case TCP_CLOSE: + if (tp->mp_killed) + goto discard; } /* step 6: check the URG bit */ @@ -6255,7 +6446,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) */ if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && + !mptcp(tp)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; @@ -6352,6 +6544,8 @@ static void tcp_openreq_init(struct request_sock *req, ireq->wscale_ok = rx_opt->wscale_ok; ireq->acked = 0; ireq->ecn_ok = 0; + ireq->mptcp_rqsk = 0; + ireq->saw_mpc = 0; ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); @@ -6449,12 +6643,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. + * + * MPTCP: new subflows cannot be established in a stateless manner. */ - if ((net->ipv4.sysctl_tcp_syncookies == 2 || + if (((!is_meta_sk(sk) && net->ipv4.sysctl_tcp_syncookies == 2) || inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); if (!want_cookie) goto drop; + + if (is_meta_sk(sk)) + goto drop; } if (sk_acceptq_is_full(sk)) { @@ -6472,8 +6671,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, - want_cookie ? NULL : &foc); + tcp_parse_options(sock_net(sk), skb, &tmp_opt, NULL, 0, + want_cookie ? NULL : &foc, NULL); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); @@ -6488,7 +6687,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - af_ops->init_req(req, sk, skb); + if (af_ops->init_req(req, sk, skb, want_cookie)) + goto drop_and_free; if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -6524,7 +6724,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_ecn_create_request(req, skb, sk, dst); if (want_cookie) { - isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + isn = cookie_init_sequence(af_ops, req, sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; @@ -6539,18 +6739,26 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); } if (fastopen_sk) { + struct sock *meta_sk = fastopen_sk; + + if (mptcp(tcp_sk(fastopen_sk))) + meta_sk = mptcp_meta_sk(fastopen_sk); af_ops->send_synack(fastopen_sk, dst, &fl, req, &foc, TCP_SYNACK_FASTOPEN); /* Add the child socket directly into the accept queue */ - if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { + if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) { reqsk_fastopen_remove(fastopen_sk, req, false); bh_unlock_sock(fastopen_sk); + if (meta_sk != fastopen_sk) + bh_unlock_sock(meta_sk); sock_put(fastopen_sk); reqsk_put(req); goto drop; } sk->sk_data_ready(sk); bh_unlock_sock(fastopen_sk); + if (meta_sk != fastopen_sk) + bh_unlock_sock(meta_sk); sock_put(fastopen_sk); } else { tcp_rsk(req)->tfo_listener = false; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index de4edfbc9e46..581e21e90a39 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -67,6 +67,8 @@ #include #include #include +#include +#include #include #include #include @@ -435,7 +437,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) struct inet_sock *inet; const int type = icmp_hdr(icmp_skb)->type; const int code = icmp_hdr(icmp_skb)->code; - struct sock *sk; + struct sock *sk, *meta_sk; struct sk_buff *skb; struct request_sock *fastopen; u32 seq, snd_una; @@ -464,13 +466,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))); - bh_lock_sock(sk); + tp = tcp_sk(sk); + if (mptcp(tp)) + meta_sk = mptcp_meta_sk(sk); + else + meta_sk = sk; + + bh_lock_sock(meta_sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. * We do take care of PMTU discovery (RFC1191) special case : * we can receive locally generated ICMP messages while socket is held. */ - if (sock_owned_by_user(sk)) { + if (sock_owned_by_user(meta_sk)) { if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); } @@ -483,7 +491,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } icsk = inet_csk(sk); - tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; @@ -517,11 +524,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; WRITE_ONCE(tp->mtu_info, info); - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { tcp_v4_mtu_reduced(sk); } else { if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); + if (mptcp(tp)) + mptcp_tsq_flags(sk); } goto out; } @@ -535,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) !icsk->icsk_backoff || fastopen) break; - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(meta_sk)) break; skb = tcp_rtx_queue_head(sk); @@ -558,7 +567,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } else { /* RTO revert clocked out retransmission. * Will retransmit now */ - tcp_retransmit_timer(sk); + tcp_sk(sk)->ops->retransmit_timer(sk); } break; @@ -578,7 +587,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (fastopen && !fastopen->sk) break; - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { sk->sk_err = err; sk->sk_error_report(sk); @@ -607,7 +616,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) */ inet = inet_sk(sk); - if (!sock_owned_by_user(sk) && inet->recverr) { + if (!sock_owned_by_user(meta_sk) && inet->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else { /* Only an error on timeout */ @@ -615,7 +624,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } out: - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } @@ -650,7 +659,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) +void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -796,10 +805,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) */ static void tcp_v4_send_ack(const struct sock *sk, - struct sk_buff *skb, u32 seq, u32 ack, + struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, - int reply_flags, u8 tos) + int reply_flags, u8 tos, int mptcp) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -808,6 +817,10 @@ static void tcp_v4_send_ack(const struct sock *sk, #ifdef CONFIG_TCP_MD5SIG + (TCPOLEN_MD5SIG_ALIGNED >> 2) #endif +#ifdef CONFIG_MPTCP + + ((MPTCP_SUB_LEN_DSS >> 2) + + (MPTCP_SUB_LEN_ACK >> 2)) +#endif ]; } rep; struct net *net = sock_net(sk); @@ -853,6 +866,21 @@ static void tcp_v4_send_ack(const struct sock *sk, ip_hdr(skb)->daddr, &rep.th); } #endif +#ifdef CONFIG_MPTCP + if (mptcp) { + int offset = (tsecr) ? 3 : 0; + /* Construction of 32-bit data_ack */ + rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) | + ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | + (0x20 << 8) | + (0x01)); + rep.opt[offset] = htonl(data_ack); + + arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; + rep.th.doff = arg.iov[0].iov_len / 4; + } +#endif /* CONFIG_MPTCP */ + arg.flags = reply_flags; arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ @@ -881,28 +909,36 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + u32 data_ack = 0; + int mptcp = 0; + + if (tcptw->mptcp_tw) { + data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; + mptcp = 1; + } tcp_v4_send_ack(sk, skb, - tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, data_ack, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, - tw->tw_tos + tw->tw_tos, mptcp ); inet_twsk_put(tw); } -static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *req) +void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : + u32 seq = (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ? + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt; /* RFC 7323 2.3 @@ -911,7 +947,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * Rcv.Wind.Shift bits: */ tcp_v4_send_ack(sk, skb, seq, - tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt, 0, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, @@ -919,7 +955,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos); + ip_hdr(skb)->tos, 0); } /* @@ -927,11 +963,11 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, - struct flowi *fl, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) +int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, + struct flowi *fl, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -961,7 +997,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, /* * IPv4 request_sock destructor. */ -static void tcp_v4_reqsk_destructor(struct request_sock *req) +void tcp_v4_reqsk_destructor(struct request_sock *req) { kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); } @@ -1343,9 +1379,10 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, return false; } -static void tcp_v4_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static int tcp_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb, + bool want_cookie) { struct inet_request_sock *ireq = inet_rsk(req); struct net *net = sock_net(sk_listener); @@ -1353,6 +1390,8 @@ static void tcp_v4_init_req(struct request_sock *req, sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); + + return 0; } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, @@ -1372,7 +1411,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v4_md5_lookup, @@ -1509,7 +1548,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) +struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); @@ -1532,6 +1571,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; + if (is_meta_sk(sk)) + return mptcp_v4_do_rcv(sk, skb); + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst = sk->sk_rx_dst; @@ -1683,6 +1725,10 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); +#ifdef CONFIG_MPTCP + TCP_SKB_CB(skb)->mptcp_flags = 0; + TCP_SKB_CB(skb)->dss_off = 0; +#endif TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); @@ -1701,8 +1747,8 @@ int tcp_v4_rcv(struct sk_buff *skb) int sdif = inet_sdif(skb); const struct iphdr *iph; const struct tcphdr *th; + struct sock *sk, *meta_sk = NULL; bool refcounted; - struct sock *sk; int ret; if (skb->pkt_type != PACKET_HOST) @@ -1756,7 +1802,11 @@ int tcp_v4_rcv(struct sk_buff *skb) reqsk_put(req); goto csum_error; } - if (unlikely(sk->sk_state != TCP_LISTEN)) { + if (unlikely(sk->sk_state != TCP_LISTEN && !is_meta_sk(sk))) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + if (unlikely(is_meta_sk(sk) && !mptcp_can_new_subflow(sk))) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } @@ -1765,6 +1815,7 @@ int tcp_v4_rcv(struct sk_buff *skb) */ sock_hold(sk); refcounted = true; + nsk = NULL; if (!tcp_filter(sk, skb)) { th = (const struct tcphdr *)skb->data; @@ -1825,15 +1876,24 @@ int tcp_v4_rcv(struct sk_buff *skb) sk_incoming_cpu_update(sk); - bh_lock_sock_nested(sk); + if (mptcp(tcp_sk(sk))) { + meta_sk = mptcp_meta_sk(sk); + + bh_lock_sock_nested(meta_sk); + if (sock_owned_by_user(meta_sk)) + mptcp_prepare_for_backlog(sk, skb); + } else { + meta_sk = sk; + bh_lock_sock_nested(sk); + } tcp_segs_in(tcp_sk(sk), skb); ret = 0; - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { ret = tcp_v4_do_rcv(sk, skb); - } else if (tcp_add_backlog(sk, skb)) { + } else if (tcp_add_backlog(meta_sk, skb)) { goto discard_and_relse; } - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); put_and_return: if (refcounted) @@ -1847,6 +1907,19 @@ int tcp_v4_rcv(struct sk_buff *skb) tcp_v4_fill_cb(skb, iph, th); +#ifdef CONFIG_MPTCP + if (!sk && th->syn && !th->ack) { + int ret = mptcp_lookup_join(skb, NULL); + + if (ret < 0) { + tcp_v4_send_reset(NULL, skb); + goto discard_it; + } else if (ret > 0) { + return 0; + } + } +#endif + if (tcp_checksum_complete(skb)) { csum_error: __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); @@ -1895,6 +1968,18 @@ int tcp_v4_rcv(struct sk_buff *skb) refcounted = false; goto process; } +#ifdef CONFIG_MPTCP + if (th->syn && !th->ack) { + int ret = mptcp_lookup_join(skb, inet_twsk(sk)); + + if (ret < 0) { + tcp_v4_send_reset(NULL, skb); + goto discard_it; + } else if (ret > 0) { + return 0; + } + } +#endif } /* to ACK */ /* fall through */ @@ -1964,7 +2049,12 @@ static int tcp_v4_init_sock(struct sock *sk) tcp_init_sock(sk); - icsk->icsk_af_ops = &ipv4_specific; +#ifdef CONFIG_MPTCP + if (sock_flag(sk, SOCK_MPTCP)) + icsk->icsk_af_ops = &mptcp_v4_specific; + else +#endif + icsk->icsk_af_ops = &ipv4_specific; #ifdef CONFIG_TCP_MD5SIG tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; @@ -1983,6 +2073,11 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_cleanup_congestion_control(sk); + if (mptcp(tp)) + mptcp_destroy_sock(sk); + if (tp->inside_tk_table) + mptcp_hash_remove_bh(tp); + tcp_cleanup_ulp(sk); /* Cleanup up the write buffer. */ @@ -2488,6 +2583,11 @@ struct proto tcp_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), +#ifdef CONFIG_MPTCP + .useroffset = offsetof(struct tcp_sock, mptcp_sched_name), + .usersize = sizeof_field(struct tcp_sock, mptcp_sched_name) + + sizeof_field(struct tcp_sock, mptcp_pm_name), +#endif .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, @@ -2498,6 +2598,9 @@ struct proto tcp_prot = { .compat_getsockopt = compat_tcp_getsockopt, #endif .diag_destroy = tcp_abort, +#ifdef CONFIG_MPTCP + .clear_sk = mptcp_clear_sk, +#endif }; EXPORT_SYMBOL(tcp_prot); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a20b393b4501..984fcf2015db 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -18,11 +18,13 @@ * Jorge Cwik, */ +#include #include #include #include #include #include +#include #include #include #include @@ -94,10 +96,14 @@ enum tcp_tw_status struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + struct mptcp_options_received mopt; tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); + if (th->doff > (sizeof(*th) >> 2) && + (tcptw->tw_ts_recent_stamp || tcptw->mptcp_tw)) { + mptcp_init_mp_opt(&mopt); + + tcp_parse_options(twsk_net(tw), skb, &tmp_opt, &mopt, 0, NULL, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) @@ -106,6 +112,11 @@ enum tcp_tw_status tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } + + if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) { + if (mopt.mptcp_sender_key == tcptw->mptcp_tw->loc_key) + return TCP_TW_RST; + } } if (tw->tw_substate == TCP_FIN_WAIT2) { @@ -129,6 +140,16 @@ enum tcp_tw_status if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { + /* If mptcp_is_data_fin() returns true, we are sure that + * mopt has been initialized - otherwise it would not + * be a DATA_FIN. + */ + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw && + mptcp_is_data_fin(skb) && + TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && + mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt) + return TCP_TW_ACK; + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -268,12 +289,25 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; - tcptw->tw_rcv_wnd = tcp_receive_window(tp); + /* no need to keep track of the right-most right edge + * when in time wait, can directly use the currently + * advertised window. + */ + tcptw->tw_rcv_wnd = tcp_receive_window_now(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; + if (mptcp(tp)) { + if (mptcp_init_tw_sock(sk, tcptw)) { + inet_twsk_free(tw); + goto exit; + } + } else { + tcptw->mptcp_tw = NULL; + } + #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -330,6 +364,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); } +exit: tcp_update_metrics(sk); tcp_done(sk); } @@ -337,9 +372,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) void tcp_twsk_destructor(struct sock *sk) { -#ifdef CONFIG_TCP_MD5SIG struct tcp_timewait_sock *twsk = tcp_twsk(sk); + if (twsk->mptcp_tw) + mptcp_twsk_destructor(twsk); +#ifdef CONFIG_TCP_MD5SIG if (twsk->tw_md5_key) kfree_rcu(twsk->tw_md5_key, rcu); #endif @@ -378,8 +415,9 @@ void tcp_openreq_init_rwin(struct request_sock *req, full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(sk_listener, full_space, - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + tp->ops->select_initial_window(sk_listener, full_space, + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) - + (ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, @@ -477,6 +515,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + newtp->out_of_order_queue = RB_ROOT; + newsk->tcp_rtx_queue = RB_ROOT; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); @@ -528,6 +568,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rcv_right_edge = newtp->rcv_wnd + newtp->rcv_wup; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; @@ -547,6 +588,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } + if (ireq->saw_mpc) + newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN; newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ @@ -566,6 +609,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rack.last_delivered = 0; newtp->rack.reo_wnd_persist = 0; newtp->rack.dsack_seen = 0; + newtp->inside_tk_table = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); @@ -589,15 +633,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool fastopen, bool *req_stolen) { struct tcp_options_received tmp_opt; + struct mptcp_options_received mopt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; bool own_req; + bool meta_locked = false; tmp_opt.saw_tstamp = 0; + + mptcp_init_mp_opt(&mopt); + if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); + tcp_parse_options(sock_net(sk), skb, &tmp_opt, &mopt, 0, NULL, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; @@ -638,7 +687,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. + * + * Fall back to TCP if MP_CAPABLE is not set. */ + + if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc) + inet_rsk(req)->saw_mpc = false; + + if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && @@ -786,17 +842,40 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ + if (is_meta_sk(sk)) { + bh_lock_sock_nested(sk); + meta_locked = true; + } child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); if (!child) goto listen_overflow; + if (own_req && !is_meta_sk(sk)) { + int ret = mptcp_check_req_master(sk, child, req, skb, 1, 0); + if (ret < 0) + goto listen_overflow; + + /* MPTCP-supported */ + if (!ret) + return tcp_sk(child)->mpcb->master_sk; + } else if (own_req) { + return mptcp_check_req_child(sk, child, req, skb, &mopt); + } + + if (meta_locked) + bh_unlock_sock(sk); + sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; + return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + if (meta_locked) + bh_unlock_sock(sk); + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; @@ -842,12 +921,13 @@ int tcp_child_process(struct sock *parent, struct sock *child, { int ret = 0; int state = child->sk_state; + struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child; /* record NAPI ID of child */ sk_mark_napi_id(child, skb); tcp_segs_in(tcp_sk(child), skb); - if (!sock_owned_by_user(child)) { + if (!sock_owned_by_user(meta_sk)) { ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) @@ -857,10 +937,14 @@ int tcp_child_process(struct sock *parent, struct sock *child, * in main socket hash table and lock on listening * socket does not protect us more. */ - __sk_add_backlog(child, skb); + if (mptcp(tcp_sk(child))) + mptcp_prepare_for_backlog(child, skb); + __sk_add_backlog(meta_sk, skb); } bh_unlock_sock(child); + if (mptcp(tcp_sk(child))) + bh_unlock_sock(meta_sk); sock_put(child); return ret; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c97c027a8d77..b9b821fd7f91 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -36,6 +36,12 @@ #define pr_fmt(fmt) "TCP: " fmt +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif +#include #include #include @@ -45,11 +51,8 @@ #include -static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - int push_one, gfp_t gfp); - /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) +void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -242,12 +245,16 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, * value can be stuffed directly into th->window for an outgoing * frame. */ -static u16 tcp_select_window(struct sock *sk) +u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 old_win = tp->rcv_wnd; - u32 cur_win = tcp_receive_window(tp); - u32 new_win = __tcp_select_window(sk); + /* The window must never shrink at the meta-level. At the subflow we + * have to allow this. Otherwise we may announce a window too large + * for the current meta-level sk_rcvbuf. + */ + u32 cur_win = tcp_receive_window_now(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp); + u32 new_win = tp->ops->__select_window(sk); /* Never shrink the offered window */ if (new_win < cur_win) { @@ -263,8 +270,10 @@ static u16 tcp_select_window(struct sock *sk) LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } + tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; + tcp_update_rcv_right_edge(tp); /* Make sure we do not exceed the maximum possible * scaled window. @@ -375,7 +384,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->ip_summed = CHECKSUM_PARTIAL; @@ -390,7 +399,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->end_seq = seq; } -static inline bool tcp_urg_mode(const struct tcp_sock *tp) +bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } @@ -401,6 +410,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) +/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */ static void smc_options_write(__be32 *ptr, u16 *options) { @@ -417,17 +427,6 @@ static void smc_options_write(__be32 *ptr, u16 *options) #endif } -struct tcp_out_options { - u16 options; /* bit field of OPTION_* */ - u16 mss; /* 0 to disable */ - u8 ws; /* window scale, 0 to disable */ - u8 num_sack_blocks; /* number of SACK blocks to include */ - u8 hash_size; /* bytes in hash_location */ - __u8 *hash_location; /* temporary pointer, overloaded */ - __u32 tsval, tsecr; /* need to include OPTION_TS */ - struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ -}; - /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -442,7 +441,7 @@ struct tcp_out_options { * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, - struct tcp_out_options *opts) + struct tcp_out_options *opts, struct sk_buff *skb) { u16 options = opts->options; /* mungable copy */ @@ -536,6 +535,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } smc_options_write(ptr, &options); + + if (unlikely(OPTION_MPTCP & opts->options)) + mptcp_options_write(ptr, tp, opts, skb); } static void smc_set_option(const struct tcp_sock *tp, @@ -621,6 +623,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; } + if (tp->request_mptcp || mptcp(tp)) + mptcp_syn_options(sk, opts, &remaining); if (fastopen && fastopen->cookie.len >= 0) { u32 need = fastopen->cookie.len; @@ -704,6 +708,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (ireq->saw_mpc) + mptcp_synack_options(req, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -737,6 +744,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } + if (mptcp(tp)) + mptcp_established_options(sk, skb, opts, &size); eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { @@ -787,19 +796,31 @@ static void tcp_tsq_write(struct sock *sk) tcp_xmit_retransmit_queue(sk); } - tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, - 0, GFP_ATOMIC); + tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk), + tcp_sk(sk)->nonagle, 0, GFP_ATOMIC); } } static void tcp_tsq_handler(struct sock *sk) { - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) + struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; + + bh_lock_sock(meta_sk); + if (!sock_owned_by_user(meta_sk)) { tcp_tsq_write(sk); - else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) - sock_hold(sk); - bh_unlock_sock(sk); + + if (mptcp(tp)) + tcp_tsq_write(meta_sk); + } else { + if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + + if ((mptcp(tp)) && (sk->sk_state != TCP_CLOSE)) + mptcp_tsq_flags(sk); + } + + bh_unlock_sock(meta_sk); } /* * One tasklet per cpu tries to send more skbs. @@ -836,7 +857,9 @@ static void tcp_tasklet_func(unsigned long data) #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \ - TCPF_MTU_REDUCED_DEFERRED) + TCPF_MTU_REDUCED_DEFERRED | \ + TCPF_PATH_MANAGER_DEFERRED |\ + TCPF_SUB_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket @@ -859,6 +882,9 @@ void tcp_release_cb(struct sock *sk) if (flags & TCPF_TSQ_DEFERRED) { tcp_tsq_write(sk); __sock_put(sk); + + if (mptcp(tcp_sk(sk))) + tcp_tsq_write(mptcp_meta_sk(sk)); } /* Here begins the tricky part : * We are called from release_sock() with : @@ -883,6 +909,13 @@ void tcp_release_cb(struct sock *sk) inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } + if (flags & TCPF_PATH_MANAGER_DEFERRED) { + if (tcp_sk(sk)->mpcb->pm_ops->release_sock) + tcp_sk(sk)->mpcb->pm_ops->release_sock(sk); + __sock_put(sk); + } + if (flags & TCPF_SUB_DEFERRED) + mptcp_tsq_sub_deferred(sk); } EXPORT_SYMBOL(tcp_release_cb); @@ -1003,7 +1036,7 @@ static bool tcp_pacing_check(const struct sock *sk) hrtimer_is_queued(&tcp_sk(sk)->pacing_timer); } -static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) { skb->skb_mstamp = tp->tcp_mstamp; list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); @@ -1115,10 +1148,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } } - tcp_options_write((__be32 *)(th + 1), tp, &opts); + tcp_options_write((__be32 *)(th + 1), tp, &opts, skb); skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { - th->window = htons(tcp_select_window(sk)); + th->window = htons(tp->ops->select_window(sk)); tcp_ecn_send(sk, skb, th, tcp_header_size); } else { /* RFC1323: The window in SYN & SYN/ACK segments @@ -1176,8 +1209,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, return err; } -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - gfp_t gfp_mask) +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) { return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, tcp_sk(sk)->rcv_nxt); @@ -1188,7 +1221,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */ -static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) +void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -1201,7 +1234,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { if (skb->len <= mss_now) { /* Avoid the costly divide in the normal @@ -1218,7 +1251,7 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ -static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) +void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); @@ -1386,7 +1419,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, /* This is similar to __pskb_pull_tail(). The difference is that pulled * data is not copied, but immediately discarded. */ -static int __pskb_trim_head(struct sk_buff *skb, int len) +int __pskb_trim_head(struct sk_buff *skb, int len) { struct skb_shared_info *shinfo; int i, k, eat; @@ -1609,6 +1642,7 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } +EXPORT_SYMBOL(tcp_current_mss); /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, @@ -1668,8 +1702,11 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * 2) not cwnd limited (this else condition) * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) + * 5) For MPTCP subflows, the scheduler determines + * sndbuf limited. */ if (tcp_write_queue_empty(sk) && sk->sk_socket && + !(mptcp(tcp_sk(sk)) && !is_meta_sk(sk)) && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); @@ -1691,8 +1728,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp) * But we can avoid doing the divide again given we already have * skb_pcount = skb->len / mss_now */ -static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, - const struct sk_buff *skb) +void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, + const struct sk_buff *skb) { if (skb->len < tcp_skb_pcount(skb) * mss_now) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; @@ -1737,7 +1774,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. */ -static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) +u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; u32 min_tso, tso_segs; @@ -1751,11 +1788,11 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) } /* Returns the portion of skb which can be sent right away */ -static unsigned int tcp_mss_split_point(const struct sock *sk, - const struct sk_buff *skb, - unsigned int mss_now, - unsigned int max_segs, - int nonagle) +unsigned int tcp_mss_split_point(const struct sock *sk, + const struct sk_buff *skb, + unsigned int mss_now, + unsigned int max_segs, + int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); u32 partial, needed, window, max_len; @@ -1785,13 +1822,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, - const struct sk_buff *skb) +unsigned int tcp_cwnd_test(const struct tcp_sock *tp, + const struct sk_buff *skb) { u32 in_flight, cwnd, halfcwnd; /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && + if (skb && + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) return 1; @@ -1806,12 +1844,13 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, halfcwnd = max(cwnd >> 1, 1U); return min(halfcwnd, cwnd - in_flight); } +EXPORT_SYMBOL(tcp_cwnd_test); /* Initialize TSO state of a skb. * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) +int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); @@ -1826,8 +1865,8 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) /* Return true if the Nagle test allows this packet to be * sent now. */ -static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned int cur_mss, int nonagle) +bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). @@ -1839,7 +1878,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf return true; /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || + mptcp_is_data_fin(skb)) return true; if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) @@ -1849,9 +1889,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf } /* Does at least the first segment of SKB fit into the send window? */ -static bool tcp_snd_wnd_test(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned int cur_mss) +bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1860,6 +1899,7 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, return !after(end_seq, tcp_wnd_end(tp)); } +EXPORT_SYMBOL(tcp_snd_wnd_test); /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like @@ -2012,7 +2052,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, } /* If this packet won't get more data, do not wait. */ - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) goto send_now; return true; @@ -2313,7 +2353,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ -static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); @@ -2327,7 +2367,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; tcp_mstamp_refresh(tp); - if (!push_one) { + + /* pmtu not yet supported with MPTCP. Should be possible, by early + * exiting the loop inside tcp_mtu_probe, making sure that only one + * single DSS-mapping gets probed. + */ + if (!push_one && !mptcp(tp)) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { @@ -2522,7 +2567,7 @@ void tcp_send_loss_probe(struct sock *sk) skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; - tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); if (tp->packets_out > pcount) goto probe_sent; goto rearm_timer; @@ -2584,8 +2629,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, if (unlikely(sk->sk_state == TCP_CLOSE)) return; - if (tcp_write_xmit(sk, cur_mss, nonagle, 0, - sk_gfp_mask(sk, GFP_ATOMIC))) + if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0, + sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -2598,7 +2643,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) BUG_ON(!skb || skb->len < mss_now); - tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); + tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, + sk->sk_allocation); } /* This function returns the amount that we can raise the @@ -2820,6 +2866,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; + /* Currently not supported for MPTCP - but it should be possible */ + if (mptcp(tp)) + return; + skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; @@ -3289,7 +3339,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), NULL, &opts); + tcp_options_write((__be32 *)(th + 1), NULL, &opts, skb); th->doff = (tcp_header_size >> 2); __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); @@ -3370,13 +3420,13 @@ static void tcp_connect_init(struct sock *sk) if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); - tcp_select_initial_window(sk, tcp_full_space(sk), - tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), - &tp->rcv_wnd, - &tp->window_clamp, - sock_net(sk)->ipv4.sysctl_tcp_window_scaling, - &rcv_wscale, - rcv_wnd); + tp->ops->select_initial_window(sk, tcp_full_space(sk), + tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), + &tp->rcv_wnd, + &tp->window_clamp, + sock_net(sk)->ipv4.sysctl_tcp_window_scaling, + &rcv_wscale, + rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; @@ -3396,11 +3446,43 @@ static void tcp_connect_init(struct sock *sk) else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; + /* force set rcv_right_edge here at start of connection */ + tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd; WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); + +#ifdef CONFIG_MPTCP + if (sock_flag(sk, SOCK_MPTCP) && mptcp_doit(sk)) { + if (is_master_tp(tp)) { + tp->request_mptcp = 1; + mptcp_connect_init(sk); + } else if (tp->mptcp) { + struct inet_sock *inet = inet_sk(sk); + + tp->mptcp->snt_isn = tp->write_seq; + tp->mptcp->init_rcv_wnd = tp->rcv_wnd; + + /* Set nonce for new subflows */ + if (sk->sk_family == AF_INET) + tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce( + inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + inet->inet_dport); +#if IS_ENABLED(CONFIG_IPV6) + else + tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce( + inet6_sk(sk)->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport); +#endif + } + } +#endif } static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) @@ -3663,6 +3745,7 @@ void tcp_send_ack(struct sock *sk) { __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } +EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. @@ -3675,7 +3758,7 @@ void tcp_send_ack(struct sock *sk) * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ -static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) +int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3762,7 +3845,7 @@ void tcp_send_probe0(struct sock *sk) unsigned long probe_max; int err; - err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); + err = tp->ops->write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d071ed6b8b9a..0b1329220b84 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -20,6 +20,7 @@ #include #include +#include #include static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) @@ -46,7 +47,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) * Returns: Nothing (void) */ -static void tcp_write_err(struct sock *sk) +void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; sk->sk_error_report(sk); @@ -102,7 +103,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) (!tp->snd_wnd && !tp->packets_out)) do_reset = true; if (do_reset) - tcp_send_active_reset(sk, GFP_ATOMIC); + tp->ops->send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; @@ -187,9 +188,9 @@ static unsigned int tcp_model_timeout(struct sock *sk, * after "boundary" unsuccessful, exponentially backed-off * retransmissions with an initial RTO of TCP_RTO_MIN. */ -static bool retransmits_timed_out(struct sock *sk, - unsigned int boundary, - unsigned int timeout) +bool retransmits_timed_out(struct sock *sk, + unsigned int boundary, + unsigned int timeout) { unsigned int start_ts; @@ -209,7 +210,7 @@ static bool retransmits_timed_out(struct sock *sk, } /* A write timeout has occurred. Process the after effects. */ -static int tcp_write_timeout(struct sock *sk) +int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -224,6 +225,17 @@ static int tcp_write_timeout(struct sock *sk) sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; + +#ifdef CONFIG_MPTCP + /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */ + if (tcp_sk(sk)->request_mptcp && + icsk->icsk_retransmits >= sysctl_mptcp_syn_retries) { + tcp_sk(sk)->request_mptcp = 0; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLERETRANSFALLBACK); + } +#endif /* CONFIG_MPTCP */ + expired = icsk->icsk_retransmits >= retry_until; } else { if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { @@ -319,18 +331,22 @@ static void tcp_delack_timer(struct timer_list *t) struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; + struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) { + bh_lock_sock(meta_sk); + if (!sock_owned_by_user(meta_sk)) { tcp_delack_timer_handler(sk); } else { icsk->icsk_ack.blocked = 1; - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); + if (mptcp(tp)) + mptcp_tsq_flags(sk); } - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } @@ -375,6 +391,10 @@ static void tcp_probe_timer(struct sock *sk) if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); + if (is_meta_sk(sk) && + mptcp_in_infinite_mapping_weak(tp->mpcb)) { + mptcp_sub_force_close_all(tp->mpcb, NULL); + } } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); @@ -590,7 +610,7 @@ void tcp_write_timer_handler(struct sock *sk) break; case ICSK_TIME_RETRANS: icsk->icsk_pending = 0; - tcp_retransmit_timer(sk); + tcp_sk(sk)->ops->retransmit_timer(sk); break; case ICSK_TIME_PROBE0: icsk->icsk_pending = 0; @@ -607,16 +627,19 @@ static void tcp_write_timer(struct timer_list *t) struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; + struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk; - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) { + bh_lock_sock(meta_sk); + if (!sock_owned_by_user(meta_sk)) { tcp_write_timer_handler(sk); } else { /* delegate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); + if (mptcp(tcp_sk(sk))) + mptcp_tsq_flags(sk); } - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } @@ -646,11 +669,12 @@ static void tcp_keepalive_timer (struct timer_list *t) struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk; u32 elapsed; /* Only process if socket is not in use. */ - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { + bh_lock_sock(meta_sk); + if (sock_owned_by_user(meta_sk)) { /* Try again later. */ inet_csk_reset_keepalive_timer (sk, HZ/20); goto out; @@ -662,16 +686,31 @@ static void tcp_keepalive_timer (struct timer_list *t) } tcp_mstamp_refresh(tp); + + if (tp->send_mp_fclose && sk->sk_state == TCP_RST_WAIT) { + if (icsk->icsk_retransmits >= MPTCP_FASTCLOSE_RETRIES) { + tcp_write_err(sk); + goto out; + } + + tcp_send_ack(sk); + icsk->icsk_retransmits++; + + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + elapsed = icsk->icsk_rto; + goto resched; + } + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { - tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } - tcp_send_active_reset(sk, GFP_ATOMIC); + tp->ops->send_active_reset(sk, GFP_ATOMIC); goto death; } @@ -696,11 +735,11 @@ static void tcp_keepalive_timer (struct timer_list *t) icsk->icsk_probes_out > 0) || (icsk->icsk_user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { - tcp_send_active_reset(sk, GFP_ATOMIC); + tp->ops->send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; } - if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { + if (tp->ops->write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { @@ -724,7 +763,7 @@ static void tcp_keepalive_timer (struct timer_list *t) tcp_done(sk); out: - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9d8b791f63ef..e0197baabd03 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -917,6 +917,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) kfree_rcu(ifp, rcu); } +EXPORT_SYMBOL(inet6_ifa_finish_destroy); static void ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5c2351deedc8..a6a95a018de8 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -107,8 +107,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } -static int inet6_create(struct net *net, struct socket *sock, int protocol, - int kern) +int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) { struct inet_sock *inet; struct ipv6_pinfo *np; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4e1da6cb9ed7..be915ab75562 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -48,6 +48,8 @@ #include #include #include +#include +#include #include #include #include @@ -223,7 +225,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, &tcp_prot, 1); local_bh_enable(); sk->sk_prot = &tcp_prot; - icsk->icsk_af_ops = &ipv4_specific; +#ifdef CONFIG_MPTCP + if (sock_flag(sk, SOCK_MPTCP)) + icsk->icsk_af_ops = &mptcp_v4_specific; + else +#endif + icsk->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index ec61b67a92be..13e0790bfc71 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #define COOKIEBITS 24 /* Upper bits store count */ @@ -111,7 +113,8 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, } EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); -__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) +__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk, + const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); @@ -133,6 +136,7 @@ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; + struct mptcp_options_received mopt; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct ipv6_pinfo *np = inet6_sk(sk); @@ -162,7 +166,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL); + mptcp_init_mp_opt(&mopt); + tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcpv6_ts_off(sock_net(sk), @@ -175,14 +180,27 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); +#ifdef CONFIG_MPTCP + if (mopt.saw_mpc) + req = inet_reqsk_alloc(&mptcp6_request_sock_ops, sk, false); + else +#endif + req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false); if (!req) goto out; ireq = inet_rsk(req); + ireq->mptcp_rqsk = 0; + ireq->saw_mpc = 0; treq = tcp_rsk(req); treq->tfo_listener = false; + /* Must be done before anything else, as it initializes + * hash_entry of the MPTCP request-sock. + */ + if (mopt.saw_mpc) + mptcp_cookies_reqsk_init(req, &mopt, skb); + if (security_inet_conn_request(sk, skb, req)) goto out_free; @@ -252,10 +270,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; - tcp_select_initial_window(sk, full_space, req->mss, - &req->rsk_rcv_wnd, &req->rsk_window_clamp, - ireq->wscale_ok, &rcv_wscale, - dst_metric(dst, RTAX_INITRWND)); + tp->ops->select_initial_window(sk, full_space, req->mss, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, + ireq->wscale_ok, &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e8d206725cb7..b83dad9b34e6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -61,6 +61,8 @@ #include #include #include +#include +#include #include #include @@ -70,15 +72,6 @@ #include #include - -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); -static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *req); - -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); - -static const struct inet_connection_sock_af_ops ipv6_mapped; -static const struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; @@ -90,7 +83,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, } #endif -static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -132,7 +125,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); } -static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, +int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; @@ -229,7 +222,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - icsk->icsk_af_ops = &ipv6_mapped; +#ifdef CONFIG_MPTCP + if (sock_flag(sk, SOCK_MPTCP)) + icsk->icsk_af_ops = &mptcp_v6_mapped; + else +#endif + icsk->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_mapped_specific; @@ -239,7 +237,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) { icsk->icsk_ext_hdr_len = exthdrlen; - icsk->icsk_af_ops = &ipv6_specific; +#ifdef CONFIG_MPTCP + if (sock_flag(sk, SOCK_MPTCP)) + icsk->icsk_af_ops = &mptcp_v6_specific; + else +#endif + icsk->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_specific; @@ -333,7 +336,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, return err; } -static void tcp_v6_mtu_reduced(struct sock *sk) +void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; u32 mtu; @@ -369,7 +372,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct ipv6_pinfo *np; struct tcp_sock *tp; __u32 seq, snd_una; - struct sock *sk; + struct sock *sk, *meta_sk; bool fatal; int err; @@ -393,8 +396,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk->sk_state == TCP_NEW_SYN_RECV) return tcp_req_err(sk, seq, fatal); - bh_lock_sock(sk); - if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) + tp = tcp_sk(sk); + if (mptcp(tp)) + meta_sk = mptcp_meta_sk(sk); + else + meta_sk = sk; + + bh_lock_sock(meta_sk); + if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) @@ -405,7 +414,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; @@ -445,11 +453,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, WRITE_ONCE(tp->mtu_info, mtu); - if (!sock_owned_by_user(sk)) + if (!sock_owned_by_user(meta_sk)) { tcp_v6_mtu_reduced(sk); - else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, - &sk->sk_tsq_flags)) - sock_hold(sk); + } else { + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, + &sk->sk_tsq_flags)) + sock_hold(sk); + if (mptcp(tp)) + mptcp_tsq_flags(sk); + } goto out; } @@ -464,7 +476,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (fastopen && !fastopen->sk) break; - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ @@ -474,14 +486,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - if (!sock_owned_by_user(sk) && np->recverr) { + if (!sock_owned_by_user(meta_sk) && np->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else sk->sk_err_soft = err; out: - bh_unlock_sock(sk); + bh_unlock_sock(meta_sk); sock_put(sk); } @@ -528,8 +540,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, return err; } - -static void tcp_v6_reqsk_destructor(struct request_sock *req) +void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->ipv6_opt); kfree_skb(inet_rsk(req)->pktopts); @@ -747,9 +758,10 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, return false; } -static void tcp_v6_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static int tcp_v6_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb, + bool want_cookie) { struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = inet6_sk(sk_listener); @@ -770,6 +782,8 @@ static void tcp_v6_init_req(struct request_sock *req, refcount_inc(&skb->users); ireq->pktopts = skb; } + + return 0; } static struct dst_entry *tcp_v6_route_req(const struct sock *sk, @@ -789,7 +803,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG @@ -807,9 +821,9 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, - u32 ack, u32 win, u32 tsval, u32 tsecr, + u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, - u8 tclass, __be32 label) + u8 tclass, __be32 label, int mptcp) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; @@ -828,7 +842,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (key) tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif - +#ifdef CONFIG_MPTCP + if (mptcp) + tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK; +#endif buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (!buff) @@ -866,6 +883,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 tcp_v6_md5_hash_hdr((__u8 *)topt, key, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, t1); + topt += 4; + } +#endif +#ifdef CONFIG_MPTCP + if (mptcp) { + /* Construction of 32-bit data_ack */ + *topt++ = htonl((TCPOPT_MPTCP << 24) | + ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) | + (0x20 << 8) | + (0x01)); + *topt++ = htonl(data_ack); } #endif @@ -915,7 +943,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 kfree_skb(buff); } -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) +void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); u32 seq = 0, ack_seq = 0; @@ -983,7 +1011,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) trace_tcp_send_reset(sk, skb); } - tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0); #ifdef CONFIG_TCP_MD5SIG out: @@ -992,30 +1020,37 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) } static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, - u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, + u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, - __be32 label) + __be32 label, int mptcp) { - tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, - tclass, label); + tcp_v6_send_response(sk, skb, seq, ack, data_ack, win, tsval, tsecr, oif, + key, 0, tclass, label, mptcp); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + u32 data_ack = 0; + int mptcp = 0; + if (tcptw->mptcp_tw) { + data_ack = (u32)tcptw->mptcp_tw->rcv_nxt; + mptcp = 1; + } tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + data_ack, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), mptcp); inet_twsk_put(tw); } -static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *req) +void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, + struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. @@ -1025,18 +1060,18 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * exception of segments, MUST be right-shifted by * Rcv.Wind.Shift bits: */ - tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? + tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt, 0, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), - 0, 0); + 0, 0, 0); } -static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) +struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); @@ -1047,7 +1082,7 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } -static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); @@ -1078,11 +1113,11 @@ static void tcp_v6_restore_cb(struct sk_buff *skb) sizeof(struct inet6_skb_parm)); } -static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) +struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; @@ -1119,7 +1154,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->saddr = newsk->sk_v6_rcv_saddr; - inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; +#ifdef CONFIG_MPTCP + /* We must check on the request-socket because the listener + * socket's flag may have been changed halfway through. + */ + if (!inet_rsk(req)->saw_mpc) + inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped; + else +#endif + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG newtp->af_specific = &tcp_sock_ipv6_mapped_specific; @@ -1166,6 +1209,14 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (!newsk) goto out_nonewsk; +#ifdef CONFIG_MPTCP + /* If the meta_sk is v6-mapped we can end up here with the wrong af_ops. + * Just make sure that this subflow is v6. + */ + if (is_meta_sk(sk)) + inet_csk(newsk)->icsk_af_ops = &mptcp_v6_specific; +#endif + /* * No need to charge this sock to the relevant IPv6 refcnt debug socks * count here, tcp_create_openreq_child now does this for us, see the @@ -1294,7 +1345,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * This is because we cannot sleep with the original spinlock * held. */ -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp; @@ -1311,6 +1362,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + if (is_meta_sk(sk)) + return mptcp_v6_do_rcv(sk, skb); + /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. @@ -1438,6 +1492,10 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); +#ifdef CONFIG_MPTCP + TCP_SKB_CB(skb)->mptcp_flags = 0; + TCP_SKB_CB(skb)->dss_off = 0; +#endif TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); @@ -1451,8 +1509,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) int sdif = inet6_sdif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; + struct sock *sk, *meta_sk = NULL; bool refcounted; - struct sock *sk; int ret; struct net *net = dev_net(skb->dev); @@ -1506,12 +1564,17 @@ static int tcp_v6_rcv(struct sk_buff *skb) reqsk_put(req); goto csum_error; } - if (unlikely(sk->sk_state != TCP_LISTEN)) { + if (unlikely(sk->sk_state != TCP_LISTEN && !is_meta_sk(sk))) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + if (unlikely(is_meta_sk(sk) && !mptcp_can_new_subflow(sk))) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } sock_hold(sk); refcounted = true; + nsk = NULL; if (!tcp_filter(sk, skb)) { th = (const struct tcphdr *)skb->data; @@ -1544,6 +1607,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) return 0; } } + if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1570,15 +1634,25 @@ static int tcp_v6_rcv(struct sk_buff *skb) sk_incoming_cpu_update(sk); - bh_lock_sock_nested(sk); + if (mptcp(tcp_sk(sk))) { + meta_sk = mptcp_meta_sk(sk); + + bh_lock_sock_nested(meta_sk); + if (sock_owned_by_user(meta_sk)) + mptcp_prepare_for_backlog(sk, skb); + } else { + meta_sk = sk; + bh_lock_sock_nested(sk); + } tcp_segs_in(tcp_sk(sk), skb); ret = 0; - if (!sock_owned_by_user(sk)) { + if (!sock_owned_by_user(meta_sk)) { ret = tcp_v6_do_rcv(sk, skb); - } else if (tcp_add_backlog(sk, skb)) { + } else if (tcp_add_backlog(meta_sk, skb)) { goto discard_and_relse; } - bh_unlock_sock(sk); + + bh_unlock_sock(meta_sk); put_and_return: if (refcounted) @@ -1591,6 +1665,19 @@ static int tcp_v6_rcv(struct sk_buff *skb) tcp_v6_fill_cb(skb, hdr, th); +#ifdef CONFIG_MPTCP + if (!sk && th->syn && !th->ack) { + int ret = mptcp_lookup_join(skb, NULL); + + if (ret < 0) { + tcp_v6_send_reset(NULL, skb); + goto discard_it; + } else if (ret > 0) { + return 0; + } + } +#endif + if (tcp_checksum_complete(skb)) { csum_error: __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); @@ -1643,6 +1730,18 @@ static int tcp_v6_rcv(struct sk_buff *skb) refcounted = false; goto process; } +#ifdef CONFIG_MPTCP + if (th->syn && !th->ack) { + int ret = mptcp_lookup_join(skb, inet_twsk(sk)); + + if (ret < 0) { + tcp_v6_send_reset(NULL, skb); + goto discard_it; + } else if (ret > 0) { + return 0; + } + } +#endif } /* to ACK */ /* fall through */ @@ -1697,13 +1796,13 @@ static void tcp_v6_early_demux(struct sk_buff *skb) } } -static struct timewait_sock_ops tcp6_timewait_sock_ops = { +struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, .twsk_destructor = tcp_twsk_destructor, }; -static const struct inet_connection_sock_af_ops ipv6_specific = { +const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, @@ -1734,7 +1833,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) /* * TCP over IPv4 via INET6 API */ -static const struct inet_connection_sock_af_ops ipv6_mapped = { +const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1770,7 +1869,12 @@ static int tcp_v6_init_sock(struct sock *sk) tcp_init_sock(sk); - icsk->icsk_af_ops = &ipv6_specific; +#ifdef CONFIG_MPTCP + if (sock_flag(sk, SOCK_MPTCP)) + icsk->icsk_af_ops = &mptcp_v6_specific; + else +#endif + icsk->icsk_af_ops = &ipv6_specific; #ifdef CONFIG_TCP_MD5SIG tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; @@ -1779,7 +1883,7 @@ static int tcp_v6_init_sock(struct sock *sk) return 0; } -static void tcp_v6_destroy_sock(struct sock *sk) +void tcp_v6_destroy_sock(struct sock *sk) { tcp_v4_destroy_sock(sk); inet6_destroy_sock(sk); @@ -2002,6 +2106,11 @@ struct proto tcpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), +#ifdef CONFIG_MPTCP + .useroffset = offsetof(struct tcp_sock, mptcp_sched_name), + .usersize = sizeof_field(struct tcp_sock, mptcp_sched_name) + + sizeof_field(struct tcp_sock, mptcp_pm_name), +#endif .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, @@ -2012,6 +2121,9 @@ struct proto tcpv6_prot = { .compat_getsockopt = compat_tcp_getsockopt, #endif .diag_destroy = tcp_abort, +#ifdef CONFIG_MPTCP + .clear_sk = mptcp_clear_sk, +#endif }; /* thinking of making this const? Don't. diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig new file mode 100644 index 000000000000..37f3af3db2a6 --- /dev/null +++ b/net/mptcp/Kconfig @@ -0,0 +1,146 @@ +# +# MPTCP configuration +# +config MPTCP + bool "MPTCP protocol" + depends on (IPV6=y || IPV6=n) + ---help--- + This replaces the normal TCP stack with a Multipath TCP stack, + able to use several paths at once. + +menuconfig MPTCP_PM_ADVANCED + bool "MPTCP: advanced path-manager control" + depends on MPTCP=y + ---help--- + Support for selection of different path-managers. You should choose 'Y' here, + because otherwise you will not actively create new MPTCP-subflows. + +if MPTCP_PM_ADVANCED + +config MPTCP_FULLMESH + tristate "MPTCP Full-Mesh Path-Manager" + depends on MPTCP=y + ---help--- + This path-management module will create a full-mesh among all IP-addresses. + +config MPTCP_NDIFFPORTS + tristate "MPTCP ndiff-ports" + depends on MPTCP=y + ---help--- + This path-management module will create multiple subflows between the same + pair of IP-addresses, modifying the source-port. You can set the number + of subflows via the mptcp_ndiffports-sysctl. + +config MPTCP_BINDER + tristate "MPTCP Binder" + depends on (MPTCP=y) + ---help--- + This path-management module works like ndiffports, and adds the sysctl + option to set the gateway (and/or path to) per each additional subflow + via Loose Source Routing (IPv4 only). + +config MPTCP_NETLINK + tristate "MPTCP Netlink Path-Manager" + depends on MPTCP=y + ---help--- + This path-management module is controlled over a Netlink interface. A userspace + module can therefore control the establishment of new subflows and the policy + to apply over those new subflows for every connection. + +choice + prompt "Default MPTCP Path-Manager" + default DEFAULT_DUMMY + help + Select the Path-Manager of your choice + + config DEFAULT_FULLMESH + bool "Full mesh" if MPTCP_FULLMESH=y + + config DEFAULT_NDIFFPORTS + bool "ndiff-ports" if MPTCP_NDIFFPORTS=y + + config DEFAULT_BINDER + bool "binder" if MPTCP_BINDER=y + + config DEFAULT_NETLINK + bool "Netlink" if MPTCP_NETLINK=y + + config DEFAULT_DUMMY + bool "Default" + +endchoice + +endif + +config DEFAULT_MPTCP_PM + string + default "default" if DEFAULT_DUMMY + default "fullmesh" if DEFAULT_FULLMESH + default "ndiffports" if DEFAULT_NDIFFPORTS + default "binder" if DEFAULT_BINDER + default "default" + +menuconfig MPTCP_SCHED_ADVANCED + bool "MPTCP: advanced scheduler control" + depends on MPTCP=y + ---help--- + Support for selection of different schedulers. You should choose 'Y' here, + if you want to choose a different scheduler than the default one. + +if MPTCP_SCHED_ADVANCED + +config MPTCP_BLEST + tristate "MPTCP BLEST" + depends on MPTCP=y + ---help--- + This is an experimental BLocking ESTimation-based (BLEST) scheduler. + +config MPTCP_ROUNDROBIN + tristate "MPTCP Round-Robin" + depends on (MPTCP=y) + ---help--- + This is a very simple round-robin scheduler. Probably has bad performance + but might be interesting for researchers. + +config MPTCP_REDUNDANT + tristate "MPTCP Redundant" + depends on (MPTCP=y) + ---help--- + This scheduler sends all packets redundantly over all subflows to decreases + latency and jitter on the cost of lower throughput. + +choice + prompt "Default MPTCP Scheduler" + default DEFAULT_SCHEDULER + help + Select the Scheduler of your choice + + config DEFAULT_SCHEDULER + bool "Default" + ---help--- + This is the default scheduler, sending first on the subflow + with the lowest RTT. + + config DEFAULT_ROUNDROBIN + bool "Round-Robin" if MPTCP_ROUNDROBIN=y + ---help--- + This is the round-rob scheduler, sending in a round-robin + fashion.. + + config DEFAULT_REDUNDANT + bool "Redundant" if MPTCP_REDUNDANT=y + ---help--- + This is the redundant scheduler, sending packets redundantly over + all the subflows. + +endchoice +endif + +config DEFAULT_MPTCP_SCHED + string + depends on (MPTCP=y) + default "default" if DEFAULT_SCHEDULER + default "roundrobin" if DEFAULT_ROUNDROBIN + default "redundant" if DEFAULT_REDUNDANT + default "default" + diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile new file mode 100644 index 000000000000..82a2d4d945ae --- /dev/null +++ b/net/mptcp/Makefile @@ -0,0 +1,24 @@ +# +## Makefile for MultiPath TCP support code. +# +# + +obj-$(CONFIG_MPTCP) += mptcp.o + +mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_pm.o \ + mptcp_output.o mptcp_input.o mptcp_sched.o + +obj-$(CONFIG_TCP_CONG_LIA) += mptcp_coupled.o +obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o +obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o +obj-$(CONFIG_TCP_CONG_BALIA) += mptcp_balia.o +obj-$(CONFIG_TCP_CONG_MCTCPDESYNC) += mctcp_desync.o +obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o +obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o +obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o +obj-$(CONFIG_MPTCP_NETLINK) += mptcp_netlink.o +obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o +obj-$(CONFIG_MPTCP_REDUNDANT) += mptcp_redundant.o +obj-$(CONFIG_MPTCP_BLEST) += mptcp_blest.o + +mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o diff --git a/net/mptcp/mctcp_desync.c b/net/mptcp/mctcp_desync.c new file mode 100644 index 000000000000..f6bf9251d59b --- /dev/null +++ b/net/mptcp/mctcp_desync.c @@ -0,0 +1,193 @@ +/* + * Desynchronized Multi-Channel TCP Congestion Control Algorithm + * + * Implementation based on publications of "DMCTCP:Desynchronized Multi-Channel + * TCP for high speed access networks with tiny buffers" in 23rd international + * conference of Computer Communication and Networks (ICCCN), 2014, and + * "Exploring parallelism and desynchronization of TCP over high speed networks + * with tiny buffers" in Journal of Computer Communications Elsevier, 2015. + * + * http://ieeexplore.ieee.org/abstract/document/6911722/ + * https://doi.org/10.1016/j.comcom.2015.07.010 + * + * This prototype is for research purpose and is currently experimental code + * that only support a single path. Future support of multi-channel over + * multi-path requires channels grouping. + * + * Initial Design and Implementation: + * Cheng Cui + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ +#include +#include +#include + +enum { + MASTER_CHANNEL = 1, + INI_MIN_CWND = 2, +}; + +/* private congestion control structure: + * off_tstamp: the last backoff timestamp for loss synchronization event + * off_subfid: the subflow which was backoff on off_tstamp + */ +struct mctcp_desync { + u64 off_tstamp; + u8 off_subfid; +}; + +static inline int mctcp_cc_sk_can_send(const struct sock *sk) +{ + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; +} + +static void mctcp_desync_init(struct sock *sk) +{ + if (mptcp(tcp_sk(sk))) { + struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk)); + ca->off_tstamp = 0; + ca->off_subfid = 0; + } + /* If we do not mptcp, behave like reno: return */ +} + +static void mctcp_desync_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!mptcp(tp)) { + tcp_reno_cong_avoid(sk, ack, acked); + return; + } else if (!tcp_is_cwnd_limited(sk)) { + return; + } else { + const struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk)); + const u8 subfid = tp->mptcp->path_index; + + /* current aggregated cwnd */ + u32 agg_cwnd = 0; + u32 min_cwnd = 0xffffffff; + u8 min_cwnd_subfid = 0; + + /* In "safe" area, increase */ + if (tcp_in_slow_start(tp)) { + if (ca->off_subfid) { + /* passed initial phase, allow slow start */ + tcp_slow_start(tp, acked); + } else if (MASTER_CHANNEL == tp->mptcp->path_index) { + /* master channel is normal slow start in + * initial phase */ + tcp_slow_start(tp, acked); + } else { + /* secondary channels increase slowly until + * the initial phase passed + */ + tp->snd_ssthresh = tp->snd_cwnd = INI_MIN_CWND; + } + return; + } else { + /* In dangerous area, increase slowly and linearly. */ + const struct mptcp_tcp_sock *mptcp; + + /* get total cwnd and the subflow that has min cwnd */ + mptcp_for_each_sub(tp->mpcb, mptcp) { + const struct sock *sub_sk = mptcp_to_sock(mptcp); + + if (mctcp_cc_sk_can_send(sub_sk)) { + const struct tcp_sock *sub_tp = + tcp_sk(sub_sk); + agg_cwnd += sub_tp->snd_cwnd; + if(min_cwnd > sub_tp->snd_cwnd) { + min_cwnd = sub_tp->snd_cwnd; + min_cwnd_subfid = + sub_tp->mptcp->path_index; + } + } + } + /* the smallest subflow grows faster than others */ + if (subfid == min_cwnd_subfid) { + tcp_cong_avoid_ai(tp, min_cwnd, acked); + } else { + tcp_cong_avoid_ai(tp, agg_cwnd - min_cwnd, + acked); + } + } + } +} + +static u32 mctcp_desync_ssthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!mptcp(tp)) { + return max(tp->snd_cwnd >> 1U, 2U); + } else { + struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk)); + const u8 subfid = tp->mptcp->path_index; + const struct mptcp_tcp_sock *mptcp; + u32 max_cwnd = 0; + u8 max_cwnd_subfid = 0; + + /* Find the subflow that has the max cwnd. */ + mptcp_for_each_sub(tp->mpcb, mptcp) { + const struct sock *sub_sk = mptcp_to_sock(mptcp); + + if (mctcp_cc_sk_can_send(sub_sk)) { + const struct tcp_sock *sub_tp = tcp_sk(sub_sk); + if (max_cwnd < sub_tp->snd_cwnd) { + max_cwnd = sub_tp->snd_cwnd; + max_cwnd_subfid = + sub_tp->mptcp->path_index; + } + } + } + /* Use high resolution clock. */ + if (subfid == max_cwnd_subfid) { + u64 now = tcp_clock_us(); + u32 delta = tcp_stamp_us_delta(now, ca->off_tstamp); + + if (delta < (tp->srtt_us >> 3)) { + /* desynchronize */ + return tp->snd_cwnd; + } else { + ca->off_tstamp = now; + ca->off_subfid = subfid; + return max(max_cwnd >> 1U, 2U); + } + } else { + return tp->snd_cwnd; + } + } +} + +static struct tcp_congestion_ops mctcp_desync = { + .init = mctcp_desync_init, + .ssthresh = mctcp_desync_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, + .cong_avoid = mctcp_desync_cong_avoid, + .owner = THIS_MODULE, + .name = "mctcpdesync", +}; + +static int __init mctcp_desync_register(void) +{ + BUILD_BUG_ON(sizeof(struct mctcp_desync) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&mctcp_desync); +} + +static void __exit mctcp_desync_unregister(void) +{ + tcp_unregister_congestion_control(&mctcp_desync); +} + +module_init(mctcp_desync_register); +module_exit(mctcp_desync_unregister); + +MODULE_AUTHOR("Cheng Cui"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MCTCP: DESYNCHRONIZED MULTICHANNEL TCP CONGESTION CONTROL"); +MODULE_VERSION("1.0"); diff --git a/net/mptcp/mptcp_balia.c b/net/mptcp/mptcp_balia.c new file mode 100644 index 000000000000..179b53dea020 --- /dev/null +++ b/net/mptcp/mptcp_balia.c @@ -0,0 +1,261 @@ +/* + * MPTCP implementation - Balia Congestion Control + * (Balanced Linked Adaptation Algorithm) + * + * Analysis, Design and Implementation: + * Qiuyu Peng + * Anwar Walid + * Jaehyun Hwang + * Steven H. Low + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include + +/* The variable 'rate' (i.e., x_r) will be scaled + * e.g., from B/s to KB/s, MB/s, or GB/s + * if max_rate > 2^rate_scale_limit + */ + +static int rate_scale_limit = 25; +static int alpha_scale = 10; +static int scale_num = 5; + +struct mptcp_balia { + u64 ai; + u64 md; + bool forced_update; +}; + +static inline int mptcp_balia_sk_can_send(const struct sock *sk) +{ + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; +} + +static inline u64 mptcp_get_ai(const struct sock *meta_sk) +{ + return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai; +} + +static inline void mptcp_set_ai(const struct sock *meta_sk, u64 ai) +{ + ((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai = ai; +} + +static inline u64 mptcp_get_md(const struct sock *meta_sk) +{ + return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->md; +} + +static inline void mptcp_set_md(const struct sock *meta_sk, u64 md) +{ + ((struct mptcp_balia *)inet_csk_ca(meta_sk))->md = md; +} + +static inline u64 mptcp_balia_scale(u64 val, int scale) +{ + return (u64) val << scale; +} + +static inline bool mptcp_get_forced(const struct sock *meta_sk) +{ + return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update; +} + +static inline void mptcp_set_forced(const struct sock *meta_sk, bool force) +{ + ((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update = force; +} + +static void mptcp_balia_recalc_ai(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct mptcp_cb *mpcb = tp->mpcb; + struct mptcp_tcp_sock *mptcp; + u64 max_rate = 0, rate = 0, sum_rate = 0; + u64 alpha, ai = tp->snd_cwnd, md = (tp->snd_cwnd >> 1); + int num_scale_down = 0; + + if (!mpcb) + return; + + /* Find max_rate first */ + mptcp_for_each_sub(mpcb, mptcp) { + const struct sock *sub_sk = mptcp_to_sock(mptcp); + struct tcp_sock *sub_tp = tcp_sk(sub_sk); + u64 tmp; + + if (!mptcp_balia_sk_can_send(sub_sk)) + continue; + + tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd + * (USEC_PER_SEC << 3), sub_tp->srtt_us); + sum_rate += tmp; + + if (tp == sub_tp) + rate = tmp; + + if (tmp >= max_rate) + max_rate = tmp; + } + + /* At least, the current subflow should be able to send */ + if (unlikely(!rate)) + goto exit; + + alpha = div64_u64(max_rate, rate); + + /* Scale down max_rate if it is too high (e.g., >2^25) */ + while (max_rate > mptcp_balia_scale(1, rate_scale_limit)) { + max_rate >>= scale_num; + num_scale_down++; + } + + if (num_scale_down) { + sum_rate = 0; + mptcp_for_each_sub(mpcb, mptcp) { + const struct sock *sub_sk = mptcp_to_sock(mptcp); + struct tcp_sock *sub_tp = tcp_sk(sub_sk); + u64 tmp; + + if (!mptcp_balia_sk_can_send(sub_sk)) + continue; + + tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd + * (USEC_PER_SEC << 3), sub_tp->srtt_us); + tmp >>= (scale_num * num_scale_down); + + sum_rate += tmp; + } + rate >>= (scale_num * num_scale_down); + } + + /* (sum_rate)^2 * 10 * w_r + * ai = ------------------------------------ + * (x_r + max_rate) * (4x_r + max_rate) + */ + sum_rate *= sum_rate; + + ai = div64_u64(sum_rate * 10, rate + max_rate); + ai = div64_u64(ai * tp->snd_cwnd, (rate << 2) + max_rate); + + if (unlikely(!ai)) + ai = tp->snd_cwnd; + + md = ((tp->snd_cwnd >> 1) * min(mptcp_balia_scale(alpha, alpha_scale), + mptcp_balia_scale(3, alpha_scale) >> 1)) + >> alpha_scale; + +exit: + mptcp_set_ai(sk, ai); + mptcp_set_md(sk, md); +} + +static void mptcp_balia_init(struct sock *sk) +{ + if (mptcp(tcp_sk(sk))) { + mptcp_set_forced(sk, 0); + mptcp_set_ai(sk, 0); + mptcp_set_md(sk, 0); + } +} + +static void mptcp_balia_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_COMPLETE_CWR || event == CA_EVENT_LOSS) + mptcp_balia_recalc_ai(sk); +} + +static void mptcp_balia_set_state(struct sock *sk, u8 ca_state) +{ + if (!mptcp(tcp_sk(sk))) + return; + + mptcp_set_forced(sk, 1); +} + +static void mptcp_balia_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + int snd_cwnd; + + if (!mptcp(tp)) { + tcp_reno_cong_avoid(sk, ack, acked); + return; + } + + if (!tcp_is_cwnd_limited(sk)) + return; + + if (tcp_in_slow_start(tp)) { + /* In "safe" area, increase. */ + tcp_slow_start(tp, acked); + mptcp_balia_recalc_ai(sk); + return; + } + + if (mptcp_get_forced(mptcp_meta_sk(sk))) { + mptcp_balia_recalc_ai(sk); + mptcp_set_forced(sk, 0); + } + + snd_cwnd = (int)mptcp_get_ai(sk); + + if (tp->snd_cwnd_cnt >= snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + mptcp_balia_recalc_ai(sk); + } + + tp->snd_cwnd_cnt = 0; + } else { + tp->snd_cwnd_cnt++; + } +} + +static u32 mptcp_balia_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + if (unlikely(!mptcp(tp))) + return tcp_reno_ssthresh(sk); + else + return max((u32)(tp->snd_cwnd - mptcp_get_md(sk)), 1U); +} + +static struct tcp_congestion_ops mptcp_balia = { + .init = mptcp_balia_init, + .ssthresh = mptcp_balia_ssthresh, + .cong_avoid = mptcp_balia_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, + .cwnd_event = mptcp_balia_cwnd_event, + .set_state = mptcp_balia_set_state, + .owner = THIS_MODULE, + .name = "balia", +}; + +static int __init mptcp_balia_register(void) +{ + BUILD_BUG_ON(sizeof(struct mptcp_balia) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&mptcp_balia); +} + +static void __exit mptcp_balia_unregister(void) +{ + tcp_unregister_congestion_control(&mptcp_balia); +} + +module_init(mptcp_balia_register); +module_exit(mptcp_balia_unregister); + +MODULE_AUTHOR("Jaehyun Hwang, Anwar Walid, Qiuyu Peng, Steven H. Low"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPTCP BALIA CONGESTION CONTROL ALGORITHM"); +MODULE_VERSION("0.1"); diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c new file mode 100644 index 000000000000..7f34a8d00274 --- /dev/null +++ b/net/mptcp/mptcp_binder.c @@ -0,0 +1,494 @@ +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MPTCP_GW_MAX_LISTS 10 +#define MPTCP_GW_LIST_MAX_LEN 6 +#define MPTCP_GW_SYSCTL_MAX_LEN (15 * MPTCP_GW_LIST_MAX_LEN * \ + MPTCP_GW_MAX_LISTS) + +struct mptcp_gw_list { + struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN]; + u8 len[MPTCP_GW_MAX_LISTS]; +}; + +struct binder_priv { + /* Worker struct for subflow establishment */ + struct work_struct subflow_work; + + struct mptcp_cb *mpcb; + + /* Prevent multiple sub-sockets concurrently iterating over sockets */ + spinlock_t *flow_lock; +}; + +static struct mptcp_gw_list *mptcp_gws; +static rwlock_t mptcp_gws_lock; + +static int mptcp_binder_ndiffports __read_mostly = 1; + +static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly; + +static int mptcp_get_avail_list_ipv4(struct sock *sk) +{ + int i, j, list_taken, opt_ret, opt_len; + unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN]; + + for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) { + struct mptcp_tcp_sock *mptcp; + + if (mptcp_gws->len[i] == 0) + goto error; + + mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i); + list_taken = 0; + + /* Loop through all sub-sockets in this connection */ + mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) { + sk = mptcp_to_sock(mptcp); + + mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n"); + + /* Reset length and options buffer, then retrieve + * from socket + */ + opt_len = MAX_IPOPTLEN; + memset(opt, 0, MAX_IPOPTLEN); + opt_ret = ip_getsockopt(sk, IPPROTO_IP, + IP_OPTIONS, (char __user *)opt, (int __user *)&opt_len); + if (opt_ret < 0) { + mptcp_debug("%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n", + __func__, opt_ret); + goto error; + } + + /* If socket has no options, it has no stake in this list */ + if (opt_len <= 0) + continue; + + /* Iterate options buffer */ + for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) { + if (*opt_ptr == IPOPT_LSRR) { + mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n"); + goto sock_lsrr; + } + } + continue; + +sock_lsrr: + /* Pointer to the 2nd to last address */ + opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4; + + /* Addresses start 3 bytes after type offset */ + opt_ptr += 3; + j = 0; + + /* Different length lists cannot be the same */ + if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i]) + continue; + + /* Iterate if we are still inside options list + * and sysctl list + */ + while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) { + /* If there is a different address, this list must + * not be set on this socket + */ + if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4)) + break; + + /* Jump 4 bytes to next address */ + opt_ptr += 4; + j++; + } + + /* Reached the end without a differing address, lists + * are therefore identical. + */ + if (j == mptcp_gws->len[i]) { + mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n"); + list_taken = 1; + break; + } + } + + /* Free list found if not taken by a socket */ + if (!list_taken) { + mptcp_debug("mptcp_get_avail_list_ipv4: List free\n"); + break; + } + } + + if (i >= MPTCP_GW_MAX_LISTS) + goto error; + + return i; +error: + return -1; +} + +/* The list of addresses is parsed each time a new connection is opened, + * to make sure it's up to date. In case of error, all the lists are + * marked as unavailable and the subflow's fingerprint is set to 0. + */ +static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr) +{ + int i, j, ret; + unsigned char opt[MAX_IPOPTLEN] = {0}; + struct tcp_sock *tp = tcp_sk(sk); + struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0]; + + /* Read lock: multiple sockets can read LSRR addresses at the same + * time, but writes are done in mutual exclusion. + * Spin lock: must search for free list for one socket at a time, or + * multiple sockets could take the same list. + */ + read_lock(&mptcp_gws_lock); + spin_lock(fmp->flow_lock); + + i = mptcp_get_avail_list_ipv4(sk); + + /* Execution enters here only if a free path is found. + */ + if (i >= 0) { + opt[0] = IPOPT_NOP; + opt[1] = IPOPT_LSRR; + opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) * + (mptcp_gws->len[i] + 1) + 3; + opt[3] = IPOPT_MINOFF; + for (j = 0; j < mptcp_gws->len[i]; ++j) + memcpy(opt + 4 + + (j * sizeof(mptcp_gws->list[i][0].s_addr)), + &mptcp_gws->list[i][j].s_addr, + sizeof(mptcp_gws->list[i][0].s_addr)); + /* Final destination must be part of IP_OPTIONS parameter. */ + memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr, + sizeof(addr.s_addr)); + + /* setsockopt must be inside the lock, otherwise another + * subflow could fail to see that we have taken a list. + */ + ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, (char __user *)opt, + 4 + sizeof(mptcp_gws->list[i][0].s_addr) * (mptcp_gws->len[i] + 1)); + + if (ret < 0) { + mptcp_debug("%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n", + __func__, ret); + } + } + + spin_unlock(fmp->flow_lock); + read_unlock(&mptcp_gws_lock); + + return; +} + +/* Parses gateways string for a list of paths to different + * gateways, and stores them for use with the Loose Source Routing (LSRR) + * socket option. Each list must have "," separated addresses, and the lists + * themselves must be separated by "-". Returns -1 in case one or more of the + * addresses is not a valid ipv4/6 address. + */ +static int mptcp_parse_gateway_ipv4(char *gateways) +{ + int i, j, k, ret; + char *tmp_string = NULL; + struct in_addr tmp_addr; + + tmp_string = kzalloc(16, GFP_KERNEL); + if (tmp_string == NULL) + return -ENOMEM; + + write_lock(&mptcp_gws_lock); + + memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list)); + + /* A TMP string is used since inet_pton needs a null terminated string + * but we do not want to modify the sysctl for obvious reasons. + * i will iterate over the SYSCTL string, j will iterate over the + * temporary string where each IP is copied into, k will iterate over + * the IPs in each list. + */ + for (i = j = k = 0; + i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS; + ++i) { + if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') { + /* If the temp IP is empty and the current list is + * empty, we are done. + */ + if (j == 0 && mptcp_gws->len[k] == 0) + break; + + /* Terminate the temp IP string, then if it is + * non-empty parse the IP and copy it. + */ + tmp_string[j] = '\0'; + if (j > 0) { + mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i); + + ret = in4_pton(tmp_string, strlen(tmp_string), + (u8 *)&tmp_addr.s_addr, '\0', + NULL); + + if (ret) { + mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n", + ret, + &tmp_addr.s_addr); + memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr, + &tmp_addr.s_addr, + sizeof(tmp_addr.s_addr)); + mptcp_gws->len[k]++; + j = 0; + tmp_string[j] = '\0'; + /* Since we can't impose a limit to + * what the user can input, make sure + * there are not too many IPs in the + * SYSCTL string. + */ + if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) { + mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n", + k, + MPTCP_GW_LIST_MAX_LEN); + goto error; + } + } else { + goto error; + } + } + + if (gateways[i] == '-' || gateways[i] == '\0') + ++k; + } else { + tmp_string[j] = gateways[i]; + ++j; + } + } + + /* Number of flows is number of gateway lists plus master flow */ + mptcp_binder_ndiffports = k+1; + + write_unlock(&mptcp_gws_lock); + kfree(tmp_string); + + return 0; + +error: + memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list)); + memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN); + write_unlock(&mptcp_gws_lock); + kfree(tmp_string); + return -1; +} + +/** + * Create all new subflows, by doing calls to mptcp_initX_subsockets + * + * This function uses a goto next_subflow, to allow releasing the lock between + * new subflows and giving other processes a chance to do some work on the + * socket and potentially finishing the communication. + **/ +static void create_subflow_worker(struct work_struct *work) +{ + const struct binder_priv *pm_priv = container_of(work, + struct binder_priv, + subflow_work); + struct mptcp_cb *mpcb = pm_priv->mpcb; + struct sock *meta_sk = mpcb->meta_sk; + int iter = 0; + +next_subflow: + if (iter) { + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + + cond_resched(); + } + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + if (!mptcp(tcp_sk(meta_sk))) + goto exit; + + iter++; + + if (sock_flag(meta_sk, SOCK_DEAD)) + goto exit; + + if (mpcb->master_sk && + !tcp_sk(mpcb->master_sk)->mptcp->fully_established) + goto exit; + + if (mptcp_binder_ndiffports > iter && + mptcp_binder_ndiffports > mptcp_subflow_count(mpcb)) { + struct mptcp_loc4 loc; + struct mptcp_rem4 rem; + + loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; + loc.loc4_id = 0; + loc.low_prio = 0; + + rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; + rem.port = inet_sk(meta_sk)->inet_dport; + rem.rem4_id = 0; /* Default 0 */ + + mptcp_init4_subsockets(meta_sk, &loc, &rem); + + goto next_subflow; + } + +exit: + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + mptcp_mpcb_put(mpcb); + sock_put(meta_sk); +} + +static void binder_new_session(const struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0]; + static DEFINE_SPINLOCK(flow_lock); + +#if IS_ENABLED(CONFIG_IPV6) + if (meta_sk->sk_family == AF_INET6 && + !mptcp_v6_is_v4_mapped(meta_sk)) { + mptcp_fallback_default(mpcb); + return; + } +#endif + + /* Initialize workqueue-struct */ + INIT_WORK(&fmp->subflow_work, create_subflow_worker); + fmp->mpcb = mpcb; + + fmp->flow_lock = &flow_lock; +} + +static void binder_create_subflows(struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0]; + + if (mptcp_in_infinite_mapping_weak(mpcb) || + mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) + return; + + if (!work_pending(&pm_priv->subflow_work)) { + sock_hold(meta_sk); + refcount_inc(&mpcb->mpcb_refcnt); + queue_work(mptcp_wq, &pm_priv->subflow_work); + } +} + +static int binder_get_local_id(const struct sock *meta_sk, sa_family_t family, + union inet_addr *addr, bool *low_prio) +{ + return 0; +} + +/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated. + * Inspired from proc_tcp_congestion_control(). + */ +static int proc_mptcp_gateways(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + struct ctl_table tbl = { + .maxlen = MPTCP_GW_SYSCTL_MAX_LEN, + }; + + if (write) { + tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL); + if (tbl.data == NULL) + return -ENOMEM; + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (ret == 0) { + ret = mptcp_parse_gateway_ipv4(tbl.data); + memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN); + } + kfree(tbl.data); + } else { + ret = proc_dostring(ctl, write, buffer, lenp, ppos); + } + + + return ret; +} + +static struct mptcp_pm_ops binder __read_mostly = { + .new_session = binder_new_session, + .fully_established = binder_create_subflows, + .get_local_id = binder_get_local_id, + .init_subsocket_v4 = mptcp_v4_add_lsrr, + .name = "binder", + .owner = THIS_MODULE, +}; + +static struct ctl_table binder_table[] = { + { + .procname = "mptcp_binder_gateways", + .data = &sysctl_mptcp_binder_gateways, + .maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN, + .mode = 0644, + .proc_handler = &proc_mptcp_gateways + }, + { } +}; + +static struct ctl_table_header *mptcp_sysctl_binder; + +/* General initialization of MPTCP_PM */ +static int __init binder_register(void) +{ + mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL); + if (!mptcp_gws) + return -ENOMEM; + + rwlock_init(&mptcp_gws_lock); + + BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE); + + mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp", + binder_table); + if (!mptcp_sysctl_binder) + goto sysctl_fail; + + if (mptcp_register_path_manager(&binder)) + goto pm_failed; + + return 0; + +pm_failed: + unregister_net_sysctl_table(mptcp_sysctl_binder); +sysctl_fail: + kfree(mptcp_gws); + + return -1; +} + +static void binder_unregister(void) +{ + mptcp_unregister_path_manager(&binder); + unregister_net_sysctl_table(mptcp_sysctl_binder); + kfree(mptcp_gws); +} + +module_init(binder_register); +module_exit(binder_unregister); + +MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("BINDER MPTCP"); +MODULE_VERSION("0.1"); diff --git a/net/mptcp/mptcp_blest.c b/net/mptcp/mptcp_blest.c new file mode 100644 index 000000000000..40905a0d1fe5 --- /dev/null +++ b/net/mptcp/mptcp_blest.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP Scheduler to reduce HoL-blocking and spurious retransmissions. + * + * Algorithm Design: + * Simone Ferlin + * Ozgu Alay + * Olivier Mehani + * Roksana Boreli + * + * Initial Implementation: + * Simone Ferlin + * + * Additional Authors: + * Daniel Weber + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +static unsigned char lambda __read_mostly = 12; +module_param(lambda, byte, 0644); +MODULE_PARM_DESC(lambda, "Divided by 10 for scaling factor of fast flow rate estimation"); + +static unsigned char max_lambda __read_mostly = 13; +module_param(max_lambda, byte, 0644); +MODULE_PARM_DESC(max_lambda, "Divided by 10 for maximum scaling factor of fast flow rate estimation"); + +static unsigned char min_lambda __read_mostly = 10; +module_param(min_lambda, byte, 0644); +MODULE_PARM_DESC(min_lambda, "Divided by 10 for minimum scaling factor of fast flow rate estimation"); + +static unsigned char dyn_lambda_good = 10; /* 1% */ +module_param(dyn_lambda_good, byte, 0644); +MODULE_PARM_DESC(dyn_lambda_good, "Decrease of lambda in positive case."); + +static unsigned char dyn_lambda_bad = 40; /* 4% */ +module_param(dyn_lambda_bad, byte, 0644); +MODULE_PARM_DESC(dyn_lambda_bad, "Increase of lambda in negative case."); + +struct blestsched_priv { + u32 last_rbuf_opti; + u32 min_srtt_us; + u32 max_srtt_us; +}; + +struct blestsched_cb { + bool retrans_flag; + s16 lambda_1000; /* values range from min_lambda * 100 to max_lambda * 100 */ + u32 last_lambda_update; +}; + +static struct blestsched_priv *blestsched_get_priv(const struct tcp_sock *tp) +{ + return (struct blestsched_priv *)&tp->mptcp->mptcp_sched[0]; +} + +static struct blestsched_cb *blestsched_get_cb(const struct tcp_sock *tp) +{ + return (struct blestsched_cb *)&tp->mpcb->mptcp_sched[0]; +} + +static void blestsched_update_lambda(struct sock *meta_sk, struct sock *sk) +{ + struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(meta_sk)); + struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk)); + + if (tcp_jiffies32 - blest_cb->last_lambda_update < usecs_to_jiffies(blest_p->min_srtt_us >> 3)) + return; + + /* if there have been retransmissions of packets of the slow flow + * during the slow flows last RTT => increase lambda + * otherwise decrease + */ + if (blest_cb->retrans_flag) { + /* need to slow down on the slow flow */ + blest_cb->lambda_1000 += dyn_lambda_bad; + } else { + /* use the slow flow more */ + blest_cb->lambda_1000 -= dyn_lambda_good; + } + blest_cb->retrans_flag = false; + + /* cap lambda_1000 to its value range */ + blest_cb->lambda_1000 = min_t(s16, blest_cb->lambda_1000, max_lambda * 100); + blest_cb->lambda_1000 = max_t(s16, blest_cb->lambda_1000, min_lambda * 100); + + blest_cb->last_lambda_update = tcp_jiffies32; +} + +/* how many bytes will sk send during the rtt of another, slower flow? */ +static u32 blestsched_estimate_bytes(struct sock *sk, u32 time_8) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct blestsched_priv *blest_p = blestsched_get_priv(tp); + struct blestsched_cb *blest_cb = blestsched_get_cb(mptcp_meta_tp(tp)); + u32 avg_rtt, num_rtts, ca_cwnd, packets; + + avg_rtt = (blest_p->min_srtt_us + blest_p->max_srtt_us) / 2; + if (avg_rtt == 0) + num_rtts = 1; /* sanity */ + else + num_rtts = (time_8 / avg_rtt) + 1; /* round up */ + + /* during num_rtts, how many bytes will be sent on the flow? + * assumes for simplification that Reno is applied as congestion-control + */ + if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { + /* we are in initial slow start */ + if (num_rtts > 16) + num_rtts = 16; /* cap for sanity */ + packets = tp->snd_cwnd * ((1 << num_rtts) - 1); /* cwnd + 2*cwnd + 4*cwnd */ + } else { + ca_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh + 1); /* assume we jump to CA already */ + packets = (ca_cwnd + (num_rtts - 1) / 2) * num_rtts; + } + + return div_u64(((u64)packets) * tp->mss_cache * blest_cb->lambda_1000, 1000); +} + +static u32 blestsched_estimate_linger_time(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct blestsched_priv *blest_p = blestsched_get_priv(tp); + u32 estimate, slope, inflight, cwnd; + + inflight = tcp_packets_in_flight(tp) + 1; /* take into account the new one */ + cwnd = tp->snd_cwnd; + + if (inflight >= cwnd) { + estimate = blest_p->max_srtt_us; + } else { + slope = blest_p->max_srtt_us - blest_p->min_srtt_us; + if (cwnd == 0) + cwnd = 1; /* sanity */ + estimate = blest_p->min_srtt_us + (slope * inflight) / cwnd; + } + + return (tp->srtt_us > estimate) ? tp->srtt_us : estimate; +} + +/* This is the BLEST scheduler. This function decides on which flow to send + * a given MSS. If all subflows are found to be busy or the currently best + * subflow is estimated to possibly cause HoL-blocking, NULL is returned. + */ +struct sock *blest_get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, + bool zero_wnd_test) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sock *bestsk, *minsk = NULL; + struct tcp_sock *meta_tp, *besttp; + struct mptcp_tcp_sock *mptcp; + struct blestsched_priv *blest_p; + u32 min_srtt = U32_MAX; + + /* Answer data_fin on same subflow!!! */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN && + skb && mptcp_is_data_fin(skb)) { + mptcp_for_each_sub(mpcb, mptcp) { + bestsk = mptcp_to_sock(mptcp); + + if (tcp_sk(bestsk)->mptcp->path_index == mpcb->dfin_path_index && + mptcp_is_available(bestsk, skb, zero_wnd_test)) + return bestsk; + } + } + + /* First, find the overall best subflow */ + mptcp_for_each_sub(mpcb, mptcp) { + bestsk = mptcp_to_sock(mptcp); + besttp = tcp_sk(bestsk); + blest_p = blestsched_get_priv(besttp); + + /* Set of states for which we are allowed to send data */ + if (!mptcp_sk_can_send(bestsk)) + continue; + + /* We do not send data on this subflow unless it is + * fully established, i.e. the 4th ack has been received. + */ + if (besttp->mptcp->pre_established) + continue; + + blest_p->min_srtt_us = min(blest_p->min_srtt_us, besttp->srtt_us); + blest_p->max_srtt_us = max(blest_p->max_srtt_us, besttp->srtt_us); + + /* record minimal rtt */ + if (besttp->srtt_us < min_srtt) { + min_srtt = besttp->srtt_us; + minsk = bestsk; + } + } + + /* find the current best subflow according to the default scheduler */ + bestsk = get_available_subflow(meta_sk, skb, zero_wnd_test); + + /* if we decided to use a slower flow, we have the option of not using it at all */ + if (bestsk && minsk && bestsk != minsk) { + u32 slow_linger_time, fast_bytes, slow_inflight_bytes, slow_bytes, avail_space; + u32 buffered_bytes = 0; + + meta_tp = tcp_sk(meta_sk); + besttp = tcp_sk(bestsk); + + blestsched_update_lambda(meta_sk, bestsk); + + /* if we send this SKB now, it will be acked in besttp->srtt seconds + * during this time: how many bytes will we send on the fast flow? + */ + slow_linger_time = blestsched_estimate_linger_time(bestsk); + fast_bytes = blestsched_estimate_bytes(minsk, slow_linger_time); + + if (skb) + buffered_bytes = skb->len; + + /* is the required space available in the mptcp meta send window? + * we assume that all bytes inflight on the slow path will be acked in besttp->srtt seconds + * (just like the SKB if it was sent now) -> that means that those inflight bytes will + * keep occupying space in the meta window until then + */ + slow_inflight_bytes = besttp->write_seq - besttp->snd_una; + slow_bytes = buffered_bytes + slow_inflight_bytes; // bytes of this SKB plus those in flight already + + avail_space = (slow_bytes < meta_tp->snd_wnd) ? (meta_tp->snd_wnd - slow_bytes) : 0; + + if (fast_bytes > avail_space) { + /* sending this SKB on the slow flow means + * we wouldn't be able to send all the data we'd like to send on the fast flow + * so don't do that + */ + return NULL; + } + } + + return bestsk; +} + +/* copy from mptcp_sched.c: mptcp_rcv_buf_optimization */ +static struct sk_buff *mptcp_blest_rcv_buf_optimization(struct sock *sk, int penal) +{ + struct sock *meta_sk; + const struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_tcp_sock *mptcp; + struct sk_buff *skb_head; + struct blestsched_priv *blest_p = blestsched_get_priv(tp); + struct blestsched_cb *blest_cb; + + meta_sk = mptcp_meta_sk(sk); + skb_head = tcp_rtx_queue_head(meta_sk); + + if (!skb_head) + return NULL; + + /* If penalization is optional (coming from mptcp_next_segment() and + * We are not send-buffer-limited we do not penalize. The retransmission + * is just an optimization to fix the idle-time due to the delay before + * we wake up the application. + */ + if (!penal && sk_stream_memory_free(meta_sk)) + goto retrans; + + /* Record the occurrence of a retransmission to update the lambda value */ + blest_cb = blestsched_get_cb(tcp_sk(meta_sk)); + blest_cb->retrans_flag = true; + + /* Only penalize again after an RTT has elapsed */ + if (tcp_jiffies32 - blest_p->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3)) + goto retrans; + + /* Half the cwnd of the slow flows */ + mptcp_for_each_sub(tp->mpcb, mptcp) { + struct tcp_sock *tp_it = mptcp->tp; + + if (tp_it != tp && + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { + if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { + u32 prior_cwnd = tp_it->snd_cwnd; + + tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); + + /* If in slow start, do not reduce the ssthresh */ + if (prior_cwnd >= tp_it->snd_ssthresh) + tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); + + blest_p->last_rbuf_opti = tcp_jiffies32; + } + } + } + +retrans: + + /* Segment not yet injected into this path? Take it!!! */ + if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { + bool do_retrans = false; + mptcp_for_each_sub(tp->mpcb, mptcp) { + struct tcp_sock *tp_it = mptcp->tp; + + if (tp_it != tp && + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { + if (tp_it->snd_cwnd <= 4) { + do_retrans = true; + break; + } + + if (4 * tp->srtt_us >= tp_it->srtt_us) { + do_retrans = false; + break; + } else { + do_retrans = true; + } + } + } + + if (do_retrans && mptcp_is_available(sk, skb_head, false)) { + trace_mptcp_retransmit(sk, skb_head); + return skb_head; + } + } + return NULL; +} + +/* copy from mptcp_sched.c: __mptcp_next_segment */ +/* Returns the next segment to be sent from the mptcp meta-queue. + * (chooses the reinject queue if any segment is waiting in it, otherwise, + * chooses the normal write queue). + * Sets *@reinject to 1 if the returned segment comes from the + * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, + * and sets it to -1 if it is a meta-level retransmission to optimize the + * receive-buffer. + */ +static struct sk_buff *__mptcp_blest_next_segment(struct sock *meta_sk, int *reinject) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sk_buff *skb = NULL; + + *reinject = 0; + + /* If we are in fallback-mode, just take from the meta-send-queue */ + if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) + return tcp_send_head(meta_sk); + + skb = skb_peek(&mpcb->reinject_queue); + + if (skb) { + *reinject = 1; + } else { + skb = tcp_send_head(meta_sk); + + if (!skb && meta_sk->sk_socket && + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) && + sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { + struct sock *subsk = blest_get_available_subflow(meta_sk, NULL, + false); + if (!subsk) + return NULL; + + skb = mptcp_blest_rcv_buf_optimization(subsk, 0); + if (skb) + *reinject = -1; + } + } + return skb; +} + +/* copy from mptcp_sched.c: mptcp_next_segment */ +static struct sk_buff *mptcp_blest_next_segment(struct sock *meta_sk, + int *reinject, + struct sock **subsk, + unsigned int *limit) +{ + struct sk_buff *skb = __mptcp_blest_next_segment(meta_sk, reinject); + unsigned int mss_now; + struct tcp_sock *subtp; + u16 gso_max_segs; + u32 max_len, max_segs, window, needed; + + /* As we set it, we have to reset it as well. */ + *limit = 0; + + if (!skb) + return NULL; + + *subsk = blest_get_available_subflow(meta_sk, skb, false); + if (!*subsk) + return NULL; + + subtp = tcp_sk(*subsk); + mss_now = tcp_current_mss(*subsk); + + if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) { + skb = mptcp_blest_rcv_buf_optimization(*subsk, 1); + if (skb) + *reinject = -1; + else + return NULL; + } + + /* No splitting required, as we will only send one single segment */ + if (skb->len <= mss_now) + return skb; + + /* The following is similar to tcp_mss_split_point, but + * we do not care about nagle, because we will anyways + * use TCP_NAGLE_PUSH, which overrides this. + * + * So, we first limit according to the cwnd/gso-size and then according + * to the subflow's window. + */ + + gso_max_segs = (*subsk)->sk_gso_max_segs; + if (!gso_max_segs) /* No gso supported on the subflow's NIC */ + gso_max_segs = 1; + max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs); + if (!max_segs) + return NULL; + + max_len = mss_now * max_segs; + window = tcp_wnd_end(subtp) - subtp->write_seq; + + needed = min(skb->len, window); + if (max_len <= skb->len) + /* Take max_win, which is actually the cwnd/gso-size */ + *limit = max_len; + else + /* Or, take the window */ + *limit = needed; + + return skb; +} + +static void blestsched_init(struct sock *sk) +{ + struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk)); + struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(mptcp_meta_sk(sk))); + + blest_p->last_rbuf_opti = tcp_jiffies32; + blest_p->min_srtt_us = U32_MAX; + blest_p->max_srtt_us = 0; + + if (!blest_cb->lambda_1000) { + blest_cb->lambda_1000 = lambda * 100; + blest_cb->last_lambda_update = tcp_jiffies32; + } +} + +static struct mptcp_sched_ops mptcp_sched_blest = { + .get_subflow = blest_get_available_subflow, + .next_segment = mptcp_blest_next_segment, + .init = blestsched_init, + .name = "blest", + .owner = THIS_MODULE, +}; + +static int __init blest_register(void) +{ + BUILD_BUG_ON(sizeof(struct blestsched_priv) > MPTCP_SCHED_SIZE); + BUILD_BUG_ON(sizeof(struct blestsched_cb) > MPTCP_SCHED_DATA_SIZE); + + if (mptcp_register_scheduler(&mptcp_sched_blest)) + return -1; + + return 0; +} + +static void blest_unregister(void) +{ + mptcp_unregister_scheduler(&mptcp_sched_blest); +} + +module_init(blest_register); +module_exit(blest_unregister); + +MODULE_AUTHOR("Simone Ferlin, Daniel Weber"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("BLEST scheduler for MPTCP, based on default minimum RTT scheduler"); +MODULE_VERSION("0.95"); diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c new file mode 100644 index 000000000000..9eb7628053f6 --- /dev/null +++ b/net/mptcp/mptcp_coupled.c @@ -0,0 +1,262 @@ +/* + * MPTCP implementation - Linked Increase congestion control Algorithm (LIA) + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include + +#include + +/* Scaling is done in the numerator with alpha_scale_num and in the denominator + * with alpha_scale_den. + * + * To downscale, we just need to use alpha_scale. + * + * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2) + */ +static int alpha_scale_den = 10; +static int alpha_scale_num = 32; +static int alpha_scale = 12; + +struct mptcp_ccc { + u64 alpha; + bool forced_update; +}; + +static inline int mptcp_ccc_sk_can_send(const struct sock *sk) +{ + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; +} + +static inline u64 mptcp_get_alpha(const struct sock *meta_sk) +{ + return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha; +} + +static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha) +{ + ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha; +} + +static inline u64 mptcp_ccc_scale(u32 val, int scale) +{ + return (u64) val << scale; +} + +static inline bool mptcp_get_forced(const struct sock *meta_sk) +{ + return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update; +} + +static inline void mptcp_set_forced(const struct sock *meta_sk, bool force) +{ + ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force; +} + +static void mptcp_ccc_recalc_alpha(const struct sock *sk) +{ + const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + const struct mptcp_tcp_sock *mptcp; + int best_cwnd = 0, best_rtt = 0, can_send = 0; + u64 max_numerator = 0, sum_denominator = 0, alpha = 1; + + if (!mpcb) + return; + + /* Do regular alpha-calculation for multiple subflows */ + + /* Find the max numerator of the alpha-calculation */ + mptcp_for_each_sub(mpcb, mptcp) { + const struct sock *sub_sk = mptcp_to_sock(mptcp); + struct tcp_sock *sub_tp = tcp_sk(sub_sk); + u64 tmp; + + if (!mptcp_ccc_sk_can_send(sub_sk)) + continue; + + can_send++; + + /* We need to look for the path, that provides the max-value. + * Integer-overflow is not possible here, because + * tmp will be in u64. + */ + tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd, + alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us); + + if (tmp >= max_numerator) { + max_numerator = tmp; + best_cwnd = sub_tp->snd_cwnd; + best_rtt = sub_tp->srtt_us; + } + } + + /* No subflow is able to send - we don't care anymore */ + if (unlikely(!can_send)) + goto exit; + + /* Calculate the denominator */ + mptcp_for_each_sub(mpcb, mptcp) { + const struct sock *sub_sk = mptcp_to_sock(mptcp); + struct tcp_sock *sub_tp = tcp_sk(sub_sk); + + if (!mptcp_ccc_sk_can_send(sub_sk)) + continue; + + sum_denominator += div_u64( + mptcp_ccc_scale(sub_tp->snd_cwnd, + alpha_scale_den) * best_rtt, + sub_tp->srtt_us); + } + sum_denominator *= sum_denominator; + if (unlikely(!sum_denominator)) { + pr_err("%s: sum_denominator == 0\n", __func__); + mptcp_for_each_sub(mpcb, mptcp) { + const struct sock *sub_sk = mptcp_to_sock(mptcp); + struct tcp_sock *sub_tp = tcp_sk(sub_sk); + pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u", + __func__, sub_tp->mptcp->path_index, + sub_sk->sk_state, sub_tp->srtt_us, + sub_tp->snd_cwnd); + } + } + + alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator); + + if (unlikely(!alpha)) + alpha = 1; + +exit: + mptcp_set_alpha(mptcp_meta_sk(sk), alpha); +} + +static void mptcp_ccc_init(struct sock *sk) +{ + if (mptcp(tcp_sk(sk))) { + mptcp_set_forced(mptcp_meta_sk(sk), 0); + mptcp_set_alpha(mptcp_meta_sk(sk), 1); + } + /* If we do not mptcp, behave like reno: return */ +} + +static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_LOSS) + mptcp_ccc_recalc_alpha(sk); +} + +static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state) +{ + if (!mptcp(tcp_sk(sk))) + return; + + mptcp_set_forced(mptcp_meta_sk(sk), 1); +} + +static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + int snd_cwnd; + u64 alpha; + + if (!mptcp(tp)) { + tcp_reno_cong_avoid(sk, ack, acked); + return; + } + + if (!tcp_is_cwnd_limited(sk)) + return; + + if (tcp_in_slow_start(tp)) { + /* In "safe" area, increase. */ + tcp_slow_start(tp, acked); + mptcp_ccc_recalc_alpha(sk); + return; + } + + if (mptcp_get_forced(mptcp_meta_sk(sk))) { + mptcp_ccc_recalc_alpha(sk); + mptcp_set_forced(mptcp_meta_sk(sk), 0); + } + + alpha = mptcp_get_alpha(mptcp_meta_sk(sk)); + + /* This may happen, if at the initialization, the mpcb + * was not yet attached to the sock, and thus + * initializing alpha failed. + */ + if (unlikely(!alpha)) + alpha = 1; + + snd_cwnd = (int)div_u64((u64)mptcp_ccc_scale(1, alpha_scale), alpha); + + /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd) + * Thus, we select here the max value. + */ + if (snd_cwnd < tp->snd_cwnd) + snd_cwnd = tp->snd_cwnd; + + if (tp->snd_cwnd_cnt >= snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + mptcp_ccc_recalc_alpha(sk); + } + + tp->snd_cwnd_cnt = 0; + } else { + tp->snd_cwnd_cnt++; + } +} + +static struct tcp_congestion_ops mptcp_ccc = { + .init = mptcp_ccc_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = mptcp_ccc_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, + .cwnd_event = mptcp_ccc_cwnd_event, + .set_state = mptcp_ccc_set_state, + .owner = THIS_MODULE, + .name = "lia", +}; + +static int __init mptcp_ccc_register(void) +{ + BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&mptcp_ccc); +} + +static void __exit mptcp_ccc_unregister(void) +{ + tcp_unregister_congestion_control(&mptcp_ccc); +} + +module_init(mptcp_ccc_register); +module_exit(mptcp_ccc_unregister); + +MODULE_AUTHOR("Christoph Paasch, Sébastien Barré"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM"); +MODULE_VERSION("0.1"); diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c new file mode 100644 index 000000000000..f3de9da4a8ff --- /dev/null +++ b/net/mptcp/mptcp_ctrl.c @@ -0,0 +1,3184 @@ +/* + * MPTCP implementation - MPTCP-control + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#endif +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *mptcp_sock_cache __read_mostly; +static struct kmem_cache *mptcp_cb_cache __read_mostly; +static struct kmem_cache *mptcp_tw_cache __read_mostly; + +int sysctl_mptcp_enabled __read_mostly = 1; +int sysctl_mptcp_version __read_mostly = 0; +static int min_mptcp_version; +static int max_mptcp_version = 1; +int sysctl_mptcp_checksum __read_mostly = 1; +int sysctl_mptcp_debug __read_mostly; +EXPORT_SYMBOL(sysctl_mptcp_debug); +int sysctl_mptcp_syn_retries __read_mostly = 3; + +bool mptcp_init_failed __read_mostly; + +struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(mptcp_static_key); + +static void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn); + +static int proc_mptcp_path_manager(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + char val[MPTCP_PM_NAME_MAX]; + struct ctl_table tbl = { + .data = val, + .maxlen = MPTCP_PM_NAME_MAX, + }; + int ret; + + mptcp_get_default_path_manager(val); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = mptcp_set_default_path_manager(val); + return ret; +} + +static int proc_mptcp_scheduler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + char val[MPTCP_SCHED_NAME_MAX]; + struct ctl_table tbl = { + .data = val, + .maxlen = MPTCP_SCHED_NAME_MAX, + }; + int ret; + + mptcp_get_default_scheduler(val); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = mptcp_set_default_scheduler(val); + return ret; +} + +static struct ctl_table mptcp_table[] = { + { + .procname = "mptcp_enabled", + .data = &sysctl_mptcp_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "mptcp_version", + .data = &sysctl_mptcp_version, + .mode = 0644, + .maxlen = sizeof(int), + .proc_handler = &proc_dointvec_minmax, + .extra1 = &min_mptcp_version, + .extra2 = &max_mptcp_version, + }, + { + .procname = "mptcp_checksum", + .data = &sysctl_mptcp_checksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "mptcp_debug", + .data = &sysctl_mptcp_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "mptcp_syn_retries", + .data = &sysctl_mptcp_syn_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "mptcp_path_manager", + .mode = 0644, + .maxlen = MPTCP_PM_NAME_MAX, + .proc_handler = proc_mptcp_path_manager, + }, + { + .procname = "mptcp_scheduler", + .mode = 0644, + .maxlen = MPTCP_SCHED_NAME_MAX, + .proc_handler = proc_mptcp_scheduler, + }, + { } +}; + +static inline u32 mptcp_hash_tk(u32 token) +{ + return token % MPTCP_HASH_SIZE; +} + +struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE]; +EXPORT_SYMBOL(tk_hashtable); + +/* The following hash table is used to avoid collision of token */ +static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE]; + +/* Lock, protecting the two hash-tables that hold the token. Namely, + * mptcp_reqsk_tk_htb and tk_hashtable + */ +static spinlock_t mptcp_tk_hashlock; + +static bool mptcp_reqsk_find_tk(const u32 token) +{ + const u32 hash = mptcp_hash_tk(token); + const struct mptcp_request_sock *mtreqsk; + const struct hlist_nulls_node *node; + +begin: + hlist_nulls_for_each_entry_rcu(mtreqsk, node, + &mptcp_reqsk_tk_htb[hash], hash_entry) { + if (token == mtreqsk->mptcp_loc_token) + return true; + } + /* A request-socket is destroyed by RCU. So, it might have been recycled + * and put into another hash-table list. So, after the lookup we may + * end up in a different list. So, we may need to restart. + * + * See also the comment in __inet_lookup_established. + */ + if (get_nulls_value(node) != hash) + goto begin; + return false; +} + +static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token) +{ + u32 hash = mptcp_hash_tk(token); + + hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry, + &mptcp_reqsk_tk_htb[hash]); +} + +static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk) +{ + rcu_read_lock(); + local_bh_disable(); + spin_lock(&mptcp_tk_hashlock); + hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry); + spin_unlock(&mptcp_tk_hashlock); + local_bh_enable(); + rcu_read_unlock(); +} + +void mptcp_reqsk_destructor(struct request_sock *req) +{ + if (!mptcp_rsk(req)->is_sub) + mptcp_reqsk_remove_tk(req); +} + +static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token) +{ + u32 hash = mptcp_hash_tk(token); + hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]); + meta_tp->inside_tk_table = 1; +} + +static bool mptcp_find_token(u32 token) +{ + const u32 hash = mptcp_hash_tk(token); + const struct tcp_sock *meta_tp; + const struct hlist_nulls_node *node; + +begin: + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) { + if (token == meta_tp->mptcp_loc_token) + return true; + } + /* A TCP-socket is destroyed by RCU. So, it might have been recycled + * and put into another hash-table list. So, after the lookup we may + * end up in a different list. So, we may need to restart. + * + * See also the comment in __inet_lookup_established. + */ + if (get_nulls_value(node) != hash) + goto begin; + return false; +} + +static void mptcp_set_key_reqsk(struct request_sock *req, + const struct sk_buff *skb, + u32 seed) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + + if (skb->protocol == htons(ETH_P_IP)) { + mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + htons(ireq->ir_num), + ireq->ir_rmt_port, + seed); +#if IS_ENABLED(CONFIG_IPV6) + } else { + mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32, + ipv6_hdr(skb)->daddr.s6_addr32, + htons(ireq->ir_num), + ireq->ir_rmt_port, + seed); +#endif + } + + mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); +} + +/* New MPTCP-connection request, prepare a new token for the meta-socket that + * will be created in mptcp_check_req_master(), and store the received token. + */ +static void mptcp_reqsk_new_mptcp(struct request_sock *req, + const struct sock *sk, + const struct mptcp_options_received *mopt, + const struct sk_buff *skb) +{ + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + const struct tcp_sock *tp = tcp_sk(sk); + + inet_rsk(req)->saw_mpc = 1; + + /* MPTCP version agreement */ + if (mopt->mptcp_ver >= tp->mptcp_ver) + mtreq->mptcp_ver = tp->mptcp_ver; + else + mtreq->mptcp_ver = mopt->mptcp_ver; + + rcu_read_lock(); + local_bh_disable(); + spin_lock(&mptcp_tk_hashlock); + do { + mptcp_set_key_reqsk(req, skb, mptcp_seed++); + } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || + mptcp_find_token(mtreq->mptcp_loc_token)); + mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token); + spin_unlock(&mptcp_tk_hashlock); + local_bh_enable(); + rcu_read_unlock(); + mtreq->mptcp_rem_key = mopt->mptcp_sender_key; +} + +static int mptcp_reqsk_new_cookie(struct request_sock *req, + const struct sock *sk, + const struct mptcp_options_received *mopt, + const struct sk_buff *skb) +{ + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + + /* MPTCP version agreement */ + if (mopt->mptcp_ver >= tcp_sk(sk)->mptcp_ver) + mtreq->mptcp_ver = tcp_sk(sk)->mptcp_ver; + else + mtreq->mptcp_ver = mopt->mptcp_ver; + + rcu_read_lock(); + local_bh_disable(); + spin_lock(&mptcp_tk_hashlock); + + mptcp_set_key_reqsk(req, skb, tcp_rsk(req)->snt_isn); + + if (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || + mptcp_find_token(mtreq->mptcp_loc_token)) { + spin_unlock(&mptcp_tk_hashlock); + local_bh_enable(); + rcu_read_unlock(); + return false; + } + + inet_rsk(req)->saw_mpc = 1; + + spin_unlock(&mptcp_tk_hashlock); + local_bh_enable(); + rcu_read_unlock(); + + mtreq->mptcp_rem_key = mopt->mptcp_sender_key; + + return true; +} + +static void mptcp_set_key_sk(const struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_sock *isk = inet_sk(sk); + + if (sk->sk_family == AF_INET) + tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr, + isk->inet_daddr, + isk->inet_sport, + isk->inet_dport, + mptcp_seed++); +#if IS_ENABLED(CONFIG_IPV6) + else + tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + isk->inet_sport, + isk->inet_dport, + mptcp_seed++); +#endif + + mptcp_key_sha1(tp->mptcp_loc_key, + &tp->mptcp_loc_token, NULL); +} + +#ifdef CONFIG_JUMP_LABEL +static atomic_t mptcp_needed_deferred; +static atomic_t mptcp_wanted; + +static void mptcp_clear(struct work_struct *work) +{ + int deferred = atomic_xchg(&mptcp_needed_deferred, 0); + int wanted; + + wanted = atomic_add_return(deferred, &mptcp_wanted); + if (wanted > 0) + static_key_enable(&mptcp_static_key); + else + static_key_disable(&mptcp_static_key); +} + +static DECLARE_WORK(mptcp_work, mptcp_clear); +#endif + +static void mptcp_enable_static_key_bh(void) +{ +#ifdef CONFIG_JUMP_LABEL + int wanted; + + while (1) { + wanted = atomic_read(&mptcp_wanted); + if (wanted <= 0) + break; + if (atomic_cmpxchg(&mptcp_wanted, wanted, wanted + 1) == wanted) + return; + } + atomic_inc(&mptcp_needed_deferred); + schedule_work(&mptcp_work); +#else + static_key_slow_inc(&mptcp_static_key); +#endif +} + +static void mptcp_enable_static_key(void) +{ +#ifdef CONFIG_JUMP_LABEL + atomic_inc(&mptcp_wanted); + static_key_enable(&mptcp_static_key); +#else + static_key_slow_inc(&mptcp_static_key); +#endif +} + +void mptcp_disable_static_key(void) +{ +#ifdef CONFIG_JUMP_LABEL + int wanted; + + while (1) { + wanted = atomic_read(&mptcp_wanted); + if (wanted <= 1) + break; + if (atomic_cmpxchg(&mptcp_wanted, wanted, wanted - 1) == wanted) + return; + } + atomic_dec(&mptcp_needed_deferred); + schedule_work(&mptcp_work); +#else + static_key_slow_dec(&mptcp_static_key); +#endif +} + +void mptcp_enable_sock(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_MPTCP)) { + sock_set_flag(sk, SOCK_MPTCP); + tcp_sk(sk)->mptcp_ver = sysctl_mptcp_version; + + /* Necessary here, because MPTCP can be enabled/disabled through + * a setsockopt. + */ + if (sk->sk_family == AF_INET) + inet_csk(sk)->icsk_af_ops = &mptcp_v4_specific; +#if IS_ENABLED(CONFIG_IPV6) + else if (mptcp_v6_is_v4_mapped(sk)) + inet_csk(sk)->icsk_af_ops = &mptcp_v6_mapped; + else + inet_csk(sk)->icsk_af_ops = &mptcp_v6_specific; +#endif + + mptcp_enable_static_key(); + } +} + +void mptcp_disable_sock(struct sock *sk) +{ + if (sock_flag(sk, SOCK_MPTCP)) { + sock_reset_flag(sk, SOCK_MPTCP); + + /* Necessary here, because MPTCP can be enabled/disabled through + * a setsockopt. + */ + if (sk->sk_family == AF_INET) + inet_csk(sk)->icsk_af_ops = &ipv4_specific; +#if IS_ENABLED(CONFIG_IPV6) + else if (mptcp_v6_is_v4_mapped(sk)) + inet_csk(sk)->icsk_af_ops = &ipv6_mapped; + else + inet_csk(sk)->icsk_af_ops = &ipv6_specific; +#endif + + mptcp_disable_static_key(); + } +} + +void mptcp_connect_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + rcu_read_lock(); + local_bh_disable(); + spin_lock(&mptcp_tk_hashlock); + do { + mptcp_set_key_sk(sk); + } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) || + mptcp_find_token(tp->mptcp_loc_token)); + + __mptcp_hash_insert(tp, tp->mptcp_loc_token); + spin_unlock(&mptcp_tk_hashlock); + local_bh_enable(); + rcu_read_unlock(); + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); +} + +/** + * This function increments the refcount of the mpcb struct. + * It is the responsibility of the caller to decrement when releasing + * the structure. + */ +struct sock *mptcp_hash_find(const struct net *net, const u32 token) +{ + const u32 hash = mptcp_hash_tk(token); + const struct tcp_sock *meta_tp; + struct sock *meta_sk = NULL; + const struct hlist_nulls_node *node; + + rcu_read_lock(); + local_bh_disable(); +begin: + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], + tk_table) { + meta_sk = (struct sock *)meta_tp; + if (token == meta_tp->mptcp_loc_token && + net_eq(net, sock_net(meta_sk))) { + if (unlikely(!refcount_inc_not_zero(&meta_sk->sk_refcnt))) + goto out; + if (unlikely(token != meta_tp->mptcp_loc_token || + !net_eq(net, sock_net(meta_sk)))) { + sock_gen_put(meta_sk); + goto begin; + } + goto found; + } + } + /* A TCP-socket is destroyed by RCU. So, it might have been recycled + * and put into another hash-table list. So, after the lookup we may + * end up in a different list. So, we may need to restart. + * + * See also the comment in __inet_lookup_established. + */ + if (get_nulls_value(node) != hash) + goto begin; +out: + meta_sk = NULL; +found: + local_bh_enable(); + rcu_read_unlock(); + return meta_sk; +} +EXPORT_SYMBOL_GPL(mptcp_hash_find); + +void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) +{ + /* remove from the token hashtable */ + rcu_read_lock(); + local_bh_disable(); + spin_lock(&mptcp_tk_hashlock); + hlist_nulls_del_init_rcu(&meta_tp->tk_table); + meta_tp->inside_tk_table = 0; + spin_unlock(&mptcp_tk_hashlock); + local_bh_enable(); + rcu_read_unlock(); +} + +struct sock *mptcp_select_ack_sock(const struct sock *meta_sk) +{ + const struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sock *rttsk = NULL, *lastsk = NULL; + u32 min_time = 0, last_active = 0; + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(meta_tp->mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + struct tcp_sock *tp = tcp_sk(sk); + u32 elapsed; + + if (!mptcp_sk_can_send_ack(sk) || tp->pf) + continue; + + elapsed = keepalive_time_elapsed(tp); + + /* We take the one with the lowest RTT within a reasonable + * (meta-RTO)-timeframe + */ + if (elapsed < inet_csk(meta_sk)->icsk_rto) { + if (!min_time || tp->srtt_us < min_time) { + min_time = tp->srtt_us; + rttsk = sk; + } + continue; + } + + /* Otherwise, we just take the most recent active */ + if (!rttsk && (!last_active || elapsed < last_active)) { + last_active = elapsed; + lastsk = sk; + } + } + + if (rttsk) + return rttsk; + + return lastsk; +} +EXPORT_SYMBOL(mptcp_select_ack_sock); + +static void mptcp_sock_def_error_report(struct sock *sk) +{ + const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + struct tcp_sock *tp = tcp_sk(sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + if (tp->send_mp_fclose && sk->sk_err == ETIMEDOUT) { + /* Called by the keep alive timer (tcp_write_timeout), + * when the limit of fastclose retransmissions has been + * reached. Send a TCP RST to clear the status of any + * stateful firewall (typically conntrack) which are + * not aware of mptcp and cannot understand the + * fastclose option. + */ + tp->ops->send_active_reset(sk, GFP_ATOMIC); + } + } + + /* record this info that can be used by PM after the sf close */ + tp->mptcp->sk_err = sk->sk_err; + + if (!tp->tcp_disconnect && mptcp_in_infinite_mapping_weak(mpcb)) { + struct sock *meta_sk = mptcp_meta_sk(sk); + + meta_sk->sk_err = sk->sk_err; + meta_sk->sk_err_soft = sk->sk_err_soft; + + if (!sock_flag(meta_sk, SOCK_DEAD)) + meta_sk->sk_error_report(meta_sk); + + WARN(meta_sk->sk_state == TCP_CLOSE, + "Meta already closed i_rcv %u i_snd %u send_i %u flags %#lx\n", + mpcb->infinite_mapping_rcv, mpcb->infinite_mapping_snd, + mpcb->send_infinite_mapping, meta_sk->sk_flags); + + if (meta_sk->sk_state != TCP_CLOSE) + tcp_done(meta_sk); + } + + sk->sk_err = 0; + return; +} + +void mptcp_mpcb_put(struct mptcp_cb *mpcb) +{ + if (refcount_dec_and_test(&mpcb->mpcb_refcnt)) { + mptcp_cleanup_path_manager(mpcb); + mptcp_cleanup_scheduler(mpcb); + kfree(mpcb->master_info); + kmem_cache_free(mptcp_cb_cache, mpcb); + } +} +EXPORT_SYMBOL(mptcp_mpcb_put); + +static void mptcp_mpcb_cleanup(struct mptcp_cb *mpcb) +{ + struct mptcp_tw *mptw; + + /* The mpcb is disappearing - we can make the final + * update to the rcv_nxt of the time-wait-sock and remove + * its reference to the mpcb. + */ + spin_lock_bh(&mpcb->mpcb_list_lock); + list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) { + list_del_rcu(&mptw->list); + mptw->in_list = 0; + mptcp_mpcb_put(mpcb); + rcu_assign_pointer(mptw->mpcb, NULL); + } + spin_unlock_bh(&mpcb->mpcb_list_lock); + + mptcp_mpcb_put(mpcb); +} + +static void mptcp_sock_destruct(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!is_meta_sk(sk)) { + BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list)); + + kmem_cache_free(mptcp_sock_cache, tp->mptcp); + tp->mptcp = NULL; + + /* Taken when mpcb pointer was set */ + sock_put(mptcp_meta_sk(sk)); + mptcp_mpcb_put(tp->mpcb); + } else { + mptcp_debug("%s destroying meta-sk token %#x\n", __func__, + tcp_sk(sk)->mpcb->mptcp_loc_token); + + mptcp_mpcb_cleanup(tp->mpcb); + } + + WARN_ON(!static_key_false(&mptcp_static_key)); + + /* Must be called here, because this will decrement the jump-label. */ + inet_sock_destruct(sk); +} + +void mptcp_destroy_sock(struct sock *sk) +{ + if (is_meta_sk(sk)) { + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + + __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue); + + /* We have to close all remaining subflows. Normally, they + * should all be about to get closed. But, if the kernel is + * forcing a closure (e.g., tcp_write_err), the subflows might + * not have been closed properly (as we are waiting for the + * DATA_ACK of the DATA_FIN). + */ + mptcp_for_each_sub_safe(tcp_sk(sk)->mpcb, mptcp, tmp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + /* Already did call tcp_close - waiting for graceful + * closure, or if we are retransmitting fast-close on + * the subflow. The reset (or timeout) will kill the + * subflow.. + */ + if (tcp_sk(sk_it)->closing || + tcp_sk(sk_it)->send_mp_fclose) + continue; + + /* Allow the delayed work first to prevent time-wait state */ + if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work)) + continue; + + mptcp_sub_close(sk_it, 0); + } + } else { + mptcp_del_sock(sk); + } +} + +static void mptcp_set_state(struct sock *sk) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + + /* Meta is not yet established - wake up the application */ + if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) && + sk->sk_state == TCP_ESTABLISHED) { + tcp_set_state(meta_sk, TCP_ESTABLISHED); + + if (!sock_flag(meta_sk, SOCK_DEAD)) { + meta_sk->sk_state_change(meta_sk); + sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT); + } + + tcp_sk(meta_sk)->lsndtime = tcp_jiffies32; + } + + if (sk->sk_state == TCP_CLOSE) { + if (!sock_flag(sk, SOCK_DEAD)) + mptcp_sub_close(sk, 0); + } +} + +static int mptcp_set_congestion_control(struct sock *meta_sk, const char *name, + bool load, bool reinit, bool cap_net_admin) +{ + struct mptcp_tcp_sock *mptcp; + int err, result = 0; + + result = __tcp_set_congestion_control(meta_sk, name, load, reinit, cap_net_admin); + + tcp_sk(meta_sk)->mpcb->tcp_ca_explicit_set = true; + + mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + err = __tcp_set_congestion_control(sk_it, name, load, reinit, cap_net_admin); + if (err) + result = err; + } + return result; +} + +static void mptcp_assign_congestion_control(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk)); + const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops; + + /* Congestion control is the same as meta. Thus, it has been + * try_module_get'd by tcp_assign_congestion_control. + * Congestion control on meta was not explicitly configured by + * application, leave default or route based. + */ + if (icsk->icsk_ca_ops == ca || + !tcp_sk(mptcp_meta_sk(sk))->mpcb->tcp_ca_explicit_set) + return; + + /* Use the same congestion control as set on the meta-sk */ + if (!try_module_get(ca->owner)) { + /* This should never happen. The congestion control is linked + * to the meta-socket (through tcp_assign_congestion_control) + * who "holds" the refcnt on the module. + */ + WARN(1, "Could not get the congestion control!"); + return; + } + module_put(icsk->icsk_ca_ops->owner); + icsk->icsk_ca_ops = ca; + + /* Clear out private data before diag gets it and + * the ca has not been initialized. + */ + if (ca->get_info) + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + + return; +} + +siphash_key_t mptcp_secret __read_mostly; +u32 mptcp_seed = 0; + +static void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn) +{ + u32 workspace[SHA_WORKSPACE_WORDS]; + u32 mptcp_hashed_key[SHA_DIGEST_WORDS]; + u8 input[64]; + int i; + + memset(workspace, 0, sizeof(workspace)); + + /* Initialize input with appropriate padding */ + memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte + * is explicitly set too + */ + memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */ + input[8] = 0x80; /* Padding: First bit after message = 1 */ + input[63] = 0x40; /* Padding: Length of the message = 64 bits */ + + sha_init(mptcp_hashed_key); + sha_transform(mptcp_hashed_key, input, workspace); + + for (i = 0; i < 5; i++) + mptcp_hashed_key[i] = (__force u32)cpu_to_be32(mptcp_hashed_key[i]); + + if (token) + *token = mptcp_hashed_key[0]; + if (idsn) + *idsn = ntohll(*((__be64 *)&mptcp_hashed_key[3])); +} + +void mptcp_hmac_sha1(const u8 *key_1, const u8 *key_2, u32 *hash_out, + int arg_num, ...) +{ + u32 workspace[SHA_WORKSPACE_WORDS]; + u8 input[128]; /* 2 512-bit blocks */ + int i; + int index; + int length; + u8 *msg; + va_list list; + + memset(workspace, 0, sizeof(workspace)); + + /* Generate key xored with ipad */ + memset(input, 0x36, 64); + for (i = 0; i < 8; i++) + input[i] ^= key_1[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key_2[i]; + + va_start(list, arg_num); + index = 64; + for (i = 0; i < arg_num; i++) { + length = va_arg(list, int); + msg = va_arg(list, u8 *); + BUG_ON(index + length > 125); /* Message is too long */ + memcpy(&input[index], msg, length); + index += length; + } + va_end(list); + + input[index] = 0x80; /* Padding: First bit after message = 1 */ + memset(&input[index + 1], 0, (126 - index)); + + /* Padding: Length of the message = 512 + message length (bits) */ + input[126] = 0x02; + input[127] = ((index - 64) * 8); /* Message length (bits) */ + + sha_init(hash_out); + sha_transform(hash_out, input, workspace); + memset(workspace, 0, sizeof(workspace)); + + sha_transform(hash_out, &input[64], workspace); + memset(workspace, 0, sizeof(workspace)); + + for (i = 0; i < 5; i++) + hash_out[i] = (__force u32)cpu_to_be32(hash_out[i]); + + /* Prepare second part of hmac */ + memset(input, 0x5C, 64); + for (i = 0; i < 8; i++) + input[i] ^= key_1[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key_2[i]; + + memcpy(&input[64], hash_out, 20); + input[84] = 0x80; + memset(&input[85], 0, 41); + + /* Padding: Length of the message = 512 + 160 bits */ + input[126] = 0x02; + input[127] = 0xA0; + + sha_init(hash_out); + sha_transform(hash_out, input, workspace); + memset(workspace, 0, sizeof(workspace)); + + sha_transform(hash_out, &input[64], workspace); + + for (i = 0; i < 5; i++) + hash_out[i] = (__force u32)cpu_to_be32(hash_out[i]); +} +EXPORT_SYMBOL(mptcp_hmac_sha1); + +static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk) +{ + /* Socket-options handled by sk_clone_lock while creating the meta-sk. + * ====== + * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT, + * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER, + * TCP_NODELAY, TCP_CORK + * + * Socket-options handled in this function here + * ====== + * TCP_DEFER_ACCEPT + * SO_KEEPALIVE + * + * Socket-options on the todo-list + * ====== + * SO_BINDTODEVICE - should probably prevent creation of new subsocks + * across other devices. - what about the api-draft? + * SO_DEBUG + * SO_REUSEADDR - probably we don't care about this + * SO_DONTROUTE, SO_BROADCAST + * SO_OOBINLINE + * SO_LINGER + * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM + * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM + * SO_RXQ_OVFL + * TCP_COOKIE_TRANSACTIONS + * TCP_MAXSEG + * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this + * in mptcp_meta_retransmit_timer. AND we need to check + * what is about the subsockets. + * TCP_LINGER2 + * TCP_WINDOW_CLAMP + * TCP_USER_TIMEOUT + * TCP_MD5SIG + * + * Socket-options of no concern for the meta-socket (but for the subsocket) + * ====== + * SO_PRIORITY + * SO_MARK + * TCP_CONGESTION + * TCP_SYNCNT + * TCP_QUICKACK + */ + + /* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */ + inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0; + + /* Keepalives are handled entirely at the MPTCP-layer */ + if (sock_flag(meta_sk, SOCK_KEEPOPEN)) { + inet_csk_reset_keepalive_timer(meta_sk, + keepalive_time_when(tcp_sk(meta_sk))); + sock_reset_flag(master_sk, SOCK_KEEPOPEN); + inet_csk_delete_keepalive_timer(master_sk); + } + + /* Do not propagate subflow-errors up to the MPTCP-layer */ + inet_sk(master_sk)->recverr = 0; +} + +/* Called without holding lock on meta_sk */ +static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk) +{ + __u8 meta_tos; + + /* IP_TOS also goes to the subflow. */ + meta_tos = READ_ONCE(inet_sk(meta_sk)->tos); + if (inet_sk(sub_sk)->tos != meta_tos) { + inet_sk(sub_sk)->tos = meta_tos; + sub_sk->sk_priority = meta_sk->sk_priority; + sk_dst_reset(sub_sk); + } + + /* Inherit SO_REUSEADDR */ + sub_sk->sk_reuse = meta_sk->sk_reuse; + + /* Inherit SO_MARK: can be used for routing or filtering */ + sub_sk->sk_mark = meta_sk->sk_mark; + + /* Inherit snd/rcv-buffer locks */ + sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + + /* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */ + tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + + /* Keepalives are handled entirely at the MPTCP-layer */ + if (sock_flag(sub_sk, SOCK_KEEPOPEN)) { + sock_reset_flag(sub_sk, SOCK_KEEPOPEN); + inet_csk_delete_keepalive_timer(sub_sk); + } + + /* Do not propagate subflow-errors up to the MPTCP-layer */ + inet_sk(sub_sk)->recverr = 0; +} + +void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb) +{ + /* In case of success (in mptcp_backlog_rcv) and error (in kfree_skb) of + * sk_add_backlog, we will decrement the sk refcount. + */ + sock_hold(sk); + skb->sk = sk; + skb->destructor = sock_efree; +} + +int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb) +{ + /* skb-sk may be NULL if we receive a packet immediatly after the + * SYN/ACK + MP_CAPABLE. + */ + struct sock *sk = skb->sk ? skb->sk : meta_sk; + int ret = 0; + + if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) { + kfree_skb(skb); + return 0; + } + + /* Decrement sk refcnt when calling the skb destructor. + * Refcnt is incremented and skb destructor is set in tcp_v{4,6}_rcv via + * mptcp_prepare_for_backlog() here above. + */ + skb_orphan(skb); + + if (sk->sk_family == AF_INET) + ret = tcp_v4_do_rcv(sk, skb); +#if IS_ENABLED(CONFIG_IPV6) + else + ret = tcp_v6_do_rcv(sk, skb); +#endif + + sock_put(sk); + return ret; +} + +static void mptcp_init_buffer_space(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = mptcp_meta_sk(sk); + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + int space; + + tcp_init_buffer_space(sk); + + if (is_master_tp(tp)) { + meta_tp->rcvq_space.space = meta_tp->rcv_wnd; + tcp_mstamp_refresh(meta_tp); + meta_tp->rcvq_space.time = meta_tp->tcp_mstamp; + meta_tp->rcvq_space.seq = meta_tp->copied_seq; + + /* If there is only one subflow, we just use regular TCP + * autotuning. User-locks are handled already by + * tcp_init_buffer_space + */ + meta_tp->window_clamp = tp->window_clamp; + meta_tp->rcv_ssthresh = tp->rcv_ssthresh; + meta_sk->sk_rcvbuf = sk->sk_rcvbuf; + meta_sk->sk_sndbuf = sk->sk_sndbuf; + + return; + } + + if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK) + goto snd_buf; + + /* Adding a new subflow to the rcv-buffer space. We make a simple + * addition, to give some space to allow traffic on the new subflow. + * Autotuning will increase it further later on. + */ + space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, + sock_net(meta_sk)->ipv4.sysctl_tcp_rmem[2]); + if (space > meta_sk->sk_rcvbuf) { + meta_tp->window_clamp += tp->window_clamp; + meta_tp->rcv_ssthresh += tp->rcv_ssthresh; + meta_sk->sk_rcvbuf = space; + } + +snd_buf: + if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return; + + /* Adding a new subflow to the send-buffer space. We make a simple + * addition, to give some space to allow traffic on the new subflow. + * Autotuning will increase it further later on. + */ + space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, + sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2]); + if (space > meta_sk->sk_sndbuf) { + meta_sk->sk_sndbuf = space; + meta_sk->sk_write_space(meta_sk); + } +} + +struct lock_class_key meta_key; +char *meta_key_name = "sk_lock-AF_INET-MPTCP"; +struct lock_class_key meta_slock_key; +char *meta_slock_key_name = "slock-AF_INET-MPTCP"; + +static const struct tcp_sock_ops mptcp_meta_specific = { + .__select_window = __mptcp_select_window, + .select_window = mptcp_select_window, + .select_initial_window = mptcp_select_initial_window, + .select_size = mptcp_select_size, + .init_buffer_space = mptcp_init_buffer_space, + .set_rto = mptcp_tcp_set_rto, + .should_expand_sndbuf = mptcp_should_expand_sndbuf, + .send_fin = mptcp_send_fin, + .write_xmit = mptcp_write_xmit, + .send_active_reset = mptcp_send_active_reset, + .write_wakeup = mptcp_write_wakeup, + .retransmit_timer = mptcp_meta_retransmit_timer, + .time_wait = mptcp_time_wait, + .cleanup_rbuf = mptcp_cleanup_rbuf, + .set_cong_ctrl = mptcp_set_congestion_control, +}; + +static const struct tcp_sock_ops mptcp_sub_specific = { + .__select_window = __mptcp_select_window, + .select_window = mptcp_select_window, + .select_initial_window = mptcp_select_initial_window, + .select_size = mptcp_select_size, + .init_buffer_space = mptcp_init_buffer_space, + .set_rto = mptcp_tcp_set_rto, + .should_expand_sndbuf = mptcp_should_expand_sndbuf, + .send_fin = tcp_send_fin, + .write_xmit = tcp_write_xmit, + .send_active_reset = tcp_send_active_reset, + .write_wakeup = tcp_write_wakeup, + .retransmit_timer = mptcp_sub_retransmit_timer, + .time_wait = tcp_time_wait, + .cleanup_rbuf = tcp_cleanup_rbuf, + .set_cong_ctrl = __tcp_set_congestion_control, +}; + +static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, + __u8 mptcp_ver, u32 window) +{ + struct mptcp_cb *mpcb; + struct sock *master_sk; + struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); + struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk); + u64 snd_idsn, rcv_idsn; + + dst_release(meta_sk->sk_rx_dst); + meta_sk->sk_rx_dst = NULL; + /* This flag is set to announce sock_lock_init to + * reclassify the lock-class of the master socket. + */ + meta_tp->is_master_sk = 1; + master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO); + meta_tp->is_master_sk = 0; + if (!master_sk) { + net_err_ratelimited("%s Could not allocate master_sk on meta %p\n", + __func__, meta_sk); + goto err_alloc_master; + } + + /* Same as in inet_csk_clone_lock - need to init to 0 */ + memset(&inet_csk(master_sk)->icsk_accept_queue, 0, + sizeof(inet_csk(master_sk)->icsk_accept_queue)); + + /* icsk_bind_hash inherited from the meta, but it will be properly set + * in mptcp_create_master_sk. Same operation is done in + * inet_csk_clone_lock. + */ + inet_csk(master_sk)->icsk_bind_hash = NULL; + + master_tp = tcp_sk(master_sk); + master_tp->inside_tk_table = 0; + + master_tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, GFP_ATOMIC); + if (!master_tp->mptcp) { + net_err_ratelimited("%s Could not allocate mptcp_tcp_sock on meta %p\n", + __func__, meta_sk); + goto err_alloc_mptcp; + } + + mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC); + if (!mpcb) { + net_err_ratelimited("%s Could not allocate mpcb on meta %p\n", + __func__, meta_sk); + goto err_alloc_mpcb; + } + + if (__inet_inherit_port(meta_sk, master_sk) < 0) { + net_err_ratelimited("%s Could not inherit port on meta %p\n", + __func__, meta_sk); + goto err_inherit_port; + } + + /* Store the mptcp version agreed on initial handshake */ + mpcb->mptcp_ver = mptcp_ver; + + /* Store the keys and generate the peer's token */ + mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key; + mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token; + + /* Generate Initial data-sequence-numbers */ + mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &snd_idsn); + snd_idsn++; + mpcb->snd_high_order[0] = snd_idsn >> 32; + mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1; + + mpcb->mptcp_rem_key = remote_key; + mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &rcv_idsn); + rcv_idsn++; + mpcb->rcv_high_order[0] = rcv_idsn >> 32; + mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1; + + mpcb->meta_sk = meta_sk; + mpcb->master_sk = master_sk; + + skb_queue_head_init(&mpcb->reinject_queue); + mutex_init(&mpcb->mpcb_mutex); + + /* Init time-wait stuff */ + INIT_LIST_HEAD(&mpcb->tw_list); + + INIT_HLIST_HEAD(&mpcb->callback_list); + INIT_HLIST_HEAD(&mpcb->conn_list); + spin_lock_init(&mpcb->mpcb_list_lock); + + mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf; + mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf; + mpcb->orig_window_clamp = meta_tp->window_clamp; + + /* The meta is directly linked - set refcnt to 1 */ + refcount_set(&mpcb->mpcb_refcnt, 1); + + if (!meta_tp->inside_tk_table) { + /* Adding the meta_tp in the token hashtable - coming from server-side */ + rcu_read_lock(); + local_bh_disable(); + spin_lock(&mptcp_tk_hashlock); + + /* With lockless listeners, we might process two ACKs at the + * same time. With TCP, inet_csk_complete_hashdance takes care + * of this. But, for MPTCP this would be too late if we add + * this MPTCP-socket in the token table (new subflows might + * come in and match on this socket here. + * So, we need to check if someone else already added the token + * and revert in that case. The other guy won the race... + */ + if (mptcp_find_token(mpcb->mptcp_loc_token)) { + spin_unlock(&mptcp_tk_hashlock); + local_bh_enable(); + rcu_read_unlock(); + + goto err_insert_token; + } + __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token); + + spin_unlock(&mptcp_tk_hashlock); + local_bh_enable(); + rcu_read_unlock(); + } + +#if IS_ENABLED(CONFIG_IPV6) + if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) { + struct tcp6_sock *master_tp6 = (struct tcp6_sock *)master_sk; + struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); + + inet_sk(master_sk)->pinet6 = &master_tp6->inet6; + + newnp = inet6_sk(master_sk); + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; + newnp->pktoptions = NULL; + newnp->opt = NULL; + + newnp->rxopt.all = 0; + newnp->repflow = 0; + np->rxopt.all = 0; + np->repflow = 0; + } else if (meta_sk->sk_family == AF_INET6) { + struct tcp6_sock *master_tp6 = (struct tcp6_sock *)master_sk; + struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk); + struct ipv6_txoptions *opt; + + inet_sk(master_sk)->pinet6 = &master_tp6->inet6; + + /* The following heavily inspired from tcp_v6_syn_recv_sock() */ + newnp = inet6_sk(master_sk); + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; + newnp->pktoptions = NULL; + newnp->opt = NULL; + + newnp->rxopt.all = 0; + newnp->repflow = 0; + np->rxopt.all = 0; + np->repflow = 0; + + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(master_sk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } + inet_csk(master_sk)->icsk_ext_hdr_len = 0; + if (opt) + inet_csk(master_sk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; + } +#endif + + meta_tp->mptcp = NULL; + + meta_tp->write_seq = (u32)snd_idsn; + meta_tp->snd_sml = meta_tp->write_seq; + meta_tp->snd_una = meta_tp->write_seq; + meta_tp->snd_nxt = meta_tp->write_seq; + meta_tp->pushed_seq = meta_tp->write_seq; + meta_tp->snd_up = meta_tp->write_seq; + + meta_tp->copied_seq = (u32)rcv_idsn; + meta_tp->rcv_nxt = (u32)rcv_idsn; + meta_tp->rcv_wup = (u32)rcv_idsn; + meta_tp->rcv_right_edge = meta_tp->rcv_wup + meta_tp->rcv_wnd; + + meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1; + meta_tp->snd_wnd = window; + meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */ + + meta_tp->packets_out = 0; + meta_icsk->icsk_probes_out = 0; + + rcu_assign_pointer(inet_sk(meta_sk)->inet_opt, NULL); + + /* Set mptcp-pointers */ + master_tp->mpcb = mpcb; + master_tp->meta_sk = meta_sk; + meta_tp->mpcb = mpcb; + meta_tp->meta_sk = meta_sk; + + /* Initialize the queues */ + master_tp->out_of_order_queue = RB_ROOT; + master_sk->tcp_rtx_queue = RB_ROOT; + INIT_LIST_HEAD(&master_tp->tsq_node); + INIT_LIST_HEAD(&master_tp->tsorted_sent_queue); + + master_tp->fastopen_req = NULL; + + master_sk->sk_tsq_flags = 0; + + /* Init the accept_queue structure, we support a queue of 32 pending + * connections, it does not need to be huge, since we only store here + * pending subflow creations. + */ + reqsk_queue_alloc(&meta_icsk->icsk_accept_queue); + meta_sk->sk_max_ack_backlog = 32; + meta_sk->sk_ack_backlog = 0; + + if (!sock_flag(meta_sk, SOCK_MPTCP)) { + mptcp_enable_static_key_bh(); + sock_set_flag(meta_sk, SOCK_MPTCP); + } + + /* Redefine function-pointers as the meta-sk is now fully ready */ + meta_tp->mpc = 1; + meta_tp->ops = &mptcp_meta_specific; + + meta_sk->sk_backlog_rcv = mptcp_backlog_rcv; + meta_sk->sk_destruct = mptcp_sock_destruct; + + /* Meta-level retransmit timer */ + meta_icsk->icsk_rto *= 2; /* Double of initial - rto */ + + tcp_init_xmit_timers(master_sk); + /* Has been set for sending out the SYN */ + inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS); + + mptcp_mpcb_inherit_sockopts(meta_sk, master_sk); + + mptcp_init_path_manager(mpcb); + mptcp_init_scheduler(mpcb); + + if (!try_module_get(inet_csk(master_sk)->icsk_ca_ops->owner)) + tcp_assign_congestion_control(master_sk); + + master_tp->saved_syn = NULL; + + mptcp_debug("%s: created mpcb with token %#x\n", + __func__, mpcb->mptcp_loc_token); + + return 0; + +err_insert_token: + kmem_cache_free(mptcp_cb_cache, mpcb); + + kmem_cache_free(mptcp_sock_cache, master_tp->mptcp); + master_tp->mptcp = NULL; + + inet_csk_prepare_forced_close(master_sk); + tcp_done(master_sk); + return -EINVAL; + +err_inherit_port: + kmem_cache_free(mptcp_cb_cache, mpcb); + +err_alloc_mpcb: + kmem_cache_free(mptcp_sock_cache, master_tp->mptcp); + master_tp->mptcp = NULL; + +err_alloc_mptcp: + inet_sk(master_sk)->inet_opt = NULL; + master_sk->sk_state = TCP_CLOSE; + sock_orphan(master_sk); + bh_unlock_sock(master_sk); + sk_free(master_sk); + +err_alloc_master: + return -ENOBUFS; +} + +/* Called without holding lock on mpcb */ +static u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb) +{ + int i; + + /* Start at 1, because 0 is reserved for the meta-sk */ + for (i = 1; i < sizeof(mpcb->path_index_bits) * 8; i++) { + if (!test_and_set_bit(i, &mpcb->path_index_bits)) + break; + } + + if (i == sizeof(mpcb->path_index_bits) * 8) + return 0; + return i; +} + +/* May be called without holding the meta-level lock */ +int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id, + gfp_t flags) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct tcp_sock *tp = tcp_sk(sk); + + /* Could have been allocated by mptcp_alloc_mpcb */ + if (!tp->mptcp) { + tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags); + + if (!tp->mptcp) + return -ENOMEM; + } + + tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb); + /* No more space for more subflows? */ + if (!tp->mptcp->path_index) { + WARN_ON(is_master_tp(tp)); + + kmem_cache_free(mptcp_sock_cache, tp->mptcp); + return -EPERM; + } + + INIT_HLIST_NODE(&tp->mptcp->cb_list); + + tp->mptcp->tp = tp; + tp->mpcb = mpcb; + tp->meta_sk = meta_sk; + + if (!sock_flag(sk, SOCK_MPTCP)) { + mptcp_enable_static_key_bh(); + sock_set_flag(sk, SOCK_MPTCP); + } + + tp->mpc = 1; + tp->ops = &mptcp_sub_specific; + + tp->mptcp->loc_id = loc_id; + tp->mptcp->rem_id = rem_id; + if (mpcb->sched_ops->init) + mpcb->sched_ops->init(sk); + + /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be + * included in mptcp_del_sock(), because the mpcb must remain alive + * until the last subsocket is completely destroyed. + */ + sock_hold(meta_sk); + refcount_inc(&mpcb->mpcb_refcnt); + + spin_lock_bh(&mpcb->mpcb_list_lock); + hlist_add_head_rcu(&tp->mptcp->node, &mpcb->conn_list); + spin_unlock_bh(&mpcb->mpcb_list_lock); + + tp->mptcp->attached = 1; + + mptcp_sub_inherit_sockopts(meta_sk, sk); + INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq); + + /* Properly inherit CC from the meta-socket */ + mptcp_assign_congestion_control(sk); + + /* As we successfully allocated the mptcp_tcp_sock, we have to + * change the function-pointers here (for sk_destruct to work correctly) + */ + sk->sk_error_report = mptcp_sock_def_error_report; + sk->sk_data_ready = mptcp_data_ready; + sk->sk_write_space = mptcp_write_space; + sk->sk_state_change = mptcp_set_state; + sk->sk_destruct = mptcp_sock_destruct; + + if (sk->sk_family == AF_INET) + mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d\n", + __func__ , mpcb->mptcp_loc_token, + tp->mptcp->path_index, + &((struct inet_sock *)tp)->inet_saddr, + ntohs(((struct inet_sock *)tp)->inet_sport), + &((struct inet_sock *)tp)->inet_daddr, + ntohs(((struct inet_sock *)tp)->inet_dport)); +#if IS_ENABLED(CONFIG_IPV6) + else + mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d\n", + __func__ , mpcb->mptcp_loc_token, + tp->mptcp->path_index, &inet6_sk(sk)->saddr, + ntohs(((struct inet_sock *)tp)->inet_sport), + &sk->sk_v6_daddr, + ntohs(((struct inet_sock *)tp)->inet_dport)); +#endif + + return 0; +} + +void mptcp_del_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_cb *mpcb; + + if (!tp->mptcp || !tp->mptcp->attached) + return; + + mpcb = tp->mpcb; + + if (mpcb->sched_ops->release) + mpcb->sched_ops->release(sk); + + if (mpcb->pm_ops->delete_subflow) + mpcb->pm_ops->delete_subflow(sk); + + mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n", + __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index, + sk->sk_state, is_meta_sk(sk)); + + spin_lock_bh(&mpcb->mpcb_list_lock); + hlist_del_init_rcu(&tp->mptcp->node); + spin_unlock_bh(&mpcb->mpcb_list_lock); + + tp->mptcp->attached = 0; + mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index); + + if (!tcp_write_queue_empty(sk) || !tcp_rtx_queue_empty(sk)) + mptcp_reinject_data(sk, 0); + + if (is_master_tp(tp)) { + struct sock *meta_sk = mptcp_meta_sk(sk); + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + + if (meta_tp->record_master_info && + !sock_flag(meta_sk, SOCK_DEAD)) { + mpcb->master_info = kmalloc(sizeof(*mpcb->master_info), + GFP_ATOMIC); + + if (mpcb->master_info) + tcp_get_info(sk, mpcb->master_info, true); + } + + mpcb->master_sk = NULL; + } else if (tp->mptcp->pre_established) { + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); + } +} + +/* Updates the MPTCP-session based on path-manager information (e.g., addresses, + * low-prio flows,...). + */ +void mptcp_update_metasocket(const struct sock *meta_sk) +{ + if (tcp_sk(meta_sk)->mpcb->pm_ops->new_session) + tcp_sk(meta_sk)->mpcb->pm_ops->new_session(meta_sk); +} + +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + * (inspired from tcp_cleanup_rbuf()) + */ +void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + bool recheck_rcv_window = false; + struct mptcp_tcp_sock *mptcp; + __u32 rcv_window_now = 0; + + if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) { + rcv_window_now = tcp_receive_window_now(meta_tp); + + /* Optimize, __mptcp_select_window() is not cheap. */ + if (2 * rcv_window_now <= meta_tp->window_clamp) + recheck_rcv_window = true; + } + + mptcp_for_each_sub(meta_tp->mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (!mptcp_sk_can_send_ack(sk)) + continue; + + if (!inet_csk_ack_scheduled(sk)) + goto second_part; + /* Delayed ACKs frequently hit locked sockets during bulk + * receive. + */ + if (icsk->icsk_ack.blocked || + /* Once-per-two-segments ACK was not sent by tcp_input.c */ + tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || + /* If this read emptied read buffer, we send ACK, if + * connection is not bidirectional, user drained + * receive buffer and there was a small segment + * in queue. + */ + (copied > 0 && + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + !icsk->icsk_ack.pingpong)) && + !atomic_read(&meta_sk->sk_rmem_alloc))) { + tcp_send_ack(sk); + continue; + } + +second_part: + /* This here is the second part of tcp_cleanup_rbuf */ + if (recheck_rcv_window) { + __u32 new_window = tp->ops->__select_window(sk); + + /* Send ACK now, if this read freed lots of space + * in our buffer. Certainly, new_window is new window. + * We can advertise it now, if it is not less than + * current one. + * "Lots" means "at least twice" here. + */ + if (new_window && new_window >= 2 * rcv_window_now) + tcp_send_ack(sk); + } + } +} + +static int mptcp_sub_send_fin(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = tcp_write_queue_tail(sk); + int mss_now; + + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. But be careful about outgoing SACKS + * and IP options. + */ + mss_now = tcp_current_mss(sk); + + if (tcp_send_head(sk) != NULL) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; + TCP_SKB_CB(skb)->end_seq++; + tp->write_seq++; + } else { + skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC); + if (!skb) + return 1; + + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); + skb_reserve(skb, MAX_TCP_HEADER); + /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ + tcp_init_nondata_skb(skb, tp->write_seq, + TCPHDR_ACK | TCPHDR_FIN); + tcp_queue_skb(sk, skb); + } + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); + + return 0; +} + +static void mptcp_sub_close_doit(struct sock *sk) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (sock_flag(sk, SOCK_DEAD)) + return; + + if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) { + tp->closing = 1; + tcp_close(sk, 0); + } else if (tcp_close_state(sk)) { + sk->sk_shutdown |= SEND_SHUTDOWN; + tcp_send_fin(sk); + } +} + +void mptcp_sub_close_wq(struct work_struct *work) +{ + struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp; + struct sock *sk = (struct sock *)tp; + struct mptcp_cb *mpcb = tp->mpcb; + struct sock *meta_sk = mptcp_meta_sk(sk); + + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + mptcp_sub_close_doit(sk); + + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + mptcp_mpcb_put(mpcb); + sock_put(sk); +} + +void mptcp_sub_close(struct sock *sk, unsigned long delay) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct delayed_work *work = &tcp_sk(sk)->mptcp->work; + + /* We are already closing - e.g., call from sock_def_error_report upon + * tcp_disconnect in tcp_close. + */ + if (tp->closing) + return; + + /* Work already scheduled ? */ + if (work_pending(&work->work)) { + /* Work present - who will be first ? */ + if (jiffies + delay > work->timer.expires) + return; + + /* Try canceling - if it fails, work will be executed soon */ + if (!cancel_delayed_work(work)) + return; + sock_put(sk); + mptcp_mpcb_put(tp->mpcb); + } + + if (!delay) { + unsigned char old_state = sk->sk_state; + + /* We directly send the FIN. Because it may take so a long time, + * untile the work-queue will get scheduled... + * + * If mptcp_sub_send_fin returns 1, it failed and thus we reset + * the old state so that tcp_close will finally send the fin + * in user-context. + */ + if (!sk->sk_err && old_state != TCP_CLOSE && + tcp_close_state(sk) && mptcp_sub_send_fin(sk)) { + if (old_state == TCP_ESTABLISHED) + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); + sk->sk_state = old_state; + } + } + + sock_hold(sk); + refcount_inc(&tp->mpcb->mpcb_refcnt); + queue_delayed_work(mptcp_wq, work, delay); +} + +void mptcp_sub_force_close(struct sock *sk) +{ + /* The below tcp_done may have freed the socket, if he is already dead. + * Thus, we are not allowed to access it afterwards. That's why + * we have to store the dead-state in this local variable. + */ + int sock_is_dead = sock_flag(sk, SOCK_DEAD); + + tcp_sk(sk)->mp_killed = 1; + + if (sk->sk_state != TCP_CLOSE) + tcp_done(sk); + + if (!sock_is_dead) + mptcp_sub_close(sk, 0); +} +EXPORT_SYMBOL(mptcp_sub_force_close); + +/* Update the mpcb send window, based on the contributions + * of each subflow + */ +void mptcp_update_sndbuf(const struct tcp_sock *tp) +{ + struct sock *meta_sk = tp->meta_sk; + int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf; + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(tp->mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + + if (!mptcp_sk_can_send(sk)) + continue; + + new_sndbuf += sk->sk_sndbuf; + + if (new_sndbuf > sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2] || + new_sndbuf < 0) { + new_sndbuf = sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2]; + break; + } + } + meta_sk->sk_sndbuf = max(min(new_sndbuf, + sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2]), + meta_sk->sk_sndbuf); + + /* The subflow's call to sk_write_space in tcp_new_space ends up in + * mptcp_write_space. + * It has nothing to do with waking up the application. + * So, we do it here. + */ + if (old_sndbuf != meta_sk->sk_sndbuf) + meta_sk->sk_write_space(meta_sk); +} + +/* Similar to: tcp_close */ +void mptcp_close(struct sock *meta_sk, long timeout) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct mptcp_tcp_sock *mptcp; + struct sk_buff *skb; + int data_was_unread = 0; + int state; + + mptcp_debug("%s: Close of meta_sk with tok %#x state %u\n", + __func__, mpcb->mptcp_loc_token, meta_sk->sk_state); + + WARN_ON(refcount_inc_not_zero(&mpcb->mpcb_refcnt) == 0); + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + if (meta_tp->inside_tk_table) + /* Detach the mpcb from the token hashtable */ + mptcp_hash_remove_bh(meta_tp); + + meta_sk->sk_shutdown = SHUTDOWN_MASK; + /* We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + len--; + data_was_unread += len; + __kfree_skb(skb); + } + + sk_mem_reclaim(meta_sk); + + /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ + if (meta_sk->sk_state == TCP_CLOSE) { + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + + mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + if (tcp_sk(sk_it)->send_mp_fclose) + continue; + + mptcp_sub_close(sk_it, 0); + } + goto adjudge_to_death; + } + + if (data_was_unread) { + /* Unread data was tossed, zap the connection. */ + NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE); + tcp_set_state(meta_sk, TCP_CLOSE); + tcp_sk(meta_sk)->ops->send_active_reset(meta_sk, + meta_sk->sk_allocation); + } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + meta_sk->sk_prot->disconnect(meta_sk, 0); + NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); + } else if (tcp_close_state(meta_sk)) { + mptcp_send_fin(meta_sk); + } else if (meta_tp->snd_una == meta_tp->write_seq) { + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + + /* The DATA_FIN has been sent and acknowledged + * (e.g., by sk_shutdown). Close all the other subflows + */ + mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + unsigned long delay = 0; + /* If we are the passive closer, don't trigger + * subflow-fin until the subflow has been finned + * by the peer. - thus we add a delay + */ + if (mpcb->passive_close && + sk_it->sk_state == TCP_ESTABLISHED) + delay = inet_csk(sk_it)->icsk_rto << 3; + + mptcp_sub_close(sk_it, delay); + } + } + + sk_stream_wait_close(meta_sk, timeout); + +adjudge_to_death: + state = meta_sk->sk_state; + sock_hold(meta_sk); + sock_orphan(meta_sk); + + /* socket will be freed after mptcp_close - we have to prevent + * access from the subflows. + */ + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + /* Similar to sock_orphan, but we don't set it DEAD, because + * the callbacks are still set and must be called. + */ + write_lock_bh(&sk_it->sk_callback_lock); + sk_set_socket(sk_it, NULL); + sk_it->sk_wq = NULL; + write_unlock_bh(&sk_it->sk_callback_lock); + } + + if (mpcb->pm_ops->close_session) + mpcb->pm_ops->close_session(meta_sk); + + /* It is the last release_sock in its life. It will remove backlog. */ + release_sock(meta_sk); + + /* Now socket is owned by kernel and we acquire BH lock + * to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(meta_sk); + WARN_ON(sock_owned_by_user(meta_sk)); + + percpu_counter_inc(meta_sk->sk_prot->orphan_count); + + /* Have we already been destroyed by a softirq or backlog? */ + if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE) + goto out; + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + * + * Nope, it was not mistake. It is really desired behaviour + * f.e. on http servers, when such sockets are useless, but + * consume significant resources. Let's do it with special + * linger2 option. --ANK + */ + + if (meta_sk->sk_state == TCP_FIN_WAIT2) { + if (meta_tp->linger2 < 0) { + tcp_set_state(meta_sk, TCP_CLOSE); + meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC); + __NET_INC_STATS(sock_net(meta_sk), + LINUX_MIB_TCPABORTONLINGER); + } else { + const int tmo = tcp_fin_time(meta_sk); + + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(meta_sk, + tmo - TCP_TIMEWAIT_LEN); + } else { + meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, + tmo); + goto out; + } + } + } + if (meta_sk->sk_state != TCP_CLOSE) { + sk_mem_reclaim(meta_sk); + if (tcp_check_oom(meta_sk, 0)) { + if (net_ratelimit()) + pr_info("MPTCP: out of memory: force closing socket\n"); + tcp_set_state(meta_sk, TCP_CLOSE); + meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC); + __NET_INC_STATS(sock_net(meta_sk), + LINUX_MIB_TCPABORTONMEMORY); + } + } + + + if (meta_sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(meta_sk); + /* Otherwise, socket is reprieved until protocol close. */ + +out: + bh_unlock_sock(meta_sk); + local_bh_enable(); + mutex_unlock(&mpcb->mpcb_mutex); + mptcp_mpcb_put(mpcb); + sock_put(meta_sk); /* Taken by sock_hold */ +} + +void mptcp_disconnect(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + + __skb_queue_purge(&meta_tp->mpcb->reinject_queue); + + if (meta_tp->inside_tk_table) + mptcp_hash_remove_bh(meta_tp); + + local_bh_disable(); + mptcp_for_each_sub_safe(meta_tp->mpcb, mptcp, tmp) { + struct sock *subsk = mptcp_to_sock(mptcp); + + if (spin_is_locked(&subsk->sk_lock.slock)) + bh_unlock_sock(subsk); + + tcp_sk(subsk)->tcp_disconnect = 1; + + meta_sk->sk_prot->disconnect(subsk, O_NONBLOCK); + + sock_orphan(subsk); + + percpu_counter_inc(meta_sk->sk_prot->orphan_count); + + inet_csk_destroy_sock(subsk); + } + local_bh_enable(); + + mptcp_mpcb_cleanup(meta_tp->mpcb); + meta_tp->meta_sk = NULL; + + meta_tp->send_mp_fclose = 0; + meta_tp->mpc = 0; + meta_tp->ops = &tcp_specific; +#if IS_ENABLED(CONFIG_IPV6) + if (meta_sk->sk_family == AF_INET6) + meta_sk->sk_backlog_rcv = tcp_v6_do_rcv; + else + meta_sk->sk_backlog_rcv = tcp_v4_do_rcv; +#else + meta_sk->sk_backlog_rcv = tcp_v4_do_rcv; +#endif + meta_sk->sk_destruct = inet_sock_destruct; +} + + +/* Returns True if we should enable MPTCP for that socket. */ +bool mptcp_doit(struct sock *sk) +{ + const struct dst_entry *dst = __sk_dst_get(sk); + + /* Don't do mptcp over loopback */ + if (sk->sk_family == AF_INET && + (ipv4_is_loopback(inet_sk(sk)->inet_daddr) || + ipv4_is_loopback(inet_sk(sk)->inet_saddr))) + return false; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + (ipv6_addr_loopback(&sk->sk_v6_daddr) || + ipv6_addr_loopback(&inet6_sk(sk)->saddr))) + return false; +#endif + if (mptcp_v6_is_v4_mapped(sk) && + ipv4_is_loopback(inet_sk(sk)->inet_saddr)) + return false; + +#ifdef CONFIG_TCP_MD5SIG + /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */ + if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk)) + return false; +#endif + + if (dst->dev && (dst->dev->flags & IFF_NOMULTIPATH)) + return false; + + return true; +} + +int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, + __u8 mptcp_ver, u32 window) +{ + struct tcp_sock *master_tp; + struct sock *master_sk; + + if (mptcp_alloc_mpcb(meta_sk, remote_key, mptcp_ver, window)) + goto err_alloc_mpcb; + + master_sk = tcp_sk(meta_sk)->mpcb->master_sk; + master_tp = tcp_sk(master_sk); + + if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC)) { + WARN_ON(1); + return -EINVAL; + } + + meta_sk->sk_prot->unhash(meta_sk); + inet_ehash_nolisten(master_sk, NULL); + + master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd; + + return 0; + +err_alloc_mpcb: + return -ENOBUFS; +} + +static int __mptcp_check_req_master(struct sock *child, + struct request_sock *req) +{ + struct tcp_sock *child_tp = tcp_sk(child); + struct sock *meta_sk = child; + struct mptcp_cb *mpcb; + struct mptcp_request_sock *mtreq; + + /* Never contained an MP_CAPABLE */ + if (!inet_rsk(req)->mptcp_rqsk) + return 1; + + if (!inet_rsk(req)->saw_mpc) { + /* Fallback to regular TCP, because we saw one SYN without + * MP_CAPABLE. In tcp_check_req we continue the regular path. + * But, the socket has been added to the reqsk_tk_htb, so we + * must still remove it. + */ + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); + mptcp_reqsk_remove_tk(req); + return 1; + } + + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); + + /* Just set this values to pass them to mptcp_alloc_mpcb */ + mtreq = mptcp_rsk(req); + child_tp->mptcp_loc_key = mtreq->mptcp_loc_key; + child_tp->mptcp_loc_token = mtreq->mptcp_loc_token; + + if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key, + mtreq->mptcp_ver, child_tp->snd_wnd)) { + inet_csk_prepare_forced_close(meta_sk); + tcp_done(meta_sk); + + return -ENOBUFS; + } + + child = tcp_sk(child)->mpcb->master_sk; + child_tp = tcp_sk(child); + mpcb = child_tp->mpcb; + + child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; + child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; + + mpcb->dss_csum = mtreq->dss_csum; + mpcb->server_side = 1; + + /* Needs to be done here additionally, because when accepting a + * new connection we pass by __reqsk_free and not reqsk_free. + */ + mptcp_reqsk_remove_tk(req); + + /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */ + sock_put(meta_sk); + + return 0; +} + +int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req) +{ + struct sock *meta_sk = child, *master_sk; + struct sk_buff *skb; + u32 new_mapping; + int ret; + + ret = __mptcp_check_req_master(child, req); + if (ret) + return ret; + + master_sk = tcp_sk(meta_sk)->mpcb->master_sk; + + /* We need to rewind copied_seq as it is set to IDSN + 1 and as we have + * pre-MPTCP data in the receive queue. + */ + tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt - + tcp_rsk(req)->rcv_isn - 1; + + /* Map subflow sequence number to data sequence numbers. We need to map + * these data to [IDSN - len - 1, IDSN[. + */ + new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1; + + /* There should be only one skb: the SYN + data. */ + skb_queue_walk(&meta_sk->sk_receive_queue, skb) { + TCP_SKB_CB(skb)->seq += new_mapping; + TCP_SKB_CB(skb)->end_seq += new_mapping; + } + + /* With fastopen we change the semantics of the relative subflow + * sequence numbers to deal with middleboxes that could add/remove + * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1 + * instead of the regular TCP ISN. + */ + tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1; + + /* We need to update copied_seq of the master_sk to account for the + * already moved data to the meta receive queue. + */ + tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt; + + /* Handled by the master_sk */ + tcp_sk(meta_sk)->fastopen_rsk = NULL; + + return 0; +} + +int mptcp_check_req_master(struct sock *sk, struct sock *child, + struct request_sock *req, const struct sk_buff *skb, + int drop, u32 tsoff) +{ + struct sock *meta_sk = child; + int ret; + + ret = __mptcp_check_req_master(child, req); + if (ret) + return ret; + child = tcp_sk(child)->mpcb->master_sk; + + sock_rps_save_rxhash(child, skb); + + /* drop indicates that we come from tcp_check_req and thus need to + * handle the request-socket fully. + */ + if (drop) { + tcp_synack_rtt_meas(child, req); + + inet_csk_reqsk_queue_drop(sk, req); + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) { + bh_unlock_sock(meta_sk); + /* No sock_put() of the meta needed. The reference has + * already been dropped in __mptcp_check_req_master(). + */ + sock_put(child); + return -1; + } + } else { + /* Thus, we come from syn-cookies */ + refcount_set(&req->rsk_refcnt, 1); + tcp_sk(meta_sk)->tsoffset = tsoff; + if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) { + bh_unlock_sock(meta_sk); + /* No sock_put() of the meta needed. The reference has + * already been dropped in __mptcp_check_req_master(). + */ + sock_put(child); + reqsk_put(req); + return -1; + } + } + + return 0; +} + +/* May be called without holding the meta-level lock */ +struct sock *mptcp_check_req_child(struct sock *meta_sk, + struct sock *child, + struct request_sock *req, + struct sk_buff *skb, + const struct mptcp_options_received *mopt) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + struct tcp_sock *child_tp = tcp_sk(child); + u8 hash_mac_check[20]; + + if (!mopt->join_ack) { + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKFAIL); + goto teardown; + } + + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, + (u8 *)&mpcb->mptcp_loc_key, + (u32 *)hash_mac_check, 2, + 4, (u8 *)&mtreq->mptcp_rem_nonce, + 4, (u8 *)&mtreq->mptcp_loc_nonce); + + if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20)) { + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKMAC); + goto teardown; + } + + /* Point it to the same struct socket and wq as the meta_sk */ + sk_set_socket(child, meta_sk->sk_socket); + child->sk_wq = meta_sk->sk_wq; + + if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) { + /* Has been inherited, but now child_tp->mptcp is NULL */ + child_tp->mpc = 0; + child_tp->ops = &tcp_specific; + + /* TODO when we support acking the third ack for new subflows, + * we should silently discard this third ack, by returning NULL. + * + * Maybe, at the retransmission we will have enough memory to + * fully add the socket to the meta-sk. + */ + goto teardown; + } + + /* The child is a clone of the meta socket, we must now reset + * some of the fields + */ + child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio; + + /* We should allow proper increase of the snd/rcv-buffers. Thus, we + * use the original values instead of the bloated up ones from the + * clone. + */ + child->sk_sndbuf = mpcb->orig_sk_sndbuf; + child->sk_rcvbuf = mpcb->orig_sk_rcvbuf; + + child_tp->mptcp->slave_sk = 1; + child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn; + child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn; + child_tp->mptcp->init_rcv_wnd = req->rsk_rcv_wnd; + + child->sk_tsq_flags = 0; + + sock_rps_save_rxhash(child, skb); + tcp_synack_rtt_meas(child, req); + + if (mpcb->pm_ops->established_subflow) + mpcb->pm_ops->established_subflow(child); + + /* Subflows do not use the accept queue, as they + * are attached immediately to the mpcb. + */ + inet_csk_reqsk_queue_drop(meta_sk, req); + reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req); + + /* The refcnt is initialized to 2, because regular TCP will put him + * in the socket's listener queue. However, we do not have a listener-queue. + * So, we need to make sure that this request-sock indeed gets destroyed. + */ + reqsk_put(req); + + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKRX); + return child; + +teardown: + req->rsk_ops->send_reset(meta_sk, skb); + + /* Drop this request - sock creation failed. */ + inet_csk_reqsk_queue_drop(meta_sk, req); + reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req); + inet_csk_prepare_forced_close(child); + tcp_done(child); + bh_unlock_sock(meta_sk); + return meta_sk; +} + +int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw) +{ + struct mptcp_tw *mptw; + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + + /* A subsocket in tw can only receive data. So, if we are in + * infinite-receive, then we should not reply with a data-ack or act + * upon general MPTCP-signaling. We prevent this by simply not creating + * the mptcp_tw_sock. + */ + if (mpcb->infinite_mapping_rcv) { + tw->mptcp_tw = NULL; + return 0; + } + + /* Alloc MPTCP-tw-sock */ + mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC); + if (!mptw) { + tw->mptcp_tw = NULL; + return -ENOBUFS; + } + + refcount_inc(&mpcb->mpcb_refcnt); + + tw->mptcp_tw = mptw; + mptw->loc_key = mpcb->mptcp_loc_key; + mptw->meta_tw = mpcb->in_time_wait; + mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); + if (mptw->meta_tw && mpcb->mptw_state != TCP_TIME_WAIT) + mptw->rcv_nxt++; + rcu_assign_pointer(mptw->mpcb, mpcb); + + spin_lock_bh(&mpcb->mpcb_list_lock); + list_add_rcu(&mptw->list, &tp->mpcb->tw_list); + mptw->in_list = 1; + spin_unlock_bh(&mpcb->mpcb_list_lock); + + return 0; +} + +void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) +{ + struct mptcp_cb *mpcb; + + rcu_read_lock(); + local_bh_disable(); + mpcb = rcu_dereference(tw->mptcp_tw->mpcb); + + /* If we are still holding a ref to the mpcb, we have to remove ourself + * from the list and drop the ref properly. + */ + if (mpcb && refcount_inc_not_zero(&mpcb->mpcb_refcnt)) { + spin_lock(&mpcb->mpcb_list_lock); + if (tw->mptcp_tw->in_list) { + list_del_rcu(&tw->mptcp_tw->list); + tw->mptcp_tw->in_list = 0; + /* Put, because we added it to the list */ + mptcp_mpcb_put(mpcb); + } + spin_unlock(&mpcb->mpcb_list_lock); + + /* Second time, because we increased it above */ + mptcp_mpcb_put(mpcb); + } + + local_bh_enable(); + rcu_read_unlock(); + + kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw); +} + +/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a + * data-fin. + */ +void mptcp_time_wait(struct sock *meta_sk, int state, int timeo) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_tw *mptw; + + if (mptcp_in_infinite_mapping_weak(meta_tp->mpcb)) { + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + + mptcp_for_each_sub_safe(meta_tp->mpcb, mptcp, tmp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + if (sk_it->sk_state == TCP_CLOSE) + continue; + + tcp_sk(sk_it)->ops->time_wait(sk_it, state, timeo); + } + } + + /* Used for sockets that go into tw after the meta + * (see mptcp_init_tw_sock()) + */ + meta_tp->mpcb->in_time_wait = 1; + meta_tp->mpcb->mptw_state = state; + + /* Update the time-wait-sock's information */ + rcu_read_lock(); + local_bh_disable(); + list_for_each_entry_rcu(mptw, &meta_tp->mpcb->tw_list, list) { + mptw->meta_tw = 1; + mptw->rcv_nxt = mptcp_get_rcv_nxt_64(meta_tp); + + /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 - + * pretend as if the DATA_FIN has already reached us, that way + * the checks in tcp_timewait_state_process will be good as the + * DATA_FIN comes in. + */ + if (state != TCP_TIME_WAIT) + mptw->rcv_nxt++; + } + local_bh_enable(); + rcu_read_unlock(); + + if (meta_sk->sk_state != TCP_CLOSE) + tcp_done(meta_sk); +} + +void mptcp_tsq_flags(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sock *meta_sk = mptcp_meta_sk(sk); + + /* It will be handled as a regular deferred-call */ + if (is_meta_sk(sk)) + return; + + if (hlist_unhashed(&tp->mptcp->cb_list)) { + hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list); + /* We need to hold it here, as the sock_hold is not assured + * by the release_sock as it is done in regular TCP. + * + * The subsocket may get inet_csk_destroy'd while it is inside + * the callback_list. + */ + sock_hold(sk); + } + + if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &meta_sk->sk_tsq_flags)) + sock_hold(meta_sk); +} + +void mptcp_tsq_sub_deferred(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + + __sock_put(meta_sk); + hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) { + struct tcp_sock *tp = mptcp->tp; + struct sock *sk = (struct sock *)tp; + + hlist_del_init(&mptcp->cb_list); + sk->sk_prot->release_cb(sk); + /* Final sock_put (cfr. mptcp_tsq_flags) */ + sock_put(sk); + } +} + +/* May be called without holding the meta-level lock */ +void mptcp_join_reqsk_init(const struct mptcp_cb *mpcb, + const struct request_sock *req, + struct sk_buff *skb) +{ + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + struct mptcp_options_received mopt; + u8 mptcp_hash_mac[20]; + + mptcp_init_mp_opt(&mopt); + tcp_parse_mptcp_options(skb, &mopt); + + mtreq->is_sub = 1; + inet_rsk(req)->mptcp_rqsk = 1; + + mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce; + + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, + (u8 *)&mpcb->mptcp_rem_key, + (u32 *)mptcp_hash_mac, 2, + 4, (u8 *)&mtreq->mptcp_loc_nonce, + 4, (u8 *)&mtreq->mptcp_rem_nonce); + mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac; + + mtreq->rem_id = mopt.rem_id; + mtreq->rcv_low_prio = mopt.low_prio; + inet_rsk(req)->saw_mpc = 1; + + MPTCP_INC_STATS(sock_net(mpcb->meta_sk), MPTCP_MIB_JOINSYNRX); +} + +void mptcp_reqsk_init(struct request_sock *req, const struct sock *sk, + const struct sk_buff *skb, bool want_cookie) +{ + struct mptcp_options_received mopt; + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + + mptcp_init_mp_opt(&mopt); + tcp_parse_mptcp_options(skb, &mopt); + + mtreq->dss_csum = mopt.dss_csum; + + if (want_cookie) { + if (!mptcp_reqsk_new_cookie(req, sk, &mopt, skb)) + /* No key available - back to regular TCP */ + inet_rsk(req)->mptcp_rqsk = 0; + return; + } + + mptcp_reqsk_new_mptcp(req, sk, &mopt, skb); +} + +void mptcp_cookies_reqsk_init(struct request_sock *req, + struct mptcp_options_received *mopt, + struct sk_buff *skb) +{ + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + + /* Absolutely need to always initialize this. */ + mtreq->hash_entry.pprev = NULL; + + mtreq->mptcp_rem_key = mopt->mptcp_sender_key; + mtreq->mptcp_loc_key = mopt->mptcp_receiver_key; + + /* Generate the token */ + mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); + + rcu_read_lock(); + local_bh_disable(); + spin_lock(&mptcp_tk_hashlock); + + /* Check, if the key is still free */ + if (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) || + mptcp_find_token(mtreq->mptcp_loc_token)) + goto out; + + inet_rsk(req)->saw_mpc = 1; + mtreq->is_sub = 0; + inet_rsk(req)->mptcp_rqsk = 1; + mtreq->dss_csum = mopt->dss_csum; + +out: + spin_unlock(&mptcp_tk_hashlock); + local_bh_enable(); + rcu_read_unlock(); +} + +int mptcp_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_options_received mopt; + + mptcp_init_mp_opt(&mopt); + tcp_parse_mptcp_options(skb, &mopt); + + if (mopt.is_mp_join) + return mptcp_do_join_short(skb, &mopt, sock_net(sk)); + if (mopt.drop_me) + goto drop; + + if (!sock_flag(sk, SOCK_MPTCP)) + mopt.saw_mpc = 0; + + if (skb->protocol == htons(ETH_P_IP)) { + if (mopt.saw_mpc) { + if (skb_rtable(skb)->rt_flags & + (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVE); + return tcp_conn_request(&mptcp_request_sock_ops, + &mptcp_request_sock_ipv4_ops, + sk, skb); + } + + return tcp_v4_conn_request(sk, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (mopt.saw_mpc) { + if (!ipv6_unicast_destination(skb)) + goto drop; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVE); + return tcp_conn_request(&mptcp6_request_sock_ops, + &mptcp_request_sock_ipv6_ops, + sk, skb); + } + + return tcp_v6_conn_request(sk, skb); +#endif + } +drop: + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); + return 0; +} + +int mptcp_finish_handshake(struct sock *child, struct sk_buff *skb) + __releases(&child->sk_lock.slock) +{ + int ret; + + /* We don't call tcp_child_process here, because we hold + * already the meta-sk-lock and are sure that it is not owned + * by the user. + */ + tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); + ret = tcp_rcv_state_process(child, skb); + bh_unlock_sock(child); + sock_put(child); + + return ret; +} + +static void __mptcp_get_info(const struct sock *meta_sk, + struct mptcp_meta_info *info) +{ + const struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); + const struct tcp_sock *meta_tp = tcp_sk(meta_sk); + u32 now = tcp_jiffies32; + + memset(info, 0, sizeof(*info)); + + info->mptcpi_state = meta_sk->sk_state; + info->mptcpi_retransmits = meta_icsk->icsk_retransmits; + info->mptcpi_probes = meta_icsk->icsk_probes_out; + info->mptcpi_backoff = meta_icsk->icsk_backoff; + + info->mptcpi_rto = jiffies_to_usecs(meta_icsk->icsk_rto); + + info->mptcpi_unacked = meta_tp->packets_out; + + info->mptcpi_last_data_sent = jiffies_to_msecs(now - meta_tp->lsndtime); + info->mptcpi_last_data_recv = jiffies_to_msecs(now - meta_icsk->icsk_ack.lrcvtime); + info->mptcpi_last_ack_recv = jiffies_to_msecs(now - meta_tp->rcv_tstamp); + + info->mptcpi_total_retrans = meta_tp->total_retrans; + + info->mptcpi_bytes_acked = meta_tp->bytes_acked; + info->mptcpi_bytes_received = meta_tp->bytes_received; +} + +static void mptcp_get_sub_info(struct sock *sk, struct mptcp_sub_info *info) +{ + struct inet_sock *inet = inet_sk(sk); + + memset(info, 0, sizeof(*info)); + + if (sk->sk_family == AF_INET) { + info->src_v4.sin_family = AF_INET; + info->src_v4.sin_port = inet->inet_sport; + + info->src_v4.sin_addr.s_addr = inet->inet_rcv_saddr; + if (!info->src_v4.sin_addr.s_addr) + info->src_v4.sin_addr.s_addr = inet->inet_saddr; + + info->dst_v4.sin_family = AF_INET; + info->dst_v4.sin_port = inet->inet_dport; + info->dst_v4.sin_addr.s_addr = inet->inet_daddr; +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct ipv6_pinfo *np = inet6_sk(sk); + + info->src_v6.sin6_family = AF_INET6; + info->src_v6.sin6_port = inet->inet_sport; + + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + info->src_v6.sin6_addr = np->saddr; + else + info->src_v6.sin6_addr = sk->sk_v6_rcv_saddr; + + info->dst_v6.sin6_family = AF_INET6; + info->dst_v6.sin6_port = inet->inet_dport; + info->dst_v6.sin6_addr = sk->sk_v6_daddr; +#endif + } +} + +int mptcp_get_info(const struct sock *meta_sk, char __user *optval, int optlen) +{ + const struct tcp_sock *meta_tp = tcp_sk(meta_sk); + + struct mptcp_meta_info meta_info; + struct mptcp_info m_info; + + unsigned int info_len; + + /* Check again with the lock held */ + if (!mptcp(meta_tp)) + return -EINVAL; + + if (copy_from_user(&m_info, optval, optlen)) + return -EFAULT; + + if (m_info.meta_info) { + unsigned int len; + + __mptcp_get_info(meta_sk, &meta_info); + + /* Need to set this, if user thinks that tcp_info is bigger than ours */ + len = min_t(unsigned int, m_info.meta_len, sizeof(meta_info)); + m_info.meta_len = len; + + if (copy_to_user((void __user *)m_info.meta_info, &meta_info, len)) + return -EFAULT; + } + + /* Need to set this, if user thinks that tcp_info is bigger than ours */ + info_len = min_t(unsigned int, m_info.tcp_info_len, sizeof(struct tcp_info)); + m_info.tcp_info_len = info_len; + + if (m_info.initial) { + struct mptcp_cb *mpcb = meta_tp->mpcb; + + if (mpcb->master_sk) { + struct tcp_info info; + + tcp_get_info(mpcb->master_sk, &info, true); + if (copy_to_user((void __user *)m_info.initial, &info, info_len)) + return -EFAULT; + } else if (meta_tp->record_master_info && mpcb->master_info) { + if (copy_to_user((void __user *)m_info.initial, mpcb->master_info, info_len)) + return -EFAULT; + } else { + return meta_tp->record_master_info ? -ENOMEM : -EINVAL; + } + } + + if (m_info.subflows) { + unsigned int len, sub_len = 0; + struct mptcp_tcp_sock *mptcp; + char __user *ptr; + + ptr = (char __user *)m_info.subflows; + len = m_info.sub_len; + + mptcp_for_each_sub(meta_tp->mpcb, mptcp) { + struct tcp_info t_info; + unsigned int tmp_len; + + tcp_get_info(mptcp_to_sock(mptcp), &t_info, true); + + tmp_len = min_t(unsigned int, len, info_len); + len -= tmp_len; + + if (copy_to_user(ptr, &t_info, tmp_len)) + return -EFAULT; + + ptr += tmp_len; + sub_len += tmp_len; + + if (len == 0) + break; + } + + m_info.sub_len = sub_len; + } + + if (m_info.subflow_info) { + unsigned int len, sub_info_len, total_sub_info_len = 0; + struct mptcp_tcp_sock *mptcp; + char __user *ptr; + + ptr = (char __user *)m_info.subflow_info; + len = m_info.total_sub_info_len; + + sub_info_len = min_t(unsigned int, m_info.sub_info_len, + sizeof(struct mptcp_sub_info)); + m_info.sub_info_len = sub_info_len; + + mptcp_for_each_sub(meta_tp->mpcb, mptcp) { + struct mptcp_sub_info m_sub_info; + unsigned int tmp_len; + + mptcp_get_sub_info(mptcp_to_sock(mptcp), &m_sub_info); + + tmp_len = min_t(unsigned int, len, sub_info_len); + len -= tmp_len; + + if (copy_to_user(ptr, &m_sub_info, tmp_len)) + return -EFAULT; + + ptr += tmp_len; + total_sub_info_len += tmp_len; + + if (len == 0) + break; + } + + m_info.total_sub_info_len = total_sub_info_len; + } + + if (copy_to_user(optval, &m_info, optlen)) + return -EFAULT; + + return 0; +} + +void mptcp_clear_sk(struct sock *sk, int size) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* we do not want to clear tk_table field, because of RCU lookups */ + sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table.next)); + + size -= offsetof(struct tcp_sock, tk_table.pprev); + memset((char *)&tp->tk_table.pprev, 0, size); +} + +static const struct snmp_mib mptcp_snmp_list[] = { + SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), + SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE), + SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK), + SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), + SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), + SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), + SNMP_MIB_ITEM("MPCapableRetransFallback", MPTCP_MIB_MPCAPABLERETRANSFALLBACK), + SNMP_MIB_ITEM("MPTCPCsumEnabled", MPTCP_MIB_CSUMENABLED), + SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), + SNMP_MIB_ITEM("MPFailRX", MPTCP_MIB_MPFAILRX), + SNMP_MIB_ITEM("MPCsumFail", MPTCP_MIB_CSUMFAIL), + SNMP_MIB_ITEM("MPFastcloseRX", MPTCP_MIB_FASTCLOSERX), + SNMP_MIB_ITEM("MPFastcloseTX", MPTCP_MIB_FASTCLOSETX), + SNMP_MIB_ITEM("MPFallbackAckSub", MPTCP_MIB_FBACKSUB), + SNMP_MIB_ITEM("MPFallbackAckInit", MPTCP_MIB_FBACKINIT), + SNMP_MIB_ITEM("MPFallbackDataSub", MPTCP_MIB_FBDATASUB), + SNMP_MIB_ITEM("MPFallbackDataInit", MPTCP_MIB_FBDATAINIT), + SNMP_MIB_ITEM("MPRemoveAddrSubDelete", MPTCP_MIB_REMADDRSUB), + SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), + SNMP_MIB_ITEM("MPJoinAlreadyFallenback", MPTCP_MIB_JOINFALLBACK), + SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), + SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), + SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), + SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), + SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), + SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), + SNMP_MIB_ITEM("MPJoinAckMissing", MPTCP_MIB_JOINACKFAIL), + SNMP_MIB_ITEM("MPJoinAckRTO", MPTCP_MIB_JOINACKRTO), + SNMP_MIB_ITEM("MPJoinAckRexmit", MPTCP_MIB_JOINACKRXMIT), + SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW), + SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), + SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), + SNMP_MIB_ITEM("DSSTrimHead", MPTCP_MIB_DSSTRIMHEAD), + SNMP_MIB_ITEM("DSSSplitTail", MPTCP_MIB_DSSSPLITTAIL), + SNMP_MIB_ITEM("DSSPurgeOldSubSegs", MPTCP_MIB_PURGEOLD), + SNMP_MIB_ITEM("AddAddrRx", MPTCP_MIB_ADDADDRRX), + SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX), + SNMP_MIB_ITEM("RemAddrRx", MPTCP_MIB_REMADDRRX), + SNMP_MIB_ITEM("RemAddrTx", MPTCP_MIB_REMADDRTX), + SNMP_MIB_SENTINEL +}; + +struct workqueue_struct *mptcp_wq; +EXPORT_SYMBOL(mptcp_wq); + +/* Output /proc/net/mptcp */ +static int mptcp_pm_seq_show(struct seq_file *seq, void *v) +{ + struct tcp_sock *meta_tp; + const struct net *net = seq->private; + int i, n = 0; + + seq_printf(seq, " sl loc_tok rem_tok v6 local_address remote_address st ns tx_queue rx_queue inode"); + seq_putc(seq, '\n'); + + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + struct hlist_nulls_node *node; + rcu_read_lock(); + local_bh_disable(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, + &tk_hashtable[i], tk_table) { + struct sock *meta_sk = (struct sock *)meta_tp; + struct inet_sock *isk = inet_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + + if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk))) + continue; + + if (!mpcb) + continue; + + if (capable(CAP_NET_ADMIN)) { + seq_printf(seq, "%4d: %04X %04X ", n++, + mpcb->mptcp_loc_token, + mpcb->mptcp_rem_token); + } else { + seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1); + } + if (meta_sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(meta_sk)) { + seq_printf(seq, " 0 %08X:%04X %08X:%04X ", + isk->inet_rcv_saddr, + ntohs(isk->inet_sport), + isk->inet_daddr, + ntohs(isk->inet_dport)); +#if IS_ENABLED(CONFIG_IPV6) + } else if (meta_sk->sk_family == AF_INET6) { + struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr; + struct in6_addr *dst = &meta_sk->sk_v6_daddr; + seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X", + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], + ntohs(isk->inet_sport), + dst->s6_addr32[0], dst->s6_addr32[1], + dst->s6_addr32[2], dst->s6_addr32[3], + ntohs(isk->inet_dport)); +#endif + } + + seq_printf(seq, " %02X %02X %08X:%08X %lu", + meta_sk->sk_state, mptcp_subflow_count(mpcb), + meta_tp->write_seq - meta_tp->snd_una, + max_t(int, meta_tp->rcv_nxt - + meta_tp->copied_seq, 0), + sock_i_ino(meta_sk)); + seq_putc(seq, '\n'); + } + + local_bh_enable(); + rcu_read_unlock(); + } + + return 0; +} + +static int mptcp_snmp_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = seq->private; + int i; + + for (i = 0; mptcp_snmp_list[i].name != NULL; i++) + seq_printf(seq, "%-32s\t%ld\n", mptcp_snmp_list[i].name, + snmp_fold_field(net->mptcp.mptcp_statistics, + mptcp_snmp_list[i].entry)); + + return 0; +} + +static int mptcp_pm_init_net(struct net *net) +{ + net->mptcp.mptcp_statistics = alloc_percpu(struct mptcp_mib); + if (!net->mptcp.mptcp_statistics) + goto out_mptcp_mibs; + +#ifdef CONFIG_PROC_FS + net->mptcp.proc_net_mptcp = proc_net_mkdir(net, "mptcp_net", net->proc_net); + if (!net->mptcp.proc_net_mptcp) + goto out_proc_net_mptcp; + if (!proc_create_net_single("mptcp", S_IRUGO, net->mptcp.proc_net_mptcp, + mptcp_pm_seq_show, NULL)) + goto out_mptcp_net_mptcp; + if (!proc_create_net_single("snmp", S_IRUGO, net->mptcp.proc_net_mptcp, + mptcp_snmp_seq_show, NULL)) + goto out_mptcp_net_snmp; +#endif + + return 0; + +#ifdef CONFIG_PROC_FS +out_mptcp_net_snmp: + remove_proc_entry("mptcp", net->mptcp.proc_net_mptcp); +out_mptcp_net_mptcp: + remove_proc_subtree("mptcp_net", net->proc_net); + net->mptcp.proc_net_mptcp = NULL; +out_proc_net_mptcp: + free_percpu(net->mptcp.mptcp_statistics); +#endif +out_mptcp_mibs: + return -ENOMEM; +} + +static void mptcp_pm_exit_net(struct net *net) +{ + remove_proc_entry("snmp", net->mptcp.proc_net_mptcp); + remove_proc_entry("mptcp", net->mptcp.proc_net_mptcp); + remove_proc_subtree("mptcp_net", net->proc_net); + free_percpu(net->mptcp.mptcp_statistics); +} + +static struct pernet_operations mptcp_pm_proc_ops = { + .init = mptcp_pm_init_net, + .exit = mptcp_pm_exit_net, +}; + +/* General initialization of mptcp */ +void __init mptcp_init(void) +{ + int i; + struct ctl_table_header *mptcp_sysctl; + + mptcp_sock_cache = kmem_cache_create("mptcp_sock", + sizeof(struct mptcp_tcp_sock), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!mptcp_sock_cache) + goto mptcp_sock_cache_failed; + + mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb), + 0, SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN, + NULL); + if (!mptcp_cb_cache) + goto mptcp_cb_cache_failed; + + mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw), + 0, SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN, + NULL); + if (!mptcp_tw_cache) + goto mptcp_tw_cache_failed; + + get_random_bytes(&mptcp_secret, sizeof(mptcp_secret)); + + mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); + if (!mptcp_wq) + goto alloc_workqueue_failed; + + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i); + INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i); + } + + spin_lock_init(&mptcp_tk_hashlock); + + if (register_pernet_subsys(&mptcp_pm_proc_ops)) + goto pernet_failed; + +#if IS_ENABLED(CONFIG_IPV6) + if (mptcp_pm_v6_init()) + goto mptcp_pm_v6_failed; +#endif + if (mptcp_pm_v4_init()) + goto mptcp_pm_v4_failed; + + mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table); + if (!mptcp_sysctl) + goto register_sysctl_failed; + + if (mptcp_register_path_manager(&mptcp_pm_default)) + goto register_pm_failed; + + if (mptcp_register_scheduler(&mptcp_sched_default)) + goto register_sched_failed; + + pr_info("MPTCP: Stable release v0.95.2"); + + mptcp_init_failed = false; + + return; + +register_sched_failed: + mptcp_unregister_path_manager(&mptcp_pm_default); +register_pm_failed: + unregister_net_sysctl_table(mptcp_sysctl); +register_sysctl_failed: + mptcp_pm_v4_undo(); +mptcp_pm_v4_failed: +#if IS_ENABLED(CONFIG_IPV6) + mptcp_pm_v6_undo(); +mptcp_pm_v6_failed: +#endif + unregister_pernet_subsys(&mptcp_pm_proc_ops); +pernet_failed: + destroy_workqueue(mptcp_wq); +alloc_workqueue_failed: + kmem_cache_destroy(mptcp_tw_cache); +mptcp_tw_cache_failed: + kmem_cache_destroy(mptcp_cb_cache); +mptcp_cb_cache_failed: + kmem_cache_destroy(mptcp_sock_cache); +mptcp_sock_cache_failed: + mptcp_init_failed = true; +} diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c new file mode 100644 index 000000000000..a3f1d8689cdd --- /dev/null +++ b/net/mptcp/mptcp_fullmesh.c @@ -0,0 +1,1941 @@ +#include +#include + +#include +#include + +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#endif + +enum { + MPTCP_EVENT_ADD = 1, + MPTCP_EVENT_DEL, + MPTCP_EVENT_MOD, +}; + +#define MPTCP_SUBFLOW_RETRY_DELAY 1000 + +/* Max number of local or remote addresses we can store. + * When changing, see the bitfield below in fullmesh_rem4/6. + */ +#define MPTCP_MAX_ADDR 8 + +struct fullmesh_rem4 { + u8 rem4_id; + u8 bitfield; + u8 retry_bitfield; + __be16 port; + struct in_addr addr; +}; + +struct fullmesh_rem6 { + u8 rem6_id; + u8 bitfield; + u8 retry_bitfield; + __be16 port; + struct in6_addr addr; +}; + +struct mptcp_loc_addr { + struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR]; + u8 loc4_bits; + u8 next_v4_index; + + struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR]; + u8 loc6_bits; + u8 next_v6_index; + struct rcu_head rcu; +}; + +struct mptcp_addr_event { + struct list_head list; + unsigned short family; + u8 code:7, + low_prio:1; + int if_idx; + union inet_addr addr; +}; + +struct fullmesh_priv { + /* Worker struct for subflow establishment */ + struct work_struct subflow_work; + /* Delayed worker, when the routing-tables are not yet ready. */ + struct delayed_work subflow_retry_work; + + /* Remote addresses */ + struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR]; + struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR]; + + struct mptcp_cb *mpcb; + + u16 remove_addrs; /* Addresses to remove */ + u8 announced_addrs_v4; /* IPv4 Addresses we did announce */ + u8 announced_addrs_v6; /* IPv6 Addresses we did announce */ + + u8 add_addr; /* Are we sending an add_addr? */ + + u8 rem4_bits; + u8 rem6_bits; + + /* Have we established the additional subflows for primary pair? */ + u8 first_pair:1; +}; + +struct mptcp_fm_ns { + struct mptcp_loc_addr __rcu *local; + spinlock_t local_lock; /* Protecting the above pointer */ + struct list_head events; + struct delayed_work address_worker; + + struct net *net; +}; + +static int num_subflows __read_mostly = 1; +module_param(num_subflows, int, 0644); +MODULE_PARM_DESC(num_subflows, "choose the number of subflows per pair of IP addresses of MPTCP connection"); + +static int create_on_err __read_mostly; +module_param(create_on_err, int, 0644); +MODULE_PARM_DESC(create_on_err, "recreate the subflow upon a timeout"); + +static struct mptcp_pm_ops full_mesh __read_mostly; + +static void full_mesh_create_subflows(struct sock *meta_sk); + +static struct mptcp_fm_ns *fm_get_ns(const struct net *net) +{ + return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH]; +} + +static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb) +{ + return (struct fullmesh_priv *)&mpcb->mptcp_pm[0]; +} + +/* Find the first free index in the bitfield */ +static int __mptcp_find_free_index(u8 bitfield, u8 base) +{ + int i; + + /* There are anyways no free bits... */ + if (bitfield == 0xff) + goto exit; + + i = ffs(~(bitfield >> base)) - 1; + if (i < 0) + goto exit; + + /* No free bits when starting at base, try from 0 on */ + if (i + base >= sizeof(bitfield) * 8) + return __mptcp_find_free_index(bitfield, 0); + + return i + base; +exit: + return -1; +} + +static int mptcp_find_free_index(u8 bitfield) +{ + return __mptcp_find_free_index(bitfield, 0); +} + +static void mptcp_addv4_raddr(struct mptcp_cb *mpcb, + const struct in_addr *addr, + __be16 port, u8 id) +{ + int i; + struct fullmesh_rem4 *rem4; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + + mptcp_for_each_bit_set(fmp->rem4_bits, i) { + rem4 = &fmp->remaddr4[i]; + + /* Address is already in the list --- continue */ + if (rem4->rem4_id == id && + rem4->addr.s_addr == addr->s_addr && rem4->port == port) + return; + + /* This may be the case, when the peer is behind a NAT. He is + * trying to JOIN, thus sending the JOIN with a certain ID. + * However the src_addr of the IP-packet has been changed. We + * update the addr in the list, because this is the address as + * OUR BOX sees it. + */ + if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) { + /* update the address */ + mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n", + __func__, &rem4->addr.s_addr, + &addr->s_addr, id); + rem4->addr.s_addr = addr->s_addr; + rem4->port = port; + mpcb->list_rcvd = 1; + return; + } + } + + i = mptcp_find_free_index(fmp->rem4_bits); + /* Do we have already the maximum number of local/remote addresses? */ + if (i < 0) { + mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n", + __func__, MPTCP_MAX_ADDR, &addr->s_addr); + return; + } + + rem4 = &fmp->remaddr4[i]; + + /* Address is not known yet, store it */ + rem4->addr.s_addr = addr->s_addr; + rem4->port = port; + rem4->bitfield = 0; + rem4->retry_bitfield = 0; + rem4->rem4_id = id; + mpcb->list_rcvd = 1; + fmp->rem4_bits |= (1 << i); + + return; +} + +static void mptcp_addv6_raddr(struct mptcp_cb *mpcb, + const struct in6_addr *addr, + __be16 port, u8 id) +{ + int i; + struct fullmesh_rem6 *rem6; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + + mptcp_for_each_bit_set(fmp->rem6_bits, i) { + rem6 = &fmp->remaddr6[i]; + + /* Address is already in the list --- continue */ + if (rem6->rem6_id == id && + ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port) + return; + + /* This may be the case, when the peer is behind a NAT. He is + * trying to JOIN, thus sending the JOIN with a certain ID. + * However the src_addr of the IP-packet has been changed. We + * update the addr in the list, because this is the address as + * OUR BOX sees it. + */ + if (rem6->rem6_id == id) { + /* update the address */ + mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n", + __func__, &rem6->addr, addr, id); + rem6->addr = *addr; + rem6->port = port; + mpcb->list_rcvd = 1; + return; + } + } + + i = mptcp_find_free_index(fmp->rem6_bits); + /* Do we have already the maximum number of local/remote addresses? */ + if (i < 0) { + mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n", + __func__, MPTCP_MAX_ADDR, addr); + return; + } + + rem6 = &fmp->remaddr6[i]; + + /* Address is not known yet, store it */ + rem6->addr = *addr; + rem6->port = port; + rem6->bitfield = 0; + rem6->retry_bitfield = 0; + rem6->rem6_id = id; + mpcb->list_rcvd = 1; + fmp->rem6_bits |= (1 << i); + + return; +} + +static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id) +{ + int i; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + + mptcp_for_each_bit_set(fmp->rem4_bits, i) { + if (fmp->remaddr4[i].rem4_id == id) { + /* remove address from bitfield */ + fmp->rem4_bits &= ~(1 << i); + + break; + } + } +} + +static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id) +{ + int i; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + + mptcp_for_each_bit_set(fmp->rem6_bits, i) { + if (fmp->remaddr6[i].rem6_id == id) { + /* remove address from bitfield */ + fmp->rem6_bits &= ~(1 << i); + + break; + } + } +} + +/* Sets the bitfield of the remote-address field */ +static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb, + const struct in_addr *addr, u8 index) +{ + int i; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + + mptcp_for_each_bit_set(fmp->rem4_bits, i) { + if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) { + fmp->remaddr4[i].bitfield |= (1 << index); + return; + } + } +} + +/* Sets the bitfield of the remote-address field */ +static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb, + const struct in6_addr *addr, u8 index) +{ + int i; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + + mptcp_for_each_bit_set(fmp->rem6_bits, i) { + if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) { + fmp->remaddr6[i].bitfield |= (1 << index); + return; + } + } +} + +static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb, + const union inet_addr *addr, + sa_family_t family, u8 id) +{ + if (family == AF_INET) + mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id); + else + mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id); +} + +static void mptcp_v4_subflows(struct sock *meta_sk, + const struct mptcp_loc4 *loc, + struct mptcp_rem4 *rem) +{ + int i; + + for (i = 1; i < num_subflows; i++) + mptcp_init4_subsockets(meta_sk, loc, rem); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void mptcp_v6_subflows(struct sock *meta_sk, + const struct mptcp_loc6 *loc, + struct mptcp_rem6 *rem) +{ + int i; + + for (i = 1; i < num_subflows; i++) + mptcp_init6_subsockets(meta_sk, loc, rem); +} +#endif + +static void retry_subflow_worker(struct work_struct *work) +{ + struct delayed_work *delayed_work = container_of(work, + struct delayed_work, + work); + struct fullmesh_priv *fmp = container_of(delayed_work, + struct fullmesh_priv, + subflow_retry_work); + struct mptcp_cb *mpcb = fmp->mpcb; + struct sock *meta_sk = mpcb->meta_sk; + struct mptcp_loc_addr *mptcp_local; + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); + int iter = 0, i; + + /* We need a local (stable) copy of the address-list. Really, it is not + * such a big deal, if the address-list is not 100% up-to-date. + */ + rcu_read_lock_bh(); + mptcp_local = rcu_dereference_bh(fm_ns->local); + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); + rcu_read_unlock_bh(); + + if (!mptcp_local) + return; + +next_subflow: + if (iter) { + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + + cond_resched(); + } + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + if (!mptcp(tcp_sk(meta_sk))) + goto exit; + + iter++; + + if (sock_flag(meta_sk, SOCK_DEAD)) + goto exit; + + mptcp_for_each_bit_set(fmp->rem4_bits, i) { + struct fullmesh_rem4 *rem = &fmp->remaddr4[i]; + /* Do we need to retry establishing a subflow ? */ + if (rem->retry_bitfield) { + int i = mptcp_find_free_index(~rem->retry_bitfield); + struct mptcp_rem4 rem4; + + rem->bitfield |= (1 << i); + rem->retry_bitfield &= ~(1 << i); + + rem4.addr = rem->addr; + rem4.port = rem->port; + rem4.rem4_id = rem->rem4_id; + + mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4); + mptcp_v4_subflows(meta_sk, + &mptcp_local->locaddr4[i], + &rem4); + goto next_subflow; + } + } + +#if IS_ENABLED(CONFIG_IPV6) + mptcp_for_each_bit_set(fmp->rem6_bits, i) { + struct fullmesh_rem6 *rem = &fmp->remaddr6[i]; + + /* Do we need to retry establishing a subflow ? */ + if (rem->retry_bitfield) { + int i = mptcp_find_free_index(~rem->retry_bitfield); + struct mptcp_rem6 rem6; + + rem->bitfield |= (1 << i); + rem->retry_bitfield &= ~(1 << i); + + rem6.addr = rem->addr; + rem6.port = rem->port; + rem6.rem6_id = rem->rem6_id; + + mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6); + mptcp_v6_subflows(meta_sk, + &mptcp_local->locaddr6[i], + &rem6); + goto next_subflow; + } + } +#endif + +exit: + kfree(mptcp_local); + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + mptcp_mpcb_put(mpcb); + sock_put(meta_sk); +} + +/** + * Create all new subflows, by doing calls to mptcp_initX_subsockets + * + * This function uses a goto next_subflow, to allow releasing the lock between + * new subflows and giving other processes a chance to do some work on the + * socket and potentially finishing the communication. + **/ +static void create_subflow_worker(struct work_struct *work) +{ + struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv, + subflow_work); + struct mptcp_cb *mpcb = fmp->mpcb; + struct sock *meta_sk = mpcb->meta_sk; + struct mptcp_loc_addr *mptcp_local; + const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); + int iter = 0, retry = 0; + int i; + + /* We need a local (stable) copy of the address-list. Really, it is not + * such a big deal, if the address-list is not 100% up-to-date. + */ + rcu_read_lock_bh(); + mptcp_local = rcu_dereference_bh(fm_ns->local); + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC); + rcu_read_unlock_bh(); + + if (!mptcp_local) + return; + +next_subflow: + if (iter) { + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + + cond_resched(); + } + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + if (sock_flag(meta_sk, SOCK_DEAD) || !mptcp(tcp_sk(meta_sk))) + goto exit; + + if (mpcb->master_sk && + !tcp_sk(mpcb->master_sk)->mptcp->fully_established) + goto exit; + + /* Create the additional subflows for the first pair */ + if (fmp->first_pair == 0 && mpcb->master_sk) { + struct mptcp_loc4 loc; + struct mptcp_rem4 rem; + + loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; + loc.loc4_id = 0; + loc.low_prio = 0; + loc.if_idx = mpcb->master_sk->sk_bound_dev_if; + + rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; + rem.port = inet_sk(meta_sk)->inet_dport; + rem.rem4_id = 0; /* Default 0 */ + + mptcp_v4_subflows(meta_sk, &loc, &rem); + + fmp->first_pair = 1; + } + iter++; + + mptcp_for_each_bit_set(fmp->rem4_bits, i) { + struct fullmesh_rem4 *rem; + u8 remaining_bits; + + rem = &fmp->remaddr4[i]; + remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits; + + /* Are there still combinations to handle? */ + if (remaining_bits) { + int i = mptcp_find_free_index(~remaining_bits); + struct mptcp_rem4 rem4; + + rem->bitfield |= (1 << i); + + rem4.addr = rem->addr; + rem4.port = rem->port; + rem4.rem4_id = rem->rem4_id; + + /* If a route is not yet available then retry once */ + if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], + &rem4) == -ENETUNREACH) + retry = rem->retry_bitfield |= (1 << i); + else + mptcp_v4_subflows(meta_sk, + &mptcp_local->locaddr4[i], + &rem4); + goto next_subflow; + } + } + +#if IS_ENABLED(CONFIG_IPV6) + if (fmp->first_pair == 0 && mpcb->master_sk) { + struct mptcp_loc6 loc; + struct mptcp_rem6 rem; + + loc.addr = inet6_sk(meta_sk)->saddr; + loc.loc6_id = 0; + loc.low_prio = 0; + loc.if_idx = mpcb->master_sk->sk_bound_dev_if; + + rem.addr = meta_sk->sk_v6_daddr; + rem.port = inet_sk(meta_sk)->inet_dport; + rem.rem6_id = 0; /* Default 0 */ + + mptcp_v6_subflows(meta_sk, &loc, &rem); + + fmp->first_pair = 1; + } + mptcp_for_each_bit_set(fmp->rem6_bits, i) { + struct fullmesh_rem6 *rem; + u8 remaining_bits; + + rem = &fmp->remaddr6[i]; + remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits; + + /* Are there still combinations to handle? */ + if (remaining_bits) { + int i = mptcp_find_free_index(~remaining_bits); + struct mptcp_rem6 rem6; + + rem->bitfield |= (1 << i); + + rem6.addr = rem->addr; + rem6.port = rem->port; + rem6.rem6_id = rem->rem6_id; + + /* If a route is not yet available then retry once */ + if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], + &rem6) == -ENETUNREACH) + retry = rem->retry_bitfield |= (1 << i); + else + mptcp_v6_subflows(meta_sk, + &mptcp_local->locaddr6[i], + &rem6); + goto next_subflow; + } + } +#endif + + if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) { + sock_hold(meta_sk); + refcount_inc(&mpcb->mpcb_refcnt); + queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work, + msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY)); + } + +exit: + kfree(mptcp_local); + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + mptcp_mpcb_put(mpcb); + sock_put(meta_sk); +} + +static void announce_remove_addr(u8 addr_id, struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + struct sock *sk = mptcp_select_ack_sock(meta_sk); + + fmp->remove_addrs |= (1 << addr_id); + mpcb->addr_signal = 1; + + if (sk) + tcp_send_ack(sk); +} + +static void update_addr_bitfields(struct sock *meta_sk, + const struct mptcp_loc_addr *mptcp_local) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + int i; + + /* The bits in announced_addrs_* always match with loc*_bits. So, a + * simple & operation unsets the correct bits, because these go from + * announced to non-announced + */ + fmp->announced_addrs_v4 &= mptcp_local->loc4_bits; + + mptcp_for_each_bit_set(fmp->rem4_bits, i) { + fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits; + fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits; + } + + fmp->announced_addrs_v6 &= mptcp_local->loc6_bits; + + mptcp_for_each_bit_set(fmp->rem6_bits, i) { + fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits; + fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits; + } +} + +static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local, + sa_family_t family, const union inet_addr *addr, + int if_idx) +{ + int i; + u8 loc_bits; + bool found = false; + + if (family == AF_INET) + loc_bits = mptcp_local->loc4_bits; + else + loc_bits = mptcp_local->loc6_bits; + + mptcp_for_each_bit_set(loc_bits, i) { + if (family == AF_INET && + (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx) && + mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) { + found = true; + break; + } + if (family == AF_INET6 && + (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx) && + ipv6_addr_equal(&mptcp_local->locaddr6[i].addr, + &addr->in6)) { + found = true; + break; + } + } + + if (!found) + return -1; + + return i; +} + +static int mptcp_find_address_transp(const struct mptcp_loc_addr *mptcp_local, + sa_family_t family, int if_idx) +{ + bool found = false; + u8 loc_bits; + int i; + + if (family == AF_INET) + loc_bits = mptcp_local->loc4_bits; + else + loc_bits = mptcp_local->loc6_bits; + + mptcp_for_each_bit_set(loc_bits, i) { + if (family == AF_INET && + (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx)) { + found = true; + break; + } + if (family == AF_INET6 && + (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx)) { + found = true; + break; + } + } + + if (!found) + return -1; + + return i; +} + +static void mptcp_address_worker(struct work_struct *work) +{ + const struct delayed_work *delayed_work = container_of(work, + struct delayed_work, + work); + struct mptcp_fm_ns *fm_ns = container_of(delayed_work, + struct mptcp_fm_ns, + address_worker); + struct net *net = fm_ns->net; + struct mptcp_addr_event *event = NULL; + struct mptcp_loc_addr *mptcp_local, *old; + int i, id = -1; /* id is used in the socket-code on a delete-event */ + bool success; /* Used to indicate if we succeeded handling the event */ + +next_event: + success = false; + kfree(event); + + /* First, let's dequeue an event from our event-list */ + rcu_read_lock_bh(); + spin_lock(&fm_ns->local_lock); + + event = list_first_entry_or_null(&fm_ns->events, + struct mptcp_addr_event, list); + if (!event) { + spin_unlock(&fm_ns->local_lock); + rcu_read_unlock_bh(); + return; + } + + list_del(&event->list); + + mptcp_local = rcu_dereference_bh(fm_ns->local); + + if (event->code == MPTCP_EVENT_DEL) { + id = mptcp_find_address(mptcp_local, event->family, + &event->addr, event->if_idx); + + /* Not in the list - so we don't care */ + if (id < 0) { + mptcp_debug("%s could not find id\n", __func__); + goto duno; + } + + old = mptcp_local; + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), + GFP_ATOMIC); + if (!mptcp_local) + goto duno; + + if (event->family == AF_INET) + mptcp_local->loc4_bits &= ~(1 << id); + else + mptcp_local->loc6_bits &= ~(1 << id); + + rcu_assign_pointer(fm_ns->local, mptcp_local); + kfree_rcu(old, rcu); + } else { + int i = mptcp_find_address(mptcp_local, event->family, + &event->addr, event->if_idx); + int j = i; + + if (j < 0) { + /* Not in the list, so we have to find an empty slot */ + if (event->family == AF_INET) + i = __mptcp_find_free_index(mptcp_local->loc4_bits, + mptcp_local->next_v4_index); + if (event->family == AF_INET6) + i = __mptcp_find_free_index(mptcp_local->loc6_bits, + mptcp_local->next_v6_index); + + if (i < 0) { + mptcp_debug("%s no more space\n", __func__); + goto duno; + } + + /* It might have been a MOD-event. */ + event->code = MPTCP_EVENT_ADD; + } else { + /* Let's check if anything changes */ + if (event->family == AF_INET && + event->low_prio == mptcp_local->locaddr4[i].low_prio) + goto duno; + + if (event->family == AF_INET6 && + event->low_prio == mptcp_local->locaddr6[i].low_prio) + goto duno; + } + + old = mptcp_local; + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), + GFP_ATOMIC); + if (!mptcp_local) + goto duno; + + if (event->family == AF_INET) { + mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr; + mptcp_local->locaddr4[i].loc4_id = i + 1; + mptcp_local->locaddr4[i].low_prio = event->low_prio; + mptcp_local->locaddr4[i].if_idx = event->if_idx; + + mptcp_debug("%s updated IP %pI4 on ifidx %u prio %u id %u\n", + __func__, &event->addr.in.s_addr, + event->if_idx, event->low_prio, i + 1); + } else { + mptcp_local->locaddr6[i].addr = event->addr.in6; + mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR; + mptcp_local->locaddr6[i].low_prio = event->low_prio; + mptcp_local->locaddr6[i].if_idx = event->if_idx; + + mptcp_debug("%s updated IP %pI6 on ifidx %u prio %u id %u\n", + __func__, &event->addr.in6, + event->if_idx, event->low_prio, i + MPTCP_MAX_ADDR); + } + + if (j < 0) { + if (event->family == AF_INET) { + mptcp_local->loc4_bits |= (1 << i); + mptcp_local->next_v4_index = i + 1; + } else { + mptcp_local->loc6_bits |= (1 << i); + mptcp_local->next_v6_index = i + 1; + } + } + + rcu_assign_pointer(fm_ns->local, mptcp_local); + kfree_rcu(old, rcu); + } + success = true; + +duno: + spin_unlock(&fm_ns->local_lock); + rcu_read_unlock_bh(); + + if (!success) + goto next_event; + + /* Now we iterate over the MPTCP-sockets and apply the event. */ + for (i = 0; i < MPTCP_HASH_SIZE; i++) { + const struct hlist_nulls_node *node; + struct tcp_sock *meta_tp; + + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i], + tk_table) { + struct sock *meta_sk = (struct sock *)meta_tp, *sk; + bool meta_v4 = meta_sk->sk_family == AF_INET; + struct mptcp_cb *mpcb; + + if (sock_net(meta_sk) != net) + continue; + + if (meta_v4) { + /* skip IPv6 events if meta is IPv4 */ + if (event->family == AF_INET6) + continue; + } else if (event->family == AF_INET && meta_sk->sk_ipv6only) { + /* skip IPv4 events if IPV6_V6ONLY is set */ + continue; + } + + if (unlikely(!refcount_inc_not_zero(&meta_sk->sk_refcnt))) + continue; + + bh_lock_sock(meta_sk); + + mpcb = meta_tp->mpcb; + if (!mpcb) + goto next; + + if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) || + mptcp_in_infinite_mapping_weak(mpcb)) + goto next; + + /* May be that the pm has changed in-between */ + if (mpcb->pm_ops != &full_mesh) + goto next; + + if (sock_owned_by_user(meta_sk)) { + if (!test_and_set_bit(MPTCP_PATH_MANAGER_DEFERRED, + &meta_sk->sk_tsq_flags)) + sock_hold(meta_sk); + + goto next; + } + + if (event->code == MPTCP_EVENT_ADD) { + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + + fmp->add_addr++; + mpcb->addr_signal = 1; + + sk = mptcp_select_ack_sock(meta_sk); + if (sk) + tcp_send_ack(sk); + + full_mesh_create_subflows(meta_sk); + } + + if (event->code == MPTCP_EVENT_DEL) { + struct mptcp_tcp_sock *mptcp; + struct mptcp_loc_addr *mptcp_local; + struct hlist_node *tmp; + bool found = false; + + mptcp_local = rcu_dereference_bh(fm_ns->local); + + /* In any case, we need to update our bitfields */ + if (id >= 0) + update_addr_bitfields(meta_sk, mptcp_local); + + /* Look for the socket and remove him */ + mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { + struct sock *sk = mptcp_to_sock(mptcp); + + if ((event->family == AF_INET6 && + (sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(sk))) || + (event->family == AF_INET && + (sk->sk_family == AF_INET6 && + !mptcp_v6_is_v4_mapped(sk)))) + continue; + + if (event->family == AF_INET && + (sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(sk)) && + inet_sk(sk)->inet_saddr != event->addr.in.s_addr) + continue; + + if (event->family == AF_INET6 && + sk->sk_family == AF_INET6 && + !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) + continue; + + /* Reinject, so that pf = 1 and so we + * won't select this one as the + * ack-sock. + */ + mptcp_reinject_data(sk, 0); + + /* We announce the removal of this id */ + announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk); + + mptcp_sub_force_close(sk); + found = true; + } + + if (found) + goto next; + + /* The id may have been given by the event, + * matching on a local address. And it may not + * have matched on one of the above sockets, + * because the client never created a subflow. + * So, we have to finally remove it here. + */ + if (id >= 0) { + u8 loc_id = id + + (event->family == AF_INET ? 1 : MPTCP_MAX_ADDR); + announce_remove_addr(loc_id, meta_sk); + } + } + + if (event->code == MPTCP_EVENT_MOD) { + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + struct tcp_sock *tp = tcp_sk(sk); + if (event->family == AF_INET && + (sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(sk)) && + inet_sk(sk)->inet_saddr == event->addr.in.s_addr) { + if (event->low_prio != tp->mptcp->low_prio) { + tp->mptcp->send_mp_prio = 1; + tp->mptcp->low_prio = event->low_prio; + + tcp_send_ack(sk); + } + } + + if (event->family == AF_INET6 && + sk->sk_family == AF_INET6 && + !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) { + if (event->low_prio != tp->mptcp->low_prio) { + tp->mptcp->send_mp_prio = 1; + tp->mptcp->low_prio = event->low_prio; + + tcp_send_ack(sk); + } + } + } + } +next: + bh_unlock_sock(meta_sk); + sock_put(meta_sk); + } + rcu_read_unlock_bh(); + } + goto next_event; +} + +static struct mptcp_addr_event *lookup_similar_event(const struct net *net, + const struct mptcp_addr_event *event) +{ + struct mptcp_addr_event *eventq; + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + + list_for_each_entry(eventq, &fm_ns->events, list) { + if (eventq->family != event->family) + continue; + if (eventq->if_idx != event->if_idx) + continue; + if (event->family == AF_INET) { + if (eventq->addr.in.s_addr == event->addr.in.s_addr) + return eventq; + } else { + if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6)) + return eventq; + } + } + return NULL; +} + +/* We already hold the net-namespace MPTCP-lock */ +static void add_pm_event(struct net *net, const struct mptcp_addr_event *event) +{ + struct mptcp_addr_event *eventq = lookup_similar_event(net, event); + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + + if (eventq) { + switch (event->code) { + case MPTCP_EVENT_DEL: + mptcp_debug("%s del old_code %u\n", __func__, eventq->code); + list_del(&eventq->list); + kfree(eventq); + break; + case MPTCP_EVENT_ADD: + mptcp_debug("%s add old_code %u\n", __func__, eventq->code); + eventq->low_prio = event->low_prio; + eventq->code = MPTCP_EVENT_ADD; + return; + case MPTCP_EVENT_MOD: + mptcp_debug("%s mod old_code %u\n", __func__, eventq->code); + eventq->low_prio = event->low_prio; + eventq->code = MPTCP_EVENT_MOD; + return; + } + } + + /* OK, we have to add the new address to the wait queue */ + eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC); + if (!eventq) + return; + + list_add_tail(&eventq->list, &fm_ns->events); + + /* Create work-queue */ + if (!delayed_work_pending(&fm_ns->address_worker)) + queue_delayed_work(mptcp_wq, &fm_ns->address_worker, + msecs_to_jiffies(500)); +} + +static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event, + struct net *net) +{ + const struct net_device *netdev = ifa->ifa_dev->dev; + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + struct mptcp_addr_event mpevent; + + if (ifa->ifa_scope > RT_SCOPE_LINK || + ipv4_is_loopback(ifa->ifa_local)) + return; + + spin_lock_bh(&fm_ns->local_lock); + + mpevent.family = AF_INET; + mpevent.addr.in.s_addr = ifa->ifa_local; + mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; + mpevent.if_idx = netdev->ifindex; + + if (event == NETDEV_DOWN || !netif_running(netdev) || + (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP)) + mpevent.code = MPTCP_EVENT_DEL; + else if (event == NETDEV_UP) + mpevent.code = MPTCP_EVENT_ADD; + else if (event == NETDEV_CHANGE) + mpevent.code = MPTCP_EVENT_MOD; + + mptcp_debug("%s created event for %pI4, code %u prio %u idx %u\n", __func__, + &ifa->ifa_local, mpevent.code, mpevent.low_prio, mpevent.if_idx); + add_pm_event(net, &mpevent); + + spin_unlock_bh(&fm_ns->local_lock); + return; +} + +/* React on IPv4-addr add/rem-events */ +static int mptcp_pm_inetaddr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net *net = dev_net(ifa->ifa_dev->dev); + + if (!(event == NETDEV_UP || event == NETDEV_DOWN || + event == NETDEV_CHANGE)) + return NOTIFY_DONE; + + addr4_event_handler(ifa, event, net); + + return NOTIFY_DONE; +} + +static struct notifier_block mptcp_pm_inetaddr_notifier = { + .notifier_call = mptcp_pm_inetaddr_event, +}; + +#if IS_ENABLED(CONFIG_IPV6) + +static int inet6_addr_event(struct notifier_block *this, unsigned long event, + void *ptr); + +static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event, + struct net *net) +{ + const struct net_device *netdev = ifa->idev->dev; + int addr_type = ipv6_addr_type(&ifa->addr); + struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + struct mptcp_addr_event mpevent; + + if (ifa->scope > RT_SCOPE_LINK || + addr_type == IPV6_ADDR_ANY || + (addr_type & IPV6_ADDR_LOOPBACK) || + (addr_type & IPV6_ADDR_LINKLOCAL)) + return; + + spin_lock_bh(&fm_ns->local_lock); + + mpevent.family = AF_INET6; + mpevent.addr.in6 = ifa->addr; + mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0; + mpevent.if_idx = netdev->ifindex; + + if (event == NETDEV_DOWN || !netif_running(netdev) || + (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP)) + mpevent.code = MPTCP_EVENT_DEL; + else if (event == NETDEV_UP) + mpevent.code = MPTCP_EVENT_ADD; + else if (event == NETDEV_CHANGE) + mpevent.code = MPTCP_EVENT_MOD; + + mptcp_debug("%s created event for %pI6, code %u prio %u idx %u\n", __func__, + &ifa->addr, mpevent.code, mpevent.low_prio, mpevent.if_idx); + add_pm_event(net, &mpevent); + + spin_unlock_bh(&fm_ns->local_lock); + return; +} + +/* React on IPv6-addr add/rem-events */ +static int inet6_addr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr; + struct net *net = dev_net(ifa6->idev->dev); + + if (!(event == NETDEV_UP || event == NETDEV_DOWN || + event == NETDEV_CHANGE)) + return NOTIFY_DONE; + + addr6_event_handler(ifa6, event, net); + + return NOTIFY_DONE; +} + +static struct notifier_block inet6_addr_notifier = { + .notifier_call = inet6_addr_event, +}; + +#endif + +/* React on ifup/down-events */ +static int netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct in_device *in_dev; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; +#endif + + if (!(event == NETDEV_UP || event == NETDEV_DOWN || + event == NETDEV_CHANGE)) + return NOTIFY_DONE; + + rcu_read_lock(); + in_dev = __in_dev_get_rtnl(dev); + + if (in_dev) { + for_ifa(in_dev) { + mptcp_pm_inetaddr_event(NULL, event, ifa); + } endfor_ifa(in_dev); + } + +#if IS_ENABLED(CONFIG_IPV6) + in6_dev = __in6_dev_get(dev); + + if (in6_dev) { + struct inet6_ifaddr *ifa6; + list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) + inet6_addr_event(NULL, event, ifa6); + } +#endif + + rcu_read_unlock(); + return NOTIFY_DONE; +} + +static struct notifier_block mptcp_pm_netdev_notifier = { + .notifier_call = netdev_event, +}; + +static void full_mesh_add_raddr(struct mptcp_cb *mpcb, + const union inet_addr *addr, + sa_family_t family, __be16 port, u8 id) +{ + if (family == AF_INET) + mptcp_addv4_raddr(mpcb, &addr->in, port, id); + else + mptcp_addv6_raddr(mpcb, &addr->in6, port, id); +} + +static void full_mesh_new_session(const struct sock *meta_sk) +{ + struct mptcp_loc_addr *mptcp_local; + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); + struct tcp_sock *master_tp = tcp_sk(mpcb->master_sk); + int i, index, if_idx = 0; + union inet_addr saddr, daddr; + sa_family_t family = AF_INET; + bool meta_v4 = meta_sk->sk_family == AF_INET; + + /* Init local variables necessary for the rest */ + if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) { + saddr.ip = inet_sk(meta_sk)->inet_saddr; + daddr.ip = inet_sk(meta_sk)->inet_daddr; + if_idx = mpcb->master_sk->sk_bound_dev_if; + family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + saddr.in6 = inet6_sk(meta_sk)->saddr; + daddr.in6 = meta_sk->sk_v6_daddr; + if_idx = mpcb->master_sk->sk_bound_dev_if; + family = AF_INET6; +#endif + } + + if (inet_sk(meta_sk)->transparent) + if_idx = inet_sk(meta_sk)->rx_dst_ifindex; + + rcu_read_lock_bh(); + mptcp_local = rcu_dereference(fm_ns->local); + + if (inet_sk(meta_sk)->transparent) + index = mptcp_find_address_transp(mptcp_local, family, if_idx); + else + index = mptcp_find_address(mptcp_local, family, &saddr, if_idx); + if (index < 0) + goto fallback; + + if (family == AF_INET) + master_tp->mptcp->low_prio = mptcp_local->locaddr4[index].low_prio; + else + master_tp->mptcp->low_prio = mptcp_local->locaddr6[index].low_prio; + master_tp->mptcp->send_mp_prio = master_tp->mptcp->low_prio; + + full_mesh_add_raddr(mpcb, &daddr, family, 0, 0); + mptcp_set_init_addr_bit(mpcb, &daddr, family, index); + + /* Initialize workqueue-struct */ + INIT_WORK(&fmp->subflow_work, create_subflow_worker); + INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker); + fmp->mpcb = mpcb; + + if (!meta_v4 && meta_sk->sk_ipv6only) + goto skip_ipv4; + + /* Look for the address among the local addresses */ + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { + __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr; + + /* We do not need to announce the initial subflow's address again */ + if (family == AF_INET && + (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx) && + saddr.ip == ifa_address) + continue; + + fmp->add_addr++; + mpcb->addr_signal = 1; + } + +skip_ipv4: +#if IS_ENABLED(CONFIG_IPV6) + /* skip IPv6 addresses if meta-socket is IPv4 */ + if (meta_v4) + goto skip_ipv6; + + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { + const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr; + + /* We do not need to announce the initial subflow's address again */ + if (family == AF_INET6 && + (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx) && + ipv6_addr_equal(&saddr.in6, ifa6)) + continue; + + fmp->add_addr++; + mpcb->addr_signal = 1; + } + +skip_ipv6: +#endif + + rcu_read_unlock_bh(); + + if (family == AF_INET) + fmp->announced_addrs_v4 |= (1 << index); + else + fmp->announced_addrs_v6 |= (1 << index); + + for (i = fmp->add_addr; i && fmp->add_addr; i--) + tcp_send_ack(mpcb->master_sk); + + if (master_tp->mptcp->send_mp_prio) + tcp_send_ack(mpcb->master_sk); + + return; + +fallback: + rcu_read_unlock_bh(); + mptcp_fallback_default(mpcb); + return; +} + +static void full_mesh_create_subflows(struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + + if (mptcp_in_infinite_mapping_weak(mpcb) || + mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) + return; + + if (mpcb->master_sk && + !tcp_sk(mpcb->master_sk)->mptcp->fully_established) + return; + + if (!work_pending(&fmp->subflow_work)) { + sock_hold(meta_sk); + refcount_inc(&mpcb->mpcb_refcnt); + queue_work(mptcp_wq, &fmp->subflow_work); + } +} + +/* Called upon release_sock, if the socket was owned by the user during + * a path-management event. + */ +static void full_mesh_release_sock(struct sock *meta_sk) +{ + struct mptcp_loc_addr *mptcp_local; + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); + bool meta_v4 = meta_sk->sk_family == AF_INET; + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + int i; + + rcu_read_lock_bh(); + mptcp_local = rcu_dereference(fm_ns->local); + + if (!meta_v4 && meta_sk->sk_ipv6only) + goto skip_ipv4; + + /* First, detect modifications or additions */ + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { + struct in_addr ifa = mptcp_local->locaddr4[i].addr; + bool found = false; + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + struct tcp_sock *tp = tcp_sk(sk); + + if (sk->sk_family == AF_INET6 && + !mptcp_v6_is_v4_mapped(sk)) + continue; + + if (inet_sk(sk)->inet_saddr != ifa.s_addr) + continue; + + found = true; + + if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) { + tp->mptcp->send_mp_prio = 1; + tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio; + + tcp_send_ack(sk); + } + } + + if (!found) { + struct sock *sk; + + fmp->add_addr++; + mpcb->addr_signal = 1; + + sk = mptcp_select_ack_sock(meta_sk); + if (sk) + tcp_send_ack(sk); + full_mesh_create_subflows(meta_sk); + } + } + +skip_ipv4: +#if IS_ENABLED(CONFIG_IPV6) + /* skip IPv6 addresses if meta-socket is IPv4 */ + if (meta_v4) + goto removal; + + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { + struct in6_addr ifa = mptcp_local->locaddr6[i].addr; + bool found = false; + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + struct tcp_sock *tp = tcp_sk(sk); + + if (sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(sk)) + continue; + + if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa)) + continue; + + found = true; + + if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) { + tp->mptcp->send_mp_prio = 1; + tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio; + + tcp_send_ack(sk); + } + } + + if (!found) { + struct sock *sk; + + fmp->add_addr++; + mpcb->addr_signal = 1; + + sk = mptcp_select_ack_sock(meta_sk); + if (sk) + tcp_send_ack(sk); + full_mesh_create_subflows(meta_sk); + } + } + +removal: +#endif + + /* Now, detect address-removals */ + mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { + struct sock *sk = mptcp_to_sock(mptcp); + bool shall_remove = true; + + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) { + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { + if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) { + shall_remove = false; + break; + } + } + } else { + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { + if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) { + shall_remove = false; + break; + } + } + } + + if (shall_remove) { + /* Reinject, so that pf = 1 and so we + * won't select this one as the + * ack-sock. + */ + mptcp_reinject_data(sk, 0); + + announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, + meta_sk); + + mptcp_sub_force_close(sk); + } + } + + /* Just call it optimistically. It actually cannot do any harm */ + update_addr_bitfields(meta_sk, mptcp_local); + + rcu_read_unlock_bh(); +} + +static int full_mesh_get_local_id(const struct sock *meta_sk, + sa_family_t family, union inet_addr *addr, + bool *low_prio) +{ + struct mptcp_loc_addr *mptcp_local; + const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk)); + int index, id = -1; + + /* Handle the backup-flows */ + rcu_read_lock_bh(); + mptcp_local = rcu_dereference(fm_ns->local); + + index = mptcp_find_address(mptcp_local, family, addr, 0); + + if (index != -1) { + if (family == AF_INET) { + id = mptcp_local->locaddr4[index].loc4_id; + *low_prio = mptcp_local->locaddr4[index].low_prio; + } else { + id = mptcp_local->locaddr6[index].loc6_id; + *low_prio = mptcp_local->locaddr6[index].low_prio; + } + } + + + rcu_read_unlock_bh(); + + return id; +} + +static void full_mesh_addr_signal(struct sock *sk, unsigned *size, + struct tcp_out_options *opts, + struct sk_buff *skb) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + struct sock *meta_sk = mpcb->meta_sk; + struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb); + struct mptcp_loc_addr *mptcp_local; + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk)); + int remove_addr_len; + u8 unannouncedv4 = 0, unannouncedv6 = 0; + bool meta_v4 = meta_sk->sk_family == AF_INET; + + mpcb->addr_signal = 0; + + if (likely(!fmp->add_addr)) + goto remove_addr; + + rcu_read_lock_bh(); + mptcp_local = rcu_dereference(fm_ns->local); + + if (!meta_v4 && meta_sk->sk_ipv6only) + goto skip_ipv4; + + /* IPv4 */ + unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits; + if (unannouncedv4 && + ((mpcb->mptcp_ver == MPTCP_VERSION_0 && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) || + (mpcb->mptcp_ver >= MPTCP_VERSION_1 && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1))) { + int ind = mptcp_find_free_index(~unannouncedv4); + + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_ADD_ADDR; + opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id; + opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr; + opts->add_addr_v4 = 1; + if (mpcb->mptcp_ver >= MPTCP_VERSION_1) { + u8 mptcp_hash_mac[20]; + u8 no_key[8]; + + *(u64 *)no_key = 0; + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, + (u8 *)no_key, + (u32 *)mptcp_hash_mac, 2, + 1, (u8 *)&mptcp_local->locaddr4[ind].loc4_id, + 4, (u8 *)&opts->add_addr4.addr.s_addr); + opts->add_addr4.trunc_mac = *(u64 *)mptcp_hash_mac; + } + + if (skb) { + fmp->announced_addrs_v4 |= (1 << ind); + fmp->add_addr--; + } + + if (mpcb->mptcp_ver < MPTCP_VERSION_1) + *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN; + if (mpcb->mptcp_ver >= MPTCP_VERSION_1) + *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1; + + goto skip_ipv6; + } + + if (meta_v4) + goto skip_ipv6; +skip_ipv4: + /* IPv6 */ + unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits; + if (unannouncedv6 && + ((mpcb->mptcp_ver == MPTCP_VERSION_0 && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) || + (mpcb->mptcp_ver >= MPTCP_VERSION_1 && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1))) { + int ind = mptcp_find_free_index(~unannouncedv6); + + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_ADD_ADDR; + opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id; + opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr; + opts->add_addr_v6 = 1; + if (mpcb->mptcp_ver >= MPTCP_VERSION_1) { + u8 mptcp_hash_mac[20]; + u8 no_key[8]; + + *(u64 *)no_key = 0; + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, + (u8 *)no_key, + (u32 *)mptcp_hash_mac, 2, + 1, (u8 *)&mptcp_local->locaddr6[ind].loc6_id, + 16, (u8 *)&opts->add_addr6.addr.s6_addr); + opts->add_addr6.trunc_mac = *(u64 *)mptcp_hash_mac; + } + + if (skb) { + fmp->announced_addrs_v6 |= (1 << ind); + fmp->add_addr--; + } + if (mpcb->mptcp_ver < MPTCP_VERSION_1) + *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN; + if (mpcb->mptcp_ver >= MPTCP_VERSION_1) + *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1; + } + +skip_ipv6: + rcu_read_unlock_bh(); + + if (!unannouncedv4 && !unannouncedv6 && skb) + fmp->add_addr--; + +remove_addr: + if (likely(!fmp->remove_addrs)) + goto exit; + + remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs); + if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len) + goto exit; + + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_REMOVE_ADDR; + opts->remove_addrs = fmp->remove_addrs; + *size += remove_addr_len; + if (skb) + fmp->remove_addrs = 0; + +exit: + mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs); +} + +static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id) +{ + mptcp_v4_rem_raddress(mpcb, rem_id); + mptcp_v6_rem_raddress(mpcb, rem_id); +} + +static void full_mesh_delete_subflow(struct sock *sk) +{ + struct fullmesh_priv *fmp = fullmesh_get_priv(tcp_sk(sk)->mpcb); + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk)); + struct sock *meta_sk = mptcp_meta_sk(sk); + struct mptcp_loc_addr *mptcp_local; + int index, i; + + if (!create_on_err) + return; + + if (!mptcp_can_new_subflow(meta_sk)) + return; + + rcu_read_lock_bh(); + mptcp_local = rcu_dereference_bh(fm_ns->local); + + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) { + union inet_addr saddr; + + saddr.ip = inet_sk(sk)->inet_saddr; + index = mptcp_find_address(mptcp_local, AF_INET, &saddr, + sk->sk_bound_dev_if); + if (index < 0) + goto out; + + mptcp_for_each_bit_set(fmp->rem4_bits, i) { + struct fullmesh_rem4 *rem4 = &fmp->remaddr4[i]; + + if (rem4->addr.s_addr != sk->sk_daddr) + continue; + + if (rem4->port && rem4->port != inet_sk(sk)->inet_dport) + continue; + + rem4->bitfield &= ~(1 << index); + } +#if IS_ENABLED(CONFIG_IPV6) + } else { + union inet_addr saddr; + + saddr.in6 = inet6_sk(sk)->saddr; + index = mptcp_find_address(mptcp_local, AF_INET6, &saddr, + sk->sk_bound_dev_if); + if (index < 0) + goto out; + + mptcp_for_each_bit_set(fmp->rem6_bits, i) { + struct fullmesh_rem6 *rem6 = &fmp->remaddr6[i]; + + if (!ipv6_addr_equal(&rem6->addr, &sk->sk_v6_daddr)) + continue; + + if (rem6->port && rem6->port != inet_sk(sk)->inet_dport) + continue; + + rem6->bitfield &= ~(1 << index); + } +#endif + } + +out: + rcu_read_unlock_bh(); + + /* re-schedule the creation of failed subflows */ + if (tcp_sk(sk)->mptcp->sk_err == ETIMEDOUT || sk->sk_err == ETIMEDOUT) + full_mesh_create_subflows(meta_sk); +} + +/* Output /proc/net/mptcp_fullmesh */ +static int mptcp_fm_seq_show(struct seq_file *seq, void *v) +{ + const struct net *net = seq->private; + struct mptcp_loc_addr *mptcp_local; + const struct mptcp_fm_ns *fm_ns = fm_get_ns(net); + int i; + + seq_printf(seq, "Index, Address-ID, Backup, IP-address, if-idx\n"); + + rcu_read_lock_bh(); + mptcp_local = rcu_dereference(fm_ns->local); + + seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index); + + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) { + struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i]; + + seq_printf(seq, "%u, %u, %u, %pI4, %u\n", i, loc4->loc4_id, + loc4->low_prio, &loc4->addr, loc4->if_idx); + } + + seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index); + + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) { + struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i]; + + seq_printf(seq, "%u, %u, %u, %pI6, %u\n", i, loc6->loc6_id, + loc6->low_prio, &loc6->addr, loc6->if_idx); + } + rcu_read_unlock_bh(); + + return 0; +} + +static int mptcp_fm_init_net(struct net *net) +{ + struct mptcp_loc_addr *mptcp_local; + struct mptcp_fm_ns *fm_ns; + int err = 0; + + fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL); + if (!fm_ns) + return -ENOBUFS; + + mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL); + if (!mptcp_local) { + err = -ENOBUFS; + goto err_mptcp_local; + } + + if (!proc_create_net_single("mptcp_fullmesh", S_IRUGO, net->proc_net, + mptcp_fm_seq_show, NULL)) { + err = -ENOMEM; + goto err_seq_fops; + } + + mptcp_local->next_v4_index = 1; + + rcu_assign_pointer(fm_ns->local, mptcp_local); + INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker); + INIT_LIST_HEAD(&fm_ns->events); + spin_lock_init(&fm_ns->local_lock); + fm_ns->net = net; + net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns; + + return 0; +err_seq_fops: + kfree(mptcp_local); +err_mptcp_local: + kfree(fm_ns); + return err; +} + +static void mptcp_fm_exit_net(struct net *net) +{ + struct mptcp_addr_event *eventq, *tmp; + struct mptcp_fm_ns *fm_ns; + struct mptcp_loc_addr *mptcp_local; + + fm_ns = fm_get_ns(net); + cancel_delayed_work_sync(&fm_ns->address_worker); + + rcu_read_lock_bh(); + + mptcp_local = rcu_dereference_bh(fm_ns->local); + kfree_rcu(mptcp_local, rcu); + + spin_lock(&fm_ns->local_lock); + list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) { + list_del(&eventq->list); + kfree(eventq); + } + spin_unlock(&fm_ns->local_lock); + + rcu_read_unlock_bh(); + + remove_proc_entry("mptcp_fullmesh", net->proc_net); + + kfree(fm_ns); +} + +static struct pernet_operations full_mesh_net_ops = { + .init = mptcp_fm_init_net, + .exit = mptcp_fm_exit_net, +}; + +static struct mptcp_pm_ops full_mesh __read_mostly = { + .new_session = full_mesh_new_session, + .release_sock = full_mesh_release_sock, + .fully_established = full_mesh_create_subflows, + .new_remote_address = full_mesh_create_subflows, + .get_local_id = full_mesh_get_local_id, + .addr_signal = full_mesh_addr_signal, + .add_raddr = full_mesh_add_raddr, + .rem_raddr = full_mesh_rem_raddr, + .delete_subflow = full_mesh_delete_subflow, + .name = "fullmesh", + .owner = THIS_MODULE, +}; + +/* General initialization of MPTCP_PM */ +static int __init full_mesh_register(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE); + + ret = register_pernet_subsys(&full_mesh_net_ops); + if (ret) + goto out; + + ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); + if (ret) + goto err_reg_inetaddr; + ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier); + if (ret) + goto err_reg_netdev; + +#if IS_ENABLED(CONFIG_IPV6) + ret = register_inet6addr_notifier(&inet6_addr_notifier); + if (ret) + goto err_reg_inet6addr; +#endif + + ret = mptcp_register_path_manager(&full_mesh); + if (ret) + goto err_reg_pm; + +out: + return ret; + + +err_reg_pm: +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&inet6_addr_notifier); +err_reg_inet6addr: +#endif + unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); +err_reg_netdev: + unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); +err_reg_inetaddr: + unregister_pernet_subsys(&full_mesh_net_ops); + goto out; +} + +static void full_mesh_unregister(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&inet6_addr_notifier); +#endif + unregister_netdevice_notifier(&mptcp_pm_netdev_notifier); + unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier); + unregister_pernet_subsys(&full_mesh_net_ops); + mptcp_unregister_path_manager(&full_mesh); +} + +module_init(full_mesh_register); +module_exit(full_mesh_unregister); + +MODULE_AUTHOR("Christoph Paasch"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Full-Mesh MPTCP"); +MODULE_VERSION("0.88"); diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c new file mode 100644 index 000000000000..d9a5e1a2db68 --- /dev/null +++ b/net/mptcp/mptcp_input.c @@ -0,0 +1,2540 @@ +/* + * MPTCP implementation - Sending side + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include +#include +#include + +#include + +/* is seq1 < seq2 ? */ +static inline bool before64(const u64 seq1, const u64 seq2) +{ + return (s64)(seq1 - seq2) < 0; +} + +/* is seq1 > seq2 ? */ +#define after64(seq1, seq2) before64(seq2, seq1) + +static inline void mptcp_become_fully_estab(struct sock *sk) +{ + tcp_sk(sk)->mptcp->fully_established = 1; + + if (is_master_tp(tcp_sk(sk)) && + tcp_sk(sk)->mpcb->pm_ops->fully_established) + tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk)); +} + +/* Similar to tcp_tso_acked without any memory accounting */ +static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk, + struct sk_buff *skb) +{ + const struct tcp_sock *meta_tp = tcp_sk(meta_sk); + u32 packets_acked, len, delta_truesize; + + BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)); + + packets_acked = tcp_skb_pcount(skb); + + if (skb_unclone(skb, GFP_ATOMIC)) + return 0; + + len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq; + delta_truesize = __pskb_trim_head(skb, len); + + TCP_SKB_CB(skb)->seq += len; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (delta_truesize) + skb->truesize -= delta_truesize; + + /* Any change of skb->len requires recalculation of tso factor. */ + if (tcp_skb_pcount(skb) > 1) + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); + packets_acked -= tcp_skb_pcount(skb); + + if (packets_acked) { + BUG_ON(tcp_skb_pcount(skb) == 0); + BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); + } + + return packets_acked; +} + +/* Cleans the meta-socket retransmission queue and the reinject-queue. */ +static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una) +{ + struct sk_buff *skb, *tmp, *next; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + bool acked = false; + u32 acked_pcount; + + for (skb = skb_rb_first(&meta_sk->tcp_rtx_queue); skb; skb = next) { + bool fully_acked = true; + + if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { + if (tcp_skb_pcount(skb) == 1 || + !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) + break; + + acked_pcount = tcp_tso_acked(meta_sk, skb); + if (!acked_pcount) + break; + + fully_acked = false; + } else { + acked_pcount = tcp_skb_pcount(skb); + } + + acked = true; + meta_tp->packets_out -= acked_pcount; + meta_tp->retrans_stamp = 0; + + if (!fully_acked) + break; + + next = skb_rb_next(skb); + + if (mptcp_is_data_fin(skb)) { + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + + /* DATA_FIN has been acknowledged - now we can close + * the subflows + */ + mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + unsigned long delay = 0; + + /* If we are the passive closer, don't trigger + * subflow-fin until the subflow has been finned + * by the peer - thus we add a delay. + */ + if (mpcb->passive_close && + sk_it->sk_state == TCP_ESTABLISHED) + delay = inet_csk(sk_it)->icsk_rto << 3; + + mptcp_sub_close(sk_it, delay); + } + } + tcp_rtx_queue_unlink_and_free(skb, meta_sk); + } + /* Remove acknowledged data from the reinject queue */ + skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) { + if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { + if (tcp_skb_pcount(skb) == 1 || + !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) + break; + + mptcp_tso_acked_reinject(meta_sk, skb); + break; + } + + __skb_unlink(skb, &mpcb->reinject_queue); + __kfree_skb(skb); + } + + if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una))) + meta_tp->snd_up = meta_tp->snd_una; + + if (acked) { + tcp_rearm_rto(meta_sk); + /* Normally this is done in tcp_try_undo_loss - but MPTCP + * does not call this function. + */ + inet_csk(meta_sk)->icsk_retransmits = 0; + } +} + +/* Inspired by tcp_rcv_state_process */ +/* Returns 0 if processing the packet can continue + * -1 if connection was closed with an active reset + * 1 if connection was closed and processing should stop. + */ +static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk, + const struct sk_buff *skb, u32 data_seq, + u16 data_len) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); + const struct tcphdr *th = tcp_hdr(skb); + + /* State-machine handling if FIN has been enqueued and he has + * been acked (snd_una == write_seq) - it's important that this + * here is after sk_wmem_free_skb because otherwise + * sk_forward_alloc is wrong upon inet_csk_destroy_sock() + */ + switch (meta_sk->sk_state) { + case TCP_FIN_WAIT1: { + struct dst_entry *dst; + int tmo; + + if (meta_tp->snd_una != meta_tp->write_seq) + break; + + tcp_set_state(meta_sk, TCP_FIN_WAIT2); + meta_sk->sk_shutdown |= SEND_SHUTDOWN; + + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + + if (!sock_flag(meta_sk, SOCK_DEAD)) { + /* Wake up lingering close() */ + meta_sk->sk_state_change(meta_sk); + break; + } + + if (meta_tp->linger2 < 0 || + (data_len && + after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0), + meta_tp->rcv_nxt))) { + mptcp_send_active_reset(meta_sk, GFP_ATOMIC); + tcp_done(meta_sk); + NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); + return -1; + } + + tmo = tcp_fin_time(meta_sk); + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN); + } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + inet_csk_reset_keepalive_timer(meta_sk, tmo); + } else { + meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo); + } + break; + } + case TCP_CLOSING: + case TCP_LAST_ACK: + if (meta_tp->snd_una == meta_tp->write_seq) { + tcp_done(meta_sk); + return 1; + } + break; + } + + /* step 7: process the segment text */ + switch (meta_sk->sk_state) { + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* RFC 793 says to queue data in these states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. + */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && + !mptcp_is_data_fin2(skb, tp)) { + NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); + mptcp_send_active_reset(meta_sk, GFP_ATOMIC); + tcp_reset(meta_sk); + return -1; + } + } + break; + } + + return 0; +} + +/** + * @return: + * i) 1: Everything's fine. + * ii) -1: A reset has been sent on the subflow - csum-failure + * iii) 0: csum-failure but no reset sent, because it's the last subflow. + * Last packet should not be destroyed by the caller because it has + * been done here. + */ +static int mptcp_verif_dss_csum(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *tmp, *tmp1, *last = NULL; + __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */ + int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0; + int iter = 0; + u32 next_seq, offset_seq; + + skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) { + unsigned int csum_len; + + /* init next seq in first round */ + if (!iter) + next_seq = TCP_SKB_CB(tmp)->seq; + offset_seq = next_seq - TCP_SKB_CB(tmp)->seq; + + if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq)) + /* Mapping ends in the middle of the packet - + * csum only these bytes + */ + csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq; + else + csum_len = tmp->len; + + csum_len -= offset_seq; + offset = 0; + if (overflowed) { + char first_word[4]; + first_word[0] = 0; + first_word[1] = 0; + first_word[2] = 0; + first_word[3] = *(tmp->data + offset_seq); + csum_tcp = csum_partial(first_word, 4, csum_tcp); + offset = 1; + csum_len--; + overflowed = 0; + } + + csum_tcp = skb_checksum(tmp, offset + offset_seq, csum_len, + csum_tcp); + + /* Was it on an odd-length? Then we have to merge the next byte + * correctly (see above) + */ + if (csum_len != (csum_len & (~1))) + overflowed = 1; + + if (mptcp_is_data_seq(tmp) && !dss_csum_added) { + __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32)); + + /* If a 64-bit dss is present, we increase the offset + * by 4 bytes, as the high-order 64-bits will be added + * in the final csum_partial-call. + */ + u32 offset = skb_transport_offset(tmp) + + TCP_SKB_CB(tmp)->dss_off; + if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET) + offset += 4; + + csum_tcp = skb_checksum(tmp, offset, + MPTCP_SUB_LEN_SEQ_CSUM, + csum_tcp); + + csum_tcp = csum_partial(&data_seq, + sizeof(data_seq), csum_tcp); + + dss_csum_added = 1; /* Just do it once */ + } + last = tmp; + iter++; + + if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) && + !before(TCP_SKB_CB(tmp1)->seq, + tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + break; + next_seq = TCP_SKB_CB(tmp)->end_seq; + } + + /* Now, checksum must be 0 */ + if (unlikely(csum_fold(csum_tcp))) { + struct mptcp_tcp_sock *mptcp; + struct sock *sk_it = NULL; + + pr_debug("%s csum is wrong: %#x tcp-seq %u dss_csum_added %d overflowed %d iterations %d\n", + __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq, + dss_csum_added, overflowed, iter); + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CSUMFAIL); + tp->mptcp->send_mp_fail = 1; + + /* map_data_seq is the data-seq number of the + * mapping we are currently checking + */ + tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq; + + /* Search for another subflow that is fully established */ + mptcp_for_each_sub(tp->mpcb, mptcp) { + sk_it = mptcp_to_sock(mptcp); + + if (sk_it != sk && + tcp_sk(sk_it)->mptcp->fully_established) + break; + + sk_it = NULL; + } + + if (sk_it) { + mptcp_send_reset(sk); + ans = -1; + } else { + tp->mpcb->send_infinite_mapping = 1; + + /* Need to purge the rcv-queue as it's no more valid */ + while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + tp->copied_seq = TCP_SKB_CB(tmp)->end_seq; + kfree_skb(tmp); + } + + if (mptcp_fallback_close(tp->mpcb, sk)) + ans = -1; + else + ans = 0; + } + } + + return ans; +} + +static inline void mptcp_prepare_skb(struct sk_buff *skb, + const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + u32 inc = 0, end_seq = tcb->end_seq; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + end_seq--; + /* If skb is the end of this mapping (end is always at mapping-boundary + * thanks to the splitting/trimming), then we need to increase + * data-end-seq by 1 if this here is a data-fin. + * + * We need to do -1 because end_seq includes the subflow-FIN. + */ + if (tp->mptcp->map_data_fin && + end_seq == tp->mptcp->map_subseq + tp->mptcp->map_data_len) { + inc = 1; + + /* We manually set the fin-flag if it is a data-fin. For easy + * processing in tcp_recvmsg. + */ + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; + } else { + /* We may have a subflow-fin with data but without data-fin */ + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_FIN; + } + + /* Adapt data-seq's to the packet itself. We kinda transform the + * dss-mapping to a per-packet granularity. This is necessary to + * correctly handle overlapping mappings coming from different + * subflows. Otherwise it would be a complete mess. + */ + tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq; + tcb->end_seq = tcb->seq + skb->len + inc; +} + +static inline void mptcp_reset_mapping(struct tcp_sock *tp, u32 old_copied_seq) +{ + tp->mptcp->map_data_len = 0; + tp->mptcp->map_data_seq = 0; + tp->mptcp->map_subseq = 0; + tp->mptcp->map_data_fin = 0; + tp->mptcp->mapping_present = 0; + + /* In infinite mapping receiver mode, we have to advance the implied + * data-sequence number when we progress the subflow's data. + */ + if (tp->mpcb->infinite_mapping_rcv) + tp->mpcb->infinite_rcv_seq += (tp->copied_seq - old_copied_seq); +} + +/* The DSS-mapping received on the sk only covers the second half of the skb + * (cut at seq). We trim the head from the skb. + * Data will be freed upon kfree(). + * + * Inspired by tcp_trim_head(). + */ +static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq) +{ + int len = seq - TCP_SKB_CB(skb)->seq; + u32 new_seq = TCP_SKB_CB(skb)->seq + len; + u32 delta_truesize; + + delta_truesize = __pskb_trim_head(skb, len); + + TCP_SKB_CB(skb)->seq = new_seq; + + if (delta_truesize) { + skb->truesize -= delta_truesize; + atomic_sub(delta_truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, delta_truesize); + } +} + +/* The DSS-mapping received on the sk only covers the first half of the skb + * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue + * as further packets may resolve the mapping of the second half of data. + * + * Inspired by tcp_fragment(). + */ +static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq) +{ + struct sk_buff *buff; + int nsize; + int nlen, len; + u8 flags; + + len = seq - TCP_SKB_CB(skb)->seq; + nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len; + if (nsize < 0) + nsize = 0; + + /* Get a new skb... force flag on. */ + buff = alloc_skb(nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; + + skb_reserve(buff, tcp_sk(sk)->tcp_header_len); + skb_reset_transport_header(buff); + + flags = TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN); + TCP_SKB_CB(buff)->tcp_flags = flags; + + /* We absolutly need to call skb_set_owner_r before refreshing the + * truesize of buff, otherwise the moved data will account twice. + */ + skb_set_owner_r(buff, sk); + nlen = skb->len - len - nsize; + buff->truesize += nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + skb_split(skb, buff, len); + + __skb_queue_after(&sk->sk_receive_queue, skb, buff); + + return 0; +} + +/* @return: 0 everything is fine. Just continue processing + * 1 subflow is broken stop everything + * -1 this packet was broken - continue with the next one. + */ +static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + + /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */ + if (!skb->len && (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && + !mptcp_is_data_fin(skb) && !mpcb->infinite_mapping_rcv) { + /* Remove a pure subflow-fin from the queue and increase + * copied_seq. + */ + tp->copied_seq = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + return -1; + } + + /* If we are not yet fully established and do not know the mapping for + * this segment, this path has to fallback to infinite or be torn down. + */ + if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) && + !tp->mptcp->mapping_present && !mpcb->infinite_mapping_rcv) { + pr_debug("%s %#x will fallback - pi %d from %pS, seq %u\n", + __func__, mpcb->mptcp_loc_token, + tp->mptcp->path_index, __builtin_return_address(0), + TCP_SKB_CB(skb)->seq); + + if (!is_master_tp(tp)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATASUB); + mptcp_send_reset(sk); + return 1; + } + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATAINIT); + + mpcb->infinite_mapping_snd = 1; + mpcb->infinite_mapping_rcv = 1; + mpcb->infinite_rcv_seq = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); + + if (mptcp_fallback_close(mpcb, sk)) + return 1; + + /* We do a seamless fallback and should not send a inf.mapping. */ + mpcb->send_infinite_mapping = 0; + tp->mptcp->fully_established = 1; + } + + /* Receiver-side becomes fully established when a whole rcv-window has + * been received without the need to fallback due to the previous + * condition. + */ + if (!tp->mptcp->fully_established) { + tp->mptcp->init_rcv_wnd -= skb->len; + if (tp->mptcp->init_rcv_wnd < 0) + mptcp_become_fully_estab(sk); + } + + return 0; +} + +static void mptcp_restart_sending(struct sock *meta_sk, uint32_t in_flight_seq) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sk_buff *wq_head, *skb, *tmp; + + skb = tcp_rtx_queue_head(meta_sk); + + /* We resend everything that has not been acknowledged and is not in-flight, + * thus we need to move it from the rtx-tree to the write-queue. + */ + wq_head = tcp_write_queue_head(meta_sk); + + /* We artificially restart parts of the send-queue. Thus, + * it is as if no packets are in flight, minus the one that are. + */ + meta_tp->packets_out = 0; + + skb_rbtree_walk_from_safe(skb, tmp) { + if (!after(TCP_SKB_CB(skb)->end_seq, in_flight_seq)) { + meta_tp->packets_out += tcp_skb_pcount(skb); + continue; + } + + list_del(&skb->tcp_tsorted_anchor); + tcp_rtx_queue_unlink(skb, meta_sk); + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); + + if (wq_head) + __skb_queue_before(&meta_sk->sk_write_queue, wq_head, skb); + else + tcp_add_write_queue_tail(meta_sk, skb); + } + + /* If the snd_nxt already wrapped around, we have to + * undo the wrapping, as we are restarting from in_flight_seq + * on. + */ + if (meta_tp->snd_nxt < in_flight_seq) { + mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2; + mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; + } + meta_tp->snd_nxt = in_flight_seq; + + /* Trigger a sending on the meta. */ + mptcp_push_pending_frames(meta_sk); +} + +/* @return: 0 everything is fine. Just continue processing + * 1 subflow is broken stop everything + * -1 this packet was broken - continue with the next one. + */ +static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); + struct mptcp_cb *mpcb = tp->mpcb; + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + u32 *ptr; + u32 data_seq, sub_seq, data_len, tcp_end_seq; + bool set_infinite_rcv = false; + + /* If we are in infinite-mapping-mode, the subflow is guaranteed to be + * in-order at the data-level. Thus data-seq-numbers can be inferred + * from what is expected at the data-level. + */ + if (mpcb->infinite_mapping_rcv) { + /* copied_seq may be bigger than tcb->seq (e.g., when the peer + * retransmits data that actually has already been acknowledged with + * newer data, if he did not receive our acks). Thus, we need + * to account for this overlap as well. + */ + tp->mptcp->map_data_seq = mpcb->infinite_rcv_seq - (tp->copied_seq - tcb->seq); + tp->mptcp->map_subseq = tcb->seq; + tp->mptcp->map_data_len = skb->len; + tp->mptcp->map_data_fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); + tp->mptcp->mapping_present = 1; + return 0; + } + + /* No mapping here? Exit - it is either already set or still on its way */ + if (!mptcp_is_data_seq(skb)) { + /* Too many packets without a mapping - this subflow is broken */ + if (!tp->mptcp->mapping_present && + tp->rcv_nxt - tp->copied_seq > 65536) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); + mptcp_send_reset(sk); + return 1; + } + + return 0; + } + + ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb); + ptr++; + sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn; + ptr++; + data_len = get_unaligned_be16(ptr); + + /* If it's an empty skb with DATA_FIN, sub_seq must get fixed. + * The draft sets it to 0, but we really would like to have the + * real value, to have an easy handling afterwards here in this + * function. + */ + if (mptcp_is_data_fin(skb) && skb->len == 0) + sub_seq = TCP_SKB_CB(skb)->seq; + + /* If there is already a mapping - we check if it maps with the current + * one. If not - we reset. + */ + if (tp->mptcp->mapping_present && + (data_seq != (u32)tp->mptcp->map_data_seq || + sub_seq != tp->mptcp->map_subseq || + data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin || + mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) { + /* Mapping in packet is different from what we want */ + pr_debug("%s Mappings do not match!\n", __func__); + pr_debug("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n", + __func__, data_seq, (u32)tp->mptcp->map_data_seq, + sub_seq, tp->mptcp->map_subseq, data_len, + tp->mptcp->map_data_len, mptcp_is_data_fin(skb), + tp->mptcp->map_data_fin); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSNOMATCH); + mptcp_send_reset(sk); + return 1; + } + + /* If the previous check was good, the current mapping is valid and we exit. */ + if (tp->mptcp->mapping_present) + return 0; + + /* Mapping not yet set on this subflow - we set it here! */ + + if (!data_len) { + mpcb->infinite_mapping_rcv = 1; + mpcb->send_infinite_mapping = 1; + tp->mptcp->fully_established = 1; + /* We need to repeat mp_fail's until the sender felt + * back to infinite-mapping - here we stop repeating it. + */ + tp->mptcp->send_mp_fail = 0; + + /* We have to fixup data_len - it must be the same as skb->len */ + data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0); + sub_seq = tcb->seq; + + if (mptcp_fallback_close(mpcb, sk)) + return 1; + + mptcp_restart_sending(tp->meta_sk, meta_tp->snd_una); + + /* data_seq and so on are set correctly */ + + /* At this point, the meta-ofo-queue has to be emptied, + * as the following data is guaranteed to be in-order at + * the data and subflow-level + */ + skb_rbtree_purge(&meta_tp->out_of_order_queue); + + set_infinite_rcv = true; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_INFINITEMAPRX); + } + + /* We are sending mp-fail's and thus are in fallback mode. + * Ignore packets which do not announce the fallback and still + * want to provide a mapping. + */ + if (tp->mptcp->send_mp_fail) { + tp->copied_seq = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + return -1; + } + + /* FIN increased the mapping-length by 1 */ + if (mptcp_is_data_fin(skb)) + data_len--; + + /* Subflow-sequences of packet must be + * (at least partially) be part of the DSS-mapping's + * subflow-sequence-space. + * + * Basically the mapping is not valid, if either of the + * following conditions is true: + * + * 1. It's not a data_fin and + * MPTCP-sub_seq >= TCP-end_seq + * + * 2. It's a data_fin and TCP-end_seq > TCP-seq and + * MPTCP-sub_seq >= TCP-end_seq + * + * The previous two can be merged into: + * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq + * Because if it's not a data-fin, TCP-end_seq > TCP-seq + * + * 3. It's a data_fin and skb->len == 0 and + * MPTCP-sub_seq > TCP-end_seq + * + * 4. It's not a data_fin and TCP-end_seq > TCP-seq and + * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq + */ + + /* subflow-fin is not part of the mapping - ignore it here ! */ + tcp_end_seq = tcb->end_seq; + if (tcb->tcp_flags & TCPHDR_FIN) + tcp_end_seq--; + if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) || + (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) || + (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq))) { + /* Subflow-sequences of packet is different from what is in the + * packet's dss-mapping. The peer is misbehaving - reset + */ + pr_debug("%s Packet's mapping does not map to the DSS sub_seq %u end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u copied_seq %u\n", + __func__, sub_seq, tcb->end_seq, tcp_end_seq, + tcb->seq, mptcp_is_data_fin(skb), + skb->len, data_len, tp->copied_seq); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSTCPMISMATCH); + mptcp_send_reset(sk); + return 1; + } + + /* Does the DSS had 64-bit seqnum's ? */ + if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) { + /* Wrapped around? */ + if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) { + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq); + } else { + /* Else, access the default high-order bits */ + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq); + } + } else { + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq); + + if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) { + /* We make sure that the data_seq is invalid. + * It will be dropped later. + */ + tp->mptcp->map_data_seq += 0xFFFFFFFF; + tp->mptcp->map_data_seq += 0xFFFFFFFF; + } + } + + if (set_infinite_rcv) + mpcb->infinite_rcv_seq = tp->mptcp->map_data_seq; + + tp->mptcp->map_data_len = data_len; + tp->mptcp->map_subseq = sub_seq; + tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0; + tp->mptcp->mapping_present = 1; + + return 0; +} + +/* Similar to tcp_sequence(...) */ +static inline bool mptcp_sequence(const struct tcp_sock *meta_tp, + u64 data_seq, u64 end_data_seq) +{ + const struct mptcp_cb *mpcb = meta_tp->mpcb; + u64 rcv_wup64; + + /* Wrap-around? */ + if (meta_tp->rcv_wup > meta_tp->rcv_nxt) { + rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) | + meta_tp->rcv_wup; + } else { + rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, + meta_tp->rcv_wup); + } + + return !before64(end_data_seq, rcv_wup64) && + !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window_now(meta_tp)); +} + +/* @return: 0 everything is fine. Just continue processing + * -1 this packet was broken - continue with the next one. + */ +static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *tmp, *tmp1; + u32 tcp_end_seq; + + if (!tp->mptcp->mapping_present) + return 0; + + /* either, the new skb gave us the mapping and the first segment + * in the sub-rcv-queue has to be trimmed ... + */ + tmp = skb_peek(&sk->sk_receive_queue); + if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) && + after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSTRIMHEAD); + mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq); + } + + /* ... or the new skb (tail) has to be split at the end. */ + tcp_end_seq = TCP_SKB_CB(skb)->end_seq; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + tcp_end_seq--; + if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { + u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSSPLITTAIL); + if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */ + /* TODO : maybe handle this here better. + * We now just force meta-retransmission. + */ + tp->copied_seq = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + return -1; + } + } + + /* Now, remove old sk_buff's from the receive-queue. + * This may happen if the mapping has been lost for these segments and + * the next mapping has already been received. + */ + if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) { + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { + if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq)) + break; + + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + __skb_unlink(tmp1, &sk->sk_receive_queue); + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PURGEOLD); + /* Impossible that we could free skb here, because his + * mapping is known to be valid from previous checks + */ + __kfree_skb(tmp1); + } + } + + return 0; +} + +/* @return: 0 everything is fine. Just continue processing + * 1 subflow is broken stop everything + * -1 this mapping has been put in the meta-receive-queue + * -2 this mapping has been eaten by the application + */ +static int mptcp_queue_skb(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); + struct sock *meta_sk = mptcp_meta_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + struct sk_buff *tmp, *tmp1; + u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp); + u32 old_copied_seq = tp->copied_seq; + bool data_queued = false; + + /* Have we not yet received the full mapping? */ + if (!tp->mptcp->mapping_present || + before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + return 0; + + /* Is this an overlapping mapping? rcv_nxt >= end_data_seq + * OR + * This mapping is out of window + */ + if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) || + !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq, + tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) { + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { + __skb_unlink(tmp1, &sk->sk_receive_queue); + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + __kfree_skb(tmp1); + + if (!skb_queue_empty(&sk->sk_receive_queue) && + !before(TCP_SKB_CB(tmp)->seq, + tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + break; + } + + mptcp_reset_mapping(tp, old_copied_seq); + + return -1; + } + + /* Record it, because we want to send our data_fin on the same path */ + if (tp->mptcp->map_data_fin) { + mpcb->dfin_path_index = tp->mptcp->path_index; + mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN); + } + + /* Verify the checksum */ + if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) { + int ret = mptcp_verif_dss_csum(sk); + + if (ret <= 0) { + mptcp_reset_mapping(tp, old_copied_seq); + return 1; + } + } + + if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) { + /* Seg's have to go to the meta-ofo-queue */ + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + mptcp_prepare_skb(tmp1, sk); + __skb_unlink(tmp1, &sk->sk_receive_queue); + /* MUST be done here, because fragstolen may be true later. + * Then, kfree_skb_partial will not account the memory. + */ + skb_orphan(tmp1); + + if (!mpcb->in_time_wait) /* In time-wait, do not receive data */ + tcp_data_queue_ofo(meta_sk, tmp1); + else + __kfree_skb(tmp1); + + if (!skb_queue_empty(&sk->sk_receive_queue) && + !before(TCP_SKB_CB(tmp)->seq, + tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + break; + } + + /* Quick ACK if more 3/4 of the receive window is filled */ + if (after64(tp->mptcp->map_data_seq, + rcv_nxt64 + 3 * (tcp_receive_window_now(meta_tp) >> 2))) + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); + + } else { + /* Ready for the meta-rcv-queue */ + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { + int eaten = 0; + bool fragstolen = false; + u32 old_rcv_nxt = meta_tp->rcv_nxt; + + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + mptcp_prepare_skb(tmp1, sk); + __skb_unlink(tmp1, &sk->sk_receive_queue); + /* MUST be done here, because fragstolen may be true. + * Then, kfree_skb_partial will not account the memory. + */ + skb_orphan(tmp1); + + /* This segment has already been received */ + if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) { + __kfree_skb(tmp1); + goto next; + } + + if (mpcb->in_time_wait) /* In time-wait, do not receive data */ + eaten = 1; + + if (!eaten) + eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen); + + meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq; + + if (TCP_SKB_CB(tmp1)->tcp_flags & TCPHDR_FIN) + mptcp_fin(meta_sk); + + /* Check if this fills a gap in the ofo queue */ + if (!RB_EMPTY_ROOT(&meta_tp->out_of_order_queue)) + tcp_ofo_queue(meta_sk); + + mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); + + if (eaten) + kfree_skb_partial(tmp1, fragstolen); + + data_queued = true; +next: + if (!skb_queue_empty(&sk->sk_receive_queue) && + !before(TCP_SKB_CB(tmp)->seq, + tp->mptcp->map_subseq + tp->mptcp->map_data_len)) + break; + } + } + + inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_jiffies32; + mptcp_reset_mapping(tp, old_copied_seq); + + return data_queued ? -1 : -2; +} + +void mptcp_data_ready(struct sock *sk) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + struct sk_buff *skb, *tmp; + int queued = 0; + + tcp_mstamp_refresh(tcp_sk(meta_sk)); + + /* restart before the check, because mptcp_fin might have changed the + * state. + */ +restart: + /* If the meta cannot receive data, there is no point in pushing data. + * If we are in time-wait, we may still be waiting for the final FIN. + * So, we should proceed with the processing. + */ + if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) { + skb_queue_purge(&sk->sk_receive_queue); + tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt; + goto exit; + } + + /* Iterate over all segments, detect their mapping (if we don't have + * one yet), validate them and push everything one level higher. + */ + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { + int ret; + /* Pre-validation - e.g., early fallback */ + ret = mptcp_prevalidate_skb(sk, skb); + if (ret < 0) + goto restart; + else if (ret > 0) + break; + + /* Set the current mapping */ + ret = mptcp_detect_mapping(sk, skb); + if (ret < 0) + goto restart; + else if (ret > 0) + break; + + /* Validation */ + if (mptcp_validate_mapping(sk, skb) < 0) + goto restart; + + /* Push a level higher */ + ret = mptcp_queue_skb(sk); + if (ret < 0) { + if (ret == -1) + queued = ret; + goto restart; + } else if (ret == 0) { + continue; + } else { /* ret == 1 */ + break; + } + } + +exit: + if (tcp_sk(sk)->close_it && sk->sk_state == TCP_FIN_WAIT2) { + tcp_send_ack(sk); + tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0); + } + + if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD)) + meta_sk->sk_data_ready(meta_sk); +} + +struct mp_join *mptcp_find_join(const struct sk_buff *skb) +{ + const struct tcphdr *th = tcp_hdr(skb); + unsigned char *ptr; + int length = (th->doff * 4) - sizeof(struct tcphdr); + + /* Jump through the options to check whether JOIN is there */ + ptr = (unsigned char *)(th + 1); + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return NULL; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return NULL; + if (opsize > length) + return NULL; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP && + ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) { + return (struct mp_join *)(ptr - 2); + } + ptr += opsize - 2; + length -= opsize; + } + } + return NULL; +} + +int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw) +{ + struct sock *meta_sk; + u32 token; + bool meta_v4; + struct mp_join *join_opt = mptcp_find_join(skb); + if (!join_opt) + return 0; + + /* MPTCP structures were not initialized, so return error */ + if (mptcp_init_failed) + return -1; + + token = join_opt->u.syn.token; + meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token); + if (!meta_sk) { + MPTCP_INC_STATS(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN); + mptcp_debug("%s:mpcb not found:%x\n", __func__, token); + return -1; + } + + meta_v4 = meta_sk->sk_family == AF_INET; + if (meta_v4) { + if (skb->protocol == htons(ETH_P_IPV6)) { + mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return -1; + } + } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) { + mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return -1; + } + + /* Coming from time-wait-sock processing in tcp_v4_rcv. + * We have to deschedule it before continuing, because otherwise + * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req. + */ + if (tw) + inet_twsk_deschedule_put(tw); + + /* OK, this is a new syn/join, let's create a new open request and + * send syn+ack + */ + if (skb->protocol == htons(ETH_P_IP)) { + tcp_v4_do_rcv(meta_sk, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { + tcp_v6_do_rcv(meta_sk, skb); +#endif /* CONFIG_IPV6 */ + } + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return 1; +} + +int mptcp_do_join_short(struct sk_buff *skb, + const struct mptcp_options_received *mopt, + struct net *net) +{ + struct sock *meta_sk; + u32 token; + bool meta_v4; + + token = mopt->mptcp_rem_token; + meta_sk = mptcp_hash_find(net, token); + if (!meta_sk) { + MPTCP_INC_STATS(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN); + mptcp_debug("%s:mpcb not found:%x\n", __func__, token); + return -1; + } + + meta_v4 = meta_sk->sk_family == AF_INET; + if (meta_v4) { + if (skb->protocol == htons(ETH_P_IPV6)) { + mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return -1; + } + } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) { + mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return -1; + } + + /* OK, this is a new syn/join, let's create a new open request and + * send syn+ack + */ + + /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as + * the skb will finally be freed by tcp_v4_do_rcv (where we are + * coming from) + */ + skb_get(skb); + if (skb->protocol == htons(ETH_P_IP)) { + tcp_v4_do_rcv(meta_sk, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { /* IPv6 */ + tcp_v6_do_rcv(meta_sk, skb); +#endif /* CONFIG_IPV6 */ + } + + sock_put(meta_sk); /* Taken by mptcp_hash_find */ + return 0; +} + +/** + * Equivalent of tcp_fin() for MPTCP + * Can be called only when the FIN is validly part + * of the data seqnum space. Not before when we get holes. + */ +void mptcp_fin(struct sock *meta_sk) +{ + struct sock *sk = NULL; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct mptcp_tcp_sock *mptcp; + unsigned char state; + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) { + sk = sk_it; + break; + } + } + + if (!sk || sk->sk_state == TCP_CLOSE) + sk = mptcp_select_ack_sock(meta_sk); + + inet_csk_schedule_ack(sk); + + if (!mpcb->in_time_wait) { + meta_sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(meta_sk, SOCK_DONE); + state = meta_sk->sk_state; + } else { + state = mpcb->mptw_state; + } + + switch (state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(meta_sk, TCP_CLOSE_WAIT); + inet_csk(sk)->icsk_ack.pingpong = 1; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; + + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_send_ack(sk); + tcp_set_state(meta_sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); + meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__, + meta_sk->sk_state); + break; + } + + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ + skb_rbtree_purge(&meta_tp->out_of_order_queue); + sk_mem_reclaim(meta_sk); + + if (!sock_flag(meta_sk, SOCK_DEAD)) { + meta_sk->sk_state_change(meta_sk); + + /* Do not send POLL_HUP for half duplex close. */ + if (meta_sk->sk_shutdown == SHUTDOWN_MASK || + meta_sk->sk_state == TCP_CLOSE) + sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN); + } + + return; +} + +/* Similar to tcp_xmit_retransmit_queue */ +static void mptcp_xmit_retransmit_queue(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sk_buff *skb, *rtx_head; + + if (!meta_tp->packets_out) + return; + + skb = rtx_head = tcp_rtx_queue_head(meta_sk); + skb_rbtree_walk_from(skb) { + if (mptcp_retransmit_skb(meta_sk, skb)) + return; + + if (skb == rtx_head) + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, + inet_csk(meta_sk)->icsk_rto, + TCP_RTO_MAX); + } +} + +static void mptcp_snd_una_update(struct tcp_sock *meta_tp, u32 data_ack) +{ + u32 delta = data_ack - meta_tp->snd_una; + + sock_owned_by_me((struct sock *)meta_tp); + meta_tp->bytes_acked += delta; + meta_tp->snd_una = data_ack; +} + +static void mptcp_stop_subflow_chronos(struct sock *meta_sk, + const enum tcp_chrono type) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + tcp_chrono_stop(sk_it, type); + } +} + +/* Return false if we can continue processing packets. True, otherwise */ +static bool mptcp_process_data_ack(struct sock *sk, const struct sk_buff *skb) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + u32 prior_snd_una = meta_tp->snd_una; + int prior_packets; + u32 nwin, data_ack, data_seq; + u16 data_len = 0; + + /* A valid packet came in - subflow is operational again */ + tp->pf = 0; + + /* Even if there is no data-ack, we stop retransmitting. + * Except if this is a SYN/ACK. Then it is just a retransmission + */ + if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) { + tp->mptcp->pre_established = 0; + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); + + if (meta_tp->mpcb->pm_ops->established_subflow) + meta_tp->mpcb->pm_ops->established_subflow(sk); + } + + /* If we are in infinite mapping mode, rx_opt.data_ack has been + * set by mptcp_handle_ack_in_infinite. + */ + if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd) + return false; + + if (unlikely(!tp->mptcp->fully_established) && + tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq) + /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1) + * includes a data-ack, we are fully established + */ + mptcp_become_fully_estab(sk); + + /* After we did the subflow-only processing (stopping timer and marking + * subflow as established), check if we can proceed with MPTCP-level + * processing. + */ + if (meta_sk->sk_state == TCP_CLOSE) + return false; + + /* Get the data_seq */ + if (mptcp_is_data_seq(skb)) { + data_seq = tp->mptcp->rx_opt.data_seq; + data_len = tp->mptcp->rx_opt.data_len; + } else { + data_seq = meta_tp->snd_wl1; + } + + data_ack = tp->mptcp->rx_opt.data_ack; + + /* If the ack is older than previous acks + * then we can probably ignore it. + */ + if (before(data_ack, prior_snd_una)) + goto exit; + + /* If the ack includes data we haven't sent yet, discard + * this segment (RFC793 Section 3.9). + */ + if (after(data_ack, meta_tp->snd_nxt)) + goto exit; + + /*** Now, update the window - inspired by tcp_ack_update_window ***/ + nwin = ntohs(tcp_hdr(skb)->window); + + if (likely(!tcp_hdr(skb)->syn)) + nwin <<= tp->rx_opt.snd_wscale; + + if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) { + tcp_update_wl(meta_tp, data_seq); + + /* Draft v09, Section 3.3.5: + * [...] It should only update its local receive window values + * when the largest sequence number allowed (i.e. DATA_ACK + + * receive window) increases. [...] + */ + if (meta_tp->snd_wnd != nwin && + !before(data_ack + nwin, tcp_wnd_end(meta_tp))) { + meta_tp->snd_wnd = nwin; + + if (nwin > meta_tp->max_window) + meta_tp->max_window = nwin; + } + } + /*** Done, update the window ***/ + + /* We passed data and got it acked, remove any soft error + * log. Something worked... + */ + sk->sk_err_soft = 0; + inet_csk(meta_sk)->icsk_probes_out = 0; + meta_tp->rcv_tstamp = tcp_jiffies32; + prior_packets = meta_tp->packets_out; + if (!prior_packets) + goto no_queue; + + mptcp_snd_una_update(meta_tp, data_ack); + + mptcp_clean_rtx_queue(meta_sk, prior_snd_una); + + /* We are in loss-state, and something got acked, retransmit the whole + * queue now! + */ + if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss && + after(data_ack, prior_snd_una)) { + mptcp_xmit_retransmit_queue(meta_sk); + inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open; + } + + /* Simplified version of tcp_new_space, because the snd-buffer + * is handled by all the subflows. + */ + if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) { + sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK); + if (meta_sk->sk_socket && + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) + meta_sk->sk_write_space(meta_sk); + + if (meta_sk->sk_socket && + !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) { + tcp_chrono_stop(meta_sk, TCP_CHRONO_SNDBUF_LIMITED); + mptcp_stop_subflow_chronos(meta_sk, + TCP_CHRONO_SNDBUF_LIMITED); + } + } + + if (meta_sk->sk_state != TCP_ESTABLISHED) { + int ret = mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len); + + if (ret < 0) + return true; + else if (ret > 0) + return false; + } + +exit: + mptcp_push_pending_frames(meta_sk); + + return false; + +no_queue: + if (tcp_send_head(meta_sk)) + tcp_ack_probe(meta_sk); + + mptcp_push_pending_frames(meta_sk); + + return false; +} + +/* Return false if we can continue processing packets. True, otherwise */ +bool mptcp_handle_ack_in_infinite(struct sock *sk, const struct sk_buff *skb, + int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *meta_tp = mptcp_meta_tp(tp); + struct mptcp_cb *mpcb = tp->mpcb; + + /* We are already in fallback-mode. Data is in-sequence and we know + * exactly what is being sent on this subflow belongs to the current + * meta-level sequence number space. + */ + if (mpcb->infinite_mapping_snd) { + if (mpcb->infinite_send_una_ahead && + !before(meta_tp->snd_una, tp->mptcp->last_end_data_seq - (tp->snd_nxt - tp->snd_una))) { + tp->mptcp->rx_opt.data_ack = meta_tp->snd_una; + } else { + /* Remember that meta snd_una is no more ahead of the game */ + mpcb->infinite_send_una_ahead = 0; + + /* The difference between both write_seq's represents the offset between + * data-sequence and subflow-sequence. As we are infinite, this must + * match. + * + * Thus, from this difference we can infer the meta snd_una. + */ + tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - + (tp->snd_nxt - tp->snd_una); + } + + goto exit; + } + + /* If data has been acknowleged on the meta-level, fully_established + * will have been set before and thus we will not fall back to infinite + * mapping. + */ + if (likely(tp->mptcp->fully_established)) + return false; + + if (!(flag & MPTCP_FLAG_DATA_ACKED)) + return false; + + pr_debug("%s %#x will fallback - pi %d, src %pI4:%u dst %pI4:%u rcv_nxt %u\n", + __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index, + &inet_sk(sk)->inet_saddr, ntohs(inet_sk(sk)->inet_sport), + &inet_sk(sk)->inet_daddr, ntohs(inet_sk(sk)->inet_dport), + tp->rcv_nxt); + if (!is_master_tp(tp)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKSUB); + return true; + } + + /* We have sent more than what has ever been sent on the master subflow. + * This means, we won't be able to seamlessly fallback because there + * will now be a hole in the sequence space. + */ + if (before(tp->mptcp->last_end_data_seq, meta_tp->snd_una)) + return true; + + mpcb->infinite_mapping_snd = 1; + mpcb->infinite_mapping_rcv = 1; + mpcb->infinite_rcv_seq = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); + tp->mptcp->fully_established = 1; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKINIT); + + if (mptcp_fallback_close(mpcb, sk)) + return true; + + mptcp_restart_sending(tp->meta_sk, tp->mptcp->last_end_data_seq); + + /* The acknowledged data-seq at the subflow-level is: + * last_end_data_seq - (tp->snd_nxt - tp->snd_una) + * + * If this is less than meta->snd_una, then we ignore it. Otherwise, + * this becomes our data_ack. + */ + if (after(meta_tp->snd_una, tp->mptcp->last_end_data_seq - (tp->snd_nxt - tp->snd_una))) { + /* Remmeber that meta snd_una is ahead of the game */ + mpcb->infinite_send_una_ahead = 1; + tp->mptcp->rx_opt.data_ack = meta_tp->snd_una; + } else { + tp->mptcp->rx_opt.data_ack = tp->mptcp->last_end_data_seq - + (tp->snd_nxt - tp->snd_una); + } + +exit: + + return mptcp_process_data_ack(sk, skb); +} + +/**** static functions used by mptcp_parse_options */ + +static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id) +{ + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + + mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) { + mptcp_reinject_data(sk_it, 0); + mptcp_send_reset(sk_it); + } + } +} + +static inline bool is_valid_addropt_opsize(u8 mptcp_ver, + struct mp_add_addr *mpadd, + int opsize) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (mptcp_ver < MPTCP_VERSION_1 && mpadd->ipver == 6) { + return opsize == MPTCP_SUB_LEN_ADD_ADDR6 || + opsize == MPTCP_SUB_LEN_ADD_ADDR6 + 2; + } + if (mptcp_ver >= MPTCP_VERSION_1 && mpadd->ipver == 6) + return opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 || + opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2; +#endif + if (mptcp_ver < MPTCP_VERSION_1 && mpadd->ipver == 4) { + return opsize == MPTCP_SUB_LEN_ADD_ADDR4 || + opsize == MPTCP_SUB_LEN_ADD_ADDR4 + 2; + } + if (mptcp_ver >= MPTCP_VERSION_1 && mpadd->ipver == 4) { + return opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 || + opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2; + } + return false; +} + +void mptcp_parse_options(const uint8_t *ptr, int opsize, + struct mptcp_options_received *mopt, + const struct sk_buff *skb, + struct tcp_sock *tp) +{ + const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr; + + /* If the socket is mp-capable we would have a mopt. */ + if (!mopt) + return; + + switch (mp_opt->sub) { + case MPTCP_SUB_CAPABLE: + { + const struct mp_capable *mpcapable = (struct mp_capable *)ptr; + + if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN && + opsize != MPTCP_SUB_LEN_CAPABLE_ACK) { + mptcp_debug("%s: mp_capable: bad option size %d\n", + __func__, opsize); + break; + } + + /* MPTCP-RFC 6824: + * "If receiving a message with the 'B' flag set to 1, and this + * is not understood, then this SYN MUST be silently ignored; + */ + if (mpcapable->b) { + mopt->drop_me = 1; + break; + } + + /* MPTCP-RFC 6824: + * "An implementation that only supports this method MUST set + * bit "H" to 1, and bits "C" through "G" to 0." + */ + if (!mpcapable->h) + break; + + mopt->saw_mpc = 1; + mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a; + + if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN) + mopt->mptcp_sender_key = mpcapable->sender_key; + if (opsize == MPTCP_SUB_LEN_CAPABLE_ACK) + mopt->mptcp_receiver_key = mpcapable->receiver_key; + + mopt->mptcp_ver = mpcapable->ver; + break; + } + case MPTCP_SUB_JOIN: + { + const struct mp_join *mpjoin = (struct mp_join *)ptr; + + if (opsize != MPTCP_SUB_LEN_JOIN_SYN && + opsize != MPTCP_SUB_LEN_JOIN_SYNACK && + opsize != MPTCP_SUB_LEN_JOIN_ACK) { + mptcp_debug("%s: mp_join: bad option size %d\n", + __func__, opsize); + break; + } + + /* saw_mpc must be set, because in tcp_check_req we assume that + * it is set to support falling back to reg. TCP if a rexmitted + * SYN has no MP_CAPABLE or MP_JOIN + */ + switch (opsize) { + case MPTCP_SUB_LEN_JOIN_SYN: + mopt->is_mp_join = 1; + mopt->saw_mpc = 1; + mopt->low_prio = mpjoin->b; + mopt->rem_id = mpjoin->addr_id; + mopt->mptcp_rem_token = mpjoin->u.syn.token; + mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce; + break; + case MPTCP_SUB_LEN_JOIN_SYNACK: + mopt->saw_mpc = 1; + mopt->low_prio = mpjoin->b; + mopt->rem_id = mpjoin->addr_id; + mopt->mptcp_recv_tmac = mpjoin->u.synack.mac; + mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce; + break; + case MPTCP_SUB_LEN_JOIN_ACK: + mopt->saw_mpc = 1; + mopt->join_ack = 1; + memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20); + break; + } + break; + } + case MPTCP_SUB_DSS: + { + const struct mp_dss *mdss = (struct mp_dss *)ptr; + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* We check opsize for the csum and non-csum case. We do this, + * because the draft says that the csum SHOULD be ignored if + * it has not been negotiated in the MP_CAPABLE but still is + * present in the data. + * + * It will get ignored later in mptcp_queue_skb. + */ + if (opsize != mptcp_sub_len_dss(mdss, 0) && + opsize != mptcp_sub_len_dss(mdss, 1)) { + mptcp_debug("%s: mp_dss: bad option size %d\n", + __func__, opsize); + break; + } + + ptr += 4; + + if (mdss->A) { + tcb->mptcp_flags |= MPTCPHDR_ACK; + + if (mdss->a) { + mopt->data_ack = (u32) get_unaligned_be64(ptr); + ptr += MPTCP_SUB_LEN_ACK_64; + } else { + mopt->data_ack = get_unaligned_be32(ptr); + ptr += MPTCP_SUB_LEN_ACK; + } + } + + tcb->dss_off = (ptr - skb_transport_header(skb)); + + if (mdss->M) { + if (mdss->m) { + u64 data_seq64 = get_unaligned_be64(ptr); + + tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET; + mopt->data_seq = (u32) data_seq64; + + ptr += 12; /* 64-bit dseq + subseq */ + } else { + mopt->data_seq = get_unaligned_be32(ptr); + ptr += 8; /* 32-bit dseq + subseq */ + } + mopt->data_len = get_unaligned_be16(ptr); + + tcb->mptcp_flags |= MPTCPHDR_SEQ; + + /* Is a check-sum present? */ + if (opsize == mptcp_sub_len_dss(mdss, 1)) + tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM; + + /* DATA_FIN only possible with DSS-mapping */ + if (mdss->F) + tcb->mptcp_flags |= MPTCPHDR_FIN; + } + + break; + } + case MPTCP_SUB_ADD_ADDR: + { + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; + + /* If tcp_sock is not available, MPTCP version can't be + * retrieved and ADD_ADDR opsize validation is not possible. + */ + if (!tp || !tp->mpcb) + break; + + if (!is_valid_addropt_opsize(tp->mpcb->mptcp_ver, + mpadd, opsize)) { + mptcp_debug("%s: mp_add_addr: bad option size %d\n", + __func__, opsize); + break; + } + + /* We have to manually parse the options if we got two of them. */ + if (mopt->saw_add_addr) { + mopt->more_add_addr = 1; + break; + } + mopt->saw_add_addr = 1; + mopt->add_addr_ptr = ptr; + break; + } + case MPTCP_SUB_REMOVE_ADDR: + if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) { + mptcp_debug("%s: mp_remove_addr: bad option size %d\n", + __func__, opsize); + break; + } + + if (mopt->saw_rem_addr) { + mopt->more_rem_addr = 1; + break; + } + mopt->saw_rem_addr = 1; + mopt->rem_addr_ptr = ptr; + break; + case MPTCP_SUB_PRIO: + { + const struct mp_prio *mpprio = (struct mp_prio *)ptr; + + if (opsize != MPTCP_SUB_LEN_PRIO && + opsize != MPTCP_SUB_LEN_PRIO_ADDR) { + mptcp_debug("%s: mp_prio: bad option size %d\n", + __func__, opsize); + break; + } + + mopt->saw_low_prio = 1; + mopt->low_prio = mpprio->b; + + if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) { + mopt->saw_low_prio = 2; + mopt->prio_addr_id = mpprio->addr_id; + } + break; + } + case MPTCP_SUB_FAIL: + if (opsize != MPTCP_SUB_LEN_FAIL) { + mptcp_debug("%s: mp_fail: bad option size %d\n", + __func__, opsize); + break; + } + mopt->mp_fail = 1; + break; + case MPTCP_SUB_FCLOSE: + if (opsize != MPTCP_SUB_LEN_FCLOSE) { + mptcp_debug("%s: mp_fclose: bad option size %d\n", + __func__, opsize); + break; + } + + mopt->mp_fclose = 1; + mopt->mptcp_sender_key = ((struct mp_fclose *)ptr)->key; + + break; + default: + mptcp_debug("%s: Received unkown subtype: %d\n", + __func__, mp_opt->sub); + break; + } +} + +/** Parse only MPTCP options */ +void tcp_parse_mptcp_options(const struct sk_buff *skb, + struct mptcp_options_received *mopt) +{ + const struct tcphdr *th = tcp_hdr(skb); + int length = (th->doff * 4) - sizeof(struct tcphdr); + const unsigned char *ptr = (const unsigned char *)(th + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP) + mptcp_parse_options(ptr - 2, opsize, mopt, skb, NULL); + } + ptr += opsize - 2; + length -= opsize; + } +} + +bool mptcp_check_rtt(const struct tcp_sock *tp, int time) +{ + struct mptcp_cb *mpcb = tp->mpcb; + struct mptcp_tcp_sock *mptcp; + u32 rtt_max = 0; + + /* In MPTCP, we take the max delay across all flows, + * in order to take into account meta-reordering buffers. + */ + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + + if (!mptcp_sk_can_recv(sk)) + continue; + + if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt_us) + rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt_us; + } + if (time < (rtt_max >> 3) || !rtt_max) + return true; + + return false; +} + +static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk) +{ + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + __be16 port = 0; + union inet_addr addr; + sa_family_t family; + + if (mpadd->ipver == 4) { + char *recv_hmac; + u8 hash_mac_check[20]; + u8 no_key[8]; + int msg_parts = 0; + + if (mpcb->mptcp_ver < MPTCP_VERSION_1) + goto skip_hmac_v4; + + *(u64 *)no_key = 0; + recv_hmac = (char *)mpadd->u.v4.mac; + if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1) { + recv_hmac -= sizeof(mpadd->u.v4.port); + msg_parts = 2; + } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2) { + msg_parts = 3; + } + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, + (u8 *)no_key, + (u32 *)hash_mac_check, msg_parts, + 1, (u8 *)&mpadd->addr_id, + 4, (u8 *)&mpadd->u.v4.addr.s_addr, + 2, (u8 *)&mpadd->u.v4.port); + if (memcmp(hash_mac_check, recv_hmac, 8) != 0) + /* ADD_ADDR2 discarded */ + return; +skip_hmac_v4: + if ((mpcb->mptcp_ver == MPTCP_VERSION_0 && + mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2) || + (mpcb->mptcp_ver == MPTCP_VERSION_1 && + mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2)) + port = mpadd->u.v4.port; + family = AF_INET; + addr.in = mpadd->u.v4.addr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (mpadd->ipver == 6) { + char *recv_hmac; + u8 hash_mac_check[20]; + u8 no_key[8]; + int msg_parts = 0; + + if (mpcb->mptcp_ver < MPTCP_VERSION_1) + goto skip_hmac_v6; + + *(u64 *)no_key = 0; + recv_hmac = (char *)mpadd->u.v6.mac; + if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1) { + recv_hmac -= sizeof(mpadd->u.v6.port); + msg_parts = 2; + } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2) { + msg_parts = 3; + } + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, + (u8 *)no_key, + (u32 *)hash_mac_check, msg_parts, + 1, (u8 *)&mpadd->addr_id, + 16, (u8 *)&mpadd->u.v6.addr.s6_addr, + 2, (u8 *)&mpadd->u.v6.port); + if (memcmp(hash_mac_check, recv_hmac, 8) != 0) + /* ADD_ADDR2 discarded */ + return; +skip_hmac_v6: + if ((mpcb->mptcp_ver == MPTCP_VERSION_0 && + mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2) || + (mpcb->mptcp_ver == MPTCP_VERSION_1 && + mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2)) + port = mpadd->u.v6.port; + family = AF_INET6; + addr.in6 = mpadd->u.v6.addr; +#endif /* CONFIG_IPV6 */ + } else { + return; + } + + if (mpcb->pm_ops->add_raddr) + mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id); + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRRX); +} + +static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk) +{ + struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; + int i; + u8 rem_id; + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + + for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) { + rem_id = (&mprem->addrs_id)[i]; + + if (mpcb->pm_ops->rem_raddr) + mpcb->pm_ops->rem_raddr(mpcb, rem_id); + mptcp_send_reset_rem_id(mpcb, rem_id); + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_REMADDRSUB); + } + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_REMADDRRX); +} + +static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk) +{ + struct tcphdr *th = tcp_hdr(skb); + unsigned char *ptr; + int length = (th->doff * 4) - sizeof(struct tcphdr); + + /* Jump through the options to check whether ADD_ADDR is there */ + ptr = (unsigned char *)(th + 1); + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) + return; + if (opsize > length) + return; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP && + ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) { + u8 mptcp_ver = tcp_sk(sk)->mpcb->mptcp_ver; + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; + + if (!is_valid_addropt_opsize(mptcp_ver, mpadd, + opsize)) + goto cont; + + mptcp_handle_add_addr(ptr, sk); + } + if (opcode == TCPOPT_MPTCP && + ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) { + if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) + goto cont; + + mptcp_handle_rem_addr(ptr, sk); + } +cont: + ptr += opsize - 2; + length -= opsize; + } + } + return; +} + +static bool mptcp_mp_fastclose_rcvd(struct sock *sk) +{ + struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp; + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + + if (likely(!mptcp->rx_opt.mp_fclose)) + return false; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FASTCLOSERX); + mptcp->rx_opt.mp_fclose = 0; + if (mptcp->rx_opt.mptcp_sender_key != mpcb->mptcp_loc_key) + return false; + + mptcp_sub_force_close_all(mpcb, NULL); + + tcp_reset(mptcp_meta_sk(sk)); + + return true; +} + +/* Returns true if we should stop processing NOW */ +static bool mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th) +{ + struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp; + struct sock *meta_sk = mptcp_meta_sk(sk); + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); + mptcp->rx_opt.mp_fail = 0; + + if (!th->rst && !mpcb->infinite_mapping_snd) { + mpcb->send_infinite_mapping = 1; + + mptcp_restart_sending(meta_sk, tcp_sk(meta_sk)->snd_una); + + return mptcp_fallback_close(mpcb, sk); + } + + return false; +} + +static inline void mptcp_path_array_check(struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + + if (unlikely(mpcb->list_rcvd)) { + mpcb->list_rcvd = 0; + if (mpcb->pm_ops->new_remote_address) + mpcb->pm_ops->new_remote_address(meta_sk); + } +} + +bool mptcp_handle_options(struct sock *sk, const struct tcphdr *th, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; + struct mptcp_cb *mpcb = tp->mpcb; + + if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd) + return false; + + if (mptcp_mp_fastclose_rcvd(sk)) + return true; + + if (sk->sk_state == TCP_RST_WAIT && !th->rst) + return true; + + if (unlikely(mopt->mp_fail) && mptcp_mp_fail_rcvd(sk, th)) + return true; + + /* RFC 6824, Section 3.3: + * If a checksum is not present when its use has been negotiated, the + * receiver MUST close the subflow with a RST as it is considered broken. + */ + if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum && + !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) { + mptcp_send_reset(sk); + return true; + } + + /* We have to acknowledge retransmissions of the third + * ack. + */ + if (mopt->join_ack) { + tcp_send_delayed_ack(sk); + mopt->join_ack = 0; + } + + if (mopt->saw_add_addr || mopt->saw_rem_addr) { + if (mopt->more_add_addr || mopt->more_rem_addr) { + mptcp_parse_addropt(skb, sk); + } else { + if (mopt->saw_add_addr) + mptcp_handle_add_addr(mopt->add_addr_ptr, sk); + if (mopt->saw_rem_addr) + mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk); + } + + mopt->more_add_addr = 0; + mopt->saw_add_addr = 0; + mopt->more_rem_addr = 0; + mopt->saw_rem_addr = 0; + } + if (mopt->saw_low_prio) { + if (mopt->saw_low_prio == 1) { + tp->mptcp->rcv_low_prio = mopt->low_prio; + if (mpcb->pm_ops->prio_changed) + mpcb->pm_ops->prio_changed(sk, mopt->low_prio); + } else { + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(tp->mpcb, mptcp) { + if (mptcp->rem_id == mopt->prio_addr_id) { + mptcp->rcv_low_prio = mopt->low_prio; + if (mpcb->pm_ops->prio_changed) + mpcb->pm_ops->prio_changed(sk, + mopt->low_prio); + } + } + } + mopt->saw_low_prio = 0; + } + + if (mptcp_process_data_ack(sk, skb)) + return true; + + mptcp_path_array_check(mptcp_meta_sk(sk)); + /* Socket may have been mp_killed by a REMOVE_ADDR */ + if (tp->mp_killed) + return true; + + return false; +} + +static void _mptcp_rcv_synsent_fastopen(struct sock *meta_sk, + struct sk_buff *skb, bool rtx_queue) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk); + u32 new_mapping = meta_tp->write_seq - master_tp->snd_una; + + /* If the server only acknowledges partially the data sent in + * the SYN, we need to trim the acknowledged part because + * we don't want to retransmit this already received data. + * When we reach this point, tcp_ack() has already cleaned up + * fully acked segments. However, tcp trims partially acked + * segments only when retransmitting. Since MPTCP comes into + * play only now, we will fake an initial transmit, and + * retransmit_skb() will not be called. The following fragment + * comes from __tcp_retransmit_skb(). + */ + if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) { + BUG_ON(before(TCP_SKB_CB(skb)->end_seq, master_tp->snd_una)); + /* tcp_trim_head can only returns ENOMEM if skb is + * cloned. It is not the case here (see + * tcp_send_syn_data). + */ + BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una - + TCP_SKB_CB(skb)->seq)); + } + + TCP_SKB_CB(skb)->seq += new_mapping; + TCP_SKB_CB(skb)->end_seq += new_mapping; + + list_del(&skb->tcp_tsorted_anchor); + + if (rtx_queue) + tcp_rtx_queue_unlink(skb, meta_sk); + + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); + + if (rtx_queue) + tcp_add_write_queue_tail(meta_sk, skb); +} + +/* In case of fastopen, some data can already be in the write queue. + * We need to update the sequence number of the segments as they + * were initially TCP sequence numbers. + */ +static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk); + struct sk_buff *skb_write_head, *skb_rtx_head, *tmp; + + skb_write_head = tcp_write_queue_head(meta_sk); + skb_rtx_head = tcp_rtx_queue_head(meta_sk); + + if (!(skb_write_head || skb_rtx_head)) + return; + + /* There should only be one skb in {write, rtx} queue: the data not + * acknowledged in the SYN+ACK. In this case, we need to map + * this data to data sequence numbers. + */ + + BUG_ON(skb_write_head && skb_rtx_head); + + if (skb_write_head) { + skb_queue_walk_from_safe(&meta_sk->sk_write_queue, + skb_write_head, tmp) { + _mptcp_rcv_synsent_fastopen(meta_sk, skb_write_head, + false); + } + } + + if (skb_rtx_head) { + skb_rbtree_walk_from_safe(skb_rtx_head, tmp) { + _mptcp_rcv_synsent_fastopen(meta_sk, skb_rtx_head, + true); + } + } + + /* We can advance write_seq by the number of bytes unacknowledged + * and that were mapped in the previous loop. + */ + meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una; + + /* The packets from the master_sk will be entailed to it later + * Until that time, its write queue is empty, and + * write_seq must align with snd_una + */ + master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una; + master_tp->packets_out = 0; +} + +/* The skptr is needed, because if we become MPTCP-capable, we have to switch + * from meta-socket to master-socket. + * + * @return: 1 - we want to reset this connection + * 2 - we want to discard the received syn/ack + * 0 - everything is fine - continue + */ +int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, + const struct sk_buff *skb, + const struct mptcp_options_received *mopt) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (mptcp(tp)) { + u8 hash_mac_check[20]; + struct mptcp_cb *mpcb = tp->mpcb; + + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, + (u8 *)&mpcb->mptcp_loc_key, + (u32 *)hash_mac_check, 2, + 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, + 4, (u8 *)&tp->mptcp->mptcp_loc_nonce); + if (memcmp(hash_mac_check, + (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC); + mptcp_sub_force_close(sk); + return 1; + } + + /* Set this flag in order to postpone data sending + * until the 4th ack arrives. + */ + tp->mptcp->pre_established = 1; + tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio; + + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, + (u8 *)&mpcb->mptcp_rem_key, + (u32 *)&tp->mptcp->sender_mac[0], 2, + 4, (u8 *)&tp->mptcp->mptcp_loc_nonce, + 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce); + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + } else if (mopt->saw_mpc) { + struct sock *meta_sk = sk; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); + if (mopt->mptcp_ver > tcp_sk(sk)->mptcp_ver) + /* TODO Consider adding new MPTCP_INC_STATS entry */ + goto fallback; + + if (mptcp_create_master_sk(sk, mopt->mptcp_sender_key, + mopt->mptcp_ver, + ntohs(tcp_hdr(skb)->window))) + return 2; + + sk = tcp_sk(sk)->mpcb->master_sk; + *skptr = sk; + tp = tcp_sk(sk); + + /* If fastopen was used data might be in the send queue. We + * need to update their sequence number to MPTCP-level seqno. + * Note that it can happen in rare cases that fastopen_req is + * NULL and syn_data is 0 but fastopen indeed occurred and + * data has been queued in the write queue (but not sent). + * Example of such rare cases: connect is non-blocking and + * TFO is configured to work without cookies. + */ + mptcp_rcv_synsent_fastopen(meta_sk); + + /* -1, because the SYN consumed 1 byte. In case of TFO, we + * start the subflow-sequence number as if the data of the SYN + * is not part of any mapping. + */ + tp->mptcp->snt_isn = tp->snd_una - 1; + tp->mpcb->dss_csum = mopt->dss_csum; + if (tp->mpcb->dss_csum) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CSUMENABLED); + + tp->mptcp->include_mpc = 1; + + sk_set_socket(sk, meta_sk->sk_socket); + sk->sk_wq = meta_sk->sk_wq; + + bh_unlock_sock(sk); + /* hold in sk_clone_lock due to initialization to 2 */ + sock_put(sk); + } else { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); +fallback: + tp->request_mptcp = 0; + + if (tp->inside_tk_table) + mptcp_hash_remove_bh(tp); + } + + if (mptcp(tp)) + tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq; + + return 0; +} + +/* Similar to tcp_should_expand_sndbuf */ +bool mptcp_should_expand_sndbuf(const struct sock *sk) +{ + const struct sock *meta_sk = mptcp_meta_sk(sk); + const struct tcp_sock *meta_tp = tcp_sk(meta_sk); + const struct mptcp_tcp_sock *mptcp; + + /* We circumvent this check in tcp_check_space, because we want to + * always call sk_write_space. So, we reproduce the check here. + */ + if (!meta_sk->sk_socket || + !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) + return false; + + /* If the user specified a specific send buffer setting, do + * not modify it. + */ + if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return false; + + /* If we are under global TCP memory pressure, do not expand. */ + if (tcp_under_memory_pressure(meta_sk)) + return false; + + /* If we are under soft global TCP memory pressure, do not expand. */ + if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0)) + return false; + + /* For MPTCP we look for a subsocket that could send data. + * If we found one, then we update the send-buffer. + */ + mptcp_for_each_sub(meta_tp->mpcb, mptcp) { + const struct sock *sk_it = mptcp_to_sock(mptcp); + const struct tcp_sock *tp_it = tcp_sk(sk_it); + + if (!mptcp_sk_can_send(sk_it)) + continue; + + if (tcp_packets_in_flight(tp_it) < tp_it->snd_cwnd) + return true; + } + + return false; +} + +void mptcp_tcp_set_rto(struct sock *sk) +{ + tcp_set_rto(sk); + mptcp_set_rto(sk); +} diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c new file mode 100644 index 000000000000..318ecc4c1fdd --- /dev/null +++ b/net/mptcp/mptcp_ipv4.c @@ -0,0 +1,428 @@ +/* + * MPTCP implementation - IPv4-specific functions + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) +{ + return siphash_4u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + mptcp_seed++, &mptcp_secret); +} + +u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, + u32 seed) +{ + return siphash_2u64((__force u64)saddr << 32 | (__force u64)daddr, + (__force u64)seed << 32 | (__force u64)sport << 16 | (__force u64)dport, + &mptcp_secret); +} + + +static void mptcp_v4_reqsk_destructor(struct request_sock *req) +{ + mptcp_reqsk_destructor(req); + + tcp_v4_reqsk_destructor(req); +} + +static int mptcp_v4_init_req(struct request_sock *req, const struct sock *sk, + struct sk_buff *skb, bool want_cookie) +{ + tcp_request_sock_ipv4_ops.init_req(req, sk, skb, want_cookie); + + mptcp_rsk(req)->hash_entry.pprev = NULL; + mptcp_rsk(req)->is_sub = 0; + inet_rsk(req)->mptcp_rqsk = 1; + + /* In case of SYN-cookies, we wait for the isn to be generated - it is + * input to the key-generation. + */ + if (!want_cookie) + mptcp_reqsk_init(req, sk, skb, false); + + return 0; +} + +#ifdef CONFIG_SYN_COOKIES +static u32 mptcp_v4_cookie_init_seq(struct request_sock *req, const struct sock *sk, + const struct sk_buff *skb, __u16 *mssp) +{ + __u32 isn = cookie_v4_init_sequence(req, sk, skb, mssp); + + tcp_rsk(req)->snt_isn = isn; + + mptcp_reqsk_init(req, sk, skb, true); + + return isn; +} +#endif + +/* May be called without holding the meta-level lock */ +static int mptcp_v4_join_init_req(struct request_sock *req, const struct sock *meta_sk, + struct sk_buff *skb, bool want_cookie) +{ + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + union inet_addr addr; + int loc_id; + bool low_prio = false; + + /* We need to do this as early as possible. Because, if we fail later + * (e.g., get_local_id), then reqsk_free tries to remove the + * request-socket from the htb in mptcp_hash_request_remove as pprev + * may be different from NULL. + */ + mtreq->hash_entry.pprev = NULL; + + tcp_request_sock_ipv4_ops.init_req(req, meta_sk, skb, want_cookie); + + mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + tcp_hdr(skb)->source, + tcp_hdr(skb)->dest); + addr.ip = inet_rsk(req)->ir_loc_addr; + loc_id = mpcb->pm_ops->get_local_id(meta_sk, AF_INET, &addr, &low_prio); + if (loc_id == -1) + return -1; + mtreq->loc_id = loc_id; + mtreq->low_prio = low_prio; + + mptcp_join_reqsk_init(mpcb, req, skb); + + return 0; +} + +/* Similar to tcp_request_sock_ops */ +struct request_sock_ops mptcp_request_sock_ops __read_mostly = { + .family = PF_INET, + .obj_size = sizeof(struct mptcp_request_sock), + .rtx_syn_ack = tcp_rtx_synack, + .send_ack = tcp_v4_reqsk_send_ack, + .destructor = mptcp_v4_reqsk_destructor, + .send_reset = tcp_v4_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, +}; + +/* Similar to: tcp_v4_conn_request + * May be called without holding the meta-level lock + */ +static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb) +{ + return tcp_conn_request(&mptcp_request_sock_ops, + &mptcp_join_request_sock_ipv4_ops, + meta_sk, skb); +} + +/* Similar to: tcp_v4_do_rcv + * We only process join requests here. (either the SYN or the final ACK) + */ +int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); + struct sock *child, *rsk = NULL, *sk; + int ret; + + sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo, + iph->saddr, th->source, iph->daddr, + th->dest, inet_iif(skb)); + + if (!sk) + goto new_subflow; + + if (is_meta_sk(sk)) { + WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); + sock_put(sk); + goto discard; + } + + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + goto discard; + } + + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + bool req_stolen; + + if (!mptcp_can_new_subflow(meta_sk)) + goto reset_and_discard; + + local_bh_disable(); + child = tcp_check_req(meta_sk, skb, req, false, &req_stolen); + if (!child) { + reqsk_put(req); + local_bh_enable(); + goto discard; + } + + if (child != meta_sk) { + ret = mptcp_finish_handshake(child, skb); + if (ret) { + rsk = child; + local_bh_enable(); + goto reset_and_discard; + } + + bh_unlock_sock(meta_sk); + local_bh_enable(); + return 0; + } + + /* tcp_check_req failed */ + reqsk_put(req); + + local_bh_enable(); + goto discard; + } + + ret = tcp_v4_do_rcv(sk, skb); + sock_put(sk); + + return ret; + +new_subflow: + if (!mptcp_can_new_subflow(meta_sk)) + goto reset_and_discard; + + child = tcp_v4_cookie_check(meta_sk, skb); + if (!child) + goto discard; + + if (child != meta_sk) { + ret = mptcp_finish_handshake(child, skb); + if (ret) { + rsk = child; + goto reset_and_discard; + } + } + + if (tcp_hdr(skb)->syn) { + local_bh_disable(); + mptcp_v4_join_request(meta_sk, skb); + local_bh_enable(); + } + +discard: + kfree_skb(skb); + return 0; + +reset_and_discard: + tcp_v4_send_reset(rsk, skb); + goto discard; +} + +/* Create a new IPv4 subflow. + * + * We are in user-context and meta-sock-lock is hold. + */ +int __mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, + __be16 sport, struct mptcp_rem4 *rem, + struct sock **subsk) +{ + struct tcp_sock *tp; + struct sock *sk; + struct sockaddr_in loc_in, rem_in; + struct socket_alloc sock_full; + struct socket *sock = (struct socket *)&sock_full; + int ret; + + /** First, create and prepare the new socket */ + memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full)); + sock->state = SS_UNCONNECTED; + sock->ops = NULL; + + ret = inet_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1); + if (unlikely(ret < 0)) { + net_err_ratelimited("%s inet_create failed ret: %d\n", + __func__, ret); + return ret; + } + + sk = sock->sk; + tp = tcp_sk(sk); + + /* All subsockets need the MPTCP-lock-class */ + lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name); + lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0); + + ret = mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL); + if (ret) { + net_err_ratelimited("%s mptcp_add_sock failed ret: %d\n", + __func__, ret); + goto error; + } + + tp->mptcp->slave_sk = 1; + tp->mptcp->low_prio = loc->low_prio; + + /* Initializing the timer for an MPTCP subflow */ + timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0); + + /** Then, connect the socket to the peer */ + loc_in.sin_family = AF_INET; + rem_in.sin_family = AF_INET; + loc_in.sin_port = sport; + if (rem->port) + rem_in.sin_port = rem->port; + else + rem_in.sin_port = inet_sk(meta_sk)->inet_dport; + loc_in.sin_addr = loc->addr; + rem_in.sin_addr = rem->addr; + + if (loc->if_idx) + sk->sk_bound_dev_if = loc->if_idx; + + ret = kernel_bind(sock, (struct sockaddr *)&loc_in, + sizeof(struct sockaddr_in)); + if (ret < 0) { + net_err_ratelimited("%s: token %#x bind() to %pI4 index %d failed, error %d\n", + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, + &loc_in.sin_addr, loc->if_idx, ret); + goto error; + } + + mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d ifidx: %d\n", + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, + tp->mptcp->path_index, &loc_in.sin_addr, + ntohs(loc_in.sin_port), &rem_in.sin_addr, + ntohs(rem_in.sin_port), loc->if_idx); + + if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4) + tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr); + + ret = kernel_connect(sock, (struct sockaddr *)&rem_in, + sizeof(struct sockaddr_in), O_NONBLOCK); + if (ret < 0 && ret != -EINPROGRESS) { + net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n", + __func__, ret); + goto error; + } + + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX); + + sk_set_socket(sk, meta_sk->sk_socket); + sk->sk_wq = meta_sk->sk_wq; + + if (subsk) + *subsk = sk; + + return 0; + +error: + /* May happen if mptcp_add_sock fails first */ + if (!mptcp(tp)) { + tcp_close(sk, 0); + } else { + local_bh_disable(); + mptcp_sub_force_close(sk); + local_bh_enable(); + } + return ret; +} +EXPORT_SYMBOL(__mptcp_init4_subsockets); + +const struct inet_connection_sock_af_ops mptcp_v4_specific = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, + .conn_request = mptcp_conn_request, + .syn_recv_sock = tcp_v4_syn_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = inet_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ip_setsockopt, + .compat_getsockopt = compat_ip_getsockopt, +#endif + .mtu_reduced = tcp_v4_mtu_reduced, +}; + +struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops; +struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops; + +/* General initialization of IPv4 for MPTCP */ +int mptcp_pm_v4_init(void) +{ + int ret = 0; + struct request_sock_ops *ops = &mptcp_request_sock_ops; + + mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; + mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req; +#ifdef CONFIG_SYN_COOKIES + mptcp_request_sock_ipv4_ops.cookie_init_seq = mptcp_v4_cookie_init_seq; +#endif + mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; + mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req; + + ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP"); + if (ops->slab_name == NULL) { + ret = -ENOMEM; + goto out; + } + + ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, + SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN, + NULL); + + if (ops->slab == NULL) { + ret = -ENOMEM; + goto err_reqsk_create; + } + +out: + return ret; + +err_reqsk_create: + kfree(ops->slab_name); + ops->slab_name = NULL; + goto out; +} + +void mptcp_pm_v4_undo(void) +{ + kmem_cache_destroy(mptcp_request_sock_ops.slab); + kfree(mptcp_request_sock_ops.slab_name); +} diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c new file mode 100644 index 000000000000..aa07bf1603cb --- /dev/null +++ b/net/mptcp/mptcp_ipv6.c @@ -0,0 +1,476 @@ +/* + * MPTCP implementation - IPv6-specific functions + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer: + * Jaakko Korkeaniemi + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) +{ + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + u32 seed; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .seed = mptcp_seed++, + .sport = sport, + .dport = dport + }; + + return siphash(&combined, offsetofend(typeof(combined), dport), + &mptcp_secret); +} + +u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport, u32 seed) +{ + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + u32 seed; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .seed = seed, + .sport = sport, + .dport = dport + }; + + return siphash(&combined, offsetofend(typeof(combined), dport), + &mptcp_secret); +} + +static void mptcp_v6_reqsk_destructor(struct request_sock *req) +{ + mptcp_reqsk_destructor(req); + + tcp_v6_reqsk_destructor(req); +} + +static int mptcp_v6_init_req(struct request_sock *req, const struct sock *sk, + struct sk_buff *skb, bool want_cookie) +{ + tcp_request_sock_ipv6_ops.init_req(req, sk, skb, want_cookie); + + mptcp_rsk(req)->hash_entry.pprev = NULL; + mptcp_rsk(req)->is_sub = 0; + inet_rsk(req)->mptcp_rqsk = 1; + + /* In case of SYN-cookies, we wait for the isn to be generated - it is + * input to the key-generation. + */ + if (!want_cookie) + mptcp_reqsk_init(req, sk, skb, false); + + return 0; +} + +#ifdef CONFIG_SYN_COOKIES +static u32 mptcp_v6_cookie_init_seq(struct request_sock *req, const struct sock *sk, + const struct sk_buff *skb, __u16 *mssp) +{ + __u32 isn = cookie_v6_init_sequence(req, sk, skb, mssp); + + tcp_rsk(req)->snt_isn = isn; + + mptcp_reqsk_init(req, sk, skb, true); + + return isn; +} +#endif + +/* May be called without holding the meta-level lock */ +static int mptcp_v6_join_init_req(struct request_sock *req, const struct sock *meta_sk, + struct sk_buff *skb, bool want_cookie) +{ + struct mptcp_request_sock *mtreq = mptcp_rsk(req); + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + union inet_addr addr; + int loc_id; + bool low_prio = false; + + /* We need to do this as early as possible. Because, if we fail later + * (e.g., get_local_id), then reqsk_free tries to remove the + * request-socket from the htb in mptcp_hash_request_remove as pprev + * may be different from NULL. + */ + mtreq->hash_entry.pprev = NULL; + + tcp_request_sock_ipv6_ops.init_req(req, meta_sk, skb, want_cookie); + + mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32, + ipv6_hdr(skb)->daddr.s6_addr32, + tcp_hdr(skb)->source, + tcp_hdr(skb)->dest); + addr.in6 = inet_rsk(req)->ir_v6_loc_addr; + loc_id = mpcb->pm_ops->get_local_id(meta_sk, AF_INET6, &addr, &low_prio); + if (loc_id == -1) + return -1; + mtreq->loc_id = loc_id; + mtreq->low_prio = low_prio; + + mptcp_join_reqsk_init(mpcb, req, skb); + + return 0; +} + +/* Similar to tcp6_request_sock_ops */ +struct request_sock_ops mptcp6_request_sock_ops __read_mostly = { + .family = AF_INET6, + .obj_size = sizeof(struct mptcp_request_sock), + .rtx_syn_ack = tcp_rtx_synack, + .send_ack = tcp_v6_reqsk_send_ack, + .destructor = mptcp_v6_reqsk_destructor, + .send_reset = tcp_v6_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, +}; + +/* Similar to: tcp_v6_conn_request + * May be called without holding the meta-level lock + */ +static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb) +{ + return tcp_conn_request(&mptcp6_request_sock_ops, + &mptcp_join_request_sock_ipv6_ops, + meta_sk, skb); +} + +int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct sock *child, *rsk = NULL, *sk; + int ret; + + sk = __inet6_lookup_established(sock_net(meta_sk), + &tcp_hashinfo, + &ip6h->saddr, th->source, + &ip6h->daddr, ntohs(th->dest), + tcp_v6_iif(skb), tcp_v6_sdif(skb)); + + if (!sk) + goto new_subflow; + + if (is_meta_sk(sk)) { + WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); + sock_put(sk); + goto discard; + } + + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + goto discard; + } + + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + bool req_stolen; + + if (!mptcp_can_new_subflow(meta_sk)) + goto reset_and_discard; + + local_bh_disable(); + child = tcp_check_req(meta_sk, skb, req, false, &req_stolen); + if (!child) { + reqsk_put(req); + local_bh_enable(); + goto discard; + } + + if (child != meta_sk) { + ret = mptcp_finish_handshake(child, skb); + if (ret) { + rsk = child; + local_bh_enable(); + goto reset_and_discard; + } + + bh_unlock_sock(meta_sk); + local_bh_enable(); + return 0; + } + + /* tcp_check_req failed */ + reqsk_put(req); + + local_bh_enable(); + goto discard; + } + + ret = tcp_v6_do_rcv(sk, skb); + sock_put(sk); + + return ret; + +new_subflow: + if (!mptcp_can_new_subflow(meta_sk)) + goto reset_and_discard; + + child = tcp_v6_cookie_check(meta_sk, skb); + if (!child) + goto discard; + + if (child != meta_sk) { + ret = mptcp_finish_handshake(child, skb); + if (ret) { + rsk = child; + goto reset_and_discard; + } + } + + if (tcp_hdr(skb)->syn) { + local_bh_disable(); + mptcp_v6_join_request(meta_sk, skb); + local_bh_enable(); + } + +discard: + kfree_skb(skb); + return 0; + +reset_and_discard: + tcp_v6_send_reset(rsk, skb); + goto discard; +} + +/* Create a new IPv6 subflow. + * + * We are in user-context and meta-sock-lock is hold. + */ +int __mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc, + __be16 sport, struct mptcp_rem6 *rem, + struct sock **subsk) +{ + struct tcp_sock *tp; + struct sock *sk; + struct sockaddr_in6 loc_in, rem_in; + struct socket_alloc sock_full; + struct socket *sock = (struct socket *)&sock_full; + int ret; + + /** First, create and prepare the new socket */ + memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full)); + sock->state = SS_UNCONNECTED; + sock->ops = NULL; + + ret = inet6_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1); + if (unlikely(ret < 0)) { + net_err_ratelimited("%s inet6_create failed ret: %d\n", + __func__, ret); + return ret; + } + + sk = sock->sk; + tp = tcp_sk(sk); + + /* All subsockets need the MPTCP-lock-class */ + lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name); + lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0); + + ret = mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL); + if (ret) { + net_err_ratelimited("%s mptcp_add_sock failed ret: %d\n", + __func__, ret); + goto error; + } + + tp->mptcp->slave_sk = 1; + tp->mptcp->low_prio = loc->low_prio; + + /* Initializing the timer for an MPTCP subflow */ + timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0); + + /** Then, connect the socket to the peer */ + loc_in.sin6_family = AF_INET6; + rem_in.sin6_family = AF_INET6; + loc_in.sin6_port = sport; + if (rem->port) + rem_in.sin6_port = rem->port; + else + rem_in.sin6_port = inet_sk(meta_sk)->inet_dport; + loc_in.sin6_addr = loc->addr; + rem_in.sin6_addr = rem->addr; + + if (loc->if_idx) + sk->sk_bound_dev_if = loc->if_idx; + + ret = kernel_bind(sock, (struct sockaddr *)&loc_in, + sizeof(struct sockaddr_in6)); + if (ret < 0) { + net_err_ratelimited("%s: token %#x bind() to %pI6 index %d failed, error %d\n", + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, + &loc_in.sin6_addr, loc->if_idx, ret); + goto error; + } + + mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d ifidx: %u\n", + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, + tp->mptcp->path_index, &loc_in.sin6_addr, + ntohs(loc_in.sin6_port), &rem_in.sin6_addr, + ntohs(rem_in.sin6_port), loc->if_idx); + + if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6) + tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr); + + ret = kernel_connect(sock, (struct sockaddr *)&rem_in, + sizeof(struct sockaddr_in6), O_NONBLOCK); + if (ret < 0 && ret != -EINPROGRESS) { + net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n", + __func__, ret); + goto error; + } + + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX); + + sk_set_socket(sk, meta_sk->sk_socket); + sk->sk_wq = meta_sk->sk_wq; + + if (subsk) + *subsk = sk; + + return 0; + +error: + /* May happen if mptcp_add_sock fails first */ + if (!mptcp(tp)) { + tcp_close(sk, 0); + } else { + local_bh_disable(); + mptcp_sub_force_close(sk); + local_bh_enable(); + } + return ret; +} +EXPORT_SYMBOL(__mptcp_init6_subsockets); + +const struct inet_connection_sock_af_ops mptcp_v6_specific = { + .queue_xmit = inet6_csk_xmit, + .send_check = tcp_v6_send_check, + .rebuild_header = inet6_sk_rebuild_header, + .sk_rx_dst_set = inet6_sk_rx_dst_set, + .conn_request = mptcp_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .net_header_len = sizeof(struct ipv6hdr), + .net_frag_header_len = sizeof(struct frag_hdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6), +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, +#endif + .mtu_reduced = tcp_v6_mtu_reduced, +}; + +const struct inet_connection_sock_af_ops mptcp_v6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, + .conn_request = mptcp_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6), +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, +#endif + .mtu_reduced = tcp_v4_mtu_reduced, +}; + +struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops; +struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops; + +int mptcp_pm_v6_init(void) +{ + int ret = 0; + struct request_sock_ops *ops = &mptcp6_request_sock_ops; + + mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; + mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req; +#ifdef CONFIG_SYN_COOKIES + mptcp_request_sock_ipv6_ops.cookie_init_seq = mptcp_v6_cookie_init_seq; +#endif + + mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; + mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req; + + ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6"); + if (ops->slab_name == NULL) { + ret = -ENOMEM; + goto out; + } + + ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0, + SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN, + NULL); + + if (ops->slab == NULL) { + ret = -ENOMEM; + goto err_reqsk_create; + } + +out: + return ret; + +err_reqsk_create: + kfree(ops->slab_name); + ops->slab_name = NULL; + goto out; +} + +void mptcp_pm_v6_undo(void) +{ + kmem_cache_destroy(mptcp6_request_sock_ops.slab); + kfree(mptcp6_request_sock_ops.slab_name); +} diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c new file mode 100644 index 000000000000..cf019990447c --- /dev/null +++ b/net/mptcp/mptcp_ndiffports.c @@ -0,0 +1,174 @@ +#include + +#include +#include + +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif + +struct ndiffports_priv { + /* Worker struct for subflow establishment */ + struct work_struct subflow_work; + + struct mptcp_cb *mpcb; +}; + +static int num_subflows __read_mostly = 2; +module_param(num_subflows, int, 0644); +MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection"); + +/** + * Create all new subflows, by doing calls to mptcp_initX_subsockets + * + * This function uses a goto next_subflow, to allow releasing the lock between + * new subflows and giving other processes a chance to do some work on the + * socket and potentially finishing the communication. + **/ +static void create_subflow_worker(struct work_struct *work) +{ + const struct ndiffports_priv *pm_priv = container_of(work, + struct ndiffports_priv, + subflow_work); + struct mptcp_cb *mpcb = pm_priv->mpcb; + struct sock *meta_sk = mpcb->meta_sk; + int iter = 0; + +next_subflow: + if (iter) { + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + + cond_resched(); + } + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + if (!mptcp(tcp_sk(meta_sk))) + goto exit; + + iter++; + + if (sock_flag(meta_sk, SOCK_DEAD)) + goto exit; + + if (mpcb->master_sk && + !tcp_sk(mpcb->master_sk)->mptcp->fully_established) + goto exit; + + if (num_subflows > iter && num_subflows > mptcp_subflow_count(mpcb)) { + if (meta_sk->sk_family == AF_INET || + mptcp_v6_is_v4_mapped(meta_sk)) { + struct mptcp_loc4 loc; + struct mptcp_rem4 rem; + + loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr; + loc.loc4_id = 0; + loc.low_prio = 0; + if (mpcb->master_sk) + loc.if_idx = mpcb->master_sk->sk_bound_dev_if; + else + loc.if_idx = 0; + + rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr; + rem.port = inet_sk(meta_sk)->inet_dport; + rem.rem4_id = 0; /* Default 0 */ + + mptcp_init4_subsockets(meta_sk, &loc, &rem); + } else { +#if IS_ENABLED(CONFIG_IPV6) + struct mptcp_loc6 loc; + struct mptcp_rem6 rem; + + loc.addr = inet6_sk(meta_sk)->saddr; + loc.loc6_id = 0; + loc.low_prio = 0; + if (mpcb->master_sk) + loc.if_idx = mpcb->master_sk->sk_bound_dev_if; + else + loc.if_idx = 0; + + rem.addr = meta_sk->sk_v6_daddr; + rem.port = inet_sk(meta_sk)->inet_dport; + rem.rem6_id = 0; /* Default 0 */ + + mptcp_init6_subsockets(meta_sk, &loc, &rem); +#endif + } + goto next_subflow; + } + +exit: + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + mptcp_mpcb_put(mpcb); + sock_put(meta_sk); +} + +static void ndiffports_new_session(const struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; + + /* Initialize workqueue-struct */ + INIT_WORK(&fmp->subflow_work, create_subflow_worker); + fmp->mpcb = mpcb; +} + +static void ndiffports_create_subflows(struct sock *meta_sk) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0]; + + if (mptcp_in_infinite_mapping_weak(mpcb) || + mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD)) + return; + + if (!work_pending(&pm_priv->subflow_work)) { + sock_hold(meta_sk); + refcount_inc(&mpcb->mpcb_refcnt); + queue_work(mptcp_wq, &pm_priv->subflow_work); + } +} + +static int ndiffports_get_local_id(const struct sock *meta_sk, + sa_family_t family, union inet_addr *addr, + bool *low_prio) +{ + return 0; +} + +static struct mptcp_pm_ops ndiffports __read_mostly = { + .new_session = ndiffports_new_session, + .fully_established = ndiffports_create_subflows, + .get_local_id = ndiffports_get_local_id, + .name = "ndiffports", + .owner = THIS_MODULE, +}; + +/* General initialization of MPTCP_PM */ +static int __init ndiffports_register(void) +{ + BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE); + + if (mptcp_register_path_manager(&ndiffports)) + goto exit; + + return 0; + +exit: + return -1; +} + +static void ndiffports_unregister(void) +{ + mptcp_unregister_path_manager(&ndiffports); +} + +module_init(ndiffports_register); +module_exit(ndiffports_unregister); + +MODULE_AUTHOR("Christoph Paasch"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("NDIFF-PORTS MPTCP"); +MODULE_VERSION("0.88"); diff --git a/net/mptcp/mptcp_netlink.c b/net/mptcp/mptcp_netlink.c new file mode 100644 index 000000000000..a2de5c6fb9a7 --- /dev/null +++ b/net/mptcp/mptcp_netlink.c @@ -0,0 +1,1277 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP implementation - Netlink Path Manager + * + * Analysis, Design and Implementation: + * - Gregory Detal + * - Sébastien Barré + * - Matthieu Baerts + * - Pau Espin Pedrol + * - Detlev Casanova + * - David Verbeiren + * - Frank Vanbever + * - Antoine Maes + * - Tim Froidcoeur + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif + +#define MPTCP_MAX_ADDR 8 + +struct mptcp_nl_priv { + /* Unfortunately we need to store this to generate MP_JOINs in case + * of the peer generating a subflow (see get_local_id). + */ + u8 loc4_bits; + u8 announced4; + struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR]; + +#if IS_ENABLED(CONFIG_IPV6) + u8 loc6_bits; + u8 announced6; + struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR]; +#endif + + u16 remove_addrs; + + bool is_closed; +}; + +static struct genl_family mptcp_genl_family; + +#define MPTCP_GENL_EV_GRP_OFFSET 0 +#define MPTCP_GENL_CMD_GRP_OFFSET 1 + +static const struct genl_multicast_group mptcp_mcgrps[] = { + [MPTCP_GENL_EV_GRP_OFFSET] = { .name = MPTCP_GENL_EV_GRP_NAME, }, + [MPTCP_GENL_CMD_GRP_OFFSET] = { .name = MPTCP_GENL_CMD_GRP_NAME, }, +}; + +static const struct nla_policy mptcp_nl_genl_policy[MPTCP_ATTR_MAX + 1] = { + [MPTCP_ATTR_TOKEN] = { .type = NLA_U32, }, + [MPTCP_ATTR_FAMILY] = { .type = NLA_U16, }, + [MPTCP_ATTR_LOC_ID] = { .type = NLA_U8, }, + [MPTCP_ATTR_REM_ID] = { .type = NLA_U8, }, + [MPTCP_ATTR_SADDR4] = { .type = NLA_U32, }, + [MPTCP_ATTR_SADDR6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr), }, + [MPTCP_ATTR_DADDR4] = { .type = NLA_U32, }, + [MPTCP_ATTR_DADDR6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr), }, + [MPTCP_ATTR_SPORT] = { .type = NLA_U16, }, + [MPTCP_ATTR_DPORT] = { .type = NLA_U16, }, + [MPTCP_ATTR_BACKUP] = { .type = NLA_U8, }, + [MPTCP_ATTR_TIMEOUT] = { .type = NLA_U32, }, + [MPTCP_ATTR_IF_IDX] = { .type = NLA_S32, }, +}; + +/* Defines the userspace PM filter on events. Set events are ignored. */ +static u16 mptcp_nl_event_filter; + +static inline struct mptcp_nl_priv * +mptcp_nl_priv(const struct sock *meta_sk) +{ + return (struct mptcp_nl_priv *)&tcp_sk(meta_sk)->mpcb->mptcp_pm[0]; +} + +static inline bool +mptcp_nl_must_notify(u16 event, const struct sock *meta_sk) +{ + struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk); + + /* close_session() can be called before other events because it is + * also called when doing a fallback to TCP. We don't want to send + * events to the user-space after having sent the CLOSED event. + */ + if (priv->is_closed) + return false; + + if (event == MPTCPF_EVENT_CLOSED) + priv->is_closed = true; + + if (mptcp_nl_event_filter & event) + return false; + + if (!genl_has_listeners(&mptcp_genl_family, sock_net(meta_sk), 0)) + return false; + + return true; +} + +/* Find the first free index in the bitfield starting from 0 */ +static int +mptcp_nl_find_free_index(u8 bitfield) +{ + int i; + + /* There are anyways no free bits... */ + if (bitfield == 0xff) + return -1; + + i = ffs(~bitfield) - 1; + if (i < 0) + return -1; + + return i; +} + +static inline int +mptcp_nl_put_subsk(struct sk_buff *msg, struct sock *sk) +{ + struct inet_sock *isk = inet_sk(sk); + struct sock *meta_sk = mptcp_meta_sk(sk); + u8 backup; + u8 sk_err; + + if (nla_put_u16(msg, MPTCP_ATTR_FAMILY, sk->sk_family)) + goto nla_put_failure; + + if (nla_put_u8(msg, MPTCP_ATTR_LOC_ID, tcp_sk(sk)->mptcp->loc_id)) + goto nla_put_failure; + + if (nla_put_u8(msg, MPTCP_ATTR_REM_ID, tcp_sk(sk)->mptcp->rem_id)) + goto nla_put_failure; + + switch (sk->sk_family) { + case AF_INET: + if (nla_put_u32(msg, MPTCP_ATTR_SADDR4, isk->inet_saddr)) + goto nla_put_failure; + + if (nla_put_u32(msg, MPTCP_ATTR_DADDR4, isk->inet_daddr)) + goto nla_put_failure; + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (nla_put(msg, MPTCP_ATTR_SADDR6, sizeof(np->saddr), + &np->saddr)) + goto nla_put_failure; + + if (nla_put(msg, MPTCP_ATTR_DADDR6, sizeof(sk->sk_v6_daddr), + &sk->sk_v6_daddr)) + goto nla_put_failure; + break; + } +#endif + default: + goto nla_put_failure; + } + + if (nla_put_u16(msg, MPTCP_ATTR_SPORT, ntohs(isk->inet_sport))) + goto nla_put_failure; + + if (nla_put_u16(msg, MPTCP_ATTR_DPORT, ntohs(isk->inet_dport))) + goto nla_put_failure; + + backup = !!(tcp_sk(sk)->mptcp->rcv_low_prio || + tcp_sk(sk)->mptcp->low_prio); + + if (nla_put_u8(msg, MPTCP_ATTR_BACKUP, backup)) + goto nla_put_failure; + + if (nla_put_s32(msg, MPTCP_ATTR_IF_IDX, sk->sk_bound_dev_if)) + goto nla_put_failure; + + sk_err = sk->sk_err ? : tcp_sk(sk)->mptcp->sk_err; + if (unlikely(sk_err != 0) && meta_sk->sk_state == TCP_ESTABLISHED && + nla_put_u8(msg, MPTCP_ATTR_ERROR, sk_err)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static inline struct sk_buff * +mptcp_nl_mcast_prepare(struct mptcp_cb *mpcb, struct sock *sk, int cmd, + void **hdr) +{ + struct sk_buff *msg; + + /* possible optimisation: use the needed size */ + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return NULL; + + *hdr = genlmsg_put(msg, 0, 0, &mptcp_genl_family, 0, cmd); + if (!*hdr) + goto free_msg; + + if (nla_put_u32(msg, MPTCP_ATTR_TOKEN, mpcb->mptcp_loc_token)) + goto nla_put_failure; + + if (sk && mptcp_nl_put_subsk(msg, sk)) + goto nla_put_failure; + + return msg; + +nla_put_failure: + genlmsg_cancel(msg, *hdr); +free_msg: + nlmsg_free(msg); + return NULL; +} + +static inline int +mptcp_nl_mcast_send(struct mptcp_cb *mpcb, struct sk_buff *msg, void *hdr) +{ + int ret; + struct sock *meta_sk = mpcb->meta_sk; + + genlmsg_end(msg, hdr); + + ret = genlmsg_multicast_netns(&mptcp_genl_family, sock_net(meta_sk), + msg, 0, MPTCP_GENL_EV_GRP_OFFSET, + GFP_ATOMIC); + if (ret && ret != -ESRCH) + pr_err("%s: genlmsg_multicast failed with %d\n", __func__, ret); + return ret; +} + +static inline void +mptcp_nl_mcast(struct mptcp_cb *mpcb, struct sock *sk, int cmd) +{ + void *hdr; + struct sk_buff *msg; + + msg = mptcp_nl_mcast_prepare(mpcb, sk, cmd, &hdr); + if (msg) + mptcp_nl_mcast_send(mpcb, msg, hdr); + else + pr_warn("%s: unable to prepare multicast message\n", __func__); +} + +static inline void +mptcp_nl_mcast_fail(struct sk_buff *msg, void *hdr) +{ + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +static void +mptcp_nl_new(const struct sock *meta_sk, bool established) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + + mptcp_nl_mcast(mpcb, mpcb->master_sk, + established ? MPTCP_EVENT_ESTABLISHED + : MPTCP_EVENT_CREATED); +} + +static void +mptcp_nl_pm_new_session(const struct sock *meta_sk) +{ + if (!mptcp_nl_must_notify(MPTCPF_EVENT_CREATED, meta_sk)) + return; + + mptcp_nl_new(meta_sk, false); +} + +static inline int +mptcp_nl_loc_id_to_index_lookup(struct sock *meta_sk, sa_family_t family, + u8 addr_id) +{ + struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk); + int i; + + switch (family) { + case AF_INET: + mptcp_for_each_bit_set(priv->loc4_bits, i) { + if (priv->locaddr4[i].loc4_id == addr_id) + return i; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + mptcp_for_each_bit_set(priv->loc6_bits, i) { + if (priv->locaddr6[i].loc6_id == addr_id) + return i; + } + break; +#endif + } + return -1; +} + +static inline void +mptcp_nl_sk_setup_locaddr(struct sock *meta_sk, struct sock *sk) +{ + struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk); + bool backup = !!(tcp_sk(sk)->mptcp->rcv_low_prio || + tcp_sk(sk)->mptcp->low_prio); + sa_family_t family = mptcp_v6_is_v4_mapped(sk) ? AF_INET + : sk->sk_family; + u8 addr_id = tcp_sk(sk)->mptcp->loc_id; + int idx = mptcp_nl_loc_id_to_index_lookup(meta_sk, family, + addr_id); + + /* Same as in mptcp_fullmesh.c: exception for transparent sockets */ + int if_idx = inet_sk(sk)->transparent ? inet_sk(sk)->rx_dst_ifindex : + sk->sk_bound_dev_if; + + switch (family) { + case AF_INET: { + struct inet_sock *isk = inet_sk(sk); + + if (idx == -1) + idx = mptcp_nl_find_free_index(priv->loc4_bits); + if (idx == -1) { + pr_warn("No free index for sk loc_id v4\n"); + return; + } + priv->locaddr4[idx].addr.s_addr = isk->inet_saddr; + priv->locaddr4[idx].loc4_id = addr_id; + priv->locaddr4[idx].low_prio = backup; + priv->locaddr4[idx].if_idx = if_idx; + priv->loc4_bits |= 1 << idx; + priv->announced4 |= 1 << idx; + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (idx == -1) + idx = mptcp_nl_find_free_index(priv->loc6_bits); + if (idx == -1) { + pr_warn("No free index for sk loc_id v6\n"); + return; + } + priv->locaddr6[idx].addr = np->saddr; + priv->locaddr6[idx].loc6_id = addr_id; + priv->locaddr6[idx].low_prio = backup; + priv->locaddr6[idx].if_idx = if_idx; + priv->loc6_bits |= 1 << idx; + priv->announced6 |= 1 << idx; + break; + } +#endif + } +} + +static void +mptcp_nl_pm_fully_established(struct sock *meta_sk) +{ + mptcp_nl_sk_setup_locaddr(meta_sk, tcp_sk(meta_sk)->mpcb->master_sk); + + if (!mptcp_nl_must_notify(MPTCPF_EVENT_ESTABLISHED, meta_sk)) + return; + + mptcp_nl_new(meta_sk, true); +} + +static void +mptcp_nl_pm_close_session(struct sock *meta_sk) +{ + if (!mptcp_nl_must_notify(MPTCPF_EVENT_CLOSED, meta_sk)) + return; + + mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, NULL, MPTCP_EVENT_CLOSED); +} + +static void +mptcp_nl_pm_established_subflow(struct sock *sk) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + + mptcp_nl_sk_setup_locaddr(meta_sk, sk); + + if (!mptcp_nl_must_notify(MPTCPF_EVENT_SUB_ESTABLISHED, meta_sk)) + return; + + mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, sk, MPTCP_EVENT_SUB_ESTABLISHED); +} + +static void +mptcp_nl_pm_delete_subflow(struct sock *sk) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + + if (!mptcp_nl_must_notify(MPTCPF_EVENT_SUB_CLOSED, meta_sk)) + return; + + mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, sk, MPTCP_EVENT_SUB_CLOSED); +} + +static void +mptcp_nl_pm_add_raddr(struct mptcp_cb *mpcb, const union inet_addr *addr, + sa_family_t family, __be16 port, u8 id) +{ + struct sk_buff *msg; + void *hdr; + + if (!mptcp_nl_must_notify(MPTCPF_EVENT_ANNOUNCED, mpcb->meta_sk)) + return; + + msg = mptcp_nl_mcast_prepare(mpcb, NULL, MPTCP_EVENT_ANNOUNCED, &hdr); + if (!msg) + return; + + if (nla_put_u8(msg, MPTCP_ATTR_REM_ID, id)) + goto nla_put_failure; + + if (nla_put_u16(msg, MPTCP_ATTR_FAMILY, family)) + goto nla_put_failure; + + switch (family) { + case AF_INET: + if (nla_put_u32(msg, MPTCP_ATTR_DADDR4, addr->ip)) + goto nla_put_failure; + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + if (nla_put(msg, MPTCP_ATTR_DADDR6, sizeof(addr->ip6), + &addr->ip6)) + goto nla_put_failure; + break; +#endif + default: + goto nla_put_failure; + } + + if (nla_put_u16(msg, MPTCP_ATTR_DPORT, ntohs(port))) + goto nla_put_failure; + + mptcp_nl_mcast_send(mpcb, msg, hdr); + + return; + +nla_put_failure: + mptcp_nl_mcast_fail(msg, hdr); +} + +static void +mptcp_nl_pm_rem_raddr(struct mptcp_cb *mpcb, u8 id) +{ + struct sk_buff *msg; + void *hdr; + + if (!mptcp_nl_must_notify(MPTCPF_EVENT_REMOVED, mpcb->meta_sk)) + return; + + msg = mptcp_nl_mcast_prepare(mpcb, NULL, MPTCP_EVENT_REMOVED, &hdr); + + if (!msg) + return; + + if (nla_put_u8(msg, MPTCP_ATTR_REM_ID, id)) + goto nla_put_failure; + + mptcp_nl_mcast_send(mpcb, msg, hdr); + + return; + +nla_put_failure: + mptcp_nl_mcast_fail(msg, hdr); +} + +static int +mptcp_nl_pm_get_local_id(const struct sock *meta_sk, sa_family_t family, + union inet_addr *addr, bool *low_prio) +{ + struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk); + int i, id = 0; + + switch (family) { + case AF_INET: + mptcp_for_each_bit_set(priv->loc4_bits, i) { + if (addr->in.s_addr == priv->locaddr4[i].addr.s_addr) { + id = priv->locaddr4[i].loc4_id; + *low_prio = priv->locaddr4[i].low_prio; + goto out; + } + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + mptcp_for_each_bit_set(priv->loc6_bits, i) { + if (ipv6_addr_equal(&addr->in6, + &priv->locaddr6[i].addr)) { + id = priv->locaddr6[i].loc6_id; + *low_prio = priv->locaddr6[i].low_prio; + goto out; + } + } + break; +#endif + } + return -1; + +out: + return id; +} + +static void +mptcp_nl_pm_addr_signal(struct sock *sk, unsigned *size, + struct tcp_out_options *opts, struct sk_buff *skb) +{ + struct mptcp_nl_priv *priv = mptcp_nl_priv(sk); + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + u8 unannounced; + int remove_addr_len; + + unannounced = (~priv->announced4) & priv->loc4_bits; + if (unannounced && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) { + int i = mptcp_nl_find_free_index(~unannounced); + + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_ADD_ADDR; + opts->add_addr4.addr_id = priv->locaddr4[i].loc4_id; + opts->add_addr4.addr = priv->locaddr4[i].addr; + opts->add_addr_v4 = 1; + + if (skb) + priv->announced4 |= (1 << i); + *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN; + } + +#if IS_ENABLED(CONFIG_IPV6) + unannounced = (~priv->announced6) & priv->loc6_bits; + if (unannounced && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) { + int i = mptcp_nl_find_free_index(~unannounced); + + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_ADD_ADDR; + opts->add_addr6.addr_id = priv->locaddr6[i].loc6_id; + opts->add_addr6.addr = priv->locaddr6[i].addr; + opts->add_addr_v6 = 1; + + if (skb) + priv->announced6 |= (1 << i); + *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN; + } +#endif + + if (likely(!priv->remove_addrs)) + goto exit; + + remove_addr_len = mptcp_sub_len_remove_addr_align(priv->remove_addrs); + if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len) + goto exit; + + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_REMOVE_ADDR; + opts->remove_addrs = priv->remove_addrs; + + if (skb) + priv->remove_addrs = 0; + *size += remove_addr_len; + +exit: + mpcb->addr_signal = !!((~priv->announced4) & priv->loc4_bits || +#if IS_ENABLED(CONFIG_IPV6) + (~priv->announced6) & priv->loc6_bits || +#endif + priv->remove_addrs); +} + +static void +mptcp_nl_pm_prio_changed(struct sock *sk, int low_prio) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + + if (!mptcp_nl_must_notify(MPTCPF_EVENT_SUB_PRIORITY, meta_sk)) + return; + + mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, sk, MPTCP_EVENT_SUB_PRIORITY); +} + +static int +mptcp_nl_genl_announce(struct sk_buff *skb, struct genl_info *info) +{ + struct sock *meta_sk, *subsk; + struct mptcp_cb *mpcb; + struct mptcp_nl_priv *priv; + u32 token; + u8 addr_id, backup = 0; + u16 family; + int i, ret = 0; + union inet_addr saddr; + int if_idx = 0; + bool useless; /* unused out parameter "low_prio" */ + + if (!info->attrs[MPTCP_ATTR_TOKEN] || !info->attrs[MPTCP_ATTR_FAMILY] || + !info->attrs[MPTCP_ATTR_LOC_ID]) + return -EINVAL; + + token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); + meta_sk = mptcp_hash_find(genl_info_net(info), token); + if (!meta_sk) + return -EINVAL; + + mpcb = tcp_sk(meta_sk)->mpcb; + priv = mptcp_nl_priv(meta_sk); + family = nla_get_u16(info->attrs[MPTCP_ATTR_FAMILY]); + addr_id = nla_get_u8(info->attrs[MPTCP_ATTR_LOC_ID]); + + if (info->attrs[MPTCP_ATTR_BACKUP]) + backup = nla_get_u8(info->attrs[MPTCP_ATTR_BACKUP]); + + if (info->attrs[MPTCP_ATTR_IF_IDX]) + if_idx = nla_get_s32(info->attrs[MPTCP_ATTR_IF_IDX]); + + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + switch (family) { + case AF_INET: + if (!info->attrs[MPTCP_ATTR_SADDR4]) { + ret = -EINVAL; + goto exit; + } + + saddr.in.s_addr = nla_get_u32(info->attrs[MPTCP_ATTR_SADDR4]); + i = mptcp_nl_pm_get_local_id(meta_sk, family, + &saddr, &useless); + if (i < 0) { + i = mptcp_nl_find_free_index(priv->loc4_bits); + if (i < 0) { + ret = -ENOBUFS; + goto exit; + } + } else if (i != addr_id) { + ret = -EINVAL; + goto exit; + } + + priv->locaddr4[i].addr.s_addr = saddr.in.s_addr; + priv->locaddr4[i].loc4_id = addr_id; + priv->locaddr4[i].low_prio = !!backup; + priv->locaddr4[i].if_idx = if_idx; + priv->loc4_bits |= 1 << i; + priv->announced4 &= ~(1 << i); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + if (!info->attrs[MPTCP_ATTR_SADDR6]) { + ret = -EINVAL; + goto exit; + } + + saddr.in6 = *(struct in6_addr *) + nla_data(info->attrs[MPTCP_ATTR_SADDR6]); + i = mptcp_nl_pm_get_local_id(meta_sk, family, &saddr, &useless); + if (i < 0) { + i = mptcp_nl_find_free_index(priv->loc6_bits); + if (i < 0) { + ret = -ENOBUFS; + goto exit; + } + } else if (i != addr_id) { + ret = -EINVAL; + goto exit; + } + + priv->locaddr6[i].addr = saddr.in6; + priv->locaddr6[i].loc6_id = addr_id; + priv->locaddr6[i].low_prio = !!backup; + priv->locaddr6[i].if_idx = if_idx; + priv->loc6_bits |= 1 << i; + priv->announced6 &= ~(1 << i); + break; +#endif + default: + ret = -EINVAL; + goto exit; + } + + mpcb->addr_signal = 1; + + rcu_read_lock_bh(); + subsk = mptcp_select_ack_sock(meta_sk); + if (subsk) + tcp_send_ack(subsk); + rcu_read_unlock_bh(); + +exit: + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + sock_put(meta_sk); + return ret; +} + +static int +mptcp_nl_genl_remove(struct sk_buff *skb, struct genl_info *info) +{ + struct sock *meta_sk, *subsk; + struct mptcp_cb *mpcb; + struct mptcp_nl_priv *priv; + u32 token; + u8 addr_id; + int i; + int retcode; + bool found = false; + + if (!info->attrs[MPTCP_ATTR_TOKEN] || !info->attrs[MPTCP_ATTR_LOC_ID]) + return -EINVAL; + + token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); + meta_sk = mptcp_hash_find(genl_info_net(info), token); + if (!meta_sk) + return -EINVAL; + + mpcb = tcp_sk(meta_sk)->mpcb; + priv = mptcp_nl_priv(meta_sk); + addr_id = nla_get_u8(info->attrs[MPTCP_ATTR_LOC_ID]); + + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + mptcp_for_each_bit_set(priv->loc4_bits, i) { + if (priv->locaddr4[i].loc4_id == addr_id) { + priv->loc4_bits &= ~(1 << i); + found = true; + break; + } + } + +#if IS_ENABLED(CONFIG_IPV6) + if (!found) { + mptcp_for_each_bit_set(priv->loc6_bits, i) { + if (priv->locaddr6[i].loc6_id == addr_id) { + priv->loc6_bits &= ~(1 << i); + found = true; + break; + } + } + } +#endif + + if (found) { + priv->remove_addrs |= 1 << addr_id; + mpcb->addr_signal = 1; + + rcu_read_lock_bh(); + subsk = mptcp_select_ack_sock(meta_sk); + if (subsk) + tcp_send_ack(subsk); + rcu_read_unlock_bh(); + retcode = 0; + } else { + retcode = -EINVAL; + } + + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + sock_put(meta_sk); + return retcode; +} + +static int +mptcp_nl_genl_create(struct sk_buff *skb, struct genl_info *info) +{ + struct sock *meta_sk, *subsk = NULL; + struct mptcp_cb *mpcb; + struct mptcp_nl_priv *priv; + u32 token; + u16 family, sport; + u8 loc_id, rem_id, backup = 0; + int i, ret = 0; + int if_idx; + + if (!info->attrs[MPTCP_ATTR_TOKEN] || !info->attrs[MPTCP_ATTR_FAMILY] || + !info->attrs[MPTCP_ATTR_LOC_ID] || !info->attrs[MPTCP_ATTR_REM_ID]) + return -EINVAL; + + token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); + meta_sk = mptcp_hash_find(genl_info_net(info), token); + if (!meta_sk) + /* We use a more specific value than EINVAL here so that + * userspace can handle this specific case easily. This is + * useful to check the case in which userspace tries to create a + * subflow for a connection which was already destroyed recently + * in kernelspace, but userspace didn't have time to realize + * about it because there is a gap of time between kernel + * destroying the connection and userspace receiving the event + * through Netlink. It can easily happen for short life-time + * conns. + */ + return -EBADR; + + mpcb = tcp_sk(meta_sk)->mpcb; + + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + if (sock_flag(meta_sk, SOCK_DEAD)) { + /* Same as for the EBADR case. In this case, though, we know for + * sure the conn owner of the subflow existed at some point (no + * invalid token possibility) + */ + ret = -EOWNERDEAD; + goto unlock; + } + + if (!mptcp_can_new_subflow(meta_sk)) { + /* Same as for the EBADR and EOWNERDEAD case but here, the MPTCP + * session has just been stopped, it is no longer possible to + * create new subflows. + */ + ret = -ENOTCONN; + goto unlock; + } + + if (mpcb->master_sk && + !tcp_sk(mpcb->master_sk)->mptcp->fully_established) { + /* First condition is not only in there for safely purposes, it + * can also be triggered in the same scenario as in EBADR and + * EOWNERDEAD + */ + ret = -EAGAIN; + goto unlock; + } + + priv = mptcp_nl_priv(meta_sk); + + family = nla_get_u16(info->attrs[MPTCP_ATTR_FAMILY]); + loc_id = nla_get_u8(info->attrs[MPTCP_ATTR_LOC_ID]); + rem_id = nla_get_u8(info->attrs[MPTCP_ATTR_REM_ID]); + + sport = info->attrs[MPTCP_ATTR_SPORT] + ? htons(nla_get_u16(info->attrs[MPTCP_ATTR_SPORT])) : 0; + backup = info->attrs[MPTCP_ATTR_BACKUP] + ? nla_get_u8(info->attrs[MPTCP_ATTR_BACKUP]) : 0; + if_idx = info->attrs[MPTCP_ATTR_IF_IDX] + ? nla_get_s32(info->attrs[MPTCP_ATTR_IF_IDX]) : 0; + + switch (family) { + case AF_INET: { + struct mptcp_rem4 rem = { + .rem4_id = rem_id, + }; + struct mptcp_loc4 loc = { + .loc4_id = loc_id, + }; + + if (!info->attrs[MPTCP_ATTR_DADDR4] || + !info->attrs[MPTCP_ATTR_DPORT]) { + goto create_failed; + } else { + rem.addr.s_addr = + nla_get_u32(info->attrs[MPTCP_ATTR_DADDR4]); + rem.port = + ntohs(nla_get_u16(info->attrs[MPTCP_ATTR_DPORT])); + } + + if (!info->attrs[MPTCP_ATTR_SADDR4]) { + bool found = false; + + mptcp_for_each_bit_set(priv->loc4_bits, i) { + if (priv->locaddr4[i].loc4_id == loc_id) { + loc.addr = priv->locaddr4[i].addr; + loc.low_prio = + priv->locaddr4[i].low_prio; + loc.if_idx = + priv->locaddr4[i].if_idx; + found = true; + break; + } + } + + if (!found) + goto create_failed; + } else { + loc.addr.s_addr = + nla_get_u32(info->attrs[MPTCP_ATTR_SADDR4]); + loc.low_prio = backup; + loc.if_idx = if_idx; + } + + ret = __mptcp_init4_subsockets(meta_sk, &loc, sport, &rem, + &subsk); + if (ret < 0) + goto unlock; + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct mptcp_rem6 rem = { + .rem6_id = rem_id, + }; + struct mptcp_loc6 loc = { + .loc6_id = loc_id, + }; + + if (!info->attrs[MPTCP_ATTR_DADDR6] || + !info->attrs[MPTCP_ATTR_DPORT]) { + goto create_failed; + } else { + rem.addr = *(struct in6_addr *) + nla_data(info->attrs[MPTCP_ATTR_DADDR6]); + rem.port = + ntohs(nla_get_u16(info->attrs[MPTCP_ATTR_DPORT])); + } + + if (!info->attrs[MPTCP_ATTR_SADDR6]) { + bool found = false; + + mptcp_for_each_bit_set(priv->loc6_bits, i) { + if (priv->locaddr6[i].loc6_id == loc_id) { + loc.addr = priv->locaddr6[i].addr; + loc.low_prio = + priv->locaddr6[i].low_prio; + loc.if_idx = + priv->locaddr6[i].if_idx; + + found = true; + break; + } + } + + if (!found) + goto create_failed; + } else { + loc.addr = *(struct in6_addr *) + nla_data(info->attrs[MPTCP_ATTR_SADDR6]); + loc.low_prio = backup; + loc.if_idx = if_idx; + } + + ret = __mptcp_init6_subsockets(meta_sk, &loc, sport, &rem, + &subsk); + if (ret < 0) + goto unlock; + break; + } +#endif + default: + goto create_failed; + } + +unlock: + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + sock_put(meta_sk); + return ret; + +create_failed: + ret = -EINVAL; + goto unlock; +} + +static struct sock * +mptcp_nl_subsk_lookup(struct mptcp_cb *mpcb, struct nlattr **attrs) +{ + struct sock *sk; + struct mptcp_tcp_sock *mptcp; + struct hlist_node *tmp; + u16 family; + __be16 sport, dport; + + if (!attrs[MPTCP_ATTR_FAMILY] || !attrs[MPTCP_ATTR_SPORT] || + !attrs[MPTCP_ATTR_DPORT]) + goto exit; + + family = nla_get_u16(attrs[MPTCP_ATTR_FAMILY]); + sport = htons(nla_get_u16(attrs[MPTCP_ATTR_SPORT])); + dport = htons(nla_get_u16(attrs[MPTCP_ATTR_DPORT])); + + switch (family) { + case AF_INET: { + __be32 saddr, daddr; + + if (!attrs[MPTCP_ATTR_SADDR4] || !attrs[MPTCP_ATTR_DADDR4]) + break; + + saddr = nla_get_u32(attrs[MPTCP_ATTR_SADDR4]); + daddr = nla_get_u32(attrs[MPTCP_ATTR_DADDR4]); + + mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { + struct sock *subsk = mptcp_to_sock(mptcp); + struct inet_sock *isk = inet_sk(subsk); + + if (subsk->sk_family != AF_INET) + continue; + + if (isk->inet_saddr == saddr && + isk->inet_daddr == daddr && + isk->inet_sport == sport && + isk->inet_dport == dport) { + sk = subsk; + goto found; + } + } + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct in6_addr saddr, daddr; + + if (!attrs[MPTCP_ATTR_SADDR6] || !attrs[MPTCP_ATTR_DADDR6]) + break; + + saddr = *(struct in6_addr *)nla_data(attrs[MPTCP_ATTR_SADDR6]); + daddr = *(struct in6_addr *)nla_data(attrs[MPTCP_ATTR_DADDR6]); + + mptcp_for_each_sub_safe(mpcb, mptcp, tmp) { + struct sock *subsk = mptcp_to_sock(mptcp); + struct inet_sock *isk = inet_sk(subsk); + struct ipv6_pinfo *np; + + if (subsk->sk_family != AF_INET6) + continue; + + np = inet6_sk(subsk); + if (ipv6_addr_equal(&saddr, &np->saddr) && + ipv6_addr_equal(&daddr, &subsk->sk_v6_daddr) && + isk->inet_sport == sport && + isk->inet_dport == dport) { + sk = subsk; + goto found; + } + } + break; + } +#endif + } + +exit: + sk = NULL; +found: + return sk; +} + +static int +mptcp_nl_genl_destroy(struct sk_buff *skb, struct genl_info *info) +{ + struct sock *meta_sk, *subsk; + struct mptcp_cb *mpcb; + int ret = 0; + u32 token; + + if (!info->attrs[MPTCP_ATTR_TOKEN]) + return -EINVAL; + + token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); + + meta_sk = mptcp_hash_find(genl_info_net(info), token); + if (!meta_sk) + return -EINVAL; + + mpcb = tcp_sk(meta_sk)->mpcb; + + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + subsk = mptcp_nl_subsk_lookup(mpcb, info->attrs); + if (subsk) { + local_bh_disable(); + mptcp_reinject_data(subsk, 0); + mptcp_send_reset(subsk); + local_bh_enable(); + } else { + ret = -EINVAL; + } + + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + sock_put(meta_sk); + return ret; +} + +static int +mptcp_nl_genl_conn_exists(struct sk_buff *skb, struct genl_info *info) +{ + struct sock *meta_sk; + u32 token; + + if (!info->attrs[MPTCP_ATTR_TOKEN]) + return -EINVAL; + + token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); + + meta_sk = mptcp_hash_find(genl_info_net(info), token); + if (!meta_sk) + return -ENOTCONN; + + sock_put(meta_sk); + return 0; +} + +static int +mptcp_nl_genl_priority(struct sk_buff *skb, struct genl_info *info) +{ + struct sock *meta_sk, *subsk; + struct mptcp_cb *mpcb; + int ret = 0; + u32 token; + u8 backup = 0; + + if (!info->attrs[MPTCP_ATTR_TOKEN]) + return -EINVAL; + + token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]); + if (info->attrs[MPTCP_ATTR_BACKUP]) + backup = nla_get_u8(info->attrs[MPTCP_ATTR_BACKUP]); + + meta_sk = mptcp_hash_find(genl_info_net(info), token); + if (!meta_sk) + return -EINVAL; + + mpcb = tcp_sk(meta_sk)->mpcb; + + mutex_lock(&mpcb->mpcb_mutex); + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING); + + subsk = mptcp_nl_subsk_lookup(mpcb, info->attrs); + if (subsk) { + tcp_sk(subsk)->mptcp->send_mp_prio = 1; + tcp_sk(subsk)->mptcp->low_prio = !!backup; + + local_bh_disable(); + if (mptcp_sk_can_send_ack(subsk)) + tcp_send_ack(subsk); + else + ret = -ENOTCONN; + local_bh_enable(); + } else { + ret = -EINVAL; + } + + release_sock(meta_sk); + mutex_unlock(&mpcb->mpcb_mutex); + sock_put(meta_sk); + return ret; +} + +static int +mptcp_nl_genl_set_filter(struct sk_buff *skb, struct genl_info *info) +{ + u16 flags; + + if (!info->attrs[MPTCP_ATTR_FLAGS]) + return -EINVAL; + + flags = nla_get_u16(info->attrs[MPTCP_ATTR_FLAGS]); + + /* Only want to receive events that correspond to these flags */ + mptcp_nl_event_filter = ~flags; + + return 0; +} + +static struct genl_ops mptcp_genl_ops[] = { + { + .cmd = MPTCP_CMD_ANNOUNCE, + .doit = mptcp_nl_genl_announce, + .policy = mptcp_nl_genl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_CMD_REMOVE, + .doit = mptcp_nl_genl_remove, + .policy = mptcp_nl_genl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_CMD_SUB_CREATE, + .doit = mptcp_nl_genl_create, + .policy = mptcp_nl_genl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_CMD_SUB_DESTROY, + .doit = mptcp_nl_genl_destroy, + .policy = mptcp_nl_genl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_CMD_SUB_PRIORITY, + .doit = mptcp_nl_genl_priority, + .policy = mptcp_nl_genl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_CMD_SET_FILTER, + .doit = mptcp_nl_genl_set_filter, + .policy = mptcp_nl_genl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_CMD_EXIST, + .doit = mptcp_nl_genl_conn_exists, + .policy = mptcp_nl_genl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +static struct mptcp_pm_ops mptcp_nl_pm_ops = { + .new_session = mptcp_nl_pm_new_session, + .close_session = mptcp_nl_pm_close_session, + .fully_established = mptcp_nl_pm_fully_established, + .established_subflow = mptcp_nl_pm_established_subflow, + .delete_subflow = mptcp_nl_pm_delete_subflow, + .add_raddr = mptcp_nl_pm_add_raddr, + .rem_raddr = mptcp_nl_pm_rem_raddr, + .get_local_id = mptcp_nl_pm_get_local_id, + .addr_signal = mptcp_nl_pm_addr_signal, + .prio_changed = mptcp_nl_pm_prio_changed, + .name = "netlink", + .owner = THIS_MODULE, +}; + +static struct genl_family mptcp_genl_family = { + .hdrsize = 0, + .name = MPTCP_GENL_NAME, + .version = MPTCP_GENL_VER, + .maxattr = MPTCP_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = mptcp_genl_ops, + .n_ops = ARRAY_SIZE(mptcp_genl_ops), + .mcgrps = mptcp_mcgrps, + .n_mcgrps = ARRAY_SIZE(mptcp_mcgrps), +}; + +static int __init +mptcp_nl_init(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct mptcp_nl_priv) > MPTCP_PM_SIZE); + + ret = genl_register_family(&mptcp_genl_family); + if (ret) + goto out_genl; + + ret = mptcp_register_path_manager(&mptcp_nl_pm_ops); + if (ret) + goto out_pm; + + return 0; +out_pm: + genl_unregister_family(&mptcp_genl_family); +out_genl: + return ret; +} + +static void __exit +mptcp_nl_exit(void) +{ + mptcp_unregister_path_manager(&mptcp_nl_pm_ops); + genl_unregister_family(&mptcp_genl_family); +} + +module_init(mptcp_nl_init); +module_exit(mptcp_nl_exit); + +MODULE_AUTHOR("Gregory Detal "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPTCP netlink-based path manager"); +MODULE_ALIAS_GENL_FAMILY(MPTCP_GENL_NAME); diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c new file mode 100644 index 000000000000..c44eb9208581 --- /dev/null +++ b/net/mptcp/mptcp_olia.c @@ -0,0 +1,318 @@ +/* + * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL: + * + * Algorithm design: + * Ramin Khalili + * Nicolas Gast + * Jean-Yves Le Boudec + * + * Implementation: + * Ramin Khalili + * + * Ported to the official MPTCP-kernel: + * Christoph Paasch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include + +#include + +static int scale = 10; + +struct mptcp_olia { + u32 mptcp_loss1; + u32 mptcp_loss2; + u32 mptcp_loss3; + int epsilon_num; + u32 epsilon_den; + int mptcp_snd_cwnd_cnt; +}; + +static inline int mptcp_olia_sk_can_send(const struct sock *sk) +{ + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us; +} + +static inline u64 mptcp_olia_scale(u64 val, int scale) +{ + return (u64) val << scale; +} + +/* take care of artificially inflate (see RFC5681) + * of cwnd during fast-retransmit phase + */ +static u32 mptcp_get_crt_cwnd(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_state == TCP_CA_Recovery) + return tcp_sk(sk)->snd_ssthresh; + else + return tcp_sk(sk)->snd_cwnd; +} + +/* return the dominator of the first term of the increasing term */ +static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt) +{ + struct mptcp_tcp_sock *mptcp; + u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */ + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + struct tcp_sock *tp = tcp_sk(sk); + u64 scaled_num; + u32 tmp_cwnd; + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + tmp_cwnd = mptcp_get_crt_cwnd(sk); + scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt; + rate += div_u64(scaled_num , tp->srtt_us); + } + rate *= rate; + return rate; +} + +/* find the maximum cwnd, used to find set M */ +static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb) +{ + struct mptcp_tcp_sock *mptcp; + u32 best_cwnd = 0; + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + u32 tmp_cwnd; + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + tmp_cwnd = mptcp_get_crt_cwnd(sk); + if (tmp_cwnd > best_cwnd) + best_cwnd = tmp_cwnd; + } + return best_cwnd; +} + +static void mptcp_get_epsilon(const struct mptcp_cb *mpcb) +{ + struct mptcp_tcp_sock *mptcp; + struct mptcp_olia *ca; + struct tcp_sock *tp; + struct sock *sk; + u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1; + u32 max_cwnd, tmp_cwnd, established_cnt = 0; + u8 M = 0, B_not_M = 0; + + /* TODO - integrate this in the following loop - we just want to iterate once */ + + max_cwnd = mptcp_get_max_cwnd(mpcb); + + /* find the best path */ + mptcp_for_each_sub(mpcb, mptcp) { + sk = mptcp_to_sock(mptcp); + tp = tcp_sk(sk); + ca = inet_csk_ca(sk); + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + established_cnt++; + + tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; + /* TODO - check here and rename variables */ + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, + ca->mptcp_loss2 - ca->mptcp_loss1); + + if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) { + best_rtt = tmp_rtt; + best_int = tmp_int; + } + } + + /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */ + /* find the size of M and B_not_M */ + mptcp_for_each_sub(mpcb, mptcp) { + sk = mptcp_to_sock(mptcp); + tp = tcp_sk(sk); + ca = inet_csk_ca(sk); + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + tmp_cwnd = mptcp_get_crt_cwnd(sk); + if (tmp_cwnd == max_cwnd) { + M++; + } else { + tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, + ca->mptcp_loss2 - ca->mptcp_loss1); + + if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) + B_not_M++; + } + } + + /* check if the path is in M or B_not_M and set the value of epsilon accordingly */ + mptcp_for_each_sub(mpcb, mptcp) { + sk = mptcp_to_sock(mptcp); + tp = tcp_sk(sk); + ca = inet_csk_ca(sk); + + if (!mptcp_olia_sk_can_send(sk)) + continue; + + if (B_not_M == 0) { + ca->epsilon_num = 0; + ca->epsilon_den = 1; + } else { + tmp_rtt = (u64)tp->srtt_us * tp->srtt_us; + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2, + ca->mptcp_loss2 - ca->mptcp_loss1); + tmp_cwnd = mptcp_get_crt_cwnd(sk); + + if (tmp_cwnd < max_cwnd && + (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) { + ca->epsilon_num = 1; + ca->epsilon_den = established_cnt * B_not_M; + } else if (tmp_cwnd == max_cwnd) { + ca->epsilon_num = -1; + ca->epsilon_den = established_cnt * M; + } else { + ca->epsilon_num = 0; + ca->epsilon_den = 1; + } + } + } +} + +/* setting the initial values */ +static void mptcp_olia_init(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_olia *ca = inet_csk_ca(sk); + + if (mptcp(tp)) { + ca->mptcp_loss1 = tp->snd_una; + ca->mptcp_loss2 = tp->snd_una; + ca->mptcp_loss3 = tp->snd_una; + ca->mptcp_snd_cwnd_cnt = 0; + ca->epsilon_num = 0; + ca->epsilon_den = 1; + } +} + +/* updating inter-loss distance and ssthresh */ +static void mptcp_olia_set_state(struct sock *sk, u8 new_state) +{ + if (!mptcp(tcp_sk(sk))) + return; + + if (new_state == TCP_CA_Loss || + new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) { + struct mptcp_olia *ca = inet_csk_ca(sk); + + if (ca->mptcp_loss3 != ca->mptcp_loss2 && + !inet_csk(sk)->icsk_retransmits) { + ca->mptcp_loss1 = ca->mptcp_loss2; + ca->mptcp_loss2 = ca->mptcp_loss3; + } + } +} + +/* main algorithm */ +static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_olia *ca = inet_csk_ca(sk); + const struct mptcp_cb *mpcb = tp->mpcb; + + u64 inc_num, inc_den, rate, cwnd_scaled; + + if (!mptcp(tp)) { + tcp_reno_cong_avoid(sk, ack, acked); + return; + } + + ca->mptcp_loss3 = tp->snd_una; + + if (!tcp_is_cwnd_limited(sk)) + return; + + /* slow start if it is in the safe area */ + if (tcp_in_slow_start(tp)) { + tcp_slow_start(tp, acked); + return; + } + + mptcp_get_epsilon(mpcb); + rate = mptcp_get_rate(mpcb, tp->srtt_us); + cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale); + inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1; + + /* calculate the increasing term, scaling is used to reduce the rounding effect */ + if (ca->epsilon_num == -1) { + if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) { + inc_num = rate - ca->epsilon_den * + cwnd_scaled * cwnd_scaled; + ca->mptcp_snd_cwnd_cnt -= div64_u64( + mptcp_olia_scale(inc_num , scale) , inc_den); + } else { + inc_num = ca->epsilon_den * + cwnd_scaled * cwnd_scaled - rate; + ca->mptcp_snd_cwnd_cnt += div64_u64( + mptcp_olia_scale(inc_num , scale) , inc_den); + } + } else { + inc_num = ca->epsilon_num * rate + + ca->epsilon_den * cwnd_scaled * cwnd_scaled; + ca->mptcp_snd_cwnd_cnt += div64_u64( + mptcp_olia_scale(inc_num , scale) , inc_den); + } + + + if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + ca->mptcp_snd_cwnd_cnt = 0; + } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) { + tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1); + ca->mptcp_snd_cwnd_cnt = 0; + } +} + +static struct tcp_congestion_ops mptcp_olia = { + .init = mptcp_olia_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = mptcp_olia_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, + .set_state = mptcp_olia_set_state, + .owner = THIS_MODULE, + .name = "olia", +}; + +static int __init mptcp_olia_register(void) +{ + BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&mptcp_olia); +} + +static void __exit mptcp_olia_unregister(void) +{ + tcp_unregister_congestion_control(&mptcp_olia); +} + +module_init(mptcp_olia_register); +module_exit(mptcp_olia_unregister); + +MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL"); +MODULE_VERSION("0.1"); diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c new file mode 100644 index 000000000000..93a149081303 --- /dev/null +++ b/net/mptcp/mptcp_output.c @@ -0,0 +1,1936 @@ +/* + * MPTCP implementation - Sending side + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include + +static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN + + MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN; + +static inline int mptcp_sub_len_remove_addr(u16 bitfield) +{ + unsigned int c; + for (c = 0; bitfield; c++) + bitfield &= bitfield - 1; + return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1; +} + +int mptcp_sub_len_remove_addr_align(u16 bitfield) +{ + return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4); +} +EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align); + +/* get the data-seq and end-data-seq and store them again in the + * tcp_skb_cb + */ +static bool mptcp_reconstruct_mapping(struct sk_buff *skb) +{ + const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss; + __be32 *p32; + __be16 *p16; + + if (!mptcp_is_data_seq(skb)) + return false; + + if (!mpdss->M) + return false; + + /* Move the pointer to the data-seq */ + p32 = (__be32 *)mpdss; + p32++; + if (mpdss->A) { + p32++; + if (mpdss->a) + p32++; + } + + TCP_SKB_CB(skb)->seq = ntohl(*p32); + + /* Get the data_len to calculate the end_data_seq */ + p32++; + p32++; + p16 = (__be16 *)p32; + TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq; + + return true; +} + +static bool mptcp_is_reinjected(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->mptcp_flags & MPTCP_REINJECT; +} + +static void mptcp_find_and_set_pathmask(struct sock *meta_sk, struct sk_buff *skb) +{ + struct rb_node **p = &meta_sk->tcp_rtx_queue.rb_node; + struct rb_node *parent; + struct sk_buff *skb_it; + + while (*p) { + parent = *p; + skb_it = rb_to_skb(parent); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb_it)->seq)) { + p = &parent->rb_left; + continue; + } + if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb_it)->seq)) { + p = &parent->rb_right; + continue; + } + + TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask; + break; + } +} + +/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are + * coming from the meta-retransmit-timer + */ +static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk, + struct sock *sk, int clone_it, + enum tcp_queue tcp_queue) +{ + struct sk_buff *skb, *skb1; + const struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + u32 seq, end_seq; + + if (clone_it) { + /* pskb_copy is necessary here, because the TCP/IP-headers + * will be changed when it's going to be reinjected on another + * subflow. + */ + tcp_skb_tsorted_save(orig_skb) { + skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC); + } tcp_skb_tsorted_restore(orig_skb); + } else { + if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) { + __skb_unlink(orig_skb, &sk->sk_write_queue); + } else { + list_del(&orig_skb->tcp_tsorted_anchor); + tcp_rtx_queue_unlink(orig_skb, sk); + INIT_LIST_HEAD(&orig_skb->tcp_tsorted_anchor); + } + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + sk->sk_wmem_queued -= orig_skb->truesize; + sk_mem_uncharge(sk, orig_skb->truesize); + skb = orig_skb; + } + if (unlikely(!skb)) + return; + + /* Make sure that this list is clean */ + tcp_skb_tsorted_anchor_cleanup(skb); + + if (sk && !mptcp_reconstruct_mapping(skb)) { + __kfree_skb(skb); + return; + } + + skb->sk = meta_sk; + + /* Reset subflow-specific TCP control-data */ + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->tcp_flags &= (TCPHDR_ACK | TCPHDR_PSH); + + /* If it reached already the destination, we don't have to reinject it */ + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { + __kfree_skb(skb); + return; + } + + /* Only reinject segments that are fully covered by the mapping */ + if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) != + TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { + struct rb_node *parent, **p = &meta_sk->tcp_rtx_queue.rb_node; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + u32 seq = TCP_SKB_CB(skb)->seq; + + __kfree_skb(skb); + + /* Ok, now we have to look for the full mapping in the meta + * send-queue :S + */ + + /* First, find the first skb that covers us */ + while (*p) { + parent = *p; + skb = rb_to_skb(parent); + + /* Not yet at the mapping? */ + if (!after(end_seq, TCP_SKB_CB(skb)->seq)) { + p = &parent->rb_left; + continue; + } + + if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { + p = &parent->rb_right; + continue; + } + + break; + } + + if (*p) { + /* We found it, now let's reinject everything */ + skb = rb_to_skb(*p); + + skb_rbtree_walk_from(skb) { + if (after(TCP_SKB_CB(skb)->end_seq, end_seq)) + return; + __mptcp_reinject_data(skb, meta_sk, NULL, 1, + TCP_FRAG_IN_RTX_QUEUE); + } + } + return; + } + + /* Segment goes back to the MPTCP-layer. So, we need to zero the + * path_mask/dss. + */ + memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len); + + /* We need to find out the path-mask from the meta-write-queue + * to properly select a subflow. + */ + mptcp_find_and_set_pathmask(meta_sk, skb); + + /* If it's empty, just add */ + if (skb_queue_empty(&mpcb->reinject_queue)) { + skb_queue_head(&mpcb->reinject_queue, skb); + return; + } + + /* Find place to insert skb - or even we can 'drop' it, as the + * data is already covered by other skb's in the reinject-queue. + * + * This is inspired by code from tcp_data_queue. + */ + + skb1 = skb_peek_tail(&mpcb->reinject_queue); + seq = TCP_SKB_CB(skb)->seq; + while (1) { + if (!after(TCP_SKB_CB(skb1)->seq, seq)) + break; + if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) { + skb1 = NULL; + break; + } + skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); + } + + /* Do skb overlap to previous one? */ + end_seq = TCP_SKB_CB(skb)->end_seq; + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Don't reinject */ + __kfree_skb(skb); + return; + } + if (seq == TCP_SKB_CB(skb1)->seq) { + if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) + skb1 = NULL; + else + skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1); + } + } + if (!skb1) + __skb_queue_head(&mpcb->reinject_queue, skb); + else + __skb_queue_after(&mpcb->reinject_queue, skb1, skb); + + /* And clean segments covered by new one as whole. */ + while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) { + skb1 = skb_queue_next(&mpcb->reinject_queue, skb); + + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) + break; + + __skb_unlink(skb1, &mpcb->reinject_queue); + __kfree_skb(skb1); + } + return; +} + +/* Inserts data into the reinject queue */ +void mptcp_reinject_data(struct sock *sk, int clone_it) +{ + struct sock *meta_sk = mptcp_meta_sk(sk); + struct sk_buff *skb_it, *tmp; + enum tcp_queue tcp_queue; + + /* It has already been closed - there is really no point in reinjecting */ + if (meta_sk->sk_state == TCP_CLOSE) + return; + + skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) { + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); + /* Subflow syn's and fin's are not reinjected. + * + * As well as empty subflow-fins with a data-fin. + * They are reinjected below (without the subflow-fin-flag) + */ + if (tcb->tcp_flags & TCPHDR_SYN || + (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) || + (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len)) + continue; + + if (mptcp_is_reinjected(skb_it)) + continue; + + tcb->mptcp_flags |= MPTCP_REINJECT; + __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it, + TCP_FRAG_IN_WRITE_QUEUE); + } + + skb_it = tcp_rtx_queue_head(sk); + skb_rbtree_walk_from_safe(skb_it, tmp) { + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); + + /* Subflow syn's and fin's are not reinjected. + * + * As well as empty subflow-fins with a data-fin. + * They are reinjected below (without the subflow-fin-flag) + */ + if (tcb->tcp_flags & TCPHDR_SYN || + (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) || + (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len)) + continue; + + if (mptcp_is_reinjected(skb_it)) + continue; + + tcb->mptcp_flags |= MPTCP_REINJECT; + __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it, + TCP_FRAG_IN_RTX_QUEUE); + } + + skb_it = tcp_write_queue_tail(meta_sk); + tcp_queue = TCP_FRAG_IN_WRITE_QUEUE; + + if (!skb_it) { + skb_it = skb_rb_last(&meta_sk->tcp_rtx_queue); + tcp_queue = TCP_FRAG_IN_RTX_QUEUE; + } + + /* If sk has sent the empty data-fin, we have to reinject it too. */ + if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 && + TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tcp_sk(sk)->mptcp->path_index)) { + __mptcp_reinject_data(skb_it, meta_sk, NULL, 1, tcp_queue); + } + + tcp_sk(sk)->pf = 1; + + mptcp_push_pending_frames(meta_sk); +} +EXPORT_SYMBOL(mptcp_reinject_data); + +static void mptcp_combine_dfin(const struct sk_buff *skb, + const struct sock *meta_sk, + struct sock *subsk) +{ + const struct tcp_sock *meta_tp = tcp_sk(meta_sk); + const struct mptcp_cb *mpcb = meta_tp->mpcb; + + /* In infinite mapping we always try to combine */ + if (mpcb->infinite_mapping_snd) + goto combine; + + /* Don't combine, if they didn't combine when closing - otherwise we end + * up in TIME_WAIT, even if our app is smart enough to avoid it. + */ + if (!mptcp_sk_can_recv(meta_sk) && !mpcb->dfin_combined) + return; + + /* Don't combine if there is still outstanding data that remains to be + * DATA_ACKed, because otherwise we may never be able to deliver this. + */ + if (meta_tp->snd_una != TCP_SKB_CB(skb)->seq) + return; + +combine: + if (tcp_close_state(subsk)) { + subsk->sk_shutdown |= SEND_SHUTDOWN; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; + } +} + +static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb, + __be32 *ptr) +{ + const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + __be32 *start = ptr; + __u16 data_len; + + *ptr++ = htonl(tcb->seq); /* data_seq */ + + /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */ + if (mptcp_is_data_fin(skb) && skb->len == 0) + *ptr++ = 0; /* subseq */ + else + *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */ + + if (tcb->mptcp_flags & MPTCPHDR_INF) + data_len = 0; + else + data_len = tcb->end_seq - tcb->seq; + + if (tp->mpcb->dss_csum && data_len) { + __sum16 *p16 = (__sum16 *)ptr; + __be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb); + __wsum csum; + + *ptr = htonl(((data_len) << 16) | + (TCPOPT_EOL << 8) | + (TCPOPT_EOL)); + csum = csum_partial(ptr - 2, 12, skb->csum); + p16++; + *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum)); + } else { + *ptr++ = htonl(((data_len) << 16) | + (TCPOPT_NOP << 8) | + (TCPOPT_NOP)); + } + + return ptr - start; +} + +static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb, + __be32 *ptr) +{ + struct mp_dss *mdss = (struct mp_dss *)ptr; + __be32 *start = ptr; + + mdss->kind = TCPOPT_MPTCP; + mdss->sub = MPTCP_SUB_DSS; + mdss->rsv1 = 0; + mdss->rsv2 = 0; + mdss->F = mptcp_is_data_fin(skb) ? 1 : 0; + mdss->m = 0; + mdss->M = mptcp_is_data_seq(skb) ? 1 : 0; + mdss->a = 0; + mdss->A = 1; + mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum); + ptr++; + + *ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt); + + return ptr - start; +} + +/* RFC6824 states that once a particular subflow mapping has been sent + * out it must never be changed. However, packets may be split while + * they are in the retransmission queue (due to SACK or ACKs) and that + * arguably means that we would change the mapping (e.g. it splits it, + * our sends out a subset of the initial mapping). + * + * Furthermore, the skb checksum is not always preserved across splits + * (e.g. mptcp_fragment) which would mean that we need to recompute + * the DSS checksum in this case. + * + * To avoid this we save the initial DSS mapping which allows us to + * send the same DSS mapping even for fragmented retransmits. + */ +static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + __be32 *ptr = (__be32 *)tcb->dss; + + tcb->mptcp_flags |= MPTCPHDR_SEQ; + + ptr += mptcp_write_dss_data_ack(tp, skb, ptr); + ptr += mptcp_write_dss_mapping(tp, skb, ptr); +} + +/* Write the saved DSS mapping to the header */ +static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb, + __be32 *ptr) +{ + __be32 *start = ptr; + + memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len); + + /* update the data_ack */ + start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt); + + /* dss is in a union with inet_skb_parm and + * the IP layer expects zeroed IPCB fields. + */ + memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len); + + return mptcp_dss_len/sizeof(*ptr); +} + +static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject) +{ + struct tcp_sock *tp = tcp_sk(sk); + const struct sock *meta_sk = mptcp_meta_sk(sk); + const struct mptcp_cb *mpcb = tp->mpcb; + struct tcp_skb_cb *tcb; + struct sk_buff *subskb = NULL; + + if (!reinject) + TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? + MPTCPHDR_SEQ64_INDEX : 0); + + tcp_skb_tsorted_save(skb) { + subskb = pskb_copy_for_clone(skb, GFP_ATOMIC); + } tcp_skb_tsorted_restore(skb); + if (!subskb) + return false; + + /* At the subflow-level we need to call again tcp_init_tso_segs. We + * force this, by setting pcount to 0. It has been set to 1 prior to + * the call to mptcp_skb_entail. + */ + tcp_skb_pcount_set(subskb, 0); + + TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index); + + /* Compute checksum */ + if (tp->mpcb->dss_csum) + subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0); + + tcb = TCP_SKB_CB(subskb); + + if (tp->mpcb->send_infinite_mapping && + !tp->mpcb->infinite_mapping_snd && + !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) { + tp->mptcp->fully_established = 1; + tp->mpcb->infinite_mapping_snd = 1; + tp->mptcp->infinite_cutoff_seq = tp->write_seq; + tcb->mptcp_flags |= MPTCPHDR_INF; + } + + if (mptcp_is_data_fin(subskb)) + mptcp_combine_dfin(subskb, meta_sk, sk); + + mptcp_save_dss_data_seq(tp, subskb); + + tcb->seq = tp->write_seq; + + /* Take into account seg len */ + tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0); + tcb->end_seq = tp->write_seq; + + /* If it's a non-payload DATA_FIN (also no subflow-fin), the + * segment is not part of the subflow but on a meta-only-level. + */ + if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) { + /* Make sure that this list is clean */ + INIT_LIST_HEAD(&subskb->tcp_tsorted_anchor); + + tcp_add_write_queue_tail(sk, subskb); + sk->sk_wmem_queued += subskb->truesize; + sk_mem_charge(sk, subskb->truesize); + } else { + /* Necessary to initialize for tcp_transmit_skb. mss of 1, as + * skb->len = 0 will force tso_segs to 1. + */ + tcp_init_tso_segs(subskb, 1); + + /* Empty data-fins are sent immediatly on the subflow */ + if (tcp_transmit_skb(sk, subskb, 0, GFP_ATOMIC)) + return false; + } + + if (!tp->mptcp->fully_established) { + tp->mptcp->second_packet = 1; + tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq; + } + + return true; +} + +/* Fragment an skb and update the mptcp meta-data. Due to reinject, we + * might need to undo some operations done by tcp_fragment. + * + * Be careful, the skb may come from 3 different places: + * - The send-queue (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) + * - The retransmit-queue (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) + * - The reinject-queue (reinject == -1) + */ +static int mptcp_fragment(struct sock *meta_sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, + gfp_t gfp, int reinject) +{ + int ret, diff, old_factor; + struct sk_buff *buff; + u8 flags; + + if (skb_headlen(skb) < len) + diff = skb->len - len; + else + diff = skb->data_len; + old_factor = tcp_skb_pcount(skb); + + /* The mss_now in tcp_fragment is used to set the tso_segs of the skb. + * At the MPTCP-level we do not care about the absolute value. All we + * care about is that it is set to 1 for accurate packets_out + * accounting. + */ + ret = tcp_fragment(meta_sk, tcp_queue, skb, len, UINT_MAX, gfp); + if (ret) + return ret; + + if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) + buff = skb->next; + else + buff = skb_rb_next(skb); + + flags = TCP_SKB_CB(skb)->mptcp_flags; + TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN); + TCP_SKB_CB(buff)->mptcp_flags = flags; + TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask; + + /* If reinject == 1, the buff will be added to the reinject + * queue, which is currently not part of memory accounting. So + * undo the changes done by tcp_fragment and update the + * reinject queue. Also, undo changes to the packet counters. + */ + if (reinject == 1) { + int undo = buff->truesize - diff; + meta_sk->sk_wmem_queued -= undo; + sk_mem_uncharge(meta_sk, undo); + + tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++; + if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) + meta_sk->sk_write_queue.qlen--; + + if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { + undo = old_factor - tcp_skb_pcount(skb) - + tcp_skb_pcount(buff); + if (undo) + tcp_adjust_pcount(meta_sk, skb, -undo); + } + + /* tcp_fragment's call to sk_stream_alloc_skb initializes the + * tcp_tsorted_anchor. We need to revert this as it clashes + * with the refdst pointer. + */ + tcp_skb_tsorted_anchor_cleanup(buff); + } + + return 0; +} + +/* Inspired by tcp_write_wakeup */ +int mptcp_write_wakeup(struct sock *meta_sk, int mib) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sk_buff *skb; + int ans = 0; + + if (meta_sk->sk_state == TCP_CLOSE) + return -1; + + skb = tcp_send_head(meta_sk); + if (skb && + before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) { + unsigned int mss; + unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq; + struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true); + struct tcp_sock *subtp; + + WARN_ON(TCP_SKB_CB(skb)->sacked); + + if (!subsk) + goto window_probe; + subtp = tcp_sk(subsk); + mss = tcp_current_mss(subsk); + + seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq, + tcp_wnd_end(subtp) - subtp->write_seq); + + if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; + if (mptcp_fragment(meta_sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, seg_size, GFP_ATOMIC, 0)) + return -1; + } else if (!tcp_skb_pcount(skb)) { + /* see mptcp_write_xmit on why we use UINT_MAX */ + tcp_set_skb_tso_segs(skb, UINT_MAX); + } + + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; + if (!mptcp_skb_entail(subsk, skb, 0)) + return -1; + + mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq); + tcp_event_new_data_sent(meta_sk, skb); + + __tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH); + tcp_update_skb_after_send(meta_tp, skb); + meta_tp->lsndtime = tcp_jiffies32; + + return 0; + } else { + struct mptcp_tcp_sock *mptcp; + +window_probe: + if (between(meta_tp->snd_up, meta_tp->snd_una + 1, + meta_tp->snd_una + 0xFFFF)) { + mptcp_for_each_sub(meta_tp->mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + + if (mptcp_sk_can_send_ack(sk_it)) + tcp_xmit_probe_skb(sk_it, 1, mib); + } + } + + /* At least one of the tcp_xmit_probe_skb's has to succeed */ + mptcp_for_each_sub(meta_tp->mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + int ret; + + if (!mptcp_sk_can_send_ack(sk_it)) + continue; + + ret = tcp_xmit_probe_skb(sk_it, 0, mib); + if (unlikely(ret > 0)) + ans = ret; + } + return ans; + } +} + +bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp; + bool is_rwnd_limited = false; + struct mptcp_tcp_sock *mptcp; + struct sock *subsk = NULL; + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sk_buff *skb; + int reinject = 0; + unsigned int sublimit; + __u32 path_mask = 0; + + tcp_mstamp_refresh(meta_tp); + + if (inet_csk(meta_sk)->icsk_retransmits) { + /* If the timer already once fired, retransmit the head of the + * queue to unblock us ASAP. + */ + if (meta_tp->packets_out && !mpcb->infinite_mapping_snd) + mptcp_retransmit_skb(meta_sk, tcp_rtx_queue_head(meta_sk)); + } + + while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk, + &sublimit))) { + enum tcp_queue tcp_queue = TCP_FRAG_IN_WRITE_QUEUE; + unsigned int limit; + + WARN(TCP_SKB_CB(skb)->sacked, "sacked: %u reinject: %u", + TCP_SKB_CB(skb)->sacked, reinject); + + subtp = tcp_sk(subsk); + mss_now = tcp_current_mss(subsk); + + if (reinject == 1) { + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) { + /* Segment already reached the peer, take the next one */ + __skb_unlink(skb, &mpcb->reinject_queue); + __kfree_skb(skb); + continue; + } + } else if (reinject == -1) { + tcp_queue = TCP_FRAG_IN_RTX_QUEUE; + } + + /* If the segment was cloned (e.g. a meta retransmission), + * the header must be expanded/copied so that there is no + * corruption of TSO information. + */ + if (skb_unclone(skb, GFP_ATOMIC)) + break; + + if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) { + is_rwnd_limited = true; + break; + } + + /* Force tso_segs to 1 by using UINT_MAX. + * We actually don't care about the exact number of segments + * emitted on the subflow. We need just to set tso_segs, because + * we still need an accurate packets_out count in + * tcp_event_new_data_sent. + */ + tcp_set_skb_tso_segs(skb, UINT_MAX); + + /* Check for nagle, irregardless of tso_segs. If the segment is + * actually larger than mss_now (TSO segment), then + * tcp_nagle_check will have partial == false and always trigger + * the transmission. + * tcp_write_xmit has a TSO-level nagle check which is not + * subject to the MPTCP-level. It is based on the properties of + * the subflow, not the MPTCP-level. + * When the segment is a reinjection or redundant scheduled + * segment, nagle check at meta-level may prevent + * sending. This could hurt with certain schedulers, as they + * to reinjection to recover from a window-stall or reduce latency. + * Therefore, Nagle check should be disabled in that case. + */ + if (!reinject && + unlikely(!tcp_nagle_test(meta_tp, skb, mss_now, + (tcp_skb_is_last(meta_sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + + limit = mss_now; + /* skb->len > mss_now is the equivalent of tso_segs > 1 in + * tcp_write_xmit. Otherwise split-point would return 0. + */ + if (skb->len > mss_now && !tcp_urg_mode(meta_tp)) + /* We limit the size of the skb so that it fits into the + * window. Call tcp_mss_split_point to avoid duplicating + * code. + * We really only care about fitting the skb into the + * window. That's why we use UINT_MAX. If the skb does + * not fit into the cwnd_quota or the NIC's max-segs + * limitation, it will be split by the subflow's + * tcp_write_xmit which does the appropriate call to + * tcp_mss_split_point. + */ + limit = tcp_mss_split_point(meta_sk, skb, mss_now, + UINT_MAX / mss_now, + nonagle); + + if (sublimit) + limit = min(limit, sublimit); + + if (skb->len > limit && + unlikely(mptcp_fragment(meta_sk, tcp_queue, + skb, limit, gfp, reinject))) + break; + + if (!mptcp_skb_entail(subsk, skb, reinject)) + break; + + if (reinject <= 0) + tcp_update_skb_after_send(meta_tp, skb); + meta_tp->lsndtime = tcp_jiffies32; + + path_mask |= mptcp_pi_to_flag(subtp->mptcp->path_index); + + if (!reinject) { + mptcp_check_sndseq_wrap(meta_tp, + TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq); + tcp_event_new_data_sent(meta_sk, skb); + } + + tcp_minshall_update(meta_tp, mss_now, skb); + + if (reinject > 0) { + __skb_unlink(skb, &mpcb->reinject_queue); + kfree_skb(skb); + } + + if (push_one) + break; + } + + if (is_rwnd_limited) + tcp_chrono_start(meta_sk, TCP_CHRONO_RWND_LIMITED); + else + tcp_chrono_stop(meta_sk, TCP_CHRONO_RWND_LIMITED); + + mptcp_for_each_sub(mpcb, mptcp) { + subsk = mptcp_to_sock(mptcp); + subtp = tcp_sk(subsk); + + if (!(path_mask & mptcp_pi_to_flag(subtp->mptcp->path_index))) + continue; + + mss_now = tcp_current_mss(subsk); + + /* Nagle is handled at the MPTCP-layer, so + * always push on the subflow + */ + __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); + } + + return !meta_tp->packets_out && tcp_send_head(meta_sk); +} + +void mptcp_write_space(struct sock *sk) +{ + mptcp_push_pending_frames(mptcp_meta_sk(sk)); +} + +u32 __mptcp_select_window(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); + struct sock *meta_sk = mptcp_meta_sk(sk); + int mss, free_space, full_space, window; + + /* MSS for the peer's data. Previous versions used mss_clamp + * here. I don't know if the value based on our guesses + * of peer's MSS is better for the performance. It's more correct + * but may be worse for the performance because of rcv_mss + * fluctuations. --SAW 1998/11/1 + */ + mss = icsk->icsk_ack.rcv_mss; + free_space = tcp_space(meta_sk); + full_space = min_t(int, meta_tp->window_clamp, + tcp_full_space(meta_sk)); + + if (mss > full_space) + mss = full_space; + + if (free_space < (full_space >> 1)) { + /* If free_space is decreasing due to mostly meta-level + * out-of-order packets, don't turn off the quick-ack mode. + */ + if (meta_tp->rcv_nxt - meta_tp->copied_seq > ((full_space - free_space) >> 1)) + icsk->icsk_ack.quick = 0; + + if (tcp_memory_pressure) + /* TODO this has to be adapted when we support different + * MSS's among the subflows. + */ + meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh, + 4U * meta_tp->advmss); + + if (free_space < mss) + return 0; + } + + if (free_space > meta_tp->rcv_ssthresh) + free_space = meta_tp->rcv_ssthresh; + + /* Don't do rounding if we are using window scaling, since the + * scaled window will not line up with the MSS boundary anyway. + */ + window = meta_tp->rcv_wnd; + if (tp->rx_opt.rcv_wscale) { + window = free_space; + + /* Advertise enough space so that it won't get scaled away. + * Import case: prevent zero window announcement if + * 1< mss. + */ + if (((window >> tp->rx_opt.rcv_wscale) << tp-> + rx_opt.rcv_wscale) != window) + window = (((window >> tp->rx_opt.rcv_wscale) + 1) + << tp->rx_opt.rcv_wscale); + } else { + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + if (window <= free_space - mss || window > free_space) + window = (free_space / mss) * mss; + else if (mss == full_space && + free_space > window + (full_space >> 1)) + window = free_space; + } + + return window; +} + +void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts, + unsigned *remaining) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + opts->options |= OPTION_MPTCP; + if (is_master_tp(tp)) { + opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN; + opts->mptcp_ver = tcp_sk(sk)->mptcp_ver; + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; + opts->mp_capable.sender_key = tp->mptcp_loc_key; + opts->dss_csum = !!sysctl_mptcp_checksum; + } else { + const struct mptcp_cb *mpcb = tp->mpcb; + + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN; + *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN; + opts->mp_join_syns.token = mpcb->mptcp_rem_token; + opts->mp_join_syns.low_prio = tp->mptcp->low_prio; + opts->addr_id = tp->mptcp->loc_id; + opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce; + } +} + +void mptcp_synack_options(struct request_sock *req, + struct tcp_out_options *opts, unsigned *remaining) +{ + struct mptcp_request_sock *mtreq; + mtreq = mptcp_rsk(req); + + opts->options |= OPTION_MPTCP; + /* MPCB not yet set - thus it's a new MPTCP-session */ + if (!mtreq->is_sub) { + opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK; + opts->mptcp_ver = mtreq->mptcp_ver; + opts->mp_capable.sender_key = mtreq->mptcp_loc_key; + opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum; + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; + } else { + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK; + opts->mp_join_syns.sender_truncated_mac = + mtreq->mptcp_hash_tmac; + opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce; + opts->mp_join_syns.low_prio = mtreq->low_prio; + opts->addr_id = mtreq->loc_id; + *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN; + } +} + +void mptcp_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, unsigned *size) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_cb *mpcb = tp->mpcb; + const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; + + /* We are coming from tcp_current_mss with the meta_sk as an argument. + * It does not make sense to check for the options, because when the + * segment gets sent, another subflow will be chosen. + */ + if (!skb && is_meta_sk(sk)) + return; + + if (unlikely(tp->send_mp_fclose)) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_FCLOSE; + opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; + *size += MPTCP_SUB_LEN_FCLOSE_ALIGN; + return; + } + + /* 1. If we are the sender of the infinite-mapping, we need the + * MPTCPHDR_INF-flag, because a retransmission of the + * infinite-announcment still needs the mptcp-option. + * + * We need infinite_cutoff_seq, because retransmissions from before + * the infinite-cutoff-moment still need the MPTCP-signalling to stay + * consistent. + * + * 2. If we are the receiver of the infinite-mapping, we always skip + * mptcp-options, because acknowledgments from before the + * infinite-mapping point have already been sent out. + * + * I know, the whole infinite-mapping stuff is ugly... + * + * TODO: Handle wrapped data-sequence numbers + * (even if it's very unlikely) + */ + if (unlikely(mpcb->infinite_mapping_snd) && + ((mpcb->send_infinite_mapping && tcb && + mptcp_is_data_seq(skb) && + !(tcb->mptcp_flags & MPTCPHDR_INF) && + !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) || + !mpcb->send_infinite_mapping)) + return; + + if (unlikely(tp->mptcp->include_mpc)) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_CAPABLE | + OPTION_TYPE_ACK; + *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN; + opts->mptcp_ver = mpcb->mptcp_ver; + opts->mp_capable.sender_key = mpcb->mptcp_loc_key; + opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; + opts->dss_csum = mpcb->dss_csum; + + if (skb) + tp->mptcp->include_mpc = 0; + } + if (unlikely(tp->mptcp->pre_established) && + (!skb || !(tcb->tcp_flags & (TCPHDR_FIN | TCPHDR_RST)))) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK; + *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN; + } + + if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal && + mpcb->mptcp_ver >= MPTCP_VERSION_1 && skb && !mptcp_is_data_seq(skb)) { + mpcb->pm_ops->addr_signal(sk, size, opts, skb); + + if (opts->add_addr_v6) + /* Skip subsequent options */ + return; + } + + if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_DATA_ACK; + /* If !skb, we come from tcp_current_mss and thus we always + * assume that the DSS-option will be set for the data-packet. + */ + if (skb && !mptcp_is_data_seq(skb)) { + *size += MPTCP_SUB_LEN_ACK_ALIGN; + } else { + /* Doesn't matter, if csum included or not. It will be + * either 10 or 12, and thus aligned = 12 + */ + *size += MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN; + } + + *size += MPTCP_SUB_LEN_DSS_ALIGN; + } + + /* In fallback mp_fail-mode, we have to repeat it until the fallback + * has been done by the sender + */ + if (unlikely(tp->mptcp->send_mp_fail) && skb && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_FAIL) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_FAIL; + *size += MPTCP_SUB_LEN_FAIL; + } + + if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal && + mpcb->mptcp_ver < MPTCP_VERSION_1) + mpcb->pm_ops->addr_signal(sk, size, opts, skb); + + if (unlikely(tp->mptcp->send_mp_prio) && + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) { + opts->options |= OPTION_MPTCP; + opts->mptcp_options |= OPTION_MP_PRIO; + if (skb) + tp->mptcp->send_mp_prio = 0; + *size += MPTCP_SUB_LEN_PRIO_ALIGN; + } + + return; +} + +u16 mptcp_select_window(struct sock *sk) +{ + u16 new_win = tcp_select_window(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *meta_tp = mptcp_meta_tp(tp); + + meta_tp->rcv_wnd = tp->rcv_wnd; + meta_tp->rcv_wup = meta_tp->rcv_nxt; + /* no need to use tcp_update_rcv_right_edge, because at the meta level + * right edge cannot go back + */ + meta_tp->rcv_right_edge = meta_tp->rcv_wnd + meta_tp->rcv_wup; + + return new_win; +} + +void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp, + const struct tcp_out_options *opts, + struct sk_buff *skb) +{ + if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) { + struct mp_capable *mpc = (struct mp_capable *)ptr; + + mpc->kind = TCPOPT_MPTCP; + + if ((OPTION_TYPE_SYN & opts->mptcp_options) || + (OPTION_TYPE_SYNACK & opts->mptcp_options)) { + mpc->sender_key = opts->mp_capable.sender_key; + mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; + mpc->ver = opts->mptcp_ver; + ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; + } else if (OPTION_TYPE_ACK & opts->mptcp_options) { + mpc->sender_key = opts->mp_capable.sender_key; + mpc->receiver_key = opts->mp_capable.receiver_key; + mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK; + mpc->ver = opts->mptcp_ver; + ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2; + } + + mpc->sub = MPTCP_SUB_CAPABLE; + mpc->a = opts->dss_csum; + mpc->b = 0; + mpc->rsv = 0; + mpc->h = 1; + } + if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) { + struct mp_join *mpj = (struct mp_join *)ptr; + + mpj->kind = TCPOPT_MPTCP; + mpj->sub = MPTCP_SUB_JOIN; + mpj->rsv = 0; + + if (OPTION_TYPE_SYN & opts->mptcp_options) { + mpj->len = MPTCP_SUB_LEN_JOIN_SYN; + mpj->u.syn.token = opts->mp_join_syns.token; + mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce; + mpj->b = opts->mp_join_syns.low_prio; + mpj->addr_id = opts->addr_id; + ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2; + } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) { + mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK; + mpj->u.synack.mac = + opts->mp_join_syns.sender_truncated_mac; + mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce; + mpj->b = opts->mp_join_syns.low_prio; + mpj->addr_id = opts->addr_id; + ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2; + } else if (OPTION_TYPE_ACK & opts->mptcp_options) { + mpj->len = MPTCP_SUB_LEN_JOIN_ACK; + mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */ + memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20); + ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2; + } + } + if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) { + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; + struct mptcp_cb *mpcb = tp->mpcb; + + mpadd->kind = TCPOPT_MPTCP; + if (opts->add_addr_v4) { + mpadd->sub = MPTCP_SUB_ADD_ADDR; + mpadd->ipver = 4; + mpadd->addr_id = opts->add_addr4.addr_id; + mpadd->u.v4.addr = opts->add_addr4.addr; + if (mpcb->mptcp_ver < MPTCP_VERSION_1) { + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4; + ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2; + } else { + memcpy((char *)mpadd->u.v4.mac - 2, + (char *)&opts->add_addr4.trunc_mac, 8); + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4_VER1; + ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1 >> 2; + } + } else if (opts->add_addr_v6) { + mpadd->sub = MPTCP_SUB_ADD_ADDR; + mpadd->ipver = 6; + mpadd->addr_id = opts->add_addr6.addr_id; + memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr, + sizeof(mpadd->u.v6.addr)); + if (mpcb->mptcp_ver < MPTCP_VERSION_1) { + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6; + ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2; + } else { + memcpy((char *)mpadd->u.v6.mac - 2, + (char *)&opts->add_addr6.trunc_mac, 8); + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6_VER1; + ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1 >> 2; + } + } + + MPTCP_INC_STATS(sock_net((struct sock *)tp), MPTCP_MIB_ADDADDRTX); + } + if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) { + struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; + u8 *addrs_id; + int id, len, len_align; + + len = mptcp_sub_len_remove_addr(opts->remove_addrs); + len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs); + + mprem->kind = TCPOPT_MPTCP; + mprem->len = len; + mprem->sub = MPTCP_SUB_REMOVE_ADDR; + mprem->rsv = 0; + addrs_id = &mprem->addrs_id; + + mptcp_for_each_bit_set(opts->remove_addrs, id) + *(addrs_id++) = id; + + /* Fill the rest with NOP's */ + if (len_align > len) { + int i; + for (i = 0; i < len_align - len; i++) + *(addrs_id++) = TCPOPT_NOP; + } + + ptr += len_align >> 2; + + MPTCP_INC_STATS(sock_net((struct sock *)tp), MPTCP_MIB_REMADDRTX); + } + if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) { + struct mp_fail *mpfail = (struct mp_fail *)ptr; + + mpfail->kind = TCPOPT_MPTCP; + mpfail->len = MPTCP_SUB_LEN_FAIL; + mpfail->sub = MPTCP_SUB_FAIL; + mpfail->rsv1 = 0; + mpfail->rsv2 = 0; + mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq); + + ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2; + } + if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) { + struct mp_fclose *mpfclose = (struct mp_fclose *)ptr; + + mpfclose->kind = TCPOPT_MPTCP; + mpfclose->len = MPTCP_SUB_LEN_FCLOSE; + mpfclose->sub = MPTCP_SUB_FCLOSE; + mpfclose->rsv1 = 0; + mpfclose->rsv2 = 0; + mpfclose->key = opts->mp_capable.receiver_key; + + ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2; + } + + if (OPTION_DATA_ACK & opts->mptcp_options) { + if (!mptcp_is_data_seq(skb)) + ptr += mptcp_write_dss_data_ack(tp, skb, ptr); + else + ptr += mptcp_write_dss_data_seq(tp, skb, ptr); + } + if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) { + struct mp_prio *mpprio = (struct mp_prio *)ptr; + + mpprio->kind = TCPOPT_MPTCP; + mpprio->len = MPTCP_SUB_LEN_PRIO; + mpprio->sub = MPTCP_SUB_PRIO; + mpprio->rsv = 0; + mpprio->b = tp->mptcp->low_prio; + mpprio->addr_id = TCPOPT_NOP; + + ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2; + } +} + +/* Sends the datafin */ +void mptcp_send_fin(struct sock *meta_sk) +{ + struct sk_buff *skb, *tskb = tcp_write_queue_tail(meta_sk); + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + int mss_now; + + if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) + meta_tp->mpcb->passive_close = 1; + + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. But be careful about outgoing SACKS + * and IP options. + */ + mss_now = mptcp_current_mss(meta_sk); + + if (tskb) { + TCP_SKB_CB(tskb)->mptcp_flags |= MPTCPHDR_FIN; + TCP_SKB_CB(tskb)->end_seq++; + meta_tp->write_seq++; + } else { + /* Socket is locked, keep trying until memory is available. */ + for (;;) { + skb = alloc_skb_fclone(MAX_TCP_HEADER, + meta_sk->sk_allocation); + if (skb) + break; + yield(); + } + /* Reserve space for headers and prepare control bits. */ + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); + skb_reserve(skb, MAX_TCP_HEADER); + + tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK); + TCP_SKB_CB(skb)->end_seq++; + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; + tcp_queue_skb(meta_sk, skb); + } + __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF); +} + +void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct sock *sk; + + if (hlist_empty(&mpcb->conn_list)) + return; + + WARN_ON(meta_tp->send_mp_fclose); + + /* First - select a socket */ + sk = mptcp_select_ack_sock(meta_sk); + + /* May happen if no subflow is in an appropriate state, OR + * we are in infinite mode or about to go there - just send a reset + */ + if (!sk || mptcp_in_infinite_mapping_weak(mpcb)) { + /* tcp_done must be handled with bh disabled */ + if (!in_serving_softirq()) + local_bh_disable(); + + mptcp_sub_force_close_all(mpcb, NULL); + + if (!in_serving_softirq()) + local_bh_enable(); + return; + } + + tcp_mstamp_refresh(meta_tp); + + tcp_sk(sk)->send_mp_fclose = 1; + /** Reset all other subflows */ + + /* tcp_done must be handled with bh disabled */ + if (!in_serving_softirq()) + local_bh_disable(); + + mptcp_sub_force_close_all(mpcb, sk); + + tcp_set_state(sk, TCP_RST_WAIT); + + if (!in_serving_softirq()) + local_bh_enable(); + + tcp_send_ack(sk); + tcp_clear_xmit_timers(sk); + inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto); + + meta_tp->send_mp_fclose = 1; + inet_csk(sk)->icsk_retransmits = 0; + + /* Prevent exp backoff reverting on ICMP dest unreachable */ + inet_csk(sk)->icsk_backoff = 0; + + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_FASTCLOSETX); +} + +static void mptcp_ack_retransmit_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + struct sk_buff *skb; + + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + goto out; /* Routing failure or similar */ + + tcp_mstamp_refresh(tp); + + if (tcp_write_timeout(sk)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKRTO); + tp->mptcp->pre_established = 0; + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); + tp->ops->send_active_reset(sk, GFP_ATOMIC); + goto out; + } + + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (skb == NULL) { + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, + jiffies + icsk->icsk_rto); + return; + } + + /* Reserve space for headers and prepare control bits */ + skb_reserve(skb, MAX_TCP_HEADER); + tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK); + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKRXMIT); + + if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!icsk->icsk_retransmits) + icsk->icsk_retransmits = 1; + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, + jiffies + icsk->icsk_rto); + return; + } + + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_time_stamp(tp) ? : 1; + + icsk->icsk_retransmits++; + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, + jiffies + icsk->icsk_rto); + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) + __sk_dst_reset(sk); + +out:; +} + +void mptcp_ack_handler(struct timer_list *t) +{ + struct mptcp_tcp_sock *mptcp = from_timer(mptcp, t, mptcp_ack_timer); + struct sock *sk = (struct sock *)mptcp->tp; + struct sock *meta_sk = mptcp_meta_sk(sk); + + bh_lock_sock(meta_sk); + if (sock_owned_by_user(meta_sk)) { + /* Try again later */ + sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer, + jiffies + (HZ / 20)); + goto out_unlock; + } + + if (sk->sk_state == TCP_CLOSE) + goto out_unlock; + if (!tcp_sk(sk)->mptcp->pre_established) + goto out_unlock; + + mptcp_ack_retransmit_timer(sk); + + sk_mem_reclaim(sk); + +out_unlock: + bh_unlock_sock(meta_sk); + sock_put(sk); +} + +/* Similar to tcp_retransmit_skb + * + * The diff is that we handle the retransmission-stats (retrans_stamp) at the + * meta-level. + */ +int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct sock *subsk; + unsigned int limit, mss_now; + int err = -1; + + WARN_ON(TCP_SKB_CB(skb)->sacked); + + /* Do not sent more than we queued. 1/4 is reserved for possible + * copying overhead: fragmentation, tunneling, mangling etc. + * + * This is a meta-retransmission thus we check on the meta-socket. + */ + if (refcount_read(&meta_sk->sk_wmem_alloc) > + min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) { + return -EAGAIN; + } + + /* We need to make sure that the retransmitted segment can be sent on a + * subflow right now. If it is too big, it needs to be fragmented. + */ + subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false); + if (!subsk) { + /* We want to increase icsk_retransmits, thus return 0, so that + * mptcp_meta_retransmit_timer enters the desired branch. + */ + err = 0; + goto failed; + } + mss_now = tcp_current_mss(subsk); + + /* If the segment was cloned (e.g. a meta retransmission), the header + * must be expanded/copied so that there is no corruption of TSO + * information. + */ + if (skb_unclone(skb, GFP_ATOMIC)) { + err = -ENOMEM; + goto failed; + } + + /* Must have been set by mptcp_write_xmit before */ + BUG_ON(!tcp_skb_pcount(skb)); + + limit = mss_now; + /* skb->len > mss_now is the equivalent of tso_segs > 1 in + * tcp_write_xmit. Otherwise split-point would return 0. + */ + if (skb->len > mss_now && !tcp_urg_mode(meta_tp)) + limit = tcp_mss_split_point(meta_sk, skb, mss_now, + UINT_MAX / mss_now, + TCP_NAGLE_OFF); + + limit = min(limit, tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq); + + if (skb->len > limit && + unlikely(mptcp_fragment(meta_sk, TCP_FRAG_IN_RTX_QUEUE, skb, + limit, GFP_ATOMIC, 0))) + goto failed; + + if (!mptcp_skb_entail(subsk, skb, -1)) + goto failed; + + /* Update global TCP statistics. */ + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_RETRANSSEGS); + + /* Diff to tcp_retransmit_skb */ + + /* Save stamp of the first retransmit. */ + if (!meta_tp->retrans_stamp) { + tcp_mstamp_refresh(meta_tp); + meta_tp->retrans_stamp = tcp_time_stamp(meta_tp); + } + + __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); + tcp_update_skb_after_send(meta_tp, skb); + meta_tp->lsndtime = tcp_jiffies32; + + return 0; + +failed: + NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL); + return err; +} + +/* Similar to tcp_retransmit_timer + * + * The diff is that we have to handle retransmissions of the FAST_CLOSE-message + * and that we don't have an srtt estimation at the meta-level. + */ +void mptcp_meta_retransmit_timer(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); + int err; + + /* In fallback, retransmission is handled at the subflow-level */ + if (!meta_tp->packets_out || mpcb->infinite_mapping_snd) + return; + + WARN_ON(tcp_rtx_queue_empty(meta_sk)); + + if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) && + !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { + /* Receiver dastardly shrinks window. Our retransmits + * become zero probes, but we should not timeout this + * connection. If the socket is an orphan, time it out, + * we cannot allow such beasts to hang infinitely. + */ + struct inet_sock *meta_inet = inet_sk(meta_sk); + if (meta_sk->sk_family == AF_INET) { + net_dbg_ratelimited("MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &meta_inet->inet_daddr, + ntohs(meta_inet->inet_dport), + meta_inet->inet_num, meta_tp->snd_una, + meta_tp->snd_nxt); + } +#if IS_ENABLED(CONFIG_IPV6) + else if (meta_sk->sk_family == AF_INET6) { + net_dbg_ratelimited("MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &meta_sk->sk_v6_daddr, + ntohs(meta_inet->inet_dport), + meta_inet->inet_num, meta_tp->snd_una, + meta_tp->snd_nxt); + } +#endif + if (tcp_jiffies32 - meta_tp->rcv_tstamp > TCP_RTO_MAX) { + tcp_write_err(meta_sk); + return; + } + + mptcp_retransmit_skb(meta_sk, tcp_rtx_queue_head(meta_sk)); + goto out_reset_timer; + } + + if (tcp_write_timeout(meta_sk)) + return; + + if (meta_icsk->icsk_retransmits == 0) + NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS); + + meta_icsk->icsk_ca_state = TCP_CA_Loss; + + err = mptcp_retransmit_skb(meta_sk, tcp_rtx_queue_head(meta_sk)); + if (err > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!meta_icsk->icsk_retransmits) + meta_icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, + min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); + return; + } + + /* Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + meta_icsk->icsk_backoff++; + meta_icsk->icsk_retransmits++; + +out_reset_timer: + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this + * might be increased if the stream oscillates between thin and thick, + * thus the old value might already be too high compared to the value + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without + * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating + * exponential backoff behaviour to avoid continue hammering + * linear-timeout retransmissions into a black hole + */ + if (meta_sk->sk_state == TCP_ESTABLISHED && + (meta_tp->thin_lto || sock_net(meta_sk)->ipv4.sysctl_tcp_thin_linear_timeouts) && + tcp_stream_is_thin(meta_tp) && + meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { + meta_icsk->icsk_backoff = 0; + /* We cannot do the same as in tcp_write_timer because the + * srtt is not set here. + */ + mptcp_set_rto(meta_sk); + } else { + /* Use normal (exponential) backoff */ + meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX); + } + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX); + + return; +} + +void mptcp_sub_retransmit_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_retransmit_timer(sk); + + if (!tp->fastopen_rsk) { + mptcp_reinject_data(sk, 1); + mptcp_set_rto(sk); + } +} + +/* Modify values to an mptcp-level for the initial window of new subflows */ +void mptcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, + __u32 *rcv_wnd, __u32 *window_clamp, + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd) +{ + const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; + + *window_clamp = mpcb->orig_window_clamp; + __space = tcp_win_from_space(sk, mpcb->orig_sk_rcvbuf); + + tcp_select_initial_window(sk, __space, mss, rcv_wnd, window_clamp, + wscale_ok, rcv_wscale, init_rcv_wnd); +} + +static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss, + unsigned int (*mss_cb)(struct sock *sk)) +{ + struct mptcp_tcp_sock *mptcp; + u64 rate = 0; + + mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + struct tcp_sock *tp = tcp_sk(sk); + int this_mss; + u64 this_rate; + + if (!mptcp_sk_can_send(sk)) + continue; + + /* Do not consider subflows without a RTT estimation yet + * otherwise this_rate >>> rate. + */ + if (unlikely(!tp->srtt_us)) + continue; + + this_mss = mss_cb(sk); + + /* If this_mss is smaller than mss, it means that a segment will + * be splitted in two (or more) when pushed on this subflow. If + * you consider that mss = 1428 and this_mss = 1420 then two + * segments will be generated: a 1420-byte and 8-byte segment. + * The latter will introduce a large overhead as for a single + * data segment 2 slots will be used in the congestion window. + * Therefore reducing by ~2 the potential throughput of this + * subflow. Indeed, 1428 will be send while 2840 could have been + * sent if mss == 1420 reducing the throughput by 2840 / 1428. + * + * The following algorithm take into account this overhead + * when computing the potential throughput that MPTCP can + * achieve when generating mss-byte segments. + * + * The formulae is the following: + * \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub} + * Where ratio is computed as follows: + * \frac{mss}{\ceil{mss / mss_sub} * mss_sub} + * + * ratio gives the reduction factor of the theoretical + * throughput a subflow can achieve if MPTCP uses a specific + * MSS value. + */ + this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) * + max(tp->snd_cwnd, tp->packets_out), + (u64)tp->srtt_us * + DIV_ROUND_UP(mss, this_mss) * this_mss); + rate += this_rate; + } + + return rate; +} + +static unsigned int __mptcp_current_mss(const struct sock *meta_sk, + unsigned int (*mss_cb)(struct sock *sk)) +{ + struct mptcp_tcp_sock *mptcp; + unsigned int mss = 0; + u64 rate = 0; + + mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + int this_mss; + u64 this_rate; + + if (!mptcp_sk_can_send(sk)) + continue; + + this_mss = mss_cb(sk); + + /* Same mss values will produce the same throughput. */ + if (this_mss == mss) + continue; + + /* See whether using this mss value can theoretically improve + * the performances. + */ + this_rate = mptcp_calc_rate(meta_sk, this_mss, mss_cb); + if (this_rate >= rate) { + mss = this_mss; + rate = this_rate; + } + } + + return mss; +} + +unsigned int mptcp_current_mss(struct sock *meta_sk) +{ + unsigned int mss = __mptcp_current_mss(meta_sk, tcp_current_mss); + + /* If no subflow is available, we take a default-mss from the + * meta-socket. + */ + return !mss ? tcp_current_mss(meta_sk) : mss; +} + +static unsigned int mptcp_select_size_mss(struct sock *sk) +{ + return tcp_sk(sk)->mss_cache; +} + +int mptcp_select_size(const struct sock *meta_sk, bool first_skb, bool zc) +{ + unsigned int mss = __mptcp_current_mss(meta_sk, mptcp_select_size_mss); + + if (mptcp_can_sg(meta_sk)) { + if (zc) + return 0; + + if (!tcp_sk(meta_sk)->mpcb->dss_csum) { + mss = linear_payload_sz(first_skb); + } else { + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + + if (mss >= pgbreak && + mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) + mss = pgbreak; + } + } + + return !mss ? tcp_sk(meta_sk)->mss_cache : mss; +} + +int mptcp_check_snd_buf(const struct tcp_sock *tp) +{ + const struct mptcp_tcp_sock *mptcp; + u32 rtt_max = tp->srtt_us; + u64 bw_est; + + if (!tp->srtt_us) + return tp->reordering + 1; + + mptcp_for_each_sub(tp->mpcb, mptcp) { + const struct sock *sk = mptcp_to_sock(mptcp); + + if (!mptcp_sk_can_send(sk)) + continue; + + if (rtt_max < tcp_sk(sk)->srtt_us) + rtt_max = tcp_sk(sk)->srtt_us; + } + + bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16, + (u64)tp->srtt_us); + + return max_t(unsigned int, (u32)(bw_est >> 16), + tp->reordering + 1); +} + +unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now, + int large_allowed) +{ + u32 xmit_size_goal = 0; + + if (large_allowed && !tcp_sk(meta_sk)->mpcb->dss_csum) { + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + int this_size_goal; + + if (!mptcp_sk_can_send(sk)) + continue; + + this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1); + if (this_size_goal > xmit_size_goal) + xmit_size_goal = this_size_goal; + } + } + + return max(xmit_size_goal, mss_now); +} + diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c new file mode 100644 index 000000000000..0e24e0aaa70a --- /dev/null +++ b/net/mptcp/mptcp_pm.c @@ -0,0 +1,226 @@ +/* + * MPTCP implementation - MPTCP-subflow-management + * + * Initial Design & Implementation: + * Sébastien Barré + * + * Current Maintainer & Author: + * Christoph Paasch + * + * Additional authors: + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien Duchêne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include + +static DEFINE_SPINLOCK(mptcp_pm_list_lock); +static LIST_HEAD(mptcp_pm_list); + +static int mptcp_default_id(const struct sock *meta_sk, sa_family_t family, + union inet_addr *addr, bool *low_prio) +{ + return 0; +} + +struct mptcp_pm_ops mptcp_pm_default = { + .get_local_id = mptcp_default_id, /* We do not care */ + .name = "default", + .owner = THIS_MODULE, +}; + +static struct mptcp_pm_ops *mptcp_pm_find(const char *name) +{ + struct mptcp_pm_ops *e; + + list_for_each_entry_rcu(e, &mptcp_pm_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +int mptcp_register_path_manager(struct mptcp_pm_ops *pm) +{ + int ret = 0; + + if (!pm->get_local_id) + return -EINVAL; + + spin_lock(&mptcp_pm_list_lock); + if (mptcp_pm_find(pm->name)) { + pr_notice("%s already registered\n", pm->name); + ret = -EEXIST; + } else { + list_add_tail_rcu(&pm->list, &mptcp_pm_list); + pr_info("%s registered\n", pm->name); + } + spin_unlock(&mptcp_pm_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(mptcp_register_path_manager); + +void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm) +{ + spin_lock(&mptcp_pm_list_lock); + list_del_rcu(&pm->list); + spin_unlock(&mptcp_pm_list_lock); + + /* Wait for outstanding readers to complete before the + * module gets removed entirely. + * + * A try_module_get() should fail by now as our module is + * in "going" state since no refs are held anymore and + * module_exit() handler being called. + */ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager); + +void mptcp_get_default_path_manager(char *name) +{ + struct mptcp_pm_ops *pm; + + BUG_ON(list_empty(&mptcp_pm_list)); + + rcu_read_lock(); + pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list); + strncpy(name, pm->name, MPTCP_PM_NAME_MAX); + rcu_read_unlock(); +} + +int mptcp_set_default_path_manager(const char *name) +{ + struct mptcp_pm_ops *pm; + int ret = -ENOENT; + + spin_lock(&mptcp_pm_list_lock); + pm = mptcp_pm_find(name); +#ifdef CONFIG_MODULES + if (!pm && capable(CAP_NET_ADMIN)) { + spin_unlock(&mptcp_pm_list_lock); + + request_module("mptcp_%s", name); + spin_lock(&mptcp_pm_list_lock); + pm = mptcp_pm_find(name); + } +#endif + + if (pm) { + list_move(&pm->list, &mptcp_pm_list); + ret = 0; + } else { + pr_info("%s is not available\n", name); + } + spin_unlock(&mptcp_pm_list_lock); + + return ret; +} + +static struct mptcp_pm_ops *__mptcp_pm_find_autoload(const char *name) +{ + struct mptcp_pm_ops *pm = mptcp_pm_find(name); +#ifdef CONFIG_MODULES + if (!pm && capable(CAP_NET_ADMIN)) { + rcu_read_unlock(); + request_module("mptcp_%s", name); + rcu_read_lock(); + pm = mptcp_pm_find(name); + } +#endif + return pm; +} + +void mptcp_init_path_manager(struct mptcp_cb *mpcb) +{ + struct mptcp_pm_ops *pm; + struct sock *meta_sk = mpcb->meta_sk; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + + rcu_read_lock(); + /* if path manager was set using socket option */ + if (meta_tp->mptcp_pm_setsockopt) { + pm = __mptcp_pm_find_autoload(meta_tp->mptcp_pm_name); + if (pm && try_module_get(pm->owner)) { + mpcb->pm_ops = pm; + goto out; + } + } + + list_for_each_entry_rcu(pm, &mptcp_pm_list, list) { + if (try_module_get(pm->owner)) { + mpcb->pm_ops = pm; + break; + } + } +out: + rcu_read_unlock(); +} + +/* Change path manager for socket */ +int mptcp_set_path_manager(struct sock *sk, const char *name) +{ + struct mptcp_pm_ops *pm; + int err = 0; + + rcu_read_lock(); + pm = __mptcp_pm_find_autoload(name); + + if (!pm) { + err = -ENOENT; + } else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + err = -EPERM; + } else { + strcpy(tcp_sk(sk)->mptcp_pm_name, name); + tcp_sk(sk)->mptcp_pm_setsockopt = 1; + } + rcu_read_unlock(); + + return err; +} + +/* Manage refcounts on socket close. */ +void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb) +{ + module_put(mpcb->pm_ops->owner); +} + +/* Fallback to the default path-manager. */ +void mptcp_fallback_default(struct mptcp_cb *mpcb) +{ + struct mptcp_pm_ops *pm; + + mptcp_cleanup_path_manager(mpcb); + pm = mptcp_pm_find("default"); + + /* Cannot fail - it's the default module */ + try_module_get(pm->owner); + mpcb->pm_ops = pm; +} +EXPORT_SYMBOL_GPL(mptcp_fallback_default); + +/* Set default value from kernel configuration at bootup */ +static int __init mptcp_path_manager_default(void) +{ + return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM); +} +late_initcall(mptcp_path_manager_default); diff --git a/net/mptcp/mptcp_redundant.c b/net/mptcp/mptcp_redundant.c new file mode 100644 index 000000000000..3db4e69acef2 --- /dev/null +++ b/net/mptcp/mptcp_redundant.c @@ -0,0 +1,395 @@ +/* + * MPTCP Scheduler to reduce latency and jitter. + * + * This scheduler sends all packets redundantly on all available subflows. + * + * Initial Design & Implementation: + * Tobias Erbshaeusser + * Alexander Froemmgen + * + * Initial corrections & modifications: + * Christian Pinedo + * Igor Lopez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +/* Struct to store the data of a single subflow */ +struct redsched_priv { + /* The skb or NULL */ + struct sk_buff *skb; + /* Start/end sequence number of the skb. This number should be checked + * to be valid before the skb field is used + */ + u32 skb_start_seq; + u32 skb_end_seq; +}; + +/* Struct to store the data of the control block */ +struct redsched_cb { + /* The next subflow where a skb should be sent or NULL */ + struct tcp_sock *next_subflow; +}; + +/* Returns the socket data from a given subflow socket */ +static struct redsched_priv *redsched_get_priv(struct tcp_sock *tp) +{ + return (struct redsched_priv *)&tp->mptcp->mptcp_sched[0]; +} + +/* Returns the control block data from a given meta socket */ +static struct redsched_cb *redsched_get_cb(struct tcp_sock *tp) +{ + return (struct redsched_cb *)&tp->mpcb->mptcp_sched[0]; +} + +static bool redsched_get_active_valid_sks(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct mptcp_tcp_sock *mptcp; + int active_valid_sks = 0; + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + + if (subflow_is_active((struct tcp_sock *)sk) && + !mptcp_is_def_unavailable(sk)) + active_valid_sks++; + } + + return active_valid_sks; +} + +static bool redsched_use_subflow(struct sock *meta_sk, + int active_valid_sks, + struct tcp_sock *tp, + struct sk_buff *skb) +{ + if (!skb || !mptcp_is_available((struct sock *)tp, skb, false)) + return false; + + if (TCP_SKB_CB(skb)->path_mask != 0) + return subflow_is_active(tp); + + if (TCP_SKB_CB(skb)->path_mask == 0) { + if (active_valid_sks == -1) + active_valid_sks = redsched_get_active_valid_sks(meta_sk); + + if (subflow_is_backup(tp) && active_valid_sks > 0) + return false; + else + return true; + } + + return false; +} + +#define mptcp_entry_next_rcu(__mptcp) \ + hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ + &(__mptcp)->node)), struct mptcp_tcp_sock, node) + +static void redsched_update_next_subflow(struct tcp_sock *tp, + struct redsched_cb *red_cb) +{ + struct mptcp_tcp_sock *mptcp = mptcp_entry_next_rcu(tp->mptcp); + + if (mptcp) + red_cb->next_subflow = mptcp->tp; + else + red_cb->next_subflow = NULL; +} + +static struct sock *red_get_available_subflow(struct sock *meta_sk, + struct sk_buff *skb, + bool zero_wnd_test) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct redsched_cb *red_cb = redsched_get_cb(meta_tp); + struct tcp_sock *first_tp = red_cb->next_subflow, *tp; + struct mptcp_tcp_sock *mptcp; + int found = 0; + + /* Answer data_fin on same subflow */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN && + skb && mptcp_is_data_fin(skb)) { + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + + if (tcp_sk(sk)->mptcp->path_index == + mpcb->dfin_path_index && + mptcp_is_available(sk, skb, zero_wnd_test)) + return sk; + } + } + + if (!first_tp && !hlist_empty(&mpcb->conn_list)) { + first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)), + struct mptcp_tcp_sock, node)->tp; + } + tp = first_tp; + + /* still NULL (no subflow in conn_list?) */ + if (!first_tp) + return NULL; + + /* Search for a subflow to send it. + * + * We want to pick a subflow that is after 'first_tp' in the list of subflows. + * Thus, the first mptcp_for_each_sub()-loop tries to walk the list up + * to the subflow 'tp' and then checks whether any one of the remaining + * ones is eligible to send. + * The second mptcp_for_each-sub()-loop is then iterating from the + * beginning of the list up to 'first_tp'. + */ + mptcp_for_each_sub(mpcb, mptcp) { + /* We go up to the subflow 'tp' and start from there */ + if (tp == mptcp->tp) + found = 1; + + if (!found) + continue; + tp = mptcp->tp; + + if (mptcp_is_available((struct sock *)tp, skb, + zero_wnd_test)) { + redsched_update_next_subflow(tp, red_cb); + return (struct sock *)tp; + } + } + + mptcp_for_each_sub(mpcb, mptcp) { + tp = mptcp->tp; + + if (tp == first_tp) + break; + + if (mptcp_is_available((struct sock *)tp, skb, + zero_wnd_test)) { + redsched_update_next_subflow(tp, red_cb); + return (struct sock *)tp; + } + } + + /* No space */ + return NULL; +} + +/* Corrects the stored skb pointers if they are invalid */ +static void redsched_correct_skb_pointers(struct sock *meta_sk, + struct redsched_priv *red_p) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + + if (red_p->skb && + (!after(red_p->skb_start_seq, meta_tp->snd_una) || + after(red_p->skb_end_seq, meta_tp->snd_nxt))) + red_p->skb = NULL; +} + +/* Returns the next skb from the queue */ +static struct sk_buff *redsched_next_skb_from_queue(struct sk_buff_head *queue, + struct sk_buff *previous, + struct sock *meta_sk) +{ + struct sk_buff *skb; + + if (!previous) + return tcp_rtx_queue_head(meta_sk) ? : skb_peek(queue); + + /* sk_data->skb stores the last scheduled packet for this subflow. + * If sk_data->skb was scheduled but not sent (e.g., due to nagle), + * we have to schedule it again. + * + * For the redundant scheduler, there are two cases: + * 1. sk_data->skb was not sent on another subflow: + * we have to schedule it again to ensure that we do not + * skip this packet. + * 2. sk_data->skb was already sent on another subflow: + * with regard to the redundant semantic, we have to + * schedule it again. However, we keep it simple and ignore it, + * as it was already sent by another subflow. + * This might be changed in the future. + * + * For case 1, send_head is equal previous, as only a single + * packet can be skipped. + */ + if (tcp_send_head(meta_sk) == previous) + return tcp_send_head(meta_sk); + + skb = skb_rb_next(previous); + if (skb) + return skb; + + return tcp_send_head(meta_sk); +} + +static struct sk_buff *mptcp_red_next_segment(struct sock *meta_sk, + int *reinject, + struct sock **subsk, + unsigned int *limit) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct redsched_cb *red_cb = redsched_get_cb(meta_tp); + struct tcp_sock *first_tp = red_cb->next_subflow, *tp; + struct mptcp_tcp_sock *mptcp; + int active_valid_sks = -1; + struct sk_buff *skb; + int found = 0; + + /* As we set it, we have to reset it as well. */ + *limit = 0; + + if (skb_queue_empty(&mpcb->reinject_queue) && + skb_queue_empty(&meta_sk->sk_write_queue) && + tcp_rtx_queue_empty(meta_sk)) + /* Nothing to send */ + return NULL; + + /* First try reinjections */ + skb = skb_peek(&mpcb->reinject_queue); + if (skb) { + *subsk = get_available_subflow(meta_sk, skb, false); + if (!*subsk) + return NULL; + *reinject = 1; + return skb; + } + + /* Then try indistinctly redundant and normal skbs */ + + if (!first_tp && !hlist_empty(&mpcb->conn_list)) { + first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)), + struct mptcp_tcp_sock, node)->tp; + } + + /* still NULL (no subflow in conn_list?) */ + if (!first_tp) + return NULL; + + tp = first_tp; + + *reinject = 0; + active_valid_sks = redsched_get_active_valid_sks(meta_sk); + + /* We want to pick a subflow that is after 'first_tp' in the list of subflows. + * Thus, the first mptcp_for_each_sub()-loop tries to walk the list up + * to the subflow 'tp' and then checks whether any one of the remaining + * ones can send a segment. + * The second mptcp_for_each-sub()-loop is then iterating from the + * beginning of the list up to 'first_tp'. + */ + mptcp_for_each_sub(mpcb, mptcp) { + struct redsched_priv *red_p; + + if (tp == mptcp->tp) + found = 1; + + if (!found) + continue; + + tp = mptcp->tp; + + /* Correct the skb pointers of the current subflow */ + red_p = redsched_get_priv(tp); + redsched_correct_skb_pointers(meta_sk, red_p); + + skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue, + red_p->skb, meta_sk); + if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp, + skb)) { + red_p->skb = skb; + red_p->skb_start_seq = TCP_SKB_CB(skb)->seq; + red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq; + redsched_update_next_subflow(tp, red_cb); + *subsk = (struct sock *)tp; + + if (TCP_SKB_CB(skb)->path_mask) + *reinject = -1; + return skb; + } + } + + mptcp_for_each_sub(mpcb, mptcp) { + struct redsched_priv *red_p; + + tp = mptcp->tp; + + if (tp == first_tp) + break; + + /* Correct the skb pointers of the current subflow */ + red_p = redsched_get_priv(tp); + redsched_correct_skb_pointers(meta_sk, red_p); + + skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue, + red_p->skb, meta_sk); + if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp, + skb)) { + red_p->skb = skb; + red_p->skb_start_seq = TCP_SKB_CB(skb)->seq; + red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq; + redsched_update_next_subflow(tp, red_cb); + *subsk = (struct sock *)tp; + + if (TCP_SKB_CB(skb)->path_mask) + *reinject = -1; + return skb; + } + } + + /* Nothing to send */ + return NULL; +} + +static void redsched_release(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct redsched_cb *red_cb = redsched_get_cb(tp); + + /* Check if the next subflow would be the released one. If yes correct + * the pointer + */ + if (red_cb->next_subflow == tp) + redsched_update_next_subflow(tp, red_cb); +} + +static struct mptcp_sched_ops mptcp_sched_red = { + .get_subflow = red_get_available_subflow, + .next_segment = mptcp_red_next_segment, + .release = redsched_release, + .name = "redundant", + .owner = THIS_MODULE, +}; + +static int __init red_register(void) +{ + BUILD_BUG_ON(sizeof(struct redsched_priv) > MPTCP_SCHED_SIZE); + BUILD_BUG_ON(sizeof(struct redsched_cb) > MPTCP_SCHED_DATA_SIZE); + + if (mptcp_register_scheduler(&mptcp_sched_red)) + return -1; + + return 0; +} + +static void red_unregister(void) +{ + mptcp_unregister_scheduler(&mptcp_sched_red); +} + +module_init(red_register); +module_exit(red_unregister); + +MODULE_AUTHOR("Tobias Erbshaeusser, Alexander Froemmgen"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("REDUNDANT MPTCP"); +MODULE_VERSION("0.90"); diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c new file mode 100644 index 000000000000..396e8aaf4762 --- /dev/null +++ b/net/mptcp/mptcp_rr.c @@ -0,0 +1,309 @@ +/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ + +#include +#include + +static unsigned char num_segments __read_mostly = 1; +module_param(num_segments, byte, 0644); +MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst"); + +static bool cwnd_limited __read_mostly = 1; +module_param(cwnd_limited, bool, 0644); +MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows"); + +struct rrsched_priv { + unsigned char quota; +}; + +static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp) +{ + return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0]; +} + +/* If the sub-socket sk available to send the skb? */ +static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb, + bool zero_wnd_test, bool cwnd_test) +{ + const struct tcp_sock *tp = tcp_sk(sk); + unsigned int space, in_flight; + + /* Set of states for which we are allowed to send data */ + if (!mptcp_sk_can_send(sk)) + return false; + + /* We do not send data on this subflow unless it is + * fully established, i.e. the 4th ack has been received. + */ + if (tp->mptcp->pre_established) + return false; + + if (tp->pf) + return false; + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { + /* If SACK is disabled, and we got a loss, TCP does not exit + * the loss-state until something above high_seq has been acked. + * (see tcp_try_undo_recovery) + * + * high_seq is the snd_nxt at the moment of the RTO. As soon + * as we have an RTO, we won't push data on the subflow. + * Thus, snd_una can never go beyond high_seq. + */ + if (!tcp_is_reno(tp)) + return false; + else if (tp->snd_una != tp->high_seq) + return false; + } + + if (!tp->mptcp->fully_established) { + /* Make sure that we send in-order data */ + if (skb && tp->mptcp->second_packet && + tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) + return false; + } + + if (!cwnd_test) + goto zero_wnd_test; + + in_flight = tcp_packets_in_flight(tp); + /* Not even a single spot in the cwnd */ + if (in_flight >= tp->snd_cwnd) + return false; + + /* Now, check if what is queued in the subflow's send-queue + * already fills the cwnd. + */ + space = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + if (tp->write_seq - tp->snd_nxt > space) + return false; + +zero_wnd_test: + if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) + return false; + + return true; +} + +/* Are we not allowed to reinject this skb on tp? */ +static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) +{ + /* If the skb has already been enqueued in this sk, try to find + * another one. + */ + return skb && + /* Has the skb already been enqueued into this subsocket? */ + mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; +} + +/* We just look for any subflow that is available */ +static struct sock *rr_get_available_subflow(struct sock *meta_sk, + struct sk_buff *skb, + bool zero_wnd_test) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sock *sk = NULL, *bestsk = NULL, *backupsk = NULL; + struct mptcp_tcp_sock *mptcp; + + /* Answer data_fin on same subflow!!! */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN && + skb && mptcp_is_data_fin(skb)) { + mptcp_for_each_sub(mpcb, mptcp) { + sk = mptcp_to_sock(mptcp); + if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && + mptcp_rr_is_available(sk, skb, zero_wnd_test, true)) + return sk; + } + } + + /* First, find the best subflow */ + mptcp_for_each_sub(mpcb, mptcp) { + struct tcp_sock *tp; + + sk = mptcp_to_sock(mptcp); + tp = tcp_sk(sk); + + if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true)) + continue; + + if (mptcp_rr_dont_reinject_skb(tp, skb)) { + backupsk = sk; + continue; + } + + bestsk = sk; + } + + if (bestsk) { + sk = bestsk; + } else if (backupsk) { + /* It has been sent on all subflows once - let's give it a + * chance again by restarting its pathmask. + */ + if (skb) + TCP_SKB_CB(skb)->path_mask = 0; + sk = backupsk; + } + + return sk; +} + +/* Returns the next segment to be sent from the mptcp meta-queue. + * (chooses the reinject queue if any segment is waiting in it, otherwise, + * chooses the normal write queue). + * Sets *@reinject to 1 if the returned segment comes from the + * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, + * and sets it to -1 if it is a meta-level retransmission to optimize the + * receive-buffer. + */ +static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sk_buff *skb = NULL; + + *reinject = 0; + + /* If we are in fallback-mode, just take from the meta-send-queue */ + if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) + return tcp_send_head(meta_sk); + + skb = skb_peek(&mpcb->reinject_queue); + + if (skb) + *reinject = 1; + else + skb = tcp_send_head(meta_sk); + return skb; +} + +static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk, + int *reinject, + struct sock **subsk, + unsigned int *limit) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sock *choose_sk = NULL; + struct mptcp_tcp_sock *mptcp; + struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject); + unsigned char split = num_segments; + unsigned char iter = 0, full_subs = 0; + + /* As we set it, we have to reset it as well. */ + *limit = 0; + + if (!skb) + return NULL; + + if (*reinject) { + *subsk = rr_get_available_subflow(meta_sk, skb, false); + if (!*subsk) + return NULL; + + return skb; + } + +retry: + + /* First, we look for a subflow who is currently being used */ + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it = tcp_sk(sk_it); + struct rrsched_priv *rr_p = rrsched_get_priv(tp_it); + + if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited)) + continue; + + iter++; + + /* Is this subflow currently being used? */ + if (rr_p->quota > 0 && rr_p->quota < num_segments) { + split = num_segments - rr_p->quota; + choose_sk = sk_it; + goto found; + } + + /* Or, it's totally unused */ + if (!rr_p->quota) { + split = num_segments; + choose_sk = sk_it; + } + + /* Or, it must then be fully used */ + if (rr_p->quota >= num_segments) + full_subs++; + } + + /* All considered subflows have a full quota, and we considered at + * least one. + */ + if (iter && iter == full_subs) { + /* So, we restart this round by setting quota to 0 and retry + * to find a subflow. + */ + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk_it = mptcp_to_sock(mptcp); + struct tcp_sock *tp_it = tcp_sk(sk_it); + struct rrsched_priv *rr_p = rrsched_get_priv(tp_it); + + if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited)) + continue; + + rr_p->quota = 0; + } + + goto retry; + } + +found: + if (choose_sk) { + unsigned int mss_now; + struct tcp_sock *choose_tp = tcp_sk(choose_sk); + struct rrsched_priv *rr_p = rrsched_get_priv(choose_tp); + + if (!mptcp_rr_is_available(choose_sk, skb, false, true)) + return NULL; + + *subsk = choose_sk; + mss_now = tcp_current_mss(*subsk); + *limit = split * mss_now; + + if (skb->len > mss_now) + rr_p->quota += DIV_ROUND_UP(skb->len, mss_now); + else + rr_p->quota++; + + return skb; + } + + return NULL; +} + +static struct mptcp_sched_ops mptcp_sched_rr = { + .get_subflow = rr_get_available_subflow, + .next_segment = mptcp_rr_next_segment, + .name = "roundrobin", + .owner = THIS_MODULE, +}; + +static int __init rr_register(void) +{ + BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE); + + if (mptcp_register_scheduler(&mptcp_sched_rr)) + return -1; + + return 0; +} + +static void rr_unregister(void) +{ + mptcp_unregister_scheduler(&mptcp_sched_rr); +} + +module_init(rr_register); +module_exit(rr_unregister); + +MODULE_AUTHOR("Christoph Paasch"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ROUNDROBIN MPTCP"); +MODULE_VERSION("0.89"); diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c new file mode 100644 index 000000000000..9ae5deaddd6f --- /dev/null +++ b/net/mptcp/mptcp_sched.c @@ -0,0 +1,675 @@ +/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */ + +#include +#include +#include +#include + +static DEFINE_SPINLOCK(mptcp_sched_list_lock); +static LIST_HEAD(mptcp_sched_list); + +struct defsched_priv { + u32 last_rbuf_opti; +}; + +static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp) +{ + return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0]; +} + +bool mptcp_is_def_unavailable(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + /* Set of states for which we are allowed to send data */ + if (!mptcp_sk_can_send(sk)) + return true; + + /* We do not send data on this subflow unless it is + * fully established, i.e. the 4th ack has been received. + */ + if (tp->mptcp->pre_established) + return true; + + if (tp->pf) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(mptcp_is_def_unavailable); + +/* estimate number of segments currently in flight + unsent in + * the subflow socket. + */ +static int mptcp_subflow_queued(struct sock *sk, u32 max_tso_segs) +{ + const struct tcp_sock *tp = tcp_sk(sk); + unsigned int queued; + + /* estimate the max number of segments in the write queue + * this is an overestimation, avoiding to iterate over the queue + * to make a better estimation. + * Having only one skb in the queue however might trigger tso deferral, + * delaying the sending of a tso segment in the hope that skb_entail + * will append more data to the skb soon. + * Therefore, in the case only one skb is in the queue, we choose to + * potentially underestimate, risking to schedule one skb too many onto + * the subflow rather than not enough. + */ + if (sk->sk_write_queue.qlen > 1) + queued = sk->sk_write_queue.qlen * max_tso_segs; + else + queued = sk->sk_write_queue.qlen; + + return queued + tcp_packets_in_flight(tp); +} + +static bool mptcp_is_temp_unavailable(struct sock *sk, + const struct sk_buff *skb, + bool zero_wnd_test) +{ + const struct tcp_sock *tp = tcp_sk(sk); + unsigned int mss_now; + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { + /* If SACK is disabled, and we got a loss, TCP does not exit + * the loss-state until something above high_seq has been + * acked. (see tcp_try_undo_recovery) + * + * high_seq is the snd_nxt at the moment of the RTO. As soon + * as we have an RTO, we won't push data on the subflow. + * Thus, snd_una can never go beyond high_seq. + */ + if (!tcp_is_reno(tp)) + return true; + else if (tp->snd_una != tp->high_seq) + return true; + } + + if (!tp->mptcp->fully_established) { + /* Make sure that we send in-order data */ + if (skb && tp->mptcp->second_packet && + tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq) + return true; + } + + mss_now = tcp_current_mss(sk); + + /* Not even a single spot in the cwnd */ + if (mptcp_subflow_queued(sk, tcp_tso_segs(sk, mss_now)) >= tp->snd_cwnd) + return true; + + if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) + return true; + + /* Don't send on this subflow if we bypass the allowed send-window at + * the per-subflow level. Similar to tcp_snd_wnd_test, but manually + * calculated end_seq (because here at this point end_seq is still at + * the meta-level). + */ + if (skb && zero_wnd_test && + after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp))) + return true; + + return false; +} + +/* Is the sub-socket sk available to send the skb? */ +bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb, + bool zero_wnd_test) +{ + return !mptcp_is_def_unavailable(sk) && + !mptcp_is_temp_unavailable(sk, skb, zero_wnd_test); +} +EXPORT_SYMBOL_GPL(mptcp_is_available); + +/* Are we not allowed to reinject this skb on tp? */ +static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb) +{ + /* If the skb has already been enqueued in this sk, try to find + * another one. + */ + return skb && + /* Has the skb already been enqueued into this subsocket? */ + mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask; +} + +bool subflow_is_backup(const struct tcp_sock *tp) +{ + return tp->mptcp->rcv_low_prio || tp->mptcp->low_prio; +} +EXPORT_SYMBOL_GPL(subflow_is_backup); + +bool subflow_is_active(const struct tcp_sock *tp) +{ + return !tp->mptcp->rcv_low_prio && !tp->mptcp->low_prio; +} +EXPORT_SYMBOL_GPL(subflow_is_active); + +/* Generic function to iterate over used and unused subflows and to select the + * best one + */ +static struct sock +*get_subflow_from_selectors(struct mptcp_cb *mpcb, struct sk_buff *skb, + bool (*selector)(const struct tcp_sock *), + bool zero_wnd_test, bool *force) +{ + struct sock *bestsk = NULL; + u32 min_srtt = 0xffffffff; + bool found_unused = false; + bool found_unused_una = false; + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + struct tcp_sock *tp = tcp_sk(sk); + bool unused = false; + + /* First, we choose only the wanted sks */ + if (!(*selector)(tp)) + continue; + + if (!mptcp_dont_reinject_skb(tp, skb)) + unused = true; + else if (found_unused) + /* If a unused sk was found previously, we continue - + * no need to check used sks anymore. + */ + continue; + + if (mptcp_is_def_unavailable(sk)) + continue; + + if (mptcp_is_temp_unavailable(sk, skb, zero_wnd_test)) { + if (unused) + found_unused_una = true; + continue; + } + + if (unused) { + if (!found_unused) { + /* It's the first time we encounter an unused + * sk - thus we reset the bestsk (which might + * have been set to a used sk). + */ + min_srtt = 0xffffffff; + bestsk = NULL; + } + found_unused = true; + } + + if (tp->srtt_us < min_srtt) { + min_srtt = tp->srtt_us; + bestsk = sk; + } + } + + if (bestsk) { + /* The force variable is used to mark the returned sk as + * previously used or not-used. + */ + if (found_unused) + *force = true; + else + *force = false; + } else { + /* The force variable is used to mark if there are temporally + * unavailable not-used sks. + */ + if (found_unused_una) + *force = true; + else + *force = false; + } + + return bestsk; +} + +/* This is the scheduler. This function decides on which flow to send + * a given MSS. If all subflows are found to be busy, NULL is returned + * The flow is selected based on the shortest RTT. + * If all paths have full cong windows, we simply return NULL. + * + * Additionally, this function is aware of the backup-subflows. + */ +struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb, + bool zero_wnd_test) +{ + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sock *sk; + bool looping = false, force; + + /* Answer data_fin on same subflow!!! */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN && + skb && mptcp_is_data_fin(skb)) { + struct mptcp_tcp_sock *mptcp; + + mptcp_for_each_sub(mpcb, mptcp) { + sk = mptcp_to_sock(mptcp); + + if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index && + mptcp_is_available(sk, skb, zero_wnd_test)) + return sk; + } + } + + /* Find the best subflow */ +restart: + sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_active, + zero_wnd_test, &force); + if (force) + /* one unused active sk or one NULL sk when there is at least + * one temporally unavailable unused active sk + */ + return sk; + + sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_backup, + zero_wnd_test, &force); + if (!force && skb) { + /* one used backup sk or one NULL sk where there is no one + * temporally unavailable unused backup sk + * + * the skb passed through all the available active and backups + * sks, so clean the path mask + */ + TCP_SKB_CB(skb)->path_mask = 0; + + if (!looping) { + looping = true; + goto restart; + } + } + return sk; +} +EXPORT_SYMBOL_GPL(get_available_subflow); + +static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal) +{ + struct sock *meta_sk; + const struct tcp_sock *tp = tcp_sk(sk); + struct mptcp_tcp_sock *mptcp; + struct sk_buff *skb_head; + struct defsched_priv *def_p = defsched_get_priv(tp); + + meta_sk = mptcp_meta_sk(sk); + skb_head = tcp_rtx_queue_head(meta_sk); + + if (!skb_head) + return NULL; + + /* If penalization is optional (coming from mptcp_next_segment() and + * We are not send-buffer-limited we do not penalize. The retransmission + * is just an optimization to fix the idle-time due to the delay before + * we wake up the application. + */ + if (!penal && sk_stream_memory_free(meta_sk)) + goto retrans; + + /* Only penalize again after an RTT has elapsed */ + if (tcp_jiffies32 - def_p->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3)) + goto retrans; + + /* Half the cwnd of the slow flows */ + mptcp_for_each_sub(tp->mpcb, mptcp) { + struct tcp_sock *tp_it = mptcp->tp; + + if (tp_it != tp && + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { + if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) { + u32 prior_cwnd = tp_it->snd_cwnd; + + tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U); + + /* If in slow start, do not reduce the ssthresh */ + if (prior_cwnd >= tp_it->snd_ssthresh) + tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U); + + def_p->last_rbuf_opti = tcp_jiffies32; + } + } + } + +retrans: + + /* Segment not yet injected into this path? Take it!!! */ + if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) { + bool do_retrans = false; + mptcp_for_each_sub(tp->mpcb, mptcp) { + struct tcp_sock *tp_it = mptcp->tp; + + if (tp_it != tp && + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) { + if (tp_it->snd_cwnd <= 4) { + do_retrans = true; + break; + } + + if (4 * tp->srtt_us >= tp_it->srtt_us) { + do_retrans = false; + break; + } else { + do_retrans = true; + } + } + } + + if (do_retrans && mptcp_is_available(sk, skb_head, false)) { + trace_mptcp_retransmit(sk, skb_head); + return skb_head; + } + } + return NULL; +} + +/* Returns the next segment to be sent from the mptcp meta-queue. + * (chooses the reinject queue if any segment is waiting in it, otherwise, + * chooses the normal write queue). + * Sets *@reinject to 1 if the returned segment comes from the + * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk, + * and sets it to -1 if it is a meta-level retransmission to optimize the + * receive-buffer. + */ +static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject) +{ + const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; + struct sk_buff *skb = NULL; + + *reinject = 0; + + /* If we are in fallback-mode, just take from the meta-send-queue */ + if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping) + return tcp_send_head(meta_sk); + + skb = skb_peek(&mpcb->reinject_queue); + + if (skb) { + *reinject = 1; + } else { + skb = tcp_send_head(meta_sk); + + if (!skb && meta_sk->sk_socket && + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) && + sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) { + struct sock *subsk; + + /* meta is send buffer limited */ + tcp_chrono_start(meta_sk, TCP_CHRONO_SNDBUF_LIMITED); + + subsk = get_available_subflow(meta_sk, NULL, false); + if (!subsk) + return NULL; + + skb = mptcp_rcv_buf_optimization(subsk, 0); + if (skb) + *reinject = -1; + else + tcp_chrono_start(subsk, + TCP_CHRONO_SNDBUF_LIMITED); + } + } + return skb; +} + +static struct sk_buff *mptcp_next_segment(struct sock *meta_sk, + int *reinject, + struct sock **subsk, + unsigned int *limit) +{ + struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject); + unsigned int mss_now; + u32 max_len, gso_max_segs, max_segs, max_tso_segs, window; + struct tcp_sock *subtp; + int queued; + + /* As we set it, we have to reset it as well. */ + *limit = 0; + + if (!skb) + return NULL; + + *subsk = get_available_subflow(meta_sk, skb, false); + if (!*subsk) + return NULL; + + subtp = tcp_sk(*subsk); + mss_now = tcp_current_mss(*subsk); + + if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) { + /* an active flow is selected, but segment will not be sent due + * to no more space in send window + * this means the meta is receive window limited + * the subflow might also be, if we have nothing to reinject + */ + tcp_chrono_start(meta_sk, TCP_CHRONO_RWND_LIMITED); + skb = mptcp_rcv_buf_optimization(*subsk, 1); + if (skb) + *reinject = -1; + else + return NULL; + } + + if (!*reinject) { + /* this will stop any other chronos on the meta */ + tcp_chrono_start(meta_sk, TCP_CHRONO_BUSY); + } + + /* No splitting required, as we will only send one single segment */ + if (skb->len <= mss_now) + return skb; + + max_tso_segs = tcp_tso_segs(*subsk, tcp_current_mss(*subsk)); + queued = mptcp_subflow_queued(*subsk, max_tso_segs); + + /* this condition should already have been established in + * mptcp_is_temp_unavailable when selecting available flows + */ + WARN_ONCE(subtp->snd_cwnd <= queued, "Selected subflow no cwnd room"); + + gso_max_segs = (*subsk)->sk_gso_max_segs; + if (!gso_max_segs) /* No gso supported on the subflow's NIC */ + gso_max_segs = 1; + + max_segs = min_t(unsigned int, subtp->snd_cwnd - queued, gso_max_segs); + if (!max_segs) + return NULL; + + /* if there is room for a segment, schedule up to a complete TSO + * segment to avoid TSO splitting. Even if it is more than allowed by + * the congestion window. + */ + max_segs = max_t(unsigned int, max_tso_segs, max_segs); + + max_len = min(mss_now * max_segs, skb->len); + + window = tcp_wnd_end(subtp) - subtp->write_seq; + + /* max_len now also respects the announced receive-window */ + max_len = min(max_len, window); + + *limit = max_len; + + return skb; +} + +static void defsched_init(struct sock *sk) +{ + struct defsched_priv *def_p = defsched_get_priv(tcp_sk(sk)); + + def_p->last_rbuf_opti = tcp_jiffies32; +} + +struct mptcp_sched_ops mptcp_sched_default = { + .get_subflow = get_available_subflow, + .next_segment = mptcp_next_segment, + .init = defsched_init, + .name = "default", + .owner = THIS_MODULE, +}; + +static struct mptcp_sched_ops *mptcp_sched_find(const char *name) +{ + struct mptcp_sched_ops *e; + + list_for_each_entry_rcu(e, &mptcp_sched_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +{ + int ret = 0; + + if (!sched->get_subflow || !sched->next_segment) + return -EINVAL; + + spin_lock(&mptcp_sched_list_lock); + if (mptcp_sched_find(sched->name)) { + pr_notice("%s already registered\n", sched->name); + ret = -EEXIST; + } else { + list_add_tail_rcu(&sched->list, &mptcp_sched_list); + pr_info("%s registered\n", sched->name); + } + spin_unlock(&mptcp_sched_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(mptcp_register_scheduler); + +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) +{ + spin_lock(&mptcp_sched_list_lock); + list_del_rcu(&sched->list); + spin_unlock(&mptcp_sched_list_lock); + + /* Wait for outstanding readers to complete before the + * module gets removed entirely. + * + * A try_module_get() should fail by now as our module is + * in "going" state since no refs are held anymore and + * module_exit() handler being called. + */ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler); + +void mptcp_get_default_scheduler(char *name) +{ + struct mptcp_sched_ops *sched; + + BUG_ON(list_empty(&mptcp_sched_list)); + + rcu_read_lock(); + sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list); + strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX); + rcu_read_unlock(); +} + +int mptcp_set_default_scheduler(const char *name) +{ + struct mptcp_sched_ops *sched; + int ret = -ENOENT; + + spin_lock(&mptcp_sched_list_lock); + sched = mptcp_sched_find(name); +#ifdef CONFIG_MODULES + if (!sched && capable(CAP_NET_ADMIN)) { + spin_unlock(&mptcp_sched_list_lock); + + request_module("mptcp_%s", name); + spin_lock(&mptcp_sched_list_lock); + sched = mptcp_sched_find(name); + } +#endif + + if (sched) { + list_move(&sched->list, &mptcp_sched_list); + ret = 0; + } else { + pr_info("%s is not available\n", name); + } + spin_unlock(&mptcp_sched_list_lock); + + return ret; +} + +/* Must be called with rcu lock held */ +static struct mptcp_sched_ops *__mptcp_sched_find_autoload(const char *name) +{ + struct mptcp_sched_ops *sched = mptcp_sched_find(name); +#ifdef CONFIG_MODULES + if (!sched && capable(CAP_NET_ADMIN)) { + rcu_read_unlock(); + request_module("mptcp_%s", name); + rcu_read_lock(); + sched = mptcp_sched_find(name); + } +#endif + return sched; +} + +void mptcp_init_scheduler(struct mptcp_cb *mpcb) +{ + struct mptcp_sched_ops *sched; + struct sock *meta_sk = mpcb->meta_sk; + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + + rcu_read_lock(); + /* if scheduler was set using socket option */ + if (meta_tp->mptcp_sched_setsockopt) { + sched = __mptcp_sched_find_autoload(meta_tp->mptcp_sched_name); + if (sched && try_module_get(sched->owner)) { + mpcb->sched_ops = sched; + goto out; + } + } + + list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { + if (try_module_get(sched->owner)) { + mpcb->sched_ops = sched; + break; + } + } +out: + rcu_read_unlock(); +} + +/* Change scheduler for socket */ +int mptcp_set_scheduler(struct sock *sk, const char *name) +{ + struct mptcp_sched_ops *sched; + int err = 0; + + rcu_read_lock(); + sched = __mptcp_sched_find_autoload(name); + + if (!sched) { + err = -ENOENT; + } else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + err = -EPERM; + } else { + strcpy(tcp_sk(sk)->mptcp_sched_name, name); + tcp_sk(sk)->mptcp_sched_setsockopt = 1; + } + rcu_read_unlock(); + + return err; +} + +/* Manage refcounts on socket close. */ +void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb) +{ + module_put(mpcb->sched_ops->owner); +} + +/* Set default value from kernel configuration at bootup */ +static int __init mptcp_scheduler_default(void) +{ + BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE); + + return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED); +} +late_initcall(mptcp_scheduler_default); diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c new file mode 100644 index 000000000000..787ddaab98a2 --- /dev/null +++ b/net/mptcp/mptcp_wvegas.c @@ -0,0 +1,271 @@ +/* + * MPTCP implementation - WEIGHTED VEGAS + * + * Algorithm design: + * Yu Cao + * Mingwei Xu + * Xiaoming Fu + * + * Implementation: + * Yu Cao + * Enhuan Dong + * + * Ported to the official MPTCP-kernel: + * Christoph Paasch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +static int initial_alpha = 2; +static int total_alpha = 10; +static int gamma = 1; + +module_param(initial_alpha, int, 0644); +MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows"); +module_param(total_alpha, int, 0644); +MODULE_PARM_DESC(total_alpha, "total alpha for all subflows"); +module_param(gamma, int, 0644); +MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); + +#define MPTCP_WVEGAS_SCALE 16 + +/* wVegas variables */ +struct wvegas { + u32 beg_snd_nxt; /* right edge during last RTT */ + u8 doing_wvegas_now;/* if true, do wvegas for this RTT */ + + u16 cnt_rtt; /* # of RTTs measured within last RTT */ + u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */ + u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */ + + u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */ + u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */ + int alpha; /* alpha for each subflows */ + + u32 queue_delay; /* queue delay*/ +}; + + +static inline u64 mptcp_wvegas_scale(u32 val, int scale) +{ + return (u64) val << scale; +} + +static void wvegas_enable(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct wvegas *wvegas = inet_csk_ca(sk); + + wvegas->doing_wvegas_now = 1; + + wvegas->beg_snd_nxt = tp->snd_nxt; + + wvegas->cnt_rtt = 0; + wvegas->sampled_rtt = 0; + + wvegas->instant_rate = 0; + wvegas->alpha = initial_alpha; + wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE); + + wvegas->queue_delay = 0; +} + +static inline void wvegas_disable(const struct sock *sk) +{ + struct wvegas *wvegas = inet_csk_ca(sk); + + wvegas->doing_wvegas_now = 0; +} + +static void mptcp_wvegas_init(struct sock *sk) +{ + struct wvegas *wvegas = inet_csk_ca(sk); + + wvegas->base_rtt = 0x7fffffff; + wvegas_enable(sk); +} + +static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us) +{ + return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us); +} + +static void mptcp_wvegas_pkts_acked(struct sock *sk, + const struct ack_sample *sample) +{ + struct wvegas *wvegas = inet_csk_ca(sk); + u32 vrtt; + + if (sample->rtt_us < 0) + return; + + vrtt = sample->rtt_us + 1; + + if (vrtt < wvegas->base_rtt) + wvegas->base_rtt = vrtt; + + wvegas->sampled_rtt += vrtt; + wvegas->cnt_rtt++; +} + +static void mptcp_wvegas_state(struct sock *sk, u8 ca_state) +{ + if (ca_state == TCP_CA_Open) + wvegas_enable(sk); + else + wvegas_disable(sk); +} + +static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART) { + mptcp_wvegas_init(sk); + } else if (event == CA_EVENT_LOSS) { + struct wvegas *wvegas = inet_csk_ca(sk); + wvegas->instant_rate = 0; + } +} + +static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp) +{ + return min(tp->snd_ssthresh, tp->snd_cwnd); +} + +static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk) +{ + u64 total_rate = 0; + const struct wvegas *wvegas = inet_csk_ca(sk); + struct mptcp_tcp_sock *mptcp; + + if (!mpcb) + return wvegas->weight; + + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sub_sk = mptcp_to_sock(mptcp); + struct wvegas *sub_wvegas = inet_csk_ca(sub_sk); + + /* sampled_rtt is initialized by 0 */ + if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0)) + total_rate += sub_wvegas->instant_rate; + } + + if (total_rate && wvegas->instant_rate) + return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate); + else + return wvegas->weight; +} + +static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct wvegas *wvegas = inet_csk_ca(sk); + + if (!wvegas->doing_wvegas_now) { + tcp_reno_cong_avoid(sk, ack, acked); + return; + } + + if (after(ack, wvegas->beg_snd_nxt)) { + wvegas->beg_snd_nxt = tp->snd_nxt; + + if (wvegas->cnt_rtt <= 2) { + tcp_reno_cong_avoid(sk, ack, acked); + } else { + u32 rtt, diff, q_delay; + u64 target_cwnd; + + rtt = wvegas->sampled_rtt / wvegas->cnt_rtt; + target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt); + + diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt); + + if (diff > gamma && tcp_in_slow_start(tp)) { + tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); + tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp); + + } else if (tcp_in_slow_start(tp)) { + tcp_slow_start(tp, acked); + } else { + if (diff >= wvegas->alpha) { + wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt); + wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk); + wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE)); + } + if (diff > wvegas->alpha) { + tp->snd_cwnd--; + tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp); + } else if (diff < wvegas->alpha) { + tp->snd_cwnd++; + } + + /* Try to drain link queue if needed*/ + q_delay = rtt - wvegas->base_rtt; + if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay)) + wvegas->queue_delay = q_delay; + + if (q_delay >= 2 * wvegas->queue_delay) { + u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt); + tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE; + wvegas->queue_delay = 0; + } + } + + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + + tp->snd_ssthresh = tcp_current_ssthresh(sk); + } + + wvegas->cnt_rtt = 0; + wvegas->sampled_rtt = 0; + } + /* Use normal slow start */ + else if (tcp_in_slow_start(tp)) + tcp_slow_start(tp, acked); +} + + +static struct tcp_congestion_ops mptcp_wvegas __read_mostly = { + .init = mptcp_wvegas_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = mptcp_wvegas_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, + .pkts_acked = mptcp_wvegas_pkts_acked, + .set_state = mptcp_wvegas_state, + .cwnd_event = mptcp_wvegas_cwnd_event, + + .owner = THIS_MODULE, + .name = "wvegas", +}; + +static int __init mptcp_wvegas_register(void) +{ + BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&mptcp_wvegas); + return 0; +} + +static void __exit mptcp_wvegas_unregister(void) +{ + tcp_unregister_congestion_control(&mptcp_wvegas); +} + +module_init(mptcp_wvegas_register); +module_exit(mptcp_wvegas_unregister); + +MODULE_AUTHOR("Yu Cao, Enhuan Dong"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPTCP wVegas"); +MODULE_VERSION("0.1"); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9e060c6a01ac..96cc3e65787b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2686,6 +2686,7 @@ enum { BPF_TCP_LISTEN, BPF_TCP_CLOSING, /* Now a valid state */ BPF_TCP_NEW_SYN_RECV, + BPF_TCP_RST_WAIT, BPF_TCP_MAX_STATES /* Leave at the end! */ };