1 /*
2 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org>
3 * Copyright (c) 2010 The FreeBSD Foundation
4 * All rights reserved.
5 * Copyright (c) 2017 by Delphix. All rights reserved.
6 *
7 * This software was developed by Lawrence Stewart while studying at the Centre
8 * for Advanced Internet Architectures, Swinburne University of Technology, made
9 * possible in part by a grant from the Cisco University Research Program Fund
10 * at Community Foundation Silicon Valley.
11 *
12 * Portions of this software were developed at the Centre for Advanced
13 * Internet Architectures, Swinburne University of Technology, Melbourne,
14 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38 /*
39 * An implementation of the CUBIC congestion control algorithm for FreeBSD,
40 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha.
41 * Originally released as part of the NewTCP research project at Swinburne
42 * University of Technology's Centre for Advanced Internet Architectures,
43 * Melbourne, Australia, which was made possible in part by a grant from the
44 * Cisco University Research Program Fund at Community Foundation Silicon
45 * Valley. More details are available at:
46 * http://caia.swin.edu.au/urp/newtcp/
47 */
48
49 #include <sys/errno.h>
50 #include <sys/types.h>
51 #include <sys/kmem.h>
52 #include <sys/ddi.h>
53 #include <sys/sunddi.h>
54 #include <sys/modctl.h>
55 #include <sys/time.h>
56
57 #include <inet/tcp_impl.h>
58 #include <inet/cc.h>
59 #include <inet/cc/cc_cubic.h>
60 #include <inet/cc/cc_module.h>
61
62 static struct modlmisc cc_cubic_modlmisc = {
63 &mod_miscops,
64 "Cubic Congestion Control"
65 };
66
67 static struct modlinkage cc_cubic_modlinkage = {
68 MODREV_1,
69 &cc_cubic_modlmisc,
70 NULL
71 };
72
73 /*
74 * cubic uses the NewReno implementation of after_idle and uses NewReno's
75 * ack_received callback during slow start.
76 */
77 static struct cc_algo *newreno_cc_algo;
78
79 static void cubic_ack_received(struct cc_var *ccv, uint16_t type);
80 static void cubic_cb_destroy(struct cc_var *ccv);
81 static int cubic_cb_init(struct cc_var *ccv);
82 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type);
83 static void cubic_conn_init(struct cc_var *ccv);
84 static void cubic_post_recovery(struct cc_var *ccv);
85 static void cubic_record_rtt(struct cc_var *ccv);
86 static void cubic_ssthresh_update(struct cc_var *ccv);
87
88 struct cubic {
89 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */
90 int64_t K;
91 /* Sum of RTT samples across an epoch in nanoseconds. */
92 hrtime_t sum_rtt_nsecs;
93 /* cwnd at the most recent congestion event. */
94 uint32_t max_cwnd;
95 /* cwnd at the previous congestion event. */
96 uint32_t prev_max_cwnd;
97 /* Number of congestion events. */
98 uint32_t num_cong_events;
99 /* Minimum observed rtt in nanoseconds. */
100 hrtime_t min_rtt_nsecs;
101 /* Mean observed rtt between congestion epochs. */
102 hrtime_t mean_rtt_nsecs;
103 /* ACKs since last congestion event. */
104 int epoch_ack_count;
105 /* Time of last congestion event in nanoseconds. */
106 hrtime_t t_last_cong;
107 };
108
109 struct cc_algo cubic_cc_algo = {
110 .name = "cubic",
111 .ack_received = cubic_ack_received,
112 .cb_destroy = cubic_cb_destroy,
113 .cb_init = cubic_cb_init,
114 .cong_signal = cubic_cong_signal,
115 .conn_init = cubic_conn_init,
116 .post_recovery = cubic_post_recovery,
117 };
118
119 int
120 _init(void)
121 {
122 int err;
123
124 if ((newreno_cc_algo = cc_load_algo("newreno")) == NULL)
125 return (EINVAL);
126
127 if ((err = cc_register_algo(&cubic_cc_algo)) == 0) {
128 if ((err = mod_install(&cc_cubic_modlinkage)) != 0)
129 (void) cc_deregister_algo(&cubic_cc_algo);
130 }
131 cubic_cc_algo.after_idle = newreno_cc_algo->after_idle;
132 return (err);
133 }
134
135 int
136 _fini(void)
137 {
138 /* XXX Not unloadable for now */
139 return (EBUSY);
140 }
141
142 int
143 _info(struct modinfo *modinfop)
144 {
145 return (mod_info(&cc_cubic_modlinkage, modinfop));
146 }
147
148 static void
149 cubic_ack_received(struct cc_var *ccv, uint16_t type)
150 {
151 struct cubic *cubic_data;
152 uint32_t w_tf, w_cubic_next;
153 hrtime_t nsecs_since_cong;
154
155 cubic_data = ccv->cc_data;
156 cubic_record_rtt(ccv);
157
158 /*
159 * Regular ACK and we're not in cong/fast recovery and we're cwnd
160 * limited and we're either not doing ABC or are slow starting or are
161 * doing ABC and we've sent a cwnd's worth of bytes.
162 */
163 if (type == CC_ACK && !IN_RECOVERY(ccv->flags) &&
164 (ccv->flags & CCF_CWND_LIMITED) && (!CC_ABC(ccv) ||
165 CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
166 (CC_ABC(ccv) && (ccv->flags & CCF_ABC_SENTAWND)))) {
167 /* Use the logic in NewReno ack_received() for slow start. */
168 if (CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
169 cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)
170 newreno_cc_algo->ack_received(ccv, type);
171 else {
172 nsecs_since_cong = gethrtime() -
173 cubic_data->t_last_cong;
174
175 /*
176 * The mean RTT is used to best reflect the equations in
177 * the I-D. Using min_rtt in the tf_cwnd calculation
178 * causes w_tf to grow much faster than it should if the
179 * RTT is dominated by network buffering rather than
180 * propagation delay.
181 */
182 w_tf = tf_cwnd(nsecs_since_cong,
183 cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
184 CCV(ccv, tcp_mss));
185
186 w_cubic_next = cubic_cwnd(nsecs_since_cong +
187 cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
188 CCV(ccv, tcp_mss), cubic_data->K);
189
190 ccv->flags &= ~CCF_ABC_SENTAWND;
191
192 if (w_cubic_next < w_tf) {
193 /*
194 * TCP-friendly region, follow tf
195 * cwnd growth.
196 */
197 CCV(ccv, tcp_cwnd) = w_tf;
198 } else if (CCV(ccv, tcp_cwnd) < w_cubic_next) {
199 /*
200 * Concave or convex region, follow CUBIC
201 * cwnd growth.
202 */
203 if (CC_ABC(ccv))
204 CCV(ccv, tcp_cwnd) = w_cubic_next;
205 else
206 CCV(ccv, tcp_cwnd) += ((w_cubic_next -
207 CCV(ccv, tcp_cwnd)) *
208 CCV(ccv, tcp_mss)) /
209 CCV(ccv, tcp_cwnd);
210 }
211
212 /*
213 * If we're not in slow start and we're probing for a
214 * new cwnd limit at the start of a connection
215 * (happens when hostcache has a relevant entry),
216 * keep updating our current estimate of the
217 * max_cwnd.
218 */
219 if (cubic_data->num_cong_events == 0 &&
220 cubic_data->max_cwnd < CCV(ccv, tcp_cwnd))
221 cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
222 }
223 }
224 }
225
226 static void
227 cubic_cb_destroy(struct cc_var *ccv)
228 {
229
230 if (ccv->cc_data != NULL)
231 kmem_free(ccv->cc_data, sizeof (struct cubic));
232 }
233
234 static int
235 cubic_cb_init(struct cc_var *ccv)
236 {
237 struct cubic *cubic_data;
238
239 cubic_data = kmem_alloc(sizeof (struct cubic), KM_NOSLEEP);
240
241 if (cubic_data == NULL)
242 return (ENOMEM);
243
244 /* Init some key variables with sensible defaults. */
245 cubic_data->t_last_cong = gethrtime();
246 cubic_data->min_rtt_nsecs = TCPTV_SRTTBASE;
247 cubic_data->mean_rtt_nsecs = 1;
248
249 ccv->cc_data = cubic_data;
250
251 return (0);
252 }
253
254 /*
255 * Perform any necessary tasks before we enter congestion recovery.
256 */
257 static void
258 cubic_cong_signal(struct cc_var *ccv, uint32_t type)
259 {
260 struct cubic *cubic_data;
261 uint32_t cwin;
262 uint32_t mss;
263
264 cubic_data = ccv->cc_data;
265 cwin = CCV(ccv, tcp_cwnd);
266 mss = CCV(ccv, tcp_mss);
267
268 switch (type) {
269 case CC_NDUPACK:
270 if (!IN_FASTRECOVERY(ccv->flags)) {
271 if (!IN_CONGRECOVERY(ccv->flags)) {
272 cubic_ssthresh_update(ccv);
273 cubic_data->num_cong_events++;
274 cubic_data->prev_max_cwnd =
275 cubic_data->max_cwnd;
276 cubic_data->max_cwnd = cwin;
277 CCV(ccv, tcp_cwnd) =
278 CCV(ccv, tcp_cwnd_ssthresh);
279 }
280 ENTER_RECOVERY(ccv->flags);
281 }
282 break;
283
284 case CC_ECN:
285 if (!IN_CONGRECOVERY(ccv->flags)) {
286 cubic_ssthresh_update(ccv);
287 cubic_data->num_cong_events++;
288 cubic_data->prev_max_cwnd = cubic_data->max_cwnd;
289 cubic_data->max_cwnd = cwin;
290 cubic_data->t_last_cong = gethrtime();
291 CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
292 ENTER_CONGRECOVERY(ccv->flags);
293 }
294 break;
295
296 case CC_RTO:
297 /*
298 * Grab the current time and record it so we know when the
299 * most recent congestion event was. Only record it when the
300 * timeout has fired more than once, as there is a reasonable
301 * chance the first one is a false alarm and may not indicate
302 * congestion.
303 */
304 if (CCV(ccv, tcp_timer_backoff) >= 2) {
305 cubic_data->num_cong_events++;
306 cubic_data->t_last_cong = gethrtime();
307 cubic_ssthresh_update(ccv);
308 cubic_data->max_cwnd = cwin;
309 CCV(ccv, tcp_cwnd) = mss;
310 }
311 break;
312 }
313 }
314
315 static void
316 cubic_conn_init(struct cc_var *ccv)
317 {
318 struct cubic *cubic_data;
319
320 cubic_data = ccv->cc_data;
321
322 /*
323 * Ensure we have a sane initial value for max_cwnd recorded. Without
324 * this here bad things happen when entries from the TCP hostcache
325 * get used.
326 */
327 cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
328 }
329
330 /*
331 * Perform any necessary tasks before we exit congestion recovery.
332 */
333 static void
334 cubic_post_recovery(struct cc_var *ccv)
335 {
336 struct cubic *cubic_data;
337
338 cubic_data = ccv->cc_data;
339
340 /* Fast convergence heuristic. */
341 if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) {
342 cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR)
343 >> CUBIC_SHIFT;
344 }
345
346 if (IN_FASTRECOVERY(ccv->flags)) {
347 /* Update cwnd based on beta and adjusted max_cwnd. */
348 CCV(ccv, tcp_cwnd) = max(1, ((CUBIC_BETA *
349 cubic_data->max_cwnd) >> CUBIC_SHIFT));
350 }
351 cubic_data->t_last_cong = gethrtime();
352
353 /* Calculate the average RTT between congestion epochs. */
354 if (cubic_data->epoch_ack_count > 0 &&
355 cubic_data->sum_rtt_nsecs >= cubic_data->epoch_ack_count) {
356 cubic_data->mean_rtt_nsecs =
357 (cubic_data->sum_rtt_nsecs / cubic_data->epoch_ack_count);
358 }
359
360 cubic_data->epoch_ack_count = 0;
361 cubic_data->sum_rtt_nsecs = 0;
362 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss));
363 }
364
365 /*
366 * Record the min RTT and sum samples for the epoch average RTT calculation.
367 */
368 static void
369 cubic_record_rtt(struct cc_var *ccv)
370 {
371 struct cubic *cubic_data;
372 int t_srtt_nsecs;
373
374 /* Ignore srtt until a min number of samples have been taken. */
375 if (CCV(ccv, tcp_rtt_update) >= CUBIC_MIN_RTT_SAMPLES) {
376 cubic_data = ccv->cc_data;
377 /* tcp_rtt_sa is 8 * smoothed RTT in nanoseconds */
378 t_srtt_nsecs = CCV(ccv, tcp_rtt_sa) >> 3;
379
380 /*
381 * Record the current SRTT as our minrtt if it's the smallest
382 * we've seen or minrtt is currently equal to its initialized
383 * value.
384 *
385 * XXXLAS: Should there be some hysteresis for minrtt?
386 */
387 if ((t_srtt_nsecs < cubic_data->min_rtt_nsecs ||
388 cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)) {
389 cubic_data->min_rtt_nsecs = max(1, t_srtt_nsecs);
390
391 /*
392 * If the connection is within its first congestion
393 * epoch, ensure we prime mean_rtt_nsecs with a
394 * reasonable value until the epoch average RTT is
395 * calculated in cubic_post_recovery().
396 */
397 if (cubic_data->min_rtt_nsecs >
398 cubic_data->mean_rtt_nsecs)
399 cubic_data->mean_rtt_nsecs =
400 cubic_data->min_rtt_nsecs;
401 }
402
403 /* Sum samples for epoch average RTT calculation. */
404 cubic_data->sum_rtt_nsecs += t_srtt_nsecs;
405 cubic_data->epoch_ack_count++;
406 }
407 }
408
409 /*
410 * Update the ssthresh in the event of congestion.
411 */
412 static void
413 cubic_ssthresh_update(struct cc_var *ccv)
414 {
415 struct cubic *cubic_data;
416
417 cubic_data = ccv->cc_data;
418
419 /*
420 * On the first congestion event, set ssthresh to cwnd * 0.5, on
421 * subsequent congestion events, set it to cwnd * beta.
422 */
423 if (cubic_data->num_cong_events == 0)
424 CCV(ccv, tcp_cwnd_ssthresh) = CCV(ccv, tcp_cwnd) >> 1;
425 else
426 CCV(ccv, tcp_cwnd_ssthresh) =
427 (CCV(ccv, tcp_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT;
428 }