1 /* 2 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 3 * Copyright (c) 2010 The FreeBSD Foundation 4 * All rights reserved. 5 * Copyright (c) 2017 by Delphix. All rights reserved. 6 * 7 * This software was developed by Lawrence Stewart while studying at the Centre 8 * for Advanced Internet Architectures, Swinburne University of Technology, made 9 * possible in part by a grant from the Cisco University Research Program Fund 10 * at Community Foundation Silicon Valley. 11 * 12 * Portions of this software were developed at the Centre for Advanced 13 * Internet Architectures, Swinburne University of Technology, Melbourne, 14 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 /* 39 * An implementation of the CUBIC congestion control algorithm for FreeBSD, 40 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 41 * Originally released as part of the NewTCP research project at Swinburne 42 * University of Technology's Centre for Advanced Internet Architectures, 43 * Melbourne, Australia, which was made possible in part by a grant from the 44 * Cisco University Research Program Fund at Community Foundation Silicon 45 * Valley. More details are available at: 46 * http://caia.swin.edu.au/urp/newtcp/ 47 */ 48 49 #include <sys/errno.h> 50 #include <sys/types.h> 51 #include <sys/kmem.h> 52 #include <sys/ddi.h> 53 #include <sys/sunddi.h> 54 #include <sys/modctl.h> 55 #include <sys/time.h> 56 57 #include <inet/tcp_impl.h> 58 #include <inet/cc.h> 59 #include <inet/cc/cc_cubic.h> 60 #include <inet/cc/cc_module.h> 61 62 static struct modlmisc cc_cubic_modlmisc = { 63 &mod_miscops, 64 "Cubic Congestion Control" 65 }; 66 67 static struct modlinkage cc_cubic_modlinkage = { 68 MODREV_1, 69 &cc_cubic_modlmisc, 70 NULL 71 }; 72 73 /* 74 * cubic uses the NewReno implementation of after_idle and uses NewReno's 75 * ack_received callback during slow start. 76 */ 77 static struct cc_algo *newreno_cc_algo; 78 79 static void cubic_ack_received(struct cc_var *ccv, uint16_t type); 80 static void cubic_cb_destroy(struct cc_var *ccv); 81 static int cubic_cb_init(struct cc_var *ccv); 82 static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 83 static void cubic_conn_init(struct cc_var *ccv); 84 static void cubic_post_recovery(struct cc_var *ccv); 85 static void cubic_record_rtt(struct cc_var *ccv); 86 static void cubic_ssthresh_update(struct cc_var *ccv); 87 88 struct cubic { 89 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 90 int64_t K; 91 /* Sum of RTT samples across an epoch in nanoseconds. */ 92 hrtime_t sum_rtt_nsecs; 93 /* cwnd at the most recent congestion event. */ 94 uint32_t max_cwnd; 95 /* cwnd at the previous congestion event. */ 96 uint32_t prev_max_cwnd; 97 /* Number of congestion events. */ 98 uint32_t num_cong_events; 99 /* Minimum observed rtt in nanoseconds. */ 100 hrtime_t min_rtt_nsecs; 101 /* Mean observed rtt between congestion epochs. */ 102 hrtime_t mean_rtt_nsecs; 103 /* ACKs since last congestion event. */ 104 int epoch_ack_count; 105 /* Time of last congestion event in nanoseconds. */ 106 hrtime_t t_last_cong; 107 }; 108 109 struct cc_algo cubic_cc_algo = { 110 .name = "cubic", 111 .ack_received = cubic_ack_received, 112 .cb_destroy = cubic_cb_destroy, 113 .cb_init = cubic_cb_init, 114 .cong_signal = cubic_cong_signal, 115 .conn_init = cubic_conn_init, 116 .post_recovery = cubic_post_recovery, 117 }; 118 119 int 120 _init(void) 121 { 122 int err; 123 124 if ((newreno_cc_algo = cc_load_algo("newreno")) == NULL) 125 return (EINVAL); 126 127 if ((err = cc_register_algo(&cubic_cc_algo)) == 0) { 128 if ((err = mod_install(&cc_cubic_modlinkage)) != 0) 129 (void) cc_deregister_algo(&cubic_cc_algo); 130 } 131 cubic_cc_algo.after_idle = newreno_cc_algo->after_idle; 132 return (err); 133 } 134 135 int 136 _fini(void) 137 { 138 /* XXX Not unloadable for now */ 139 return (EBUSY); 140 } 141 142 int 143 _info(struct modinfo *modinfop) 144 { 145 return (mod_info(&cc_cubic_modlinkage, modinfop)); 146 } 147 148 static void 149 cubic_ack_received(struct cc_var *ccv, uint16_t type) 150 { 151 struct cubic *cubic_data; 152 uint32_t w_tf, w_cubic_next; 153 hrtime_t nsecs_since_cong; 154 155 cubic_data = ccv->cc_data; 156 cubic_record_rtt(ccv); 157 158 /* 159 * Regular ACK and we're not in cong/fast recovery and we're cwnd 160 * limited and we're either not doing ABC or are slow starting or are 161 * doing ABC and we've sent a cwnd's worth of bytes. 162 */ 163 if (type == CC_ACK && !IN_RECOVERY(ccv->flags) && 164 (ccv->flags & CCF_CWND_LIMITED) && (!CC_ABC(ccv) || 165 CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) || 166 (CC_ABC(ccv) && (ccv->flags & CCF_ABC_SENTAWND)))) { 167 /* Use the logic in NewReno ack_received() for slow start. */ 168 if (CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) || 169 cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE) 170 newreno_cc_algo->ack_received(ccv, type); 171 else { 172 nsecs_since_cong = gethrtime() - 173 cubic_data->t_last_cong; 174 175 /* 176 * The mean RTT is used to best reflect the equations in 177 * the I-D. Using min_rtt in the tf_cwnd calculation 178 * causes w_tf to grow much faster than it should if the 179 * RTT is dominated by network buffering rather than 180 * propagation delay. 181 */ 182 w_tf = tf_cwnd(nsecs_since_cong, 183 cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd, 184 CCV(ccv, tcp_mss)); 185 186 w_cubic_next = cubic_cwnd(nsecs_since_cong + 187 cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd, 188 CCV(ccv, tcp_mss), cubic_data->K); 189 190 ccv->flags &= ~CCF_ABC_SENTAWND; 191 192 if (w_cubic_next < w_tf) { 193 /* 194 * TCP-friendly region, follow tf 195 * cwnd growth. 196 */ 197 CCV(ccv, tcp_cwnd) = w_tf; 198 } else if (CCV(ccv, tcp_cwnd) < w_cubic_next) { 199 /* 200 * Concave or convex region, follow CUBIC 201 * cwnd growth. 202 */ 203 if (CC_ABC(ccv)) 204 CCV(ccv, tcp_cwnd) = w_cubic_next; 205 else 206 CCV(ccv, tcp_cwnd) += ((w_cubic_next - 207 CCV(ccv, tcp_cwnd)) * 208 CCV(ccv, tcp_mss)) / 209 CCV(ccv, tcp_cwnd); 210 } 211 212 /* 213 * If we're not in slow start and we're probing for a 214 * new cwnd limit at the start of a connection 215 * (happens when hostcache has a relevant entry), 216 * keep updating our current estimate of the 217 * max_cwnd. 218 */ 219 if (cubic_data->num_cong_events == 0 && 220 cubic_data->max_cwnd < CCV(ccv, tcp_cwnd)) 221 cubic_data->max_cwnd = CCV(ccv, tcp_cwnd); 222 } 223 } 224 } 225 226 static void 227 cubic_cb_destroy(struct cc_var *ccv) 228 { 229 230 if (ccv->cc_data != NULL) 231 kmem_free(ccv->cc_data, sizeof (struct cubic)); 232 } 233 234 static int 235 cubic_cb_init(struct cc_var *ccv) 236 { 237 struct cubic *cubic_data; 238 239 cubic_data = kmem_alloc(sizeof (struct cubic), KM_NOSLEEP); 240 241 if (cubic_data == NULL) 242 return (ENOMEM); 243 244 /* Init some key variables with sensible defaults. */ 245 cubic_data->t_last_cong = gethrtime(); 246 cubic_data->min_rtt_nsecs = TCPTV_SRTTBASE; 247 cubic_data->mean_rtt_nsecs = 1; 248 249 ccv->cc_data = cubic_data; 250 251 return (0); 252 } 253 254 /* 255 * Perform any necessary tasks before we enter congestion recovery. 256 */ 257 static void 258 cubic_cong_signal(struct cc_var *ccv, uint32_t type) 259 { 260 struct cubic *cubic_data; 261 uint32_t cwin; 262 uint32_t mss; 263 264 cubic_data = ccv->cc_data; 265 cwin = CCV(ccv, tcp_cwnd); 266 mss = CCV(ccv, tcp_mss); 267 268 switch (type) { 269 case CC_NDUPACK: 270 if (!IN_FASTRECOVERY(ccv->flags)) { 271 if (!IN_CONGRECOVERY(ccv->flags)) { 272 cubic_ssthresh_update(ccv); 273 cubic_data->num_cong_events++; 274 cubic_data->prev_max_cwnd = 275 cubic_data->max_cwnd; 276 cubic_data->max_cwnd = cwin; 277 CCV(ccv, tcp_cwnd) = 278 CCV(ccv, tcp_cwnd_ssthresh); 279 } 280 ENTER_RECOVERY(ccv->flags); 281 } 282 break; 283 284 case CC_ECN: 285 if (!IN_CONGRECOVERY(ccv->flags)) { 286 cubic_ssthresh_update(ccv); 287 cubic_data->num_cong_events++; 288 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 289 cubic_data->max_cwnd = cwin; 290 cubic_data->t_last_cong = gethrtime(); 291 CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh); 292 ENTER_CONGRECOVERY(ccv->flags); 293 } 294 break; 295 296 case CC_RTO: 297 /* 298 * Grab the current time and record it so we know when the 299 * most recent congestion event was. Only record it when the 300 * timeout has fired more than once, as there is a reasonable 301 * chance the first one is a false alarm and may not indicate 302 * congestion. 303 */ 304 if (CCV(ccv, tcp_timer_backoff) >= 2) { 305 cubic_data->num_cong_events++; 306 cubic_data->t_last_cong = gethrtime(); 307 cubic_ssthresh_update(ccv); 308 cubic_data->max_cwnd = cwin; 309 CCV(ccv, tcp_cwnd) = mss; 310 } 311 break; 312 } 313 } 314 315 static void 316 cubic_conn_init(struct cc_var *ccv) 317 { 318 struct cubic *cubic_data; 319 320 cubic_data = ccv->cc_data; 321 322 /* 323 * Ensure we have a sane initial value for max_cwnd recorded. Without 324 * this here bad things happen when entries from the TCP hostcache 325 * get used. 326 */ 327 cubic_data->max_cwnd = CCV(ccv, tcp_cwnd); 328 } 329 330 /* 331 * Perform any necessary tasks before we exit congestion recovery. 332 */ 333 static void 334 cubic_post_recovery(struct cc_var *ccv) 335 { 336 struct cubic *cubic_data; 337 338 cubic_data = ccv->cc_data; 339 340 /* Fast convergence heuristic. */ 341 if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) { 342 cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) 343 >> CUBIC_SHIFT; 344 } 345 346 if (IN_FASTRECOVERY(ccv->flags)) { 347 /* Update cwnd based on beta and adjusted max_cwnd. */ 348 CCV(ccv, tcp_cwnd) = max(1, ((CUBIC_BETA * 349 cubic_data->max_cwnd) >> CUBIC_SHIFT)); 350 } 351 cubic_data->t_last_cong = gethrtime(); 352 353 /* Calculate the average RTT between congestion epochs. */ 354 if (cubic_data->epoch_ack_count > 0 && 355 cubic_data->sum_rtt_nsecs >= cubic_data->epoch_ack_count) { 356 cubic_data->mean_rtt_nsecs = 357 (cubic_data->sum_rtt_nsecs / cubic_data->epoch_ack_count); 358 } 359 360 cubic_data->epoch_ack_count = 0; 361 cubic_data->sum_rtt_nsecs = 0; 362 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss)); 363 } 364 365 /* 366 * Record the min RTT and sum samples for the epoch average RTT calculation. 367 */ 368 static void 369 cubic_record_rtt(struct cc_var *ccv) 370 { 371 struct cubic *cubic_data; 372 int t_srtt_nsecs; 373 374 /* Ignore srtt until a min number of samples have been taken. */ 375 if (CCV(ccv, tcp_rtt_update) >= CUBIC_MIN_RTT_SAMPLES) { 376 cubic_data = ccv->cc_data; 377 /* tcp_rtt_sa is 8 * smoothed RTT in nanoseconds */ 378 t_srtt_nsecs = CCV(ccv, tcp_rtt_sa) >> 3; 379 380 /* 381 * Record the current SRTT as our minrtt if it's the smallest 382 * we've seen or minrtt is currently equal to its initialized 383 * value. 384 * 385 * XXXLAS: Should there be some hysteresis for minrtt? 386 */ 387 if ((t_srtt_nsecs < cubic_data->min_rtt_nsecs || 388 cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)) { 389 cubic_data->min_rtt_nsecs = max(1, t_srtt_nsecs); 390 391 /* 392 * If the connection is within its first congestion 393 * epoch, ensure we prime mean_rtt_nsecs with a 394 * reasonable value until the epoch average RTT is 395 * calculated in cubic_post_recovery(). 396 */ 397 if (cubic_data->min_rtt_nsecs > 398 cubic_data->mean_rtt_nsecs) 399 cubic_data->mean_rtt_nsecs = 400 cubic_data->min_rtt_nsecs; 401 } 402 403 /* Sum samples for epoch average RTT calculation. */ 404 cubic_data->sum_rtt_nsecs += t_srtt_nsecs; 405 cubic_data->epoch_ack_count++; 406 } 407 } 408 409 /* 410 * Update the ssthresh in the event of congestion. 411 */ 412 static void 413 cubic_ssthresh_update(struct cc_var *ccv) 414 { 415 struct cubic *cubic_data; 416 417 cubic_data = ccv->cc_data; 418 419 /* 420 * On the first congestion event, set ssthresh to cwnd * 0.5, on 421 * subsequent congestion events, set it to cwnd * beta. 422 */ 423 if (cubic_data->num_cong_events == 0) 424 CCV(ccv, tcp_cwnd_ssthresh) = CCV(ccv, tcp_cwnd) >> 1; 425 else 426 CCV(ccv, tcp_cwnd_ssthresh) = 427 (CCV(ccv, tcp_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT; 428 }