1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file threads.c 7 * Oracle elects to have and use the contents of threads.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 /* 13 * Copyright (c) 2006 Oracle. All rights reserved. 14 * 15 * This software is available to you under a choice of one of two 16 * licenses. You may choose to be licensed under the terms of the GNU 17 * General Public License (GPL) Version 2, available from the file 18 * COPYING in the main directory of this source tree, or the 19 * OpenIB.org BSD license below: 20 * 21 * Redistribution and use in source and binary forms, with or 22 * without modification, are permitted provided that the following 23 * conditions are met: 24 * 25 * - Redistributions of source code must retain the above 26 * copyright notice, this list of conditions and the following 27 * disclaimer. 28 * 29 * - Redistributions in binary form must reproduce the above 30 * copyright notice, this list of conditions and the following 31 * disclaimer in the documentation and/or other materials 32 * provided with the distribution. 33 * 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 * SOFTWARE. 42 * 43 */ 44 #include <sys/rds.h> 45 #include <sys/sunddi.h> 46 47 #include <sys/ib/clients/rdsv3/rdsv3.h> 48 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 49 50 /* 51 * All of connection management is simplified by serializing it through 52 * work queues that execute in a connection managing thread. 53 * 54 * TCP wants to send acks through sendpage() in response to data_ready(), 55 * but it needs a process context to do so. 56 * 57 * The receive paths need to allocate but can't drop packets (!) so we have 58 * a thread around to block allocating if the receive fast path sees an 59 * allocation failure. 60 */ 61 62 /* 63 * Grand Unified Theory of connection life cycle: 64 * At any point in time, the connection can be in one of these states: 65 * DOWN, CONNECTING, UP, DISCONNECTING, ERROR 66 * 67 * The following transitions are possible: 68 * ANY -> ERROR 69 * UP -> DISCONNECTING 70 * ERROR -> DISCONNECTING 71 * DISCONNECTING -> DOWN 72 * DOWN -> CONNECTING 73 * CONNECTING -> UP 74 * 75 * Transition to state DISCONNECTING/DOWN: 76 * - Inside the shutdown worker; synchronizes with xmit path 77 * through c_send_lock, and with connection management callbacks 78 * via c_cm_lock. 79 * 80 * For receive callbacks, we rely on the underlying transport 81 * (TCP, IB/RDMA) to provide the necessary synchronisation. 82 */ 83 struct rdsv3_workqueue_struct_s *rdsv3_wq; 84 85 void 86 rdsv3_connect_complete(struct rdsv3_connection *conn) 87 { 88 RDSV3_DPRINTF4("rdsv3_connect_complete", "Enter(conn: %p)", conn); 89 90 if (!rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 91 RDSV3_CONN_UP)) { 92 RDSV3_DPRINTF2("rdsv3_connect_complete", 93 "%s: Cannot transition to state UP, " 94 "current state is %d", 95 __func__, 96 atomic_get(&conn->c_state)); 97 conn->c_state = RDSV3_CONN_ERROR; 98 rdsv3_queue_work(rdsv3_wq, &conn->c_down_w); 99 return; 100 } 101 102 RDSV3_DPRINTF2("rdsv3_connect_complete", 103 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u complete", 104 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 105 106 conn->c_reconnect_jiffies = 0; 107 conn->c_last_connect_jiffies = ddi_get_lbolt(); 108 109 set_bit(0, &conn->c_map_queued); 110 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 111 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 112 113 RDSV3_DPRINTF4("rdsv3_connect_complete", "Return(conn: %p)", conn); 114 } 115 116 /* 117 * This random exponential backoff is relied on to eventually resolve racing 118 * connects. 119 * 120 * If connect attempts race then both parties drop both connections and come 121 * here to wait for a random amount of time before trying again. Eventually 122 * the backoff range will be so much greater than the time it takes to 123 * establish a connection that one of the pair will establish the connection 124 * before the other's random delay fires. 125 * 126 * Connection attempts that arrive while a connection is already established 127 * are also considered to be racing connects. This lets a connection from 128 * a rebooted machine replace an existing stale connection before the transport 129 * notices that the connection has failed. 130 * 131 * We should *always* start with a random backoff; otherwise a broken connection 132 * will always take several iterations to be re-established. 133 */ 134 void 135 rdsv3_queue_reconnect(struct rdsv3_connection *conn) 136 { 137 unsigned long rand; 138 139 RDSV3_DPRINTF2("rdsv3_queue_reconnect", 140 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u reconnect jiffies %lu", 141 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), 142 conn->c_reconnect_jiffies); 143 144 set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 145 if (conn->c_reconnect_jiffies == 0) { 146 conn->c_reconnect_jiffies = rdsv3_sysctl_reconnect_min_jiffies; 147 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 148 return; 149 } 150 151 (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); 152 153 RDSV3_DPRINTF5("rdsv3", 154 "%lu delay %lu ceil conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", 155 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, 156 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); 157 158 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 159 rand % conn->c_reconnect_jiffies); 160 161 conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, 162 rdsv3_sysctl_reconnect_max_jiffies); 163 } 164 165 void 166 rdsv3_connect_worker(struct rdsv3_work_s *work) 167 { 168 struct rdsv3_connection *conn = container_of(work, 169 struct rdsv3_connection, c_conn_w.work); 170 int ret; 171 172 RDSV3_DPRINTF2("rdsv3_connect_worker", "Enter(work: %p)", work); 173 174 clear_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags); 175 if (rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 176 RDSV3_CONN_CONNECTING)) { 177 ret = conn->c_trans->conn_connect(conn); 178 179 RDSV3_DPRINTF5("rdsv3", 180 "connect conn %p for %u.%u.%u.%u -> %u.%u.%u.%u " 181 "ret %d", conn, NIPQUAD(conn->c_laddr), 182 NIPQUAD(conn->c_faddr), ret); 183 184 RDSV3_DPRINTF2("rdsv3_connect_worker", 185 "conn %p for %u.%u.%u.%u to %u.%u.%u.%u dispatched, ret %d", 186 conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), ret); 187 188 if (ret) { 189 if (rdsv3_conn_transition(conn, RDSV3_CONN_CONNECTING, 190 RDSV3_CONN_DOWN)) 191 rdsv3_queue_reconnect(conn); 192 else { 193 RDSV3_DPRINTF2("rdsv3_connect_worker", 194 "RDS: connect failed: %p", conn); 195 rdsv3_conn_drop(conn); 196 } 197 } 198 } 199 200 RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work); 201 } 202 203 void 204 rdsv3_send_worker(struct rdsv3_work_s *work) 205 { 206 struct rdsv3_connection *conn = container_of(work, 207 struct rdsv3_connection, c_send_w.work); 208 int ret; 209 210 RDSV3_DPRINTF4("rdsv3_send_worker", "Enter(work: %p)", work); 211 212 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 213 ret = rdsv3_send_xmit(conn); 214 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 215 switch (ret) { 216 case -EAGAIN: 217 rdsv3_stats_inc(s_send_immediate_retry); 218 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 219 break; 220 case -ENOMEM: 221 rdsv3_stats_inc(s_send_delayed_retry); 222 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 2); 223 default: 224 break; 225 } 226 } 227 228 RDSV3_DPRINTF4("rdsv3_send_worker", "Return(work: %p)", work); 229 } 230 231 void 232 rdsv3_recv_worker(struct rdsv3_work_s *work) 233 { 234 struct rdsv3_connection *conn = container_of(work, 235 struct rdsv3_connection, c_recv_w.work); 236 int ret; 237 238 RDSV3_DPRINTF4("rdsv3_recv_worker", "Enter(work: %p)", work); 239 240 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 241 ret = conn->c_trans->recv(conn); 242 RDSV3_DPRINTF5("rdsv3", "conn %p ret %d", conn, ret); 243 switch (ret) { 244 case -EAGAIN: 245 rdsv3_stats_inc(s_recv_immediate_retry); 246 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); 247 break; 248 case -ENOMEM: 249 rdsv3_stats_inc(s_recv_delayed_retry); 250 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 2); 251 default: 252 break; 253 } 254 } 255 256 RDSV3_DPRINTF4("rdsv3_recv_worker", "Return(work: %p)", work); 257 } 258 259 void 260 rdsv3_shutdown_worker(struct rdsv3_work_s *work) 261 { 262 struct rdsv3_connection *conn = container_of(work, 263 struct rdsv3_connection, c_down_w); 264 rdsv3_conn_shutdown(conn); 265 } 266 267 #define time_after(a, b) ((long)(b) - (long)(a) < 0) 268 269 void 270 rdsv3_reaper_worker(struct rdsv3_work_s *work) 271 { 272 struct rdsv3_connection *conn = container_of(work, 273 struct rdsv3_connection, c_reap_w.work); 274 275 if (rdsv3_conn_state(conn) != RDSV3_CONN_UP && 276 !time_after(conn->c_last_connect_jiffies, 277 ddi_get_lbolt() - RDSV3_REAPER_WAIT_JIFFIES)) { 278 rdsv3_conn_destroy(conn); 279 } else { 280 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w, 281 RDSV3_REAPER_WAIT_JIFFIES); 282 } 283 } 284 285 void 286 rdsv3_threads_exit(void) 287 { 288 rdsv3_destroy_task_workqueue(rdsv3_wq); 289 } 290 291 int 292 rdsv3_threads_init(void) 293 { 294 rdsv3_wq = rdsv3_create_task_workqueue("krdsd"); 295 if (!rdsv3_wq) 296 return (-ENOMEM); 297 298 return (0); 299 }