/*! \file */ /* * (C) 2006 by Argonne National Laboratory. * See COPYRIGHT in top-level directory. */ /* Copyright (c) 2001-2022, The Ohio State University. All rights * reserved. * * This file is part of the MVAPICH2 software package developed by the * team members of The Ohio State University's Network-Based Computing * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. * * For detailed copyright and licensing information, please refer to the * copyright file COPYRIGHT in the top level MVAPICH2 directory. * */ #include #include "mpidimpl.h" #include "mpidbg.h" #include "upmi.h" #include "ib_errors.h" #include "ib_process.h" #include "ib_hca.h" /** * Define ENABLE_HCA_REPORT in order to print a report on the local HCAs status. * The function MPID_nem_ib_hca_check prints the report. * * #define ENABLE_HCA_REPORT */ #undef ENABLE_HCA_REPORT /** * Number of HCSs. * (Was rdma_num_hcas). */ int ib_hca_num_hcas = 1; /** * Number of ports. * (Was rdma_num_ports). */ int ib_hca_num_ports = 1; /** * The list of the HCAs found in the system. */ MPID_nem_ib_nem_hca hca_list[MAX_NUM_HCAS]; /** * Check the ibv_port_attr and ibv_device_attr. */ static int check_attrs( struct ibv_port_attr *port_attr, struct ibv_device_attr *dev_attr) { int ret = 0; #ifdef _ENABLE_XRC_ if (USE_XRC && !(dev_attr->device_cap_flags & IBV_DEVICE_XRC)) { fprintf (stderr, "HCA does not support XRC. Disable MV2_USE_XRC.\n"); ret = 1; } #endif /* _ENABLE_XRC_ */ if(port_attr->active_mtu < rdma_default_mtu) { MPL_error_printf( "Active MTU is %d, MV2_DEFAULT_MTU set to %d. See User Guide\n", port_attr->active_mtu, rdma_default_mtu); ret = 1; } if(dev_attr->max_qp_rd_atom < rdma_default_qp_ous_rd_atom) { MPL_error_printf( "Max MV2_DEFAULT_QP_OUS_RD_ATOM is %d, set to %d\n", dev_attr->max_qp_rd_atom, rdma_default_qp_ous_rd_atom); ret = 1; } if(process_info.has_srq) { if(dev_attr->max_srq_sge < rdma_default_max_sg_list) { MPL_error_printf( "Max MV2_DEFAULT_MAX_SG_LIST is %d, set to %d\n", dev_attr->max_srq_sge, rdma_default_max_sg_list); ret = 1; } if(dev_attr->max_srq_wr < mv2_srq_alloc_size) { MPL_error_printf( "Max MV2_SRQ_SIZE is %d, set to %d\n", dev_attr->max_srq_wr, (int) mv2_srq_alloc_size); ret = 1; } } else { if(dev_attr->max_sge < rdma_default_max_sg_list) { MPL_error_printf( "Max MV2_DEFAULT_MAX_SG_LIST is %d, set to %d\n", dev_attr->max_sge, rdma_default_max_sg_list); ret = 1; } if(dev_attr->max_qp_wr < rdma_default_max_send_wqe) { MPL_error_printf( "Max MV2_DEFAULT_MAX_SEND_WQE is %d, set to %d\n", dev_attr->max_qp_wr, (int) rdma_default_max_send_wqe); ret = 1; } } if(dev_attr->max_cqe < rdma_default_max_cq_size) { MPL_error_printf( "Max MV2_DEFAULT_MAX_CQ_SIZE is %d, set to %d\n", dev_attr->max_cqe, (int) rdma_default_max_cq_size); ret = 1; } return ret; } /* * Function: rdma_find_active_port * * Description: * Finds if the given device has any active ports. * * Input: * context - Pointer to the device context obtained by opening device. * ib_dev - Pointer to the device from ibv_get_device_list. * * Return: * Success: Port number of the active port. * Failure: ERROR (-1). */ static int rdma_find_active_port(struct ibv_context *context,struct ibv_device *ib_dev) { int j = 0; const char *dev_name = NULL; struct ibv_port_attr port_attr; if (NULL == ib_dev) { return -1; } else { dev_name = ibv_get_device_name(ib_dev); } for (j = 1; j <= RDMA_DEFAULT_MAX_PORTS; ++ j) { if ((! ibv_query_port(context, j, &port_attr)) && port_attr.state == IBV_PORT_ACTIVE) { if (!strncmp(dev_name, "cxgb3", 5) || !strncmp(dev_name, "cxgb4", 5) || port_attr.lid) { /* Chelsio RNIC's don't get LID's as they're not IB devices. * So dont do this check for them. */ DEBUG_PRINT("Active port number = %d, state = %s, lid = %d\r\n", j, (port_attr.state==IBV_PORT_ACTIVE)?"Active":"Not Active", port_attr.lid); return j; } } } return -1; } #ifdef ENABLE_HCA_REPORT static char *port_state_str[] = { "???", "Down", "Initializing", "Armed", "Active" }; static char *port_phy_state_str[] = { "No state change", "Sleep", "Polling", "Disabled", "PortConfigurationTraining", "LinkUp", "LinkErrorRecovery", "PhyTest" }; #endif #undef FUNCNAME #define FUNCNAME MPID_nem_ib_init_hca #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) /** * Initialize the HCAs * Look at rdma_open_hca() & rdma_iba_hca_init_noqp() in * mvapich2/trunk/src/mpid/ch3/channels/mrail/src/gen2/rdma_iba_priv.c * * Store all the HCA info in mv2_nem_dev_info_t->hca[hca_num] * * Output: * hca_list: fill it with the HCAs information * * \see hca_list */ int MPID_nem_ib_init_hca() { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_INIT_HCA); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_INIT_HCA); struct ibv_device *ib_dev = NULL; struct ibv_device **dev_list = NULL; int nHca; int num_devices = 0; #ifdef CRC_CHECK gen_crc_table(); #endif memset( hca_list, 0, sizeof(hca_list) ); /* Get the list of devices */ dev_list = ibv_get_device_list(&num_devices); if (dev_list==NULL) { MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s", "No IB device found"); } /* Runtime checks */ MPIU_Assert( num_devices<=MAX_NUM_HCAS ); if ( num_devices> MAX_NUM_HCAS) { MPL_error_printf( "WARNING: found %d IB devices, the maximum is %d (MAX_NUM_HCAS). ", num_devices, MAX_NUM_HCAS); num_devices = MAX_NUM_HCAS; } if ( ib_hca_num_hcas > num_devices) { MPL_error_printf( "WARNING: user requested %d IB devices, the available number is %d. ", ib_hca_num_hcas, num_devices); ib_hca_num_hcas = num_devices; } MPIU_DBG_MSG_P( CH3_CHANNEL, VERBOSE, "[HCA] Found %d HCAs\n", num_devices); MPIU_DBG_MSG_P( CH3_CHANNEL, VERBOSE, "[HCA] User requested %d\n", ib_hca_num_hcas); /* Retrieve information for each found device */ for (nHca = 0; nHca < ib_hca_num_hcas; nHca++) { /* Check for user choice */ if( (rdma_iba_hca[0]==0) || (strncmp(rdma_iba_hca, RDMA_IBA_NULL_HCA, 32)==0) || (ib_hca_num_hcas > 1)) { /* User hasn't specified any HCA name, or the number of HCAs is greater then 1 */ ib_dev = dev_list[nHca]; } else { /* User specified a HCA, try to look for it */ int dev_count; dev_count = 0; while(dev_list[dev_count]) { if(!strncmp(ibv_get_device_name(dev_list[dev_count]), rdma_iba_hca, 32)) { ib_dev = dev_list[dev_count]; break; } dev_count++; } } /* Check if device has been identified */ hca_list[nHca].ib_dev = ib_dev; if (!ib_dev) { MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s", "No IB device found"); } MPIU_DBG_MSG_P( CH3_CHANNEL, VERBOSE, "[HCA] HCA device %d found\n", nHca); hca_list[nHca].nic_context = ibv_open_device(ib_dev); if (hca_list[nHca].nic_context==NULL) { MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**fail", "%s %d", "Failed to open HCA number", nHca); } hca_list[nHca].ptag = ibv_alloc_pd(hca_list[nHca].nic_context); if (!hca_list[nHca].ptag) { MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**fail", "%s%d", "Failed to alloc pd number ", nHca); } /* Set the hca type */ #if defined(RDMA_CM) if (process_info.use_iwarp_mode) { if ((mpi_errno = rdma_cm_get_hca_type(process_info.use_iwarp_mode, &process_info.hca_type)) != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } if (process_info.hca_type == CHELSIO_T3) { process_info.use_iwarp_mode = 1; } } else #endif /* defined(RDMA_CM) */ { process_info.hca_type = hca_list[nHca].hca_type = mv2_get_hca_type(hca_list[nHca].ib_dev); process_info.arch_hca_type = mv2_get_arch_hca_type(hca_list[nHca].ib_dev); } } if (!strncmp(rdma_iba_hca, RDMA_IBA_NULL_HCA, 32) && (ib_hca_num_hcas==1) && (num_devices > nHca) && (rdma_find_active_port(hca_list[0].nic_context, hca_list[nHca].ib_dev)==-1)) { /* Trac #376 - There are multiple rdma capable devices (num_devices) in * the system. The user has asked us to use ANY (!strncmp) ONE device * (rdma_num_hcas), and the first device does not have an active port. So * try to find some other device with an active port. */ int j; for (j = 0; dev_list[j]; j++) { ib_dev = dev_list[j]; if (ib_dev) { hca_list[0].nic_context = ibv_open_device(ib_dev); if (!hca_list[0].nic_context) { /* Go to next device */ continue; } if (rdma_find_active_port(hca_list[0].nic_context, ib_dev)!=-1) { hca_list[0].ib_dev = ib_dev; hca_list[0].ptag = ibv_alloc_pd(hca_list[0].nic_context); if (!hca_list[0].ptag) { MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**fail", "%s%d", "Failed to alloc pd number ", nHca); } } } } } fn_exit: /* Clean up before exit */ if (dev_list!=NULL) ibv_free_device_list(dev_list); MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_INIT_HCA); return mpi_errno; fn_fail: goto fn_exit; } /** * Create a srq using process info data. */ struct ibv_srq *create_srq(int hca_num) { struct ibv_srq_init_attr srq_init_attr; struct ibv_srq *srq_ptr = NULL; memset(&srq_init_attr, 0, sizeof(srq_init_attr)); srq_init_attr.srq_context = hca_list[hca_num].nic_context; srq_init_attr.attr.max_wr = mv2_srq_alloc_size; srq_init_attr.attr.max_sge = 1; /* The limit value should be ignored during SRQ create */ srq_init_attr.attr.srq_limit = mv2_srq_limit; srq_ptr = ibv_create_srq(hca_list[hca_num].ptag, &srq_init_attr); if (!srq_ptr) { ibv_error_abort(-1, "Error creating SRQ\n"); } return srq_ptr; } #undef FUNCNAME #define FUNCNAME MPID_nem_ib_open_ports #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) /** * the first step in original MPID_nem_ib_setup_conn() function * open hca, create ptags and create cqs */ int MPID_nem_ib_open_ports() { int mpi_errno = MPI_SUCCESS; /* Infiniband Verb Structures */ struct ibv_port_attr port_attr; struct ibv_device_attr dev_attr; int nHca; /* , curRank, rail_index ; */ MPIDI_STATE_DECL(MPID_STATE_MPIDI_OPEN_HCA); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_OPEN_HCA); for (nHca = 0; nHca < ib_hca_num_hcas; nHca++) { if (ibv_query_device(hca_list[nHca].nic_context, &dev_attr)) { MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s", "Error getting HCA attributes"); } /* detecting active ports */ if (rdma_default_port < 0 || ib_hca_num_ports > 1) { int nPort; int k = 0; for (nPort = 1; nPort <= RDMA_DEFAULT_MAX_PORTS; nPort ++) { if ((! ibv_query_port(hca_list[nHca].nic_context, nPort, &port_attr)) && port_attr.state == IBV_PORT_ACTIVE && (port_attr.lid || (!port_attr.lid && use_iboeth))) { if (use_iboeth) { if (ibv_query_gid(hca_list[nHca].nic_context, nPort, 0, &hca_list[nHca].gids[k])) { /* new error information function needed */ MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "Failed to retrieve gid on rank %d", process_info.rank); } DEBUG_PRINT("[%d] %s(%d): Getting gid[%d][%d] for" " port %d subnet_prefix = %llx," " intf_id = %llx\r\n", process_info.rank, __FUNCTION__, __LINE__, nHca, k, k, hca_list[nHca].gids[k].global.subnet_prefix, hca_list[nHca].gids[k].global.interface_id); } else { hca_list[nHca].lids[k] = port_attr.lid; } hca_list[nHca].ports[k++] = nPort; if (check_attrs(&port_attr, &dev_attr)) { MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s", "Attributes failed sanity check"); } } } if (k < ib_hca_num_ports) { MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**activeports", "**activeports %d", ib_hca_num_ports); } } else { if(ibv_query_port(hca_list[nHca].nic_context, rdma_default_port, &port_attr) || (!port_attr.lid && !use_iboeth) || (port_attr.state != IBV_PORT_ACTIVE)) { MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**portquery", "**portquery %d", rdma_default_port); } hca_list[nHca].ports[0] = rdma_default_port; if (use_iboeth) { if (ibv_query_gid(hca_list[nHca].nic_context, 0, 0, &hca_list[nHca].gids[0])) { /* new error function needed */ MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "Failed to retrieve gid on rank %d", process_info.rank); } if (check_attrs(&port_attr, &dev_attr)) { MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s", "Attributes failed sanity check"); } } else { hca_list[nHca].lids[0] = port_attr.lid; } } if (rdma_use_blocking) { hca_list[nHca].comp_channel = ibv_create_comp_channel(hca_list[nHca].nic_context); if (!hca_list[nHca].comp_channel) { MPIR_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail, "**fail", "**fail %s", "cannot create completion channel"); } hca_list[nHca].send_cq_hndl = NULL; hca_list[nHca].recv_cq_hndl = NULL; hca_list[nHca].cq_hndl = ibv_create_cq(hca_list[nHca].nic_context, rdma_default_max_cq_size, NULL, hca_list[nHca].comp_channel, 0); if (!hca_list[nHca].cq_hndl) { MPIR_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail, "**fail", "**fail %s", "cannot create cq"); } if (ibv_req_notify_cq(hca_list[nHca].cq_hndl, 0)) { MPIR_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail, "**fail", "**fail %s", "cannot request cq notification"); } } else { /* Allocate the completion queue handle for the HCA */ hca_list[nHca].send_cq_hndl = NULL; hca_list[nHca].recv_cq_hndl = NULL; hca_list[nHca].cq_hndl = ibv_create_cq(hca_list[nHca].nic_context, rdma_default_max_cq_size, NULL, NULL, 0); if (!hca_list[nHca].cq_hndl) { MPIR_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail, "**fail", "**fail %s", "cannot create cq"); } } /* to decouple process_info, may need to store has_srq to hca structure??? */ if (process_info.has_srq) { hca_list[nHca].srq_hndl = create_srq(nHca); } } rdma_default_port = hca_list[0].ports[0]; fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_OPEN_HCA); return mpi_errno; fn_fail: goto fn_exit; }