/*!\file */ /* Copyright (c) 2001-2022, The Ohio State University. All rights * reserved. * * This file is part of the MVAPICH2 software package developed by the * team members of The Ohio State University's Network-Based Computing * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. * * For detailed copyright and licensing information, please refer to the * copyright file COPYRIGHT in the top level MVAPICH2 directory. * */ #ifndef IB_CM_H #define IB_CM_H #include #include "ib_channel_manager.h" #include "ib_vbuf.h" /* add mpidimpl.h to pass MPIDI_PG_t *pg to MPID_nem_ib_setup_conn(); */ typedef enum { MPID_NEM_IB_CONN_NONE = 1, MPID_NEM_IB_CONN_IN_PROGRESS = 2, MPID_NEM_IB_CONN_RC = 3, } MPID_nem_ib_cm_conn_type_t; /* TODO: move srp to a independent structure in order to keep modulized*/ typedef struct _ibv_backlog_queue_t { int len; /* length of backlog queue */ vbuf *vbuf_head; /* head of backlog queue */ vbuf *vbuf_tail; /* tail of backlog queue */ } ibv_backlog_queue_t; #define BACKLOG_ENQUEUE(q,v) { \ v->desc.next = NULL; \ if (q->vbuf_tail == NULL) { \ q->vbuf_head = v; \ } else { \ q->vbuf_tail->desc.next = v; \ } \ q->vbuf_tail = v; \ q->len++; \ } #define BACKLOG_DEQUEUE(q,v) { \ v = q->vbuf_head; \ q->vbuf_head = v->desc.next; \ if (v == q->vbuf_tail) { \ q->vbuf_tail = NULL; \ } \ q->len--; \ v->desc.next = NULL; \ } typedef struct MPIDI_CH3I_MRAILI_RDMAPATH_VC { /********************************************************** * Following part of the structure is shared by all rails * **********************************************************/ /* RDMA buffers */ void *RDMA_send_buf_DMA; void *RDMA_recv_buf_DMA; struct vbuf *RDMA_send_buf; struct vbuf *RDMA_recv_buf; struct ibv_mr *RDMA_send_buf_mr[MAX_NUM_HCAS]; struct ibv_mr *RDMA_recv_buf_mr[MAX_NUM_HCAS]; uint32_t RDMA_remote_buf_rkey[MAX_NUM_HCAS]; /* current flow control credit accumulated for remote side */ uint8_t rdma_credit; /* RDMA buffer address on the remote side */ char *remote_RDMA_buf; int phead_RDMA_send; int ptail_RDMA_send; /* pointer to the head of free receive buffers * this is also where we should poll for incoming * rdma write messages */ /* this pointer advances when we receive packets */ int p_RDMA_recv; int p_RDMA_recv_tail; int eager_start_cnt; int in_polling_set; int rdma_failed; #ifndef MV2_DISABLE_HEADER_CACHING void *cached_outgoing; void *cached_outgoing_iheader; void *cached_incoming; void *cached_incoming_iheader; int cached_hit; int cached_miss; #endif } MPIDI_CH3I_MRAILI_RDMAPATH_VC; /** * MPIDI_CH3I_MRAILI_SR_VC */ typedef struct MPIDI_NEM_IB_SR_VC { struct { /* how many vbufs can be consumed on remote end. */ uint8_t remote_credit; /* accumulate vbuf credit locally here */ uint8_t local_credit; /* number of vbufs currently preposted */ uint8_t preposts; uint32_t pending_r3_sends; uint8_t remote_cc; uint8_t initialized; int rendezvous_packets_expected; ibv_backlog_queue_t backlog; } *credits; } MPIDI_NEM_IB_SR_VC; #define INDEX_GLOBAL(_cmanager,_global_index) (_global_index) #define INDEX_LOCAL(_cmanager,_local_index) \ (((_cmanager)->num_channels - (_cmanager)->num_local_pollings) + (_local_index)) struct process_init_info { int **hostid; uint16_t **lid; uint32_t **qp_num_rdma; union ibv_gid **gid; uint64_t *vc_addr; mv2_arch_hca_type *arch_hca_type; }; struct mrail_rail { int hca_index; int s_weight; int used_send_cq; int used_recv_cq; struct ibv_qp *qp_hndl; int send_wqes_avail; vbuf *ext_sendq_head; vbuf *ext_sendq_tail; int ext_sendq_size; }; typedef struct _MPID_nem_ib_connection_tag { struct mrail_rail *rails; /* move to channel_manager.h */ /* MPID_nem_ib_channel_manager cmanager; */ /* number of send wqes available */ uint16_t next_packet_expected; uint16_t next_packet_tosend; /* what buffer are we currently packing */ struct vbuf *coalesce_vbuf; /* how many eager sends do we have outstanding */ int outstanding_eager_vbufs; MPIDI_CH3I_MRAILI_RDMAPATH_VC rfp; MPIDI_NEM_IB_SR_VC srp; /* Buffered receiving request for packetized transfer */ void *packetized_recv; /* these fields are used to remember data transfer operations * that are currently in progress on this connection. The * send handle list is a queue of send handles representing * in-progress rendezvous transfers. It is processed in FIFO * order (because of MPI ordering rules) so there is both a head * and a tail. * * The receive handle is a pointer to a single * in-progress eager receive. We require that an eager sender * send *all* packets associated with an eager receive before * sending any others, so when we receive the first packet of * an eager series, we remember it by caching the rhandle * on the connection. * */ void *sreq_head; /* "queue" of send handles to process */ void *sreq_tail; /* these two fields are used *only* by MPID_DeviceCheck to * build up a list of connections that have received new * flow control credit so that pending operations should be * pushed. nextflow is a pointer to the next connection on the * list, and inflow is 1 (true) or 0 (false) to indicate whether * the connection is currently on the flowlist. This is needed * to prevent a circular list. */ void *nextflow; int inflow; uint64_t remote_vc_addr; /* Used to find vc at remote side */ } MPID_nem_ib_connection_t; /** * The list of all connections. */ typedef struct _MPID_nem_ib_connections { int size; int rank; MPID_nem_ib_connection_t *connections; struct process_init_info *init_info; } MPID_nem_ib_conn_info_t; /** * all the connections existing for current process */ extern MPID_nem_ib_conn_info_t conn_info; struct MPIDI_PG; int MPID_nem_ib_init_connection(int size, int rank); int MPID_nem_ib_alloc_process_init_info(); int MPID_nem_ib_setup_conn(struct MPIDI_PG *pg); int MPID_nem_ib_establish_conn(); int MPID_nem_ib_exchange_conn(struct MPIDI_PG *pg, int rank); int MPID_nem_ib_free_conn_info(int size); int MPID_nem_ib_setup_startup_ring(struct MPIDI_PG *pg, int rank); extern int mv2_pmi_max_keylen; extern int mv2_pmi_max_vallen; extern char *mv2_pmi_key; extern char *mv2_pmi_val; /* * mv2_allocate_pmi_keyval * Allocate a Key-Value pair of correct length * Return 0 on success, non-zero on failure */ int mv2_allocate_pmi_keyval(void); /* * mv2_free_pmi_keyval * Free a previously allocated Key-Value pair */ void mv2_free_pmi_keyval(void); /** * set credits info into packet header */ #define PACKET_SET_CREDIT(_p, _c, _rail_index) \ { \ (_p)->rdma_credit = (_c)->rfp.rdma_credit;\ (_c)->rfp.rdma_credit = 0; \ (_p)->vbuf_credit = (_c)->srp.credits[(_rail_index)].local_credit; \ (_p)->remote_credit = (_c)->srp.credits[(_rail_index)].remote_credit; \ (_c)->srp.credits[(_rail_index)].local_credit = 0; \ } #define PACKET_SET_RDMA_CREDIT(_p, _c) \ { \ (_p)->rdma_credit = (_c)->rfp.rdma_credit; \ (_c)->rfp.rdma_credit = 0; \ (_p)->vbuf_credit = 0; \ (_p)->remote_credit = 0; \ } #endif