-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_coll.c
144 lines (97 loc) · 4.09 KB
/
example_coll.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define Y_DIM 3
int main(int rgc, char *argv[])
{
int res;
int rank, size;
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
fprintf(stdout,"Process %i out of %i\n",rank,size);
assert(size%DIMS_Y == 0);
MPI_Comm topo_comm;
MPI_Request topo_request;
// Request the creation of a 2-D Torus
// More a naming scheme than a real topology
int ndims = 2;
int dims[ndims];
dims[0] = size/Y_DIM;
dims[1] = Y_DIM;
int periods[ndims] = {1, 1};
int reorder = 1; // allow for "intial" processes to be reordered b/c "Row-major
// numbering is always used for the processes in a Cartesian
// structure"
// Is reorder useful for other graph naming schemes then? (e.g
// MPI_Igraph_create or MPI_Idist_graph_create )
// Naming scheme creation
MPI_Icart_coords_create(MPI_COMM_WORLD, ndims, dims, periods, reorder, &topo_comm, &topo_request); // new api
// rather than MPI_Icart_create(MPI_COMM_WORLD, ndims, dims, periods, reorder, &topo_comm, &request); ??
// The naming scheme is likely to be a local operation, but the communicator creation is likely to be
// non-local.
// However, in case of reorder, some information might need to be exchanged then non-local?
// The list/kind of operations/procedures that can use the topo_comm handle just
// after this call returns must be determined with care.
// Basically, the handle can be used in incomplete procedures calls.
// or in procedures that handle the naming scheme, e.g:
// - MPI_Cart_get: info about topology + coords of calling process
// - MPI_Cart_rank : cartesian coords -> rank
// - MPI_Cart_coords : linear rank -> coords
// Since reordering is enabled, processes need to get their new ranks/coords.
// Hardware topologies can be leveraged at this step already.
// However, scheduled communcations cannot.
int out_dims[ndims] = {0};
int out_periods[ndims] = {0};
int new_coods[dims] = {0};
MPI_Cart_get(topo_comm,ndims,out_dims,out_periods,new_coords);
int new_rank;
MPI_Cart_rank(topo_comm,new_coords,&new_rank);
fprintf(stdout,(new_rank == 0) ?
"I'm root process in cart comm (rank in COMM_WORLD %i)\n" :
"Not root process (rank in COMM_WORLD %i)\n",rank);
// Schedule a first operation, e.g a Bcast
// in this example (0,0) will send an integer to processes E (1, 0), W (size/Y_DIMS-1, 0), N (0, 1) and S (0, DIMS_Y-1)
// If the naming scheme restricts the links effectively usable, what does this bcast mean
// for other processes?
// Does that make sense at all?
/*
Example with 12 processes in a 4 X 3 grid, with periodicty 1 on X and Y dims.
S
* --- * --- * --- *
| | | |
N * --- * --- * --- *
| | | |
* --- * --- * --- *
Root E W
*/
MPI_Request bcast_request;
int data = 0;
if( !new_rank )
data = 333;
MPI_Bcast_init(&data,1,MPI_INT,0,topo_comm,MPI_INFO_NULL,&bcast_request);
int data2 = 1;
int result = 0;
// Schedule another op.
//
MPI_Allreduce_init(&data2, &result, 1, MPI_INT, MPI_SUM, topo_comm, MPI_INFO_NULL, &reduction_request);
// Actual communicator creation, definitely a non-local op.
// Communication schedules can be taken into account at this moment.
MPI_Wait(&topo_request, MPI_STATUS_IGNORE);
//after MPI_Wait returns/completes, the communicator handle can be fully used.
//Do we want to authorize other non-scheduled ops on the communicator.
//If not, only non-blocking/immediate and/or persistent communication operations
//can be supported?
MPI_Start(&bcast_request);
MPI_Start(&reduction_request);
// Do some work here
MPI_Wait(&bcast_request);
MPI_Wait(&reduction_request);
MPI_Request_free(&bcast_request);
MPI_Request_free(&reduction_request);
// authorize non-scheduled comms to occur on topo_comm??
// ==> means non optimized
MPI_Comm_disconnect(topo_comm);
MPI_Finalize();
exit(EXIT_SUCCESS);
}