Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
pub
pi
Commits
d935c24d
Commit
d935c24d
authored
Oct 15, 2020
by
Neil Gershenfeld
Browse files
wip
parent
29d3f0dd
Pipeline
#8246
passed with stage
in 1 second
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
CUDA/cudampipi.cu
0 → 100644
View file @
d935c24d
//
// cudampipi.cu
// Neil Gershenfeld 10/14/20
// calculation of pi by a CUDA+MPI sum
// assumes one GPU per MPI rank
// pi = 3.14159265358979323846
//
#include <iostream>
#include <cstdint>
#include <mpi.h>
//
using
namespace
std
;
//
uint64_t
blocks
=
1024
;
uint64_t
threads
=
1024
;
uint64_t
npts
=
1000000
;
uint64_t
nthreads
=
blocks
*
threads
;
int
nloop
=
10
;
//
__global__
void
init
(
double
*
arr
,
uint64_t
npts
,
uint64_t
nthreads
,
int
rank
)
{
uint64_t
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
uint64_t
start
=
npts
*
index
+
npts
*
nthreads
*
rank
+
1
;
uint64_t
end
=
npts
*
(
index
+
1
)
+
npts
*
nthreads
*
rank
+
1
;
arr
[
index
]
=
0
;
for
(
uint64_t
j
=
start
;
j
<
end
;
++
j
)
arr
[
index
]
+=
0.5
/
((
j
-
0.75
)
*
(
j
-
0.25
));
}
//
__global__
void
reduce_sum
(
double
*
arr
,
uint64_t
len
)
{
uint64_t
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
len
)
arr
[
index
]
+=
arr
[
index
+
len
];
}
//
void
reduce
(
double
*
arr
)
{
uint64_t
len
=
nthreads
>>
1
;
while
(
1
)
{
reduce_sum
<<<
blocks
,
threads
>>>
(
arr
,
len
);
len
=
len
>>
1
;
if
(
len
==
0
)
return
;
}
}
//
int
main
(
int
argc
,
char
**
argv
)
{
int
rank
,
nranks
;
MPI_Init
(
&
argc
,
&
argv
);
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
nranks
);
double
*
arr
,
result
,
pi
;
cudaMalloc
(
&
arr
,
nthreads
*
sizeof
(
double
));
if
(
rank
==
0
)
{
for
(
int
i
=
0
;
i
<
nloop
;
++
i
)
{
MPI_Barrier
(
MPI_COMM_WORLD
);
double
tstart
=
MPI_Wtime
();
init
<<<
blocks
,
threads
>>>
(
arr
,
npts
,
nthreads
,
rank
);
reduce
(
arr
);
cudaDeviceSynchronize
();
cudaMemcpy
(
&
result
,
arr
,
8
,
cudaMemcpyDeviceToHost
);
MPI_Reduce
(
&
result
,
&
pi
,
1
,
MPI_DOUBLE
,
MPI_SUM
,
0
,
MPI_COMM_WORLD
);
double
tend
=
MPI_Wtime
();
double
dt
=
tend
-
tstart
;
double
gflops
=
npts
*
nthreads
*
nranks
*
5.0
/
dt
/
1e9
;
printf
(
"npts = %ld, nthreads = %ld, nranks = %d, pi = %lf
\n
"
,
npts
,
nthreads
,
nranks
,
pi
);
printf
(
"time = %f, estimated GFlops = %f
\n
"
,
dt
,
gflops
);
}
}
else
{
for
(
int
i
=
0
;
i
<
nloop
;
++
i
)
{
MPI_Barrier
(
MPI_COMM_WORLD
);
init
<<<
blocks
,
threads
>>>
(
arr
,
npts
,
nthreads
,
rank
);
reduce
(
arr
);
cudaDeviceSynchronize
();
cudaMemcpy
(
&
result
,
arr
,
8
,
cudaMemcpyDeviceToHost
);
MPI_Reduce
(
&
result
,
&
pi
,
1
,
MPI_DOUBLE
,
MPI_SUM
,
0
,
MPI_COMM_WORLD
);
}
}
cudaFree
(
arr
);
MPI_Finalize
();
return
0
;
}
README.md
View file @
d935c24d
...
...
@@ -3,6 +3,7 @@
|estimated GFlops|code|description|system|date|
|---|---|---|---|---|
|2,493,988|
[
cudampipi.cu
](
CUDA/cudampipi.cu
)
|C++, CUDA+MPI, 256 nodes, 1536 ranks
<br>
nvcc -arch=sm_70 -std=c++11|Oak Ridge OLCF Summit
<br>
IBM POWER9+NVIDA V100|October 15, 2020|
|88,333|
[
mpimppi.c
](
hybrid/mpimppi.c
)
|C, MPI+OpenMP, 1024 nodes, 64 cores/node, 4 threads/core
<br>
cc mpimppi.c -o mpimppi -O3 -ffast-math -fopenmp|Argonne ALCF Theta
<br>
Cray XC40|Oct 9, 2019|
|12,589|
[
cudapit.cu
](
CUDA/cudapit.cu
)
|C++, CUDA, 8 GPUs, 5120 cores/GPU|NVIDIA V100|March 3, 2020|
|11,083|
[
mpithreadpi.cpp
](
hybrid/mpithreadpi.cpp
)
|C++, MPI+threads, 128 nodes, 64 cores/node, 4 threads/core
<br>
CC mpithreadpi.cpp -o mpithreadpi -O3 -ffast-math -std=c++11|Argonne ALCF Theta
<br>
Cray XC40|Mar 8, 2020|
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment