Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
pub
pi
Commits
29d3f0dd
Commit
29d3f0dd
authored
Jul 14, 2020
by
Neil Gershenfeld
Browse files
try peer
parent
f932159b
Pipeline
#5659
passed with stage
in 1 second
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
CUDA/cudapip.cu
0 → 100755
View file @
29d3f0dd
//
// cudapip.cu
// Neil Gershenfeld 7/14/20
// calculation of pi by a CUDA multi-GPU peer sum
// pi = 3.14159265358979323846
//
#include <iostream>
#include <chrono>
#include <cstdint>
uint64_t
blocks
=
1024
;
uint64_t
threads
=
1024
;
uint64_t
nloop
=
1000000
;
uint64_t
npts
=
blocks
*
threads
;
using
namespace
std
;
__global__
void
init
(
double
*
arr
,
uint64_t
nloop
,
uint64_t
npts
,
int
ngpus
,
int
index
)
{
uint64_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
uint64_t
start
=
nloop
*
i
+
npts
*
nloop
*
index
+
1
;
uint64_t
end
=
nloop
*
(
i
+
1
)
+
npts
*
nloop
*
index
+
1
;
arr
[
i
+
index
*
npts
]
=
0
;
for
(
uint64_t
j
=
start
;
j
<
end
;
++
j
)
arr
[
i
+
index
*
npts
]
+=
0.5
/
((
j
-
0.75
)
*
(
j
-
0.25
));
}
void
cudaCheck
(
string
msg
)
{
cudaError
err
;
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
cerr
<<
msg
<<
": "
<<
cudaGetErrorString
(
err
)
<<
endl
;
}
int
main
(
void
)
{
double
*
arr
,
*
darr
;
int
ngpus
;
cudaGetDeviceCount
(
&
ngpus
);
arr
=
new
double
[
ngpus
*
npts
];
cudaSetDevice
(
0
);
cudaMalloc
(
&
darr
,
ngpus
*
npts
*
sizeof
(
double
));
for
(
int
i
=
1
;
i
<
ngpus
;
++
i
)
{
cudaSetDevice
(
i
);
cudaDeviceEnablePeerAccess
(
0
,
0
);
cudaCheck
(
"peer access"
);
}
auto
tstart
=
chrono
::
high_resolution_clock
::
now
();
for
(
int
i
=
0
;
i
<
ngpus
;
++
i
)
{
cudaSetDevice
(
i
);
init
<<<
blocks
,
threads
>>>
(
darr
,
nloop
,
npts
,
ngpus
,
i
);
}
for
(
int
i
=
1
;
i
<
ngpus
;
++
i
)
{
cudaSetDevice
(
i
);
cudaDeviceSynchronize
();
}
cudaSetDevice
(
0
);
cudaMemcpy
(
arr
,
darr
,
ngpus
*
npts
*
sizeof
(
double
),
cudaMemcpyDeviceToHost
);
double
pi
=
0
;
for
(
int
i
=
0
;
i
<
ngpus
*
npts
;
++
i
)
pi
+=
arr
[
i
];
auto
tend
=
chrono
::
high_resolution_clock
::
now
();
auto
dt
=
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
tend
-
tstart
).
count
();
auto
gflops
=
npts
*
nloop
*
ngpus
*
5.0
/
dt
/
1e3
;
std
::
cout
<<
"npts: "
<<
npts
<<
" nloop: "
<<
nloop
<<
" ngpus: "
<<
ngpus
<<
" pi: "
<<
pi
<<
'\n'
;
std
::
cout
<<
"time: "
<<
1e-6
*
dt
<<
" estimated GFlops: "
<<
gflops
<<
'\n'
;
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment