-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatrix_add_kokkos.cpp
94 lines (76 loc) · 2.76 KB
/
matrix_add_kokkos.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
#include <cuda_runtime.h>
#include <omp.h>
#include <iostream>
#include <iomanip>
// Function to get the number of GPUs available
int getNumGPUs() {
int numGPUs = 0;
cudaGetDeviceCount(&numGPUs);
return numGPUs;
}
// Matrix addition function with detailed output
double matrixAddMultiGPU(const int N, const int numGPUs) {
Kokkos::Timer timer;
// Use OpenMP to create threads for each GPU
#pragma omp parallel num_threads(numGPUs)
{
int gpu = omp_get_thread_num(); // GPU ID corresponds to thread ID
// Set the CUDA device for this thread
cudaSetDevice(gpu);
// Calculate the number of rows per GPU
int rowsPerGPU = N / numGPUs;
int remainder = N % numGPUs;
// Calculate the start and end rows for this GPU
int startRow = gpu * rowsPerGPU;
int endRow = startRow + rowsPerGPU;
if (gpu == numGPUs - 1) {
endRow += remainder; // Add remaining rows to the last GPU
}
// Allocate matrices A, B, and C
Kokkos::View<double**> A("A", N, N);
Kokkos::View<double**> B("B", N, N);
Kokkos::View<double**> C("C", N, N);
// Initialize A and B
Kokkos::parallel_for("Initialize A and B", Kokkos::RangePolicy<>(startRow, endRow), KOKKOS_LAMBDA(const int i) {
for (int j = 0; j < N; ++j) {
A(i, j) = i * N + j;
B(i, j) = (i * N + j) * 0.5;
}
});
// Perform matrix addition
Kokkos::parallel_for("Matrix Addition", Kokkos::RangePolicy<>(startRow, endRow), KOKKOS_LAMBDA(const int i) {
for (int j = 0; j < N; ++j) {
C(i, j) = A(i, j) + B(i, j);
}
});
// Synchronize threads
Kokkos::fence();
// Output which GPU completed which rows
#pragma omp critical
{
std::cout << "GPU " << gpu << " completed computation on rows " << startRow
<< " to " << endRow - 1 << std::endl;
}
}
// Return elapsed time
return timer.seconds();
}
int main(int argc, char* argv[]) {
// Initialize Kokkos
Kokkos::initialize(argc, argv);
{
int N = 3500; // Matrix size for this test
int numGPUs = getNumGPUs();
std::cout << "Matrix size: " << N << "x" << N << std::endl;
std::cout << "Number of GPUs: " << numGPUs << std::endl;
// Perform matrix addition
double elapsed = matrixAddMultiGPU(N, numGPUs);
std::cout << "Time taken for matrix addition with size " << N << "x" << N
<< " using " << numGPUs << " GPUs: " << elapsed << " seconds." << std::endl;
}
// Finalize Kokkos
Kokkos::finalize();
return 0;
}