diff --git a/README.md b/README.md index d512f91..d5bc86e 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,10 @@ Optional dependencies are: Compilers tested include: -* gfortran-9.4.0 -* ifort-2021.4 -* ifx-2021.4 +* flang-new 20.0.0 +* gfortran 13.2.0, 14.0.1 +* ifort 2021.13.1 +* ifx 2024.2.1 ### Building with fpm @@ -85,7 +86,7 @@ Once installed, use the compiler wrappers `caf` and `cafrun` to build and execut in parallel, respectively: ``` -fpm build --compiler caf --profile release +fpm build --compiler caf --profile release --flag "-cpp -DPARALLEL" ``` #### Testing with fpm @@ -107,7 +108,7 @@ See the [Fortran Package Manager](https://github.com/fortran-lang/fpm) for more ``` mkdir build cd build -cmake .. -DSERIAL=1 +cmake .. make ``` @@ -122,7 +123,7 @@ in parallel, respectively: ``` -FC=caf cmake .. +FC=caf cmake .. -DPARALLEL make cafrun -n 4 bin/mnist # run MNIST example on 4 cores ``` @@ -139,7 +140,7 @@ FC=ifort cmake .. for a parallel build of neural-fortran, or ``` -FC=ifort cmake .. -DSERIAL=1 +FC=ifort cmake .. ``` for a serial build. diff --git a/cmake/compilers.cmake b/cmake/compilers.cmake index 3fd608f..0abc163 100644 --- a/cmake/compilers.cmake +++ b/cmake/compilers.cmake @@ -1,11 +1,12 @@ # compiler flags for gfortran if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") - if(SERIAL) - message(STATUS "Configuring to build with -fcoarray=single") - add_compile_options("$<$:-fcoarray=single>") + if(PARALLEL) + message(STATUS "Configuring to build with -fcoarray=shared") + add_compile_options("$<$:-fcoarray=shared>") + add_compile_definitions(PARALLEL) else() - add_compile_options("$<$:-fcoarray=lib>") + add_compile_options("$<$:-fcoarray=single>") endif() if(BLAS) @@ -14,21 +15,22 @@ if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") message(STATUS "Configuring build to use BLAS from ${BLAS}") endif() - add_compile_options("$<$,$>:-fcheck=bounds;-fbacktrace>") - add_compile_options("$<$,$>:-Ofast;-fno-frontend-optimize;-fno-backtrace>") + add_compile_options("$<$,$>:-cpp;-fcheck=bounds;-fbacktrace>") + add_compile_options("$<$,$>:-cpp;-Ofast;-fno-frontend-optimize;-fno-backtrace>") elseif(CMAKE_Fortran_COMPILER_ID MATCHES "^Intel") # compiler flags for ifort - if(SERIAL) - message(STATUS "Configuring to build with -coarray=single") + if(PARALLEL) + message(STATUS "Configuring to build with -coarray=shared") if(WIN32) - add_compile_options("$<$:/Qcoarray:single>") - add_link_options("$<$:/Qcoarray:single>") + add_compile_options("$<$:/Qcoarray:shared>") + add_link_options("$<$:/Qcoarray:shared>") else() - add_compile_options("$<$:-coarray=single>") - add_link_options("$<$:-coarray=single>") + add_compile_options("$<$:-coarray=shared>") + add_link_options("$<$:-coarray=shared>") endif() + add_compile_definitions(PARALLEL) else() if(WIN32) add_compile_options("$<$:/Qcoarray:shared>") @@ -40,16 +42,16 @@ elseif(CMAKE_Fortran_COMPILER_ID MATCHES "^Intel") endif() if(WIN32) - string(APPEND CMAKE_Fortran_FLAGS " /assume:byterecl") + string(APPEND CMAKE_Fortran_FLAGS " /assume:byterecl /fpp") else() - string(APPEND CMAKE_Fortran_FLAGS " -assume byterecl") + string(APPEND CMAKE_Fortran_FLAGS " -assume byterecl -fpp") endif() - add_compile_options("$<$,$>:-check;-traceback>") - # add_compile_options("$<$,$>:-O3>") + add_compile_options("$<$,$>:-fpp;-check;-traceback>") + add_compile_options("$<$,$>:-fpp;-O3>") elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") # compiler flags for Cray ftn string(APPEND CMAKE_Fortran_FLAGS " -h noomp") - add_compile_options("$<$,$>:-O0;-g>") - add_compile_options("$<$,$>:-O3>") + add_compile_options("$<$,$>:-e Z;-O0;-g>") + add_compile_options("$<$,$>:-e Z;-O3>") endif() diff --git a/cmake/options.cmake b/cmake/options.cmake index 24bcf76..2f2b838 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -1,4 +1,4 @@ -option(SERIAL "Serial execution") +option(PARALLEL "Parallel execution") option(${PROJECT_NAME}_BUILD_TESTING "build ${PROJECT_NAME} tests" true) option(${PROJECT_NAME}_BUILD_EXAMPLES "build ${PROJECT_NAME} examples" true) @@ -8,10 +8,10 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -if(SERIAL) - message(STATUS "Configuring build for serial execution") -else() +if(PARALLEL) message(STATUS "Configuring build for parallel execution") +else() + message(STATUS "Configuring build for serial execution; configure with -DPARALLEL=1 for a parallel build") endif() # --- Generally useful CMake project options diff --git a/example/cnn_mnist.f90 b/example/cnn_mnist.f90 index fecb7de..bec50b8 100644 --- a/example/cnn_mnist.f90 +++ b/example/cnn_mnist.f90 @@ -40,9 +40,8 @@ program cnn_mnist optimizer=sgd(learning_rate=3.) & ) - if (this_image() == 1) & - print '(a,i2,a,f5.2,a)', 'Epoch ', n, ' done, Accuracy: ', accuracy( & - net, validation_images, label_digits(validation_labels)) * 100, ' %' + print '(a,i2,a,f5.2,a)', 'Epoch ', n, ' done, Accuracy: ', accuracy( & + net, validation_images, label_digits(validation_labels)) * 100, ' %' end do epochs diff --git a/example/dense_mnist.f90 b/example/dense_mnist.f90 index 3b0ec07..c26d0ce 100644 --- a/example/dense_mnist.f90 +++ b/example/dense_mnist.f90 @@ -24,9 +24,8 @@ program dense_mnist call net % print_info() - if (this_image() == 1) & - print '(a,f5.2,a)', 'Initial accuracy: ', accuracy( & - net, validation_images, label_digits(validation_labels)) * 100, ' %' + print '(a,f5.2,a)', 'Initial accuracy: ', accuracy( & + net, validation_images, label_digits(validation_labels)) * 100, ' %' epochs: do n = 1, num_epochs @@ -44,10 +43,9 @@ program dense_mnist ! 2 metrics; 1st is default loss function (quadratic), other is Pearson corr. output_metrics = net % evaluate(validation_images, label_digits(validation_labels), metric=corr()) mean_metrics = sum(output_metrics, 1) / size(output_metrics, 1) - if (this_image() == 1) & - print '(a,i2,3(a,f6.3))', 'Epoch ', n, ' done, Accuracy: ', & - accuracy(net, validation_images, label_digits(validation_labels)) * 100, & - '%, Loss: ', mean_metrics(1), ', Pearson correlation: ', mean_metrics(2) + print '(a,i2,3(a,f6.3))', 'Epoch ', n, ' done, Accuracy: ', & + accuracy(net, validation_images, label_digits(validation_labels)) * 100, & + '%, Loss: ', mean_metrics(1), ', Pearson correlation: ', mean_metrics(2) end block end do epochs diff --git a/src/nf/nf_activation.f90 b/src/nf/nf_activation.f90 index 237ce26..309b43d 100644 --- a/src/nf/nf_activation.f90 +++ b/src/nf/nf_activation.f90 @@ -557,7 +557,7 @@ pure function eval_3d_celu_prime(self, x) result(res) end where end function - pure function get_activation_by_name(activation_name) result(res) + function get_activation_by_name(activation_name) result(res) ! Workaround to get activation_function with some ! hardcoded default parameters by its name. ! Need this function since we get only activation name diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90 index 2f28603..4b79376 100644 --- a/src/nf/nf_conv2d_layer.f90 +++ b/src/nf/nf_conv2d_layer.f90 @@ -41,7 +41,7 @@ module nf_conv2d_layer end type conv2d_layer interface conv2d_layer - pure module function conv2d_layer_cons(filters, kernel_size, activation) & + module function conv2d_layer_cons(filters, kernel_size, activation) & result(res) !! `conv2d_layer` constructor function integer, intent(in) :: filters diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90 index 5f71084..45a2c1d 100644 --- a/src/nf/nf_conv2d_layer_submodule.f90 +++ b/src/nf/nf_conv2d_layer_submodule.f90 @@ -7,7 +7,7 @@ contains - pure module function conv2d_layer_cons(filters, kernel_size, activation) result(res) + module function conv2d_layer_cons(filters, kernel_size, activation) result(res) implicit none integer, intent(in) :: filters integer, intent(in) :: kernel_size diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index c573579..862f4cd 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -42,7 +42,7 @@ module nf_dense_layer end type dense_layer interface dense_layer - elemental module function dense_layer_cons(output_size, activation) & + module function dense_layer_cons(output_size, activation) & result(res) !! This function returns the `dense_layer` instance. integer, intent(in) :: output_size diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index 50d5b10..a424cf9 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -8,7 +8,7 @@ contains - elemental module function dense_layer_cons(output_size, activation) & + module function dense_layer_cons(output_size, activation) & result(res) integer, intent(in) :: output_size class(activation_function), intent(in) :: activation @@ -129,7 +129,9 @@ module subroutine init(self, input_shape) self % weights = self % weights / self % input_size ! Broadcast weights to all other images, if any. +#ifdef PARALLEL call co_broadcast(self % weights, 1) +#endif allocate(self % biases(self % output_size)) self % biases = 0 diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90 index ce9a724..309be6e 100644 --- a/src/nf/nf_layer_constructors.f90 +++ b/src/nf/nf_layer_constructors.f90 @@ -12,7 +12,7 @@ module nf_layer_constructors interface input - pure module function input1d(layer_size) result(res) + module function input1d(layer_size) result(res) !! 1-d input layer constructor. !! !! This layer is for inputting 1-d data to the network. @@ -35,7 +35,7 @@ pure module function input1d(layer_size) result(res) !! Resulting layer instance end function input1d - pure module function input3d(layer_shape) result(res) + module function input3d(layer_shape) result(res) !! 3-d input layer constructor. !! !! This layer is for inputting 3-d data to the network. @@ -62,7 +62,7 @@ end function input3d interface - pure module function dense(layer_size, activation) result(res) + module function dense(layer_size, activation) result(res) !! Dense (fully-connected) layer constructor. !! !! This layer is a building block for dense, fully-connected networks, @@ -85,7 +85,7 @@ pure module function dense(layer_size, activation) result(res) !! Resulting layer instance end function dense - pure module function flatten() result(res) + module function flatten() result(res) !! Flatten (3-d -> 1-d) layer constructor. !! !! Use this layer to chain layers with 3-d outputs to layers with 1-d @@ -106,7 +106,7 @@ pure module function flatten() result(res) !! Resulting layer instance end function flatten - pure module function conv2d(filters, kernel_size, activation) result(res) + module function conv2d(filters, kernel_size, activation) result(res) !! 2-d convolutional layer constructor. !! !! This layer is for building 2-d convolutional network. @@ -133,7 +133,7 @@ pure module function conv2d(filters, kernel_size, activation) result(res) !! Resulting layer instance end function conv2d - pure module function maxpool2d(pool_size, stride) result(res) + module function maxpool2d(pool_size, stride) result(res) !! 2-d maxpooling layer constructor. !! !! This layer is for downscaling other layers, typically `conv2d`. @@ -155,7 +155,7 @@ pure module function maxpool2d(pool_size, stride) result(res) !! Resulting layer instance end function maxpool2d - pure module function reshape(output_shape) result(res) + module function reshape(output_shape) result(res) !! Rank-1 to rank-any reshape layer constructor. !! Currently implemented is only rank-3 for the output of the reshape. !! diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90 index 91de36e..234b20b 100644 --- a/src/nf/nf_layer_constructors_submodule.f90 +++ b/src/nf/nf_layer_constructors_submodule.f90 @@ -14,7 +14,7 @@ contains - pure module function conv2d(filters, kernel_size, activation) result(res) + module function conv2d(filters, kernel_size, activation) result(res) integer, intent(in) :: filters integer, intent(in) :: kernel_size class(activation_function), intent(in), optional :: activation @@ -40,7 +40,7 @@ pure module function conv2d(filters, kernel_size, activation) result(res) end function conv2d - pure module function dense(layer_size, activation) result(res) + module function dense(layer_size, activation) result(res) integer, intent(in) :: layer_size class(activation_function), intent(in), optional :: activation type(layer) :: res @@ -63,14 +63,14 @@ pure module function dense(layer_size, activation) result(res) end function dense - pure module function flatten() result(res) + module function flatten() result(res) type(layer) :: res res % name = 'flatten' allocate(res % p, source=flatten_layer()) end function flatten - pure module function input1d(layer_size) result(res) + module function input1d(layer_size) result(res) integer, intent(in) :: layer_size type(layer) :: res res % name = 'input' @@ -81,7 +81,7 @@ pure module function input1d(layer_size) result(res) end function input1d - pure module function input3d(layer_shape) result(res) + module function input3d(layer_shape) result(res) integer, intent(in) :: layer_shape(3) type(layer) :: res res % name = 'input' @@ -91,7 +91,7 @@ pure module function input3d(layer_shape) result(res) res % initialized = .true. end function input3d - pure module function maxpool2d(pool_size, stride) result(res) + module function maxpool2d(pool_size, stride) result(res) integer, intent(in) :: pool_size integer, intent(in), optional :: stride integer :: stride_ @@ -119,7 +119,7 @@ pure module function maxpool2d(pool_size, stride) result(res) end function maxpool2d - pure module function reshape(output_shape) result(res) + module function reshape(output_shape) result(res) integer, intent(in) :: output_shape(:) type(layer) :: res diff --git a/src/nf/nf_metrics.f90 b/src/nf/nf_metrics.f90 index 43bc12d..abc2dce 100644 --- a/src/nf/nf_metrics.f90 +++ b/src/nf/nf_metrics.f90 @@ -36,7 +36,7 @@ end function metric_interface contains - pure module function corr_eval(true, predicted) result(res) + pure function corr_eval(true, predicted) result(res) !! Pearson correlation function: !! real, intent(in) :: true(:) diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90 index bcf10ae..8afa888 100644 --- a/src/nf/nf_network.f90 +++ b/src/nf/nf_network.f90 @@ -77,7 +77,7 @@ end function evaluate_batch_1d interface forward - pure module subroutine forward_1d(self, input) + module subroutine forward_1d(self, input) !! Apply a forward pass through the network. !! !! This changes the state of layers on the network. @@ -91,7 +91,7 @@ pure module subroutine forward_1d(self, input) !! 1-d input data end subroutine forward_1d - pure module subroutine forward_3d(self, input) + module subroutine forward_3d(self, input) !! Apply a forward pass through the network. !! !! This changes the state of layers on the network. @@ -153,7 +153,7 @@ end function predict_batch_3d interface - pure module subroutine backward(self, output, loss) + module subroutine backward(self, output, loss) !! Apply one backward pass through the network. !! This changes the state of layers on the network. !! Typically used only internally from the `train` method, @@ -166,7 +166,7 @@ pure module subroutine backward(self, output, loss) !! Loss instance to use. If not provided, the default is quadratic(). end subroutine backward - pure module integer function get_num_params(self) + module integer function get_num_params(self) !! Get the number of parameters in the network. class(network), intent(in) :: self !! Network instance diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index a3404dc..140c922 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -93,7 +93,7 @@ module function network_from_layers(layers) result(res) end function network_from_layers - pure module subroutine backward(self, output, loss) + module subroutine backward(self, output, loss) class(network), intent(in out) :: self real, intent(in) :: output(:) class(loss_type), intent(in), optional :: loss @@ -167,20 +167,20 @@ module function evaluate_batch_1d(self, input_data, output_data, metric) result( allocate(res(size(output, dim=1), n)) - do concurrent (i = 1:size(output, dim=1)) + do i = 1, size(output, dim=1) res(i,1) = self % loss % eval(output_data(i,:), output(i,:)) end do if (.not. present(metric)) return - do concurrent (i = 1:size(output, dim=1)) + do i = 1, size(output, dim=1) res(i,2) = metric % eval(output_data(i,:), output(i,:)) end do end function evaluate_batch_1d - pure module subroutine forward_1d(self, input) + module subroutine forward_1d(self, input) class(network), intent(in out) :: self real, intent(in) :: input(:) integer :: n @@ -197,7 +197,7 @@ pure module subroutine forward_1d(self, input) end subroutine forward_1d - pure module subroutine forward_3d(self, input) + module subroutine forward_3d(self, input) class(network), intent(in out) :: self real, intent(in) :: input(:,:,:) integer :: n @@ -273,7 +273,7 @@ module function predict_batch_1d(self, input) result(res) allocate(res(output_size, batch_size)) - batch: do concurrent(i = 1:size(res, dim=2)) + batch: do i = 1, size(res, dim=2) call self % forward(input(:,i)) @@ -303,7 +303,7 @@ module function predict_batch_3d(self, input) result(res) allocate(res(output_size, batch_size)) - batch: do concurrent(i = 1:batch_size) + batch: do i = 1, batch_size call self % forward(input(:,:,:,i)) @@ -330,7 +330,7 @@ module subroutine print_info(self) end subroutine print_info - pure module function get_num_params(self) + module function get_num_params(self) class(network), intent(in) :: self integer :: get_num_params @@ -443,15 +443,17 @@ module subroutine train(self, input_data, output_data, batch_size, & call random_number(pos) batch_start = int(pos * (dataset_size - batch_size + 1)) + 1 +#ifdef PARALLEL ! FIXME shuffle in a way that doesn't require co_broadcast call co_broadcast(batch_start, 1) +#endif ! Distribute the batch in nearly equal pieces to all images indices = tile_indices(batch_size) istart = indices(1) + batch_start - 1 iend = indices(2) + batch_start - 1 - do concurrent(j = istart:iend) + do j = istart, iend call self % forward(input_data(:,j)) call self % backward(output_data(:,j)) end do @@ -494,6 +496,7 @@ module subroutine update(self, optimizer, batch_size) batch_size_ = 1 end if +#ifdef PARALLEL ! Sum weight and bias gradients across images, if any do n = 2, size(self % layers) select type(this_layer => self % layers(n) % p) @@ -505,13 +508,14 @@ module subroutine update(self, optimizer, batch_size) call co_sum(this_layer % db) end select end do +#endif params = self % get_params() call self % optimizer % minimize(params, self % get_gradients() / batch_size_) call self % set_params(params) ! Flush network gradients to zero. - do concurrent(n = 2:size(self % layers)) + do n = 2, size(self % layers) select type(this_layer => self % layers(n) % p) type is(dense_layer) this_layer % dw = 0 diff --git a/src/nf/nf_parallel_submodule.f90 b/src/nf/nf_parallel_submodule.f90 index 6af1b57..04f1770 100644 --- a/src/nf/nf_parallel_submodule.f90 +++ b/src/nf/nf_parallel_submodule.f90 @@ -1,3 +1,8 @@ +#ifndef PARALLEL +#define num_images() 1 +#define this_image() 1 +#endif + submodule(nf_parallel) nf_parallel_submodule implicit none contains