From 6dfaed004f4d8ecabe8738a753d87c39ffbd93dc Mon Sep 17 00:00:00 2001
From: Jeremie Vandenplas <jeremie.vandenplas@gmail.com>
Date: Thu, 9 May 2024 13:11:49 -0400
Subject: [PATCH] Replacement of a matmul + use of merge (#181)

* dense_layer: replace a matmul(reshape) by a do concurrent

* nf_activation: replace some where statements by merge intrinsic

* Set correct size for self%gradient in dense_layer

* remove some unneeded pack()

* Remove notes on -fno-frontend-optimize (no longer necessary)

* Bump patch version

---------

Co-authored-by: Vandenplas, Jeremie <jeremie.vandenplas@wur.nl>
Co-authored-by: milancurcic <caomaco@gmail.com>
---
 README.md                            | 12 ++++------
 fpm.toml                             |  2 +-
 src/nf/nf_activation.f90             | 36 +++++-----------------------
 src/nf/nf_conv2d_layer_submodule.f90 |  4 ++--
 src/nf/nf_dense_layer_submodule.f90  | 14 +++++++----
 5 files changed, 23 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 2b05e56..f8b2174 100644
--- a/README.md
+++ b/README.md
@@ -80,23 +80,21 @@ With gfortran, the following will create an optimized build of neural-fortran:
 ```
 fpm build \
   --profile release \
-  --flag "-fno-frontend-optimize -I$HDF5INC -L$HDF5LIB"
+  --flag "-I$HDF5INC -L$HDF5LIB"
 ```
 
 HDF5 is now a required dependency, so you have to provide it to fpm.
 The above command assumes that the `HDF5INC` and `HDF5LIB` environment
 variables are set to the include and library paths, respectively, of your
 HDF5 install.
-The `-fno-frontend-optimize` disables some optimizations that may be harmful
-when building neural-fortran.
 
 If you use Conda, the following instructions work:
 
 ```
 conda create -n nf hdf5
 conda activate nf
-fpm build --profile release --flag "-fno-frontend-optimize -I$CONDA_PREFIX/include -L$CONDA_PREFIX/lib -Wl,-rpath -Wl,$CONDA_PREFIX/lib"
-fpm test --profile release --flag "-fno-frontend-optimize -I$CONDA_PREFIX/include -L$CONDA_PREFIX/lib -Wl,-rpath -Wl,$CONDA_PREFIX/lib"
+fpm build --profile release --flag "-I$CONDA_PREFIX/include -L$CONDA_PREFIX/lib -Wl,-rpath -Wl,$CONDA_PREFIX/lib"
+fpm test --profile release --flag "-I$CONDA_PREFIX/include -L$CONDA_PREFIX/lib -Wl,-rpath -Wl,$CONDA_PREFIX/lib"
 ```
 
 #### Building in parallel mode
@@ -110,7 +108,7 @@ in parallel, respectively:
 fpm build \
   --compiler caf \
   --profile release \
-  --flag "-fno-frontend-optimize -I$HDF5INC -L$HDF5LIB"
+  --flag "-I$HDF5INC -L$HDF5LIB"
 ```
 
 #### Testing with fpm
@@ -118,7 +116,7 @@ fpm build \
 ```
 fpm test \
   --profile release \
-  --flag "-fno-frontend-optimize -I$HDF5INC -L$HDF5LIB"
+  --flag "-I$HDF5INC -L$HDF5LIB"
 ```
 
 For the time being, you need to specify the same compiler flags to `fpm test`
diff --git a/fpm.toml b/fpm.toml
index 5252ab8..4fc21b3 100644
--- a/fpm.toml
+++ b/fpm.toml
@@ -1,5 +1,5 @@
 name = "neural-fortran"
-version = "0.16.0"
+version = "0.16.1"
 license = "MIT"
 author = "Milan Curcic"
 maintainer = "milancurcic@hey.com"
diff --git a/src/nf/nf_activation.f90 b/src/nf/nf_activation.f90
index 67034a3..e413243 100644
--- a/src/nf/nf_activation.f90
+++ b/src/nf/nf_activation.f90
@@ -295,11 +295,7 @@ pure function eval_1d_relu_prime(self, x) result(res)
     class(relu), intent(in) :: self
     real, intent(in) :: x(:)
     real :: res(size(x))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
+    res = merge(1., 0., x > 0)
   end function eval_1d_relu_prime
 
   pure function eval_3d_relu(self, x) result(res)
@@ -315,11 +311,7 @@ pure function eval_3d_relu_prime(self, x) result(res)
     class(relu), intent(in) :: self
     real, intent(in) :: x(:,:,:)
     real :: res(size(x,1),size(x,2),size(x,3))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
+    res = merge(1., 0., x > 0)
   end function eval_3d_relu_prime
 
   pure function eval_1d_leaky_relu(self, x) result(res)
@@ -335,11 +327,7 @@ pure function eval_1d_leaky_relu_prime(self, x) result(res)
     class(leaky_relu), intent(in) :: self
     real, intent(in) :: x(:)
     real :: res(size(x))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = self % alpha
-    end where
+    res = merge(1., self%alpha, x > 0)
   end function eval_1d_leaky_relu_prime
 
   pure function eval_3d_leaky_relu(self, x) result(res)
@@ -355,11 +343,7 @@ pure function eval_3d_leaky_relu_prime(self, x) result(res)
     class(leaky_relu), intent(in) :: self
     real, intent(in) :: x(:,:,:)
     real :: res(size(x,1),size(x,2),size(x,3))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = self % alpha
-    end where
+    res = merge(1., self%alpha, x > 0)
   end function eval_3d_leaky_relu_prime
 
   pure function eval_1d_sigmoid(self, x) result(res)
@@ -465,11 +449,7 @@ pure function eval_1d_step(self, x) result(res)
     class(step), intent(in) :: self
     real, intent(in) :: x(:)
     real :: res(size(x))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
+    res = merge(1., 0., x > 0)
   end function eval_1d_step
 
   pure function eval_1d_step_prime(self, x) result(res)
@@ -485,11 +465,7 @@ pure function eval_3d_step(self, x) result(res)
     class(step), intent(in) :: self
     real, intent(in) :: x(:,:,:)
     real :: res(size(x,1),size(x,2),size(x,3))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
+    res = merge(1., 0., x > 0)
   end function eval_3d_step
 
   pure function eval_3d_step_prime(self, x) result(res)
diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
index a1733c4..b480424 100644
--- a/src/nf/nf_conv2d_layer_submodule.f90
+++ b/src/nf/nf_conv2d_layer_submodule.f90
@@ -195,7 +195,7 @@ pure module function get_params(self) result(params)
 
     params = [ &
       pack(self % kernel, .true.), &
-      pack(self % biases, .true.) &
+      self % biases &
     ]
 
   end function get_params
@@ -207,7 +207,7 @@ pure module function get_gradients(self) result(gradients)
 
     gradients = [ &
       pack(self % dw, .true.), &
-      pack(self % db, .true.) &
+      self % db &
     ]
 
   end function get_gradients
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index 5944f0f..4be23e3 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -27,11 +27,15 @@ pure module subroutine backward(self, input, gradient)
     real, intent(in) :: gradient(:)
     real :: db(self % output_size)
     real :: dw(self % input_size, self % output_size)
+    integer :: i
 
     db = gradient * self % activation % eval_prime(self % z)
-    dw = matmul(reshape(input, [size(input), 1]), reshape(db, [1, size(db)]))
+!    dw = matmul(reshape(input, [size(input), 1]), reshape(db, [1, size(db)]))
+    do concurrent (i = 1:size(db))
+      self % dw(:,i) = self % dw(:,i) + input(:) * db(i)
+    enddo
     self % gradient = matmul(self % weights, db)
-    self % dw = self % dw + dw
+!    self % dw = self % dw + dw
     self % db = self % db + db
 
   end subroutine backward
@@ -63,7 +67,7 @@ pure module function get_params(self) result(params)
 
     params = [ &
       pack(self % weights, .true.), &
-      pack(self % biases, .true.) &
+      self % biases &
     ]
 
   end function get_params
@@ -75,7 +79,7 @@ pure module function get_gradients(self) result(gradients)
 
     gradients = [ &
       pack(self % dw, .true.), &
-      pack(self % db, .true.) &
+      self % db &
     ]
 
   end function get_gradients
@@ -135,7 +139,7 @@ module subroutine init(self, input_shape)
     allocate(self % db(self % output_size))
     self % db = 0
 
-    allocate(self % gradient(self % output_size))
+    allocate(self % gradient(self % input_size))
     self % gradient = 0
 
   end subroutine init