diff --git a/regtest/basic/rt0-gpu/COLVAR b/regtest/basic/rt0-gpu/COLVAR
new file mode 100644
index 0000000000..a4a9fa7d4a
--- /dev/null
+++ b/regtest/basic/rt0-gpu/COLVAR
@@ -0,0 +1 @@
+# check that this file is backupped
diff --git a/regtest/basic/rt0-gpu/COLVAR.reference b/regtest/basic/rt0-gpu/COLVAR.reference
new file mode 100644
index 0000000000..9a6ab24410
--- /dev/null
+++ b/regtest/basic/rt0-gpu/COLVAR.reference
@@ -0,0 +1,6 @@
+#! FIELDS time rmsd0 rmsds0 rmsd1 rmsds1 rmsd2 rmsds2 rmsd3 rmsds3 rmsd4 rmsds4 sum @12.bias @12.force2
+ 0.000000  1.496  1.643  1.496  1.643  1.337  1.486  1.336  1.486  2.329  2.130 16.381  6.550 13.100
+ 0.050000  1.502  1.650  1.503  1.650  1.347  1.497  1.347  1.496  2.332  2.128 16.452  6.294 12.589
+ 0.100000  1.503  1.653  1.504  1.653  1.351  1.502  1.350  1.501  2.324  2.120 16.461  6.262 12.525
+ 0.150000  1.502  1.653  1.502  1.654  1.350  1.503  1.350  1.503  2.308  2.108 16.433  6.361 12.721
+ 0.200000  1.496  1.649  1.496  1.649  1.347  1.500  1.346  1.499  2.278  2.080 16.340  6.696 13.392
diff --git a/regtest/basic/rt0-gpu/Makefile b/regtest/basic/rt0-gpu/Makefile
new file mode 100644
index 0000000000..3703b27cea
--- /dev/null
+++ b/regtest/basic/rt0-gpu/Makefile
@@ -0,0 +1 @@
+include ../../scripts/test.make
diff --git a/regtest/basic/rt0-gpu/bck.0.COLVAR.reference b/regtest/basic/rt0-gpu/bck.0.COLVAR.reference
new file mode 100644
index 0000000000..a4a9fa7d4a
--- /dev/null
+++ b/regtest/basic/rt0-gpu/bck.0.COLVAR.reference
@@ -0,0 +1 @@
+# check that this file is backupped
diff --git a/regtest/basic/rt0-gpu/config b/regtest/basic/rt0-gpu/config
new file mode 100644
index 0000000000..be67468de8
--- /dev/null
+++ b/regtest/basic/rt0-gpu/config
@@ -0,0 +1,6 @@
+type=driver
+# this is to test a different name
+arg="--plumed plumed.dat --trajectory-stride 10 --timestep 0.005 --ixyz trajectory.xyz --dump-forces forces --dump-forces-fmt=%8.4f --restart"
+extra_files="../../trajectories/trajectory.xyz"
+plumed_needs="arrayfire"
+PLUMED_ALLOW_SKIP_ON_TRAVIS=yes
\ No newline at end of file
diff --git a/regtest/basic/rt0-gpu/forces.reference b/regtest/basic/rt0-gpu/forces.reference
new file mode 100644
index 0000000000..844853373c
--- /dev/null
+++ b/regtest/basic/rt0-gpu/forces.reference
@@ -0,0 +1,550 @@
+108
+ -3.6537 -23.9554 -30.6233
+X   0.0000   0.0000   0.0000
+X   0.8129  -2.0125  -1.8501
+X   0.0000   0.0000   0.0000
+X  -0.5094  -0.9374  -2.0431
+X   0.0000   0.0000   0.0000
+X   0.5404  -1.8349   0.1546
+X   0.5004  -0.8763  -0.9617
+X  -0.9351  -1.1169   0.2720
+X  -0.4854  -2.1684   1.0329
+X   0.4763  -2.2261   2.0409
+X   0.0000   0.0000   0.0000
+X  -0.3397  -1.1427   2.0762
+X   0.0000   0.0000   0.0000
+X   0.4771   0.8746  -1.7698
+X   0.0000   0.0000   0.0000
+X  -0.7138   2.5481  -2.8942
+X   0.0000   0.0000   0.0000
+X   0.3510   0.8475   0.1527
+X   0.5913   2.7041  -1.0864
+X  -0.7376   2.7042   0.2320
+X  -0.7197   1.2681   1.5213
+X   0.6914   1.3686   3.1226
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+108
+ -3.6097 -23.4265 -30.3767
+X   0.0000   0.0000   0.0000
+X   0.8263  -1.9997  -1.8213
+X   0.0000   0.0000   0.0000
+X  -0.5079  -0.8670  -2.0396
+X   0.0000   0.0000   0.0000
+X   0.5076  -1.7401   0.1283
+X   0.5236  -0.8420  -0.9629
+X  -0.9936  -1.2450   0.2600
+X  -0.5684  -2.1038   1.0153
+X   0.3845  -2.1673   1.9982
+X   0.0000   0.0000   0.0000
+X  -0.3383  -1.0899   2.0200
+X   0.0000   0.0000   0.0000
+X   0.4291   0.8627  -1.7565
+X   0.0000   0.0000   0.0000
+X  -0.6496   2.4891  -2.8679
+X   0.0000   0.0000   0.0000
+X   0.2830   0.7679   0.1178
+X   0.5860   2.6269  -0.9610
+X  -0.6743   2.6856   0.3024
+X  -0.5560   1.2417   1.5061
+X   0.7480   1.3809   3.0611
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+108
+ -3.8169 -23.1693 -30.3205
+X   0.0000   0.0000   0.0000
+X   0.8816  -2.0323  -1.7946
+X   0.0000   0.0000   0.0000
+X  -0.5597  -0.8440  -2.0441
+X   0.0000   0.0000   0.0000
+X   0.5068  -1.6757   0.1353
+X   0.5877  -0.8014  -0.9633
+X  -1.0568  -1.3227   0.2104
+X  -0.6328  -2.1181   1.0457
+X   0.2900  -2.1384   1.9801
+X   0.0000   0.0000   0.0000
+X  -0.2498  -1.0810   2.0413
+X   0.0000   0.0000   0.0000
+X   0.4287   0.8766  -1.7525
+X   0.0000   0.0000   0.0000
+X  -0.6185   2.4985  -2.8929
+X   0.0000   0.0000   0.0000
+X   0.3006   0.7210   0.0988
+X   0.5727   2.5586  -0.9302
+X  -0.7280   2.6416   0.3691
+X  -0.4722   1.2371   1.4881
+X   0.7497   1.4802   3.0089
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+108
+ -4.2180 -23.0710 -30.3148
+X   0.0000   0.0000   0.0000
+X   0.9800  -2.0435  -1.7618
+X   0.0000   0.0000   0.0000
+X  -0.6155  -0.8580  -2.0768
+X   0.0000   0.0000   0.0000
+X   0.5316  -1.6385   0.1667
+X   0.6465  -0.7953  -0.9931
+X  -1.1237  -1.3029   0.2106
+X  -0.7018  -2.1617   1.0736
+X   0.2124  -2.1681   1.9788
+X   0.0000   0.0000   0.0000
+X  -0.1526  -1.1097   2.0794
+X   0.0000   0.0000   0.0000
+X   0.4605   0.9036  -1.7455
+X   0.0000   0.0000   0.0000
+X  -0.6030   2.5214  -2.9309
+X   0.0000   0.0000   0.0000
+X   0.3564   0.6945   0.1017
+X   0.5674   2.5415  -0.9662
+X  -0.7997   2.5898   0.4174
+X  -0.4888   1.2679   1.4785
+X   0.7302   1.5590   2.9674
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+108
+ -4.6840 -23.2396 -30.7391
+X   0.0000   0.0000   0.0000
+X   1.0667  -2.0511  -1.7927
+X   0.0000   0.0000   0.0000
+X  -0.6627  -0.8873  -2.1631
+X   0.0000   0.0000   0.0000
+X   0.5742  -1.6315   0.2008
+X   0.6715  -0.8239  -1.0595
+X  -1.2003  -1.2170   0.2710
+X  -0.7572  -2.2573   1.0986
+X   0.1630  -2.2602   2.0289
+X   0.0000   0.0000   0.0000
+X  -0.0912  -1.1643   2.1521
+X   0.0000   0.0000   0.0000
+X   0.5296   0.9238  -1.7478
+X   0.0000   0.0000   0.0000
+X  -0.5774   2.5939  -2.9905
+X   0.0000   0.0000   0.0000
+X   0.4352   0.6851   0.1088
+X   0.5771   2.5823  -1.0172
+X  -0.8899   2.5799   0.4292
+X  -0.5882   1.3165   1.5149
+X   0.7495   1.6111   2.9667
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
+X   0.0000   0.0000   0.0000
diff --git a/regtest/basic/rt0-gpu/plumed.dat b/regtest/basic/rt0-gpu/plumed.dat
new file mode 100644
index 0000000000..1ee1b859aa
--- /dev/null
+++ b/regtest/basic/rt0-gpu/plumed.dat
@@ -0,0 +1,40 @@
+RESTART NO # revert command line option --restart
+
+rmsd0:   RMSD TYPE=OPTIMAL REFERENCE=test0.pdb NOPBC GPU=on DEVICEID=0
+rmsds0:  RMSD TYPE=SIMPLE  REFERENCE=test0.pdb NOPBC GPU=on DEVICEID=0
+rmsd1:   RMSD TYPE=OPTIMAL REFERENCE=test1.pdb NOPBC GPU=on DEVICEID=0
+rmsds1:  RMSD TYPE=SIMPLE  REFERENCE=test1.pdb NOPBC GPU=on DEVICEID=0
+rmsd2:   RMSD TYPE=OPTIMAL REFERENCE=test2.pdb NOPBC GPU=on DEVICEID=0
+rmsds2:  RMSD TYPE=SIMPLE  REFERENCE=test2.pdb NOPBC GPU=on DEVICEID=0
+rmsd3:   RMSD TYPE=OPTIMAL REFERENCE=test3.pdb NOPBC GPU=on DEVICEID=0
+rmsds3:  RMSD TYPE=SIMPLE  REFERENCE=test3.pdb NOPBC GPU=on DEVICEID=0
+rmsd4:   RMSD TYPE=OPTIMAL REFERENCE=test4.pdb NOPBC GPU=on DEVICEID=0
+rmsds4:  RMSD TYPE=SIMPLE  REFERENCE=test4.pdb NOPBC GPU=on DEVICEID=0
+
+sum: COMBINE ARG=* PERIODIC=NO
+
+RESTRAINT ARG=sum AT=20.0 KAPPA=1.0
+
+PRINT ...
+  STRIDE=1
+  ARG=*
+  FILE=COLVAR FMT=%6.3f
+... PRINT
+
+##  # This is to check numerical derivatives:
+##  rmsd0n:   RMSD TYPE=OPTIMAL REFERENCE=test0.pdb NUMERICAL_DERIVATIVES
+##  rmsd1n:   RMSD TYPE=OPTIMAL REFERENCE=test1.pdb NUMERICAL_DERIVATIVES
+##  rmsd2n:   RMSD TYPE=OPTIMAL REFERENCE=test2.pdb NUMERICAL_DERIVATIVES
+##  rmsd3n:   RMSD TYPE=OPTIMAL REFERENCE=test3.pdb NUMERICAL_DERIVATIVES
+##  rmsd4n:   RMSD TYPE=OPTIMAL REFERENCE=test4.pdb NUMERICAL_DERIVATIVES
+##  DUMPDERIVATIVES ARG=rmsd0,rmsd0n FILE=check0
+##  DUMPDERIVATIVES ARG=rmsd1,rmsd1n FILE=check1
+##  DUMPDERIVATIVES ARG=rmsd2,rmsd2n FILE=check2
+##  DUMPDERIVATIVES ARG=rmsd3,rmsd3n FILE=check3
+##  DUMPDERIVATIVES ARG=rmsd4,rmsd4n FILE=check4
+
+DEBUG DETAILED_TIMERS
+
+ENDPLUMED
+
+text here should be ignored
diff --git a/regtest/basic/rt0-gpu/test0.pdb b/regtest/basic/rt0-gpu/test0.pdb
new file mode 100644
index 0000000000..aa13f04635
--- /dev/null
+++ b/regtest/basic/rt0-gpu/test0.pdb
@@ -0,0 +1,15 @@
+ATOM      2  O   ALA     2      -0.926  -2.447  -0.497  0.00  0.00      DIA  O
+ATOM      4  HNT ALA     2       0.533  -0.396   1.184  0.00  0.00      DIA  H
+ATOM      6  HT1 ALA     2      -0.216  -2.590   1.371  0.00  0.00      DIA  H
+ATOM      7  HT2 ALA     2      -0.309  -1.255   2.315  0.00  0.00      DIA  H
+ATOM      8  HT3 ALA     2      -1.480  -1.560   1.212  0.00  0.00      DIA  H
+ATOM      9  CAY ALA     2      -0.096   2.144  -0.669  0.00  0.00      DIA  C
+ATOM     10  HY1 ALA     2       0.871   2.385  -0.588  0.00  0.00      DIA  H
+ATOM     12  HY3 ALA     2      -0.520   2.679  -1.400  0.00  0.00      DIA  H
+ATOM     14  OY  ALA     2      -1.139   0.931  -0.973  0.00  0.00      DIA  O
+ATOM     16  HN  ALA     2       1.713   1.021  -0.873  0.00  0.00      DIA  H
+ATOM     18  HA  ALA     2       0.099  -0.774  -2.218  0.00  0.00      DIA  H
+ATOM     19  CB  ALA     2       2.063  -1.223  -1.276  0.00  0.00      DIA  C
+ATOM     20  HB1 ALA     2       2.670  -0.716  -2.057  0.00  0.00      DIA  H
+ATOM     21  HB2 ALA     2       2.556  -1.051  -0.295  0.00  0.00      DIA  H
+ATOM     22  HB3 ALA     2       2.070  -2.314  -1.490  0.00  0.00      DIA  H
diff --git a/regtest/basic/rt0-gpu/test1.pdb b/regtest/basic/rt0-gpu/test1.pdb
new file mode 100644
index 0000000000..924aa4b057
--- /dev/null
+++ b/regtest/basic/rt0-gpu/test1.pdb
@@ -0,0 +1,15 @@
+ATOM      2  O   ALA     2      -0.926  -2.447  -0.497  1.00  1.01      DIA  O
+ATOM      4  HNT ALA     2       0.533  -0.396   1.184  1.00  1.00      DIA  H
+ATOM      6  HT1 ALA     2      -0.216  -2.590   1.371  1.00  1.00      DIA  H
+ATOM      7  HT2 ALA     2      -0.309  -1.255   2.315  1.00  1.00      DIA  H
+ATOM      8  HT3 ALA     2      -1.480  -1.560   1.212  1.00  1.00      DIA  H
+ATOM      9  CAY ALA     2      -0.096   2.144  -0.669  1.00  1.00      DIA  C
+ATOM     10  HY1 ALA     2       0.871   2.385  -0.588  1.00  1.00      DIA  H
+ATOM     12  HY3 ALA     2      -0.520   2.679  -1.400  1.00  1.00      DIA  H
+ATOM     14  OY  ALA     2      -1.139   0.931  -0.973  1.00  1.00      DIA  O
+ATOM     16  HN  ALA     2       1.713   1.021  -0.873  1.00  1.00      DIA  H
+ATOM     18  HA  ALA     2       0.099  -0.774  -2.218  1.00  1.00      DIA  H
+ATOM     19  CB  ALA     2       2.063  -1.223  -1.276  1.00  0.99      DIA  C
+ATOM     20  HB1 ALA     2       2.670  -0.716  -2.057  1.00  1.00      DIA  H
+ATOM     21  HB2 ALA     2       2.556  -1.051  -0.295  1.00  1.00      DIA  H
+ATOM     22  HB3 ALA     2       2.070  -2.314  -1.490  1.00  1.00      DIA  H
diff --git a/regtest/basic/rt0-gpu/test2.pdb b/regtest/basic/rt0-gpu/test2.pdb
new file mode 100644
index 0000000000..f2585025b3
--- /dev/null
+++ b/regtest/basic/rt0-gpu/test2.pdb
@@ -0,0 +1,15 @@
+ATOM      2  O   ALA     2      -0.926  -2.447  -0.497  1.10  1.10      DIA  O
+ATOM      4  HNT ALA     2       0.533  -0.396   1.184  1.00  1.00      DIA  H
+ATOM      6  HT1 ALA     2      -0.216  -2.590   1.371  1.00  1.00      DIA  H
+ATOM      7  HT2 ALA     2      -0.309  -1.255   2.315  1.00  1.00      DIA  H
+ATOM      8  HT3 ALA     2      -1.480  -1.560   1.212  5.00  5.00      DIA  H
+ATOM      9  CAY ALA     2      -0.096   2.144  -0.669  1.00  1.00      DIA  C
+ATOM     10  HY1 ALA     2       0.871   2.385  -0.588  1.00  1.00      DIA  H
+ATOM     12  HY3 ALA     2      -0.520   2.679  -1.400  1.00  1.00      DIA  H
+ATOM     14  OY  ALA     2      -1.139   0.931  -0.973  0.00  0.00      DIA  O
+ATOM     16  HN  ALA     2       1.713   1.021  -0.873  1.00  1.00      DIA  H
+ATOM     18  HA  ALA     2       0.099  -0.774  -2.218  0.00  0.00      DIA  H
+ATOM     19  CB  ALA     2       2.063  -1.223  -1.276  1.00  1.00      DIA  C
+ATOM     20  HB1 ALA     2       2.670  -0.716  -2.057  1.00  1.00      DIA  H
+ATOM     21  HB2 ALA     2       2.556  -1.051  -0.295  1.00  1.00      DIA  H
+ATOM     22  HB3 ALA     2       2.070  -2.314  -1.490  1.00  1.00      DIA  H
diff --git a/regtest/basic/rt0-gpu/test3.pdb b/regtest/basic/rt0-gpu/test3.pdb
new file mode 100644
index 0000000000..4b04045e2a
--- /dev/null
+++ b/regtest/basic/rt0-gpu/test3.pdb
@@ -0,0 +1,15 @@
+ATOM      2  O   ALA     2      -0.926  -2.447  -0.497  1.10  1.10      DIA  O
+ATOM      4  HNT ALA     2       0.533  -0.396   1.184  1.00  1.00      DIA  H
+ATOM      6  HT1 ALA     2      -0.216  -2.590   1.371  1.00  1.00      DIA  H
+ATOM      7  HT2 ALA     2      -0.309  -1.255   2.315  1.00  1.00      DIA  H
+ATOM      8  HT3 ALA     2      -1.480  -1.560   1.212  5.00  5.01      DIA  H
+ATOM      9  CAY ALA     2      -0.096   2.144  -0.669  1.00  1.00      DIA  C
+ATOM     10  HY1 ALA     2       0.871   2.385  -0.588  1.00  1.00      DIA  H
+ATOM     12  HY3 ALA     2      -0.520   2.679  -1.400  1.00  1.00      DIA  H
+ATOM     14  OY  ALA     2      -1.139   0.931  -0.973  0.00  0.00      DIA  O
+ATOM     16  HN  ALA     2       1.713   1.021  -0.873  1.00  1.00      DIA  H
+ATOM     18  HA  ALA     2       0.099  -0.774  -2.218  0.00  0.00      DIA  H
+ATOM     19  CB  ALA     2       2.063  -1.223  -1.276  1.00  1.00      DIA  C
+ATOM     20  HB1 ALA     2       2.670  -0.716  -2.057  1.00  1.00      DIA  H
+ATOM     21  HB2 ALA     2       2.556  -1.051  -0.295  1.00  1.00      DIA  H
+ATOM     22  HB3 ALA     2       2.070  -2.314  -1.490  1.00  1.00      DIA  H
diff --git a/regtest/basic/rt0-gpu/test4.pdb b/regtest/basic/rt0-gpu/test4.pdb
new file mode 100644
index 0000000000..455f79f38c
--- /dev/null
+++ b/regtest/basic/rt0-gpu/test4.pdb
@@ -0,0 +1,15 @@
+ATOM      2  O   ALA     2      -0.926  -2.447  -0.497  1.00  0.00      DIA  O
+ATOM      4  HNT ALA     2       0.533  -0.396   1.184  1.00  0.00      DIA  H
+ATOM      6  HT1 ALA     2      -0.216  -2.590   1.371  1.00  0.00      DIA  H
+ATOM      7  HT2 ALA     2      -0.309  -1.255   2.315  1.00  0.00      DIA  H
+ATOM      8  HT3 ALA     2      -1.480  -1.560   1.212  1.00  0.00      DIA  H
+ATOM      9  CAY ALA     2      -0.096   2.144  -0.669  1.00  0.00      DIA  C
+ATOM     10  HY1 ALA     2       0.871   2.385  -0.588  1.00  0.00      DIA  H
+ATOM     12  HY3 ALA     2      -0.520   2.679  -1.400  1.00  0.00      DIA  H
+ATOM     14  OY  ALA     2      -1.139   0.931  -0.973  0.00  1.00      DIA  O
+ATOM     16  HN  ALA     2       1.713   1.021  -0.873  0.00  1.00      DIA  H
+ATOM     18  HA  ALA     2       0.099  -0.774  -2.218  0.00  1.00      DIA  H
+ATOM     19  CB  ALA     2       2.063  -1.223  -1.276  0.00  1.00      DIA  C
+ATOM     20  HB1 ALA     2       2.670  -0.716  -2.057  0.00  1.00      DIA  H
+ATOM     21  HB2 ALA     2       2.556  -1.051  -0.295  0.00  1.00      DIA  H
+ATOM     22  HB3 ALA     2       2.070  -2.314  -1.490  0.00  1.00      DIA  H
diff --git a/src/colvar/RMSD.cpp b/src/colvar/RMSD.cpp
index e6090597ab..e54cc08a49 100644
--- a/src/colvar/RMSD.cpp
+++ b/src/colvar/RMSD.cpp
@@ -27,6 +27,10 @@
 #include "reference/MetricRegister.h"
 #include "core/Atoms.h"
 
+#ifdef __PLUMED_HAS_ARRAYFIRE
+#include <arrayfire.h>
+#endif
+
 namespace PLMD {
 namespace colvar {
 
@@ -37,6 +41,8 @@ class RMSD : public Colvar {
   std::unique_ptr<PLMD::RMSDBase> rmsd;
   bool squared;
   bool nopbc;
+  bool gpu;
+  int  deviceid;
 
 public:
   explicit RMSD(const ActionOptions&);
@@ -166,6 +172,8 @@ void RMSD::registerKeywords(Keywords& keys) {
   keys.add("compulsory","REFERENCE","a file in pdb format containing the reference structure and the atoms involved in the CV.");
   keys.add("compulsory","TYPE","SIMPLE","the manner in which RMSD alignment is performed.  Should be OPTIMAL or SIMPLE.");
   keys.addFlag("SQUARED",false," This should be set if you want mean squared displacement instead of RMSD ");
+  keys.addFlag("GPU",false,"calculate RMSD using ARRAYFIRE on an accelerator device");
+  keys.add("compulsory","DEVICEID","0","Identifier of the GPU to be used");
 }
 
 RMSD::RMSD(const ActionOptions&ao):
@@ -173,7 +181,9 @@ RMSD::RMSD(const ActionOptions&ao):
   myvals(1,0),
   mypack(0,0,myvals),
   squared(false),
-  nopbc(false)
+  nopbc(false),
+  gpu(false),
+  deviceid(0)
 {
   std::string reference;
   parse("REFERENCE",reference);
@@ -183,6 +193,27 @@ RMSD::RMSD(const ActionOptions&ao):
   parseFlag("SQUARED",squared);
   parseFlag("NOPBC",nopbc);
 
+  std::string gpuuse;
+  gpuuse.assign("off");
+  parse("GPU",gpuuse);
+  if (gpuuse=="on" || gpuuse=="ON")
+    gpu = true;
+  else if (gpuuse=="off" || gpuuse=="OFF")
+    gpu = false;
+  else
+    plumed_merror("unknown GPU on/off");
+#ifndef  __PLUMED_HAS_ARRAYFIRE
+  if(gpu) error("To use the GPU mode PLUMED must be compiled with ARRAYFIRE");
+#endif
+
+  parse("DEVICEID",deviceid);
+#ifdef  __PLUMED_HAS_ARRAYFIRE
+  if(gpu) {
+    af::setDevice(deviceid);
+    af::info();
+  }
+#endif
+
   checkRead();
 
 
@@ -222,7 +253,7 @@ RMSD::RMSD(const ActionOptions&ao):
 // calculator
 void RMSD::calculate() {
   if(!nopbc) makeWhole();
-  double r=rmsd->calculate( getPositions(), mypack, squared );
+  double r=rmsd->calculate_cpugpu( getPositions(), mypack, squared, gpu, deviceid );
 
   setValue(r);
   for(unsigned i=0; i<getNumberOfAtoms(); i++) setAtomsDerivatives( i, mypack.getAtomDerivative(i) );
diff --git a/src/reference/OptimalRMSD.cpp b/src/reference/OptimalRMSD.cpp
index a667f53aa1..b2dabf8a74 100644
--- a/src/reference/OptimalRMSD.cpp
+++ b/src/reference/OptimalRMSD.cpp
@@ -34,6 +34,7 @@ class OptimalRMSD : public RMSDBase {
   explicit OptimalRMSD(const ReferenceConfigurationOptions& ro);
   void read( const PDB& ) override;
   double calc( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared ) const override;
+  double calc_cpugpu( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared, const bool& gpu, const int& deviceid ) const override;
   bool pcaIsEnabledForThisReference() override { return true; }
   void setupRMSDObject() override { myrmsd.clear(); myrmsd.set(getAlign(),getDisplace(),getReferencePositions(),"OPTIMAL"); }
   void setupPCAStorage( ReferenceValuePack& mypack ) override {
@@ -59,6 +60,25 @@ void OptimalRMSD::read( const PDB& pdb ) {
   readReference( pdb ); setupRMSDObject();
 }
 
+double OptimalRMSD::calc_cpugpu( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared, const bool& gpu, const int& deviceid ) const {
+  double d;
+  if( myder.calcUsingPCAOption() ) {
+    std::vector<Vector> centeredreference( getNumberOfAtoms () );
+    d=myrmsd.calc_PCAelements(pos,myder.getAtomVector(),myder.rot[0],myder.DRotDPos,myder.getAtomsDisplacementVector(),myder.centeredpos,centeredreference,squared);
+    unsigned nat = pos.size();
+    for(unsigned i=0; i<nat; ++i) { myder.getAtomsDisplacementVector()[i] -= getReferencePosition(i); myder.getAtomsDisplacementVector()[i] *= getDisplace()[i]; }
+  } else if( fast ) {
+    if( getAlign()==getDisplace() ) d=myrmsd.optimalAlignment<false,true>(getAlign(),getDisplace(),pos,getReferencePositions(),myder.getAtomVector(),squared, gpu, deviceid);
+    else d=myrmsd.optimalAlignment<false,false>(getAlign(),getDisplace(),pos,getReferencePositions(),myder.getAtomVector(),squared, gpu, deviceid);
+  } else {
+    if( getAlign()==getDisplace() ) d=myrmsd.optimalAlignment<true,true>(getAlign(),getDisplace(),pos,getReferencePositions(),myder.getAtomVector(),squared, gpu, deviceid);
+    else d=myrmsd.optimalAlignment<true,false>(getAlign(),getDisplace(),pos,getReferencePositions(),myder.getAtomVector(),squared, gpu, deviceid);
+  }
+  myder.clear(); for(unsigned i=0; i<pos.size(); ++i) myder.setAtomDerivatives( i, myder.getAtomVector()[i] );
+  if( !myder.updateComplete() ) myder.updateDynamicLists();
+  return d;
+}
+
 double OptimalRMSD::calc( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared ) const {
   double d;
   if( myder.calcUsingPCAOption() ) {
diff --git a/src/reference/RMSDBase.cpp b/src/reference/RMSDBase.cpp
index 46a30a0b81..18d81e65b4 100644
--- a/src/reference/RMSDBase.cpp
+++ b/src/reference/RMSDBase.cpp
@@ -34,6 +34,10 @@ double RMSDBase::calculate( const std::vector<Vector>& pos, ReferenceValuePack&
   return calc( pos, myder, squared );
 }
 
+double RMSDBase::calculate_cpugpu( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared, const bool& gpu, const int& deviceid ) const {
+  return calc_cpugpu(pos, myder, squared, gpu, deviceid);
+}
+
 double RMSDBase::calc( const std::vector<Vector>& pos, const Pbc& pbc, ReferenceValuePack& myder, const bool& squared ) const {
   plumed_dbg_assert( pos.size()==getNumberOfAtoms() );
   return calc( pos, myder, squared );
diff --git a/src/reference/RMSDBase.h b/src/reference/RMSDBase.h
index c6e5947917..148b80d0b3 100644
--- a/src/reference/RMSDBase.h
+++ b/src/reference/RMSDBase.h
@@ -37,7 +37,9 @@ class RMSDBase : public SingleDomainRMSD {
   explicit RMSDBase( const ReferenceConfigurationOptions& ro );
   double calc( const std::vector<Vector>& pos, const Pbc& pbc, ReferenceValuePack& myder, const bool& squared ) const override;
   double calculate( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared ) const ;
+  double calculate_cpugpu( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared, const bool& gpu, const int& deviceid ) const ;
   virtual double calc( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared ) const=0;
+  virtual double calc_cpugpu( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared, const bool& gpu, const int& deviceid ) const=0;
 };
 
 }
diff --git a/src/reference/SimpleRMSD.cpp b/src/reference/SimpleRMSD.cpp
index f5a2549dcd..cf9e09dbef 100644
--- a/src/reference/SimpleRMSD.cpp
+++ b/src/reference/SimpleRMSD.cpp
@@ -32,6 +32,7 @@ class SimpleRMSD : public RMSDBase {
   explicit SimpleRMSD( const ReferenceConfigurationOptions& ro );
   void read( const PDB& ) override;
   double calc( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared ) const override;
+  double calc_cpugpu( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared, const bool& gpu, const int& deviceid ) const override;
   bool pcaIsEnabledForThisReference() override { return true; }
   void setupPCAStorage( ReferenceValuePack& mypack ) override {
     mypack.switchOnPCAOption(); mypack.getAtomsDisplacementVector().resize( getNumberOfAtoms() );
@@ -52,6 +53,14 @@ void SimpleRMSD::read( const PDB& pdb ) {
   readReference( pdb );
 }
 
+double SimpleRMSD::calc_cpugpu( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared, const bool& gpu, const int& deviceid ) const {
+  if( myder.getAtomsDisplacementVector().size()!=pos.size() ) myder.getAtomsDisplacementVector().resize( pos.size() );
+  double d=myrmsd.simpleAlignment( getAlign(), getDisplace(), pos, getReferencePositions(), myder.getAtomVector(), myder.getAtomsDisplacementVector(), squared );
+  myder.clear(); for(unsigned i=0; i<pos.size(); ++i) myder.setAtomDerivatives( i, myder.getAtomVector()[i] );
+  if( !myder.updateComplete() ) myder.updateDynamicLists();
+  return d;
+}
+
 double SimpleRMSD::calc( const std::vector<Vector>& pos, ReferenceValuePack& myder, const bool& squared ) const {
   if( myder.getAtomsDisplacementVector().size()!=pos.size() ) myder.getAtomsDisplacementVector().resize( pos.size() );
   double d=myrmsd.simpleAlignment( getAlign(), getDisplace(), pos, getReferencePositions(), myder.getAtomVector(), myder.getAtomsDisplacementVector(), squared );
diff --git a/src/tools/RMSD.cpp b/src/tools/RMSD.cpp
index e07d7804a1..1370e2b639 100644
--- a/src/tools/RMSD.cpp
+++ b/src/tools/RMSD.cpp
@@ -27,9 +27,13 @@
 #include <iostream>
 #include "Tools.h"
 
+#ifdef __PLUMED_HAS_ARRAYFIRE
+#include <arrayfire.h>
+#endif
+
 namespace PLMD {
 
-RMSD::RMSD() : alignmentMethod(SIMPLE),reference_center_is_calculated(false),reference_center_is_removed(false),positions_center_is_calculated(false),positions_center_is_removed(false) {}
+RMSD::RMSD() : alignmentMethod(SIMPLE),reference_center_is_calculated(false),reference_center_is_removed(false),positions_center_is_calculated(false),positions_center_is_removed(false),rr11(0.0) {}
 
 ///
 /// general method to set all the rmsd property at once by using a pdb where occupancy column sets the weights for the atoms involved in the
@@ -47,6 +51,17 @@ void RMSD::set(const std::vector<double> & align, const std::vector<double> & di
   setDisplace(displace, normalize_weights);  // this is does not affect any calculation of the weights
   setType(mytype);
 
+  bool gpu = true;
+  if (gpu) {
+#ifdef __PLUMED_HAS_ARRAYFIRE
+    const unsigned n=reference.size();
+    setReference_gpu(reference_device, reference,n);
+    setAlign_gpu(align_device, align, n);
+    setDisplace_gpu(displace_device, displace, n);
+    setrr11_gpu(rr11);
+    setHostmem_gpu(derivatives_host, rr01_host, cpositions_host, ddist_drotation_host, n);
+#endif
+  }
 }
 
 void RMSD::setType(const std::string & mytype) {
@@ -171,6 +186,49 @@ void RMSD::setDisplace(const std::vector<double> & displace, bool normalize_weig
 std::vector<double> RMSD::getDisplace() {
   return displace;
 }
+#ifdef __PLUMED_HAS_ARRAYFIRE
+void RMSD::setReference_gpu(af::array & reference_device, const std::vector<Vector> & reference, int n) {
+  std::vector<double> reference_host;
+  reference_host.resize(3*n);
+  for (unsigned iat=0; iat<n; iat++) {
+    reference_host[3*iat]   = reference[iat][0];
+    reference_host[3*iat+1] = reference[iat][1];
+    reference_host[3*iat+2] = reference[iat][2];
+  }
+  // 3,n,1,1
+  reference_device = af::array(3, n, &reference_host.front());
+}
+af::array RMSD::getReference_gpu() {
+  return reference_device;
+}
+void RMSD::setAlign_gpu(af::array & align_device, const std::vector<double> & align, int n) {
+  // 1,n,1,1
+  align_device = af::array(1, n, &align.front());
+}
+af::array RMSD::getAlign_gpu() {
+  return align_device;
+}
+void RMSD::setDisplace_gpu(af::array & displace_device, const std::vector<double> & displace, int n) {
+  // 1,n,1,1
+  displace_device = af::array(1, n, &displace.front());
+}
+af::array RMSD::getDisplace_gpu() {
+  return displace_device;
+}
+void RMSD::setrr11_gpu(double & rr11) {
+  // 1,1,1,1
+  rr11 = af::sum<double>(af::sum(reference_device*reference_device)*align_device);
+}
+double RMSD::getrr11_gpu() {
+  return rr11;
+}
+void RMSD::setHostmem_gpu(std::vector<double> & derivatives_host, std::vector<double> & rr01_host, std::vector<double> & cpositions_host, std::vector<double> & ddist_drotation_host, int n) {
+  derivatives_host.resize(3*n);
+  rr01_host.resize(9);
+  cpositions_host.resize(3);
+  ddist_drotation_host.resize(9);
+}
+#endif
 ///
 /// This is the main workhorse for rmsd that decides to use specific optimal alignment versions
 ///
@@ -423,12 +481,81 @@ double RMSD::simpleAlignment(const  std::vector<double>  & align,
 #ifdef OLDRMSD
 // notice that in the current implementation the safe argument only makes sense for
 // align==displace
+void RMSD::optimalAlignment_gpu(const std::vector<Vector> & positions,
+                                double & rr00, double & rr11, Tensor & rr01, Vector & cpositions, int deviceid)const {
+#ifdef __PLUMED_HAS_ARRAYFIRE
+  af::array positions_device;
+  af::array cpositions_device;
+  af::array rr01_device;
+
+  const unsigned n=positions.size();
+
+  std::vector<double> positions_host;
+  positions_host.resize(3*n);
+  for (unsigned iat=0; iat<n; iat++) {
+    positions_host[3*iat]   = positions[iat][0];
+    positions_host[3*iat+1] = positions[iat][1];
+    positions_host[3*iat+2] = positions[iat][2];
+  }
+
+  af::setDevice(deviceid);
+  // 3,n,1,1
+  positions_device = af::array(3, n, &positions_host.front());
+
+// first expensive loop: compute centers on GPU
+  // 3,1,1,1
+  cpositions_device = af::sum((positions_device * af::tile(align_device,3,1)).T()).T();
+
+// second expensive loop: compute second moments wrt centers on GPU
+  // 1,1,1,1
+  rr00 = af::sum<double>(af::sum((positions_device - af::tile(cpositions_device,1,n))*(positions_device - af::tile(cpositions_device,1,n)))*align_device);
+  // 1,1,1,1
+  rr11 = this->rr11;
+  // 3,3,1,1 -> 9,1,1,1
+  rr01_device = af::flat( af::matmul((positions_device - af::tile(cpositions_device,1,n))*af::tile(align_device,3,1),reference_device.T()) );
+
+  rr01_device.host(&rr01_host.front());
+  rr01[0][0] = rr01_host[0];
+  rr01[1][0] = rr01_host[1];
+  rr01[2][0] = rr01_host[2];
+  rr01[0][1] = rr01_host[3];
+  rr01[1][1] = rr01_host[4];
+  rr01[2][1] = rr01_host[5];
+  rr01[0][2] = rr01_host[6];
+  rr01[1][2] = rr01_host[7];
+  rr01[2][2] = rr01_host[8];
+
+  cpositions_device.host(&cpositions_host.front());
+  cpositions[0] = cpositions_host[0];
+  cpositions[1] = cpositions_host[1];
+  cpositions[2] = cpositions_host[2];
+#endif
+}
+void RMSD::optimalAlignment_cpu(const  std::vector<double>  & align,
+                                const std::vector<Vector> & positions,
+                                const std::vector<Vector> & reference,
+                                double & rr00, double & rr11, Tensor & rr01, Vector & cpositions)const {
+  const unsigned n=reference.size();
+// first expensive loop: compute centers
+  for(unsigned iat=0; iat<n; iat++) {
+    double w=align[iat];
+    cpositions+=positions[iat]*w;
+  }
+
+// second expensive loop: compute second moments wrt centers
+  for(unsigned iat=0; iat<n; iat++) {
+    double w=align[iat];
+    rr00+=dotProduct(positions[iat]-cpositions,positions[iat]-cpositions)*w;
+    rr11+=dotProduct(reference[iat],reference[iat])*w;
+    rr01+=Tensor(positions[iat]-cpositions,reference[iat])*w;
+  }
+}
 template <bool safe,bool alEqDis>
 double RMSD::optimalAlignment(const  std::vector<double>  & align,
                               const  std::vector<double>  & displace,
                               const std::vector<Vector> & positions,
                               const std::vector<Vector> & reference,
-                              std::vector<Vector>  & derivatives, bool squared)const {
+                              std::vector<Vector>  & derivatives, bool squared, bool gpu, int deviceid)const {
   const unsigned n=reference.size();
 // This is the trace of positions*positions + reference*reference
   double rr00(0);
@@ -440,19 +567,8 @@ double RMSD::optimalAlignment(const  std::vector<double>  & align,
 
   Vector cpositions;
 
-// first expensive loop: compute centers
-  for(unsigned iat=0; iat<n; iat++) {
-    double w=align[iat];
-    cpositions+=positions[iat]*w;
-  }
-
-// second expensive loop: compute second moments wrt centers
-  for(unsigned iat=0; iat<n; iat++) {
-    double w=align[iat];
-    rr00+=dotProduct(positions[iat]-cpositions,positions[iat]-cpositions)*w;
-    rr11+=dotProduct(reference[iat],reference[iat])*w;
-    rr01+=Tensor(positions[iat]-cpositions,reference[iat])*w;
-  }
+  if (gpu) optimalAlignment_gpu(positions, rr00, rr11, rr01, cpositions, deviceid);
+  else optimalAlignment_cpu(align, positions, reference, rr00, rr11, rr01, cpositions);
 
   Tensor4d m;
 
@@ -619,7 +735,7 @@ double RMSD::optimalAlignment(const  std::vector<double>  & align,
                               const std::vector<Vector> & positions,
                               const std::vector<Vector> & reference,
                               std::vector<Vector>  & derivatives,
-                              bool squared) const {
+                              bool squared, bool gpu, int deviceid) const {
   //std::cerr<<"setting up the core data \n";
   RMSDCoreData cd(align,displace,positions,reference);
 
@@ -957,15 +1073,240 @@ double RMSD::optimalAlignment_Fit(const  std::vector<double>  & align,
 
 
 
+void RMSDCoreData::doCoreCalc_gpu(bool safe,bool alEqDis,bool only_rotation) {
+#ifdef __PLUMED_HAS_ARRAYFIRE
+  retrieve_only_rotation=only_rotation;
+  const unsigned n=static_cast<unsigned int>(reference.size());
 
+  plumed_massert(creference_is_calculated,"the center of the reference frame must be already provided at this stage");
+  plumed_massert(cpositions_is_calculated,"the center of the positions frame must be already provided at this stage");
 
+// This is the trace of positions*positions + reference*reference
+  rr00=0.;
+  rr11=0.;
+// This is positions*reference
+  Tensor rr01;
+// center of mass managing: must subtract the center from the position or not?
+  Vector cp; cp.zero(); if(!cpositions_is_removed)cp=cpositions;
+  Vector cr; cr.zero(); if(!creference_is_removed)cr=creference;
+// second expensive loop: compute second moments wrt centers
+  // GPU
+  // on gpu only the master rank run the calculation
+  // if(comm_rank==0) {
+  std::vector<double> positions_host;
+  std::vector<double> reference_host;
+  std::vector<double> cp_host;
+  std::vector<double> cr_host;
+  std::vector<double> align_host;
+  positions_host.resize(3*n);
+  reference_host.resize(3*n);
+  cp_host.resize(3);
+  cr_host.resize(3);
+  align_host.resize(n);
+  cp_host[0] = cp[0];  cp_host[1] = cp[1];  cp_host[2] = cp[2];
+  cr_host[0] = cr[0];  cr_host[1] = cr[1];  cr_host[2] = cr[2];
+  #pragma omp parallel for num_threads(OpenMP::getNumThreads())
+  for (unsigned iat=0; iat<n; iat++) {
+    positions_host[3*iat]   = positions[iat][0];   // .d[0]是私有变量需要调用[]访问
+    positions_host[3*iat+1] = positions[iat][1];
+    positions_host[3*iat+2] = positions[iat][2];
+    reference_host[3*iat]   = reference[iat][0];
+    reference_host[3*iat+1] = reference[iat][1];
+    reference_host[3*iat+2] = reference[iat][2];
+    align_host[iat] = align[iat];
+  }
 
-/// This calculates the elements needed by the quaternion to calculate everything that is needed
-/// additional calls retrieve different components
-/// note that this considers that the centers of both reference and positions are already setted
-/// but automatically should properly account for non removed components: if not removed then it
-/// removes prior to calculation of the alignment
-void RMSDCoreData::doCoreCalc(bool safe,bool alEqDis, bool only_rotation) {
+  // //set GPU device no.
+  // af::setDevice(2);
+  //3,n,1,1
+  positions_device = af::array(3, n, &positions_host.front());
+  reference_device = af::array(3, n, &reference_host.front());
+  // 3,1,1,1
+  cp_device = af::array(3, 1, &cp_host.front());
+  cr_device = af::array(3, 1, &cr_host.front());
+  // 1,n,1,1
+  align_device = af::array(1, n, &align_host.front());
+
+  rr00 = af::sum<double>(af::sum((positions_device - af::tile(cp_device,1,n))*(positions_device - af::tile(cp_device,1,n)))*align_device);
+  rr11 = af::sum<double>(af::sum((reference_device - af::tile(cr_device,1,n))*(reference_device - af::tile(cr_device,1,n)))*align_device);
+  // 3,3,1,1 -> 9,1,1,1
+  rr01_device = af::flat( af::matmul((positions_device - af::tile(cp_device,1,n))*af::tile(align_device,3,1),(reference_device - af::tile(cr_device,1,n)).T()) );
+
+  double * rr01_host = rr01_device.host<double>();
+
+  rr01[0][0] = rr01_host[0];
+  rr01[1][0] = rr01_host[1];
+  rr01[2][0] = rr01_host[2];
+  rr01[0][1] = rr01_host[3];
+  rr01[1][1] = rr01_host[4];
+  rr01[2][1] = rr01_host[5];
+  rr01[0][2] = rr01_host[6];
+  rr01[1][2] = rr01_host[7];
+  rr01[2][2] = rr01_host[8];
+
+  delete[] rr01_host;
+
+  // }
+
+  // the quaternion matrix: this is internal
+  Tensor4d m;
+
+  m[0][0]=2.0*(-rr01[0][0]-rr01[1][1]-rr01[2][2]);
+  m[1][1]=2.0*(-rr01[0][0]+rr01[1][1]+rr01[2][2]);
+  m[2][2]=2.0*(+rr01[0][0]-rr01[1][1]+rr01[2][2]);
+  m[3][3]=2.0*(+rr01[0][0]+rr01[1][1]-rr01[2][2]);
+  m[0][1]=2.0*(-rr01[1][2]+rr01[2][1]);
+  m[0][2]=2.0*(+rr01[0][2]-rr01[2][0]);
+  m[0][3]=2.0*(-rr01[0][1]+rr01[1][0]);
+  m[1][2]=2.0*(-rr01[0][1]-rr01[1][0]);
+  m[1][3]=2.0*(-rr01[0][2]-rr01[2][0]);
+  m[2][3]=2.0*(-rr01[1][2]-rr01[2][1]);
+  m[1][0] = m[0][1];
+  m[2][0] = m[0][2];
+  m[2][1] = m[1][2];
+  m[3][0] = m[0][3];
+  m[3][1] = m[1][3];
+  m[3][2] = m[2][3];
+
+
+  Tensor dm_drr01[4][4];
+  if(!alEqDis or !retrieve_only_rotation) {
+    dm_drr01[0][0] = 2.0*Tensor(-1.0, 0.0, 0.0,  0.0,-1.0, 0.0,  0.0, 0.0,-1.0);
+    dm_drr01[1][1] = 2.0*Tensor(-1.0, 0.0, 0.0,  0.0,+1.0, 0.0,  0.0, 0.0,+1.0);
+    dm_drr01[2][2] = 2.0*Tensor(+1.0, 0.0, 0.0,  0.0,-1.0, 0.0,  0.0, 0.0,+1.0);
+    dm_drr01[3][3] = 2.0*Tensor(+1.0, 0.0, 0.0,  0.0,+1.0, 0.0,  0.0, 0.0,-1.0);
+    dm_drr01[0][1] = 2.0*Tensor( 0.0, 0.0, 0.0,  0.0, 0.0,-1.0,  0.0,+1.0, 0.0);
+    dm_drr01[0][2] = 2.0*Tensor( 0.0, 0.0,+1.0,  0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
+    dm_drr01[0][3] = 2.0*Tensor( 0.0,-1.0, 0.0, +1.0, 0.0, 0.0,  0.0, 0.0, 0.0);
+    dm_drr01[1][2] = 2.0*Tensor( 0.0,-1.0, 0.0, -1.0, 0.0, 0.0,  0.0, 0.0, 0.0);
+    dm_drr01[1][3] = 2.0*Tensor( 0.0, 0.0,-1.0,  0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
+    dm_drr01[2][3] = 2.0*Tensor( 0.0, 0.0, 0.0,  0.0, 0.0,-1.0,  0.0,-1.0, 0.0);
+    dm_drr01[1][0] = dm_drr01[0][1];
+    dm_drr01[2][0] = dm_drr01[0][2];
+    dm_drr01[2][1] = dm_drr01[1][2];
+    dm_drr01[3][0] = dm_drr01[0][3];
+    dm_drr01[3][1] = dm_drr01[1][3];
+    dm_drr01[3][2] = dm_drr01[2][3];
+  }
+
+  Vector4d q;
+
+  Tensor dq_drr01[4];
+  if(!alEqDis or !only_rotation) {
+    diagMatSym(m, eigenvals, eigenvecs );
+    q=Vector4d(eigenvecs[0][0],eigenvecs[0][1],eigenvecs[0][2],eigenvecs[0][3]);
+    double dq_dm[4][4][4];
+    for(unsigned i=0; i<4; i++) for(unsigned j=0; j<4; j++) for(unsigned k=0; k<4; k++) {
+          double tmp=0.0;
+// perturbation theory for matrix m
+          for(unsigned l=1; l<4; l++) tmp+=eigenvecs[l][j]*eigenvecs[l][i]/(eigenvals[0]-eigenvals[l])*eigenvecs[0][k];
+          dq_dm[i][j][k]=tmp;
+        }
+// propagation to _drr01
+    for(unsigned i=0; i<4; i++) {
+      Tensor tmp;
+      for(unsigned j=0; j<4; j++) for(unsigned k=0; k<4; k++) {
+          tmp+=dq_dm[i][j][k]*dm_drr01[j][k];
+        }
+      dq_drr01[i]=tmp;
+    }
+  } else {
+    TensorGeneric<1,4> here_eigenvecs;
+    VectorGeneric<1> here_eigenvals;
+    diagMatSym(m, here_eigenvals, here_eigenvecs );
+    for(unsigned i=0; i<4; i++) eigenvecs[0][i]=here_eigenvecs[0][i];
+    eigenvals[0]=here_eigenvals[0];
+    q=Vector4d(eigenvecs[0][0],eigenvecs[0][1],eigenvecs[0][2],eigenvecs[0][3]);
+  }
+
+// This is the rotation matrix that brings reference to positions
+// i.e. matmul(rotation,reference[iat])+shift is fitted to positions[iat]
+
+  rotation[0][0]=q[0]*q[0]+q[1]*q[1]-q[2]*q[2]-q[3]*q[3];
+  rotation[1][1]=q[0]*q[0]-q[1]*q[1]+q[2]*q[2]-q[3]*q[3];
+  rotation[2][2]=q[0]*q[0]-q[1]*q[1]-q[2]*q[2]+q[3]*q[3];
+  rotation[0][1]=2*(+q[0]*q[3]+q[1]*q[2]);
+  rotation[0][2]=2*(-q[0]*q[2]+q[1]*q[3]);
+  rotation[1][2]=2*(+q[0]*q[1]+q[2]*q[3]);
+  rotation[1][0]=2*(-q[0]*q[3]+q[1]*q[2]);
+  rotation[2][0]=2*(+q[0]*q[2]+q[1]*q[3]);
+  rotation[2][1]=2*(-q[0]*q[1]+q[2]*q[3]);
+
+
+  if(!alEqDis or !only_rotation) {
+    drotation_drr01[0][0]=2*q[0]*dq_drr01[0]+2*q[1]*dq_drr01[1]-2*q[2]*dq_drr01[2]-2*q[3]*dq_drr01[3];
+    drotation_drr01[1][1]=2*q[0]*dq_drr01[0]-2*q[1]*dq_drr01[1]+2*q[2]*dq_drr01[2]-2*q[3]*dq_drr01[3];
+    drotation_drr01[2][2]=2*q[0]*dq_drr01[0]-2*q[1]*dq_drr01[1]-2*q[2]*dq_drr01[2]+2*q[3]*dq_drr01[3];
+    drotation_drr01[0][1]=2*(+(q[0]*dq_drr01[3]+dq_drr01[0]*q[3])+(q[1]*dq_drr01[2]+dq_drr01[1]*q[2]));
+    drotation_drr01[0][2]=2*(-(q[0]*dq_drr01[2]+dq_drr01[0]*q[2])+(q[1]*dq_drr01[3]+dq_drr01[1]*q[3]));
+    drotation_drr01[1][2]=2*(+(q[0]*dq_drr01[1]+dq_drr01[0]*q[1])+(q[2]*dq_drr01[3]+dq_drr01[2]*q[3]));
+    drotation_drr01[1][0]=2*(-(q[0]*dq_drr01[3]+dq_drr01[0]*q[3])+(q[1]*dq_drr01[2]+dq_drr01[1]*q[2]));
+    drotation_drr01[2][0]=2*(+(q[0]*dq_drr01[2]+dq_drr01[0]*q[2])+(q[1]*dq_drr01[3]+dq_drr01[1]*q[3]));
+    drotation_drr01[2][1]=2*(-(q[0]*dq_drr01[1]+dq_drr01[0]*q[1])+(q[2]*dq_drr01[3]+dq_drr01[2]*q[3]));
+  }
+
+  d.resize(n);
+
+  // calculate rotation matrix derivatives and components distances needed for components only when align!=displacement
+  if(!alEqDis)ddist_drotation.zero();
+
+  // GPU
+  std::vector<double> rotation_host;
+  rotation_host.resize(9);
+  rotation_host[0] = rotation[0][0];
+  rotation_host[1] = rotation[1][0];
+  rotation_host[2] = rotation[2][0];
+  rotation_host[3] = rotation[0][1];
+  rotation_host[4] = rotation[1][1];
+  rotation_host[5] = rotation[2][1];
+  rotation_host[6] = rotation[0][2];
+  rotation_host[7] = rotation[1][2];
+  rotation_host[8] = rotation[2][2];
+  // 3,3,1,1
+  rotation_device = af::array(3, 3, &rotation_host.front());
+  // 3,n,1,1
+  d_device = positions_device - af::tile(cp_device,1,n) - af::matmul(rotation_device, reference_device - af::tile(cr_device,1,n));
+
+
+  // ddist_drotation if needed
+
+  // GPU
+  if(!alEqDis or !only_rotation) {
+    std::vector<double> displace_host;
+    displace_host.resize(n);
+    #pragma omp parallel for num_threads(OpenMP::getNumThreads())
+    for (unsigned iat=0; iat<n; iat++) {
+      displace_host[iat] = displace[iat];
+    }
+    // 1,n,1,1
+    displace_device = af::array(1, n, &displace_host.front());
+    // 3,3,1,1 -> 9,1,1,1
+    ddist_drotation_device = af::flat( af::matmul(-2 * af::tile(displace_device,3,1) * d_device, (reference_device - af::tile(cr_device,1,n)).T()) );
+
+    double * ddist_drotation_host = ddist_drotation_device.host<double>();
+
+    ddist_drotation[0][0] = ddist_drotation_host[0];
+    ddist_drotation[1][0] = ddist_drotation_host[1];
+    ddist_drotation[2][0] = ddist_drotation_host[2];
+    ddist_drotation[0][1] = ddist_drotation_host[3];
+    ddist_drotation[1][1] = ddist_drotation_host[4];
+    ddist_drotation[2][1] = ddist_drotation_host[5];
+    ddist_drotation[0][2] = ddist_drotation_host[6];
+    ddist_drotation[1][2] = ddist_drotation_host[7];
+    ddist_drotation[2][2] = ddist_drotation_host[8];
+  }
+
+  if(!alEqDis or !only_rotation) {
+    ddist_drr01.zero();
+    for(unsigned i=0; i<3; i++) for(unsigned j=0; j<3; j++) ddist_drr01+=ddist_drotation[i][j]*drotation_drr01[i][j];
+  }
+  // transfer this bools to the cd so that this settings will be reflected in the other calls
+  this->alEqDis=alEqDis;
+  this->safe=safe;
+  isInitialized=true;
+#endif
+}
+void RMSDCoreData::doCoreCalc_cpu(bool safe,bool alEqDis, bool only_rotation) {
 
   retrieve_only_rotation=only_rotation;
   const unsigned n=static_cast<unsigned int>(reference.size());
@@ -1114,8 +1455,50 @@ void RMSDCoreData::doCoreCalc(bool safe,bool alEqDis, bool only_rotation) {
   isInitialized=true;
 
 }
-/// just retrieve the distance already calculated
-double RMSDCoreData::getDistance( bool squared) {
+/// This calculates the elements needed by the quaternion to calculate everything that is needed
+/// additional calls retrieve different components
+/// note that this considers that the centers of both reference and positions are already setted
+/// but automatically should properly account for non removed components: if not removed then it
+/// removes prior to calculation of the alignment
+void RMSDCoreData::doCoreCalc(bool safe, bool alEqDis, bool only_rotation) {
+  bool gpu = false;
+  if (gpu) doCoreCalc_gpu(safe, alEqDis, only_rotation);
+  else doCoreCalc_cpu(safe, alEqDis, only_rotation);
+}
+
+
+
+double RMSDCoreData::getDistance_gpu(bool squared) {
+#ifdef __PLUMED_HAS_ARRAYFIRE
+  if(!isInitialized)plumed_merror("getDistance cannot calculate the distance without being initialized first by doCoreCalc ");
+
+  double localDist=0.0;
+  const unsigned n=static_cast<unsigned int>(reference.size());
+  if(safe || !alEqDis) localDist=0.0;
+  else
+    localDist=eigenvals[0]+rr00+rr11;
+
+  // GPU
+  if(alEqDis) {
+    if(safe) localDist = af::sum<double>(align_device * af::sum(d_device*d_device));
+  } else {
+    localDist = af::sum<double>(displace_device * af::sum(d_device*d_device));
+  }
+
+  if(!squared) {
+    dist=sqrt(localDist);
+    distanceIsMSD=false;
+  } else {
+    dist=localDist;
+    distanceIsMSD=true;
+  }
+  hasDistance=true;
+  return dist;
+#else
+  return 0.0;
+#endif
+}
+double RMSDCoreData::getDistance_cpu(bool squared) {
 
   if(!isInitialized)plumed_merror("getDistance cannot calculate the distance without being initialized first by doCoreCalc ");
 
@@ -1142,6 +1525,12 @@ double RMSDCoreData::getDistance( bool squared) {
   hasDistance=true;
   return dist;
 }
+/// just retrieve the distance already calculated
+double RMSDCoreData::getDistance(bool squared) {
+  bool gpu = false;
+  if (gpu) return getDistance_gpu(squared);
+  else return getDistance_cpu(squared);
+}
 
 void RMSDCoreData::doCoreCalcWithCloseStructure(bool safe,bool alEqDis, const Tensor & rotationPosClose, const Tensor & rotationRefClose, std::array<std::array<Tensor,3>,3> & drotationPosCloseDrr01) {
 
@@ -1450,23 +1839,21 @@ template double RMSD::optimalAlignment<true,true>(const  std::vector<double>  &
     const  std::vector<double>  & displace,
     const std::vector<Vector> & positions,
     const std::vector<Vector> & reference,
-    std::vector<Vector>  & derivatives, bool squared)const;
+    std::vector<Vector>  & derivatives, bool squared, bool gpu, int deviceid)const;
 template double RMSD::optimalAlignment<true,false>(const  std::vector<double>  & align,
     const  std::vector<double>  & displace,
     const std::vector<Vector> & positions,
     const std::vector<Vector> & reference,
-    std::vector<Vector>  & derivatives, bool squared)const;
+    std::vector<Vector>  & derivatives, bool squared, bool gpu, int deviceid)const;
 template double RMSD::optimalAlignment<false,true>(const  std::vector<double>  & align,
     const  std::vector<double>  & displace,
     const std::vector<Vector> & positions,
     const std::vector<Vector> & reference,
-    std::vector<Vector>  & derivatives, bool squared)const;
+    std::vector<Vector>  & derivatives, bool squared, bool gpu, int deviceid)const;
 template double RMSD::optimalAlignment<false,false>(const  std::vector<double>  & align,
     const  std::vector<double>  & displace,
     const std::vector<Vector> & positions,
     const std::vector<Vector> & reference,
-    std::vector<Vector>  & derivatives, bool squared)const;
-
-
+    std::vector<Vector>  & derivatives, bool squared, bool gpu, int deviceid)const;
 
 }
diff --git a/src/tools/RMSD.h b/src/tools/RMSD.h
index a91ffe19db..a9f61e7cb7 100644
--- a/src/tools/RMSD.h
+++ b/src/tools/RMSD.h
@@ -29,6 +29,10 @@
 #include <string>
 #include <array>
 
+#ifdef __PLUMED_HAS_ARRAYFIRE
+#include <arrayfire.h>
+#endif
+
 namespace PLMD {
 
 class Log;
@@ -79,6 +83,17 @@ class RMSD
   Vector positions_center;
   bool positions_center_is_calculated;
   bool positions_center_is_removed;
+// arrayfire GPU variable for rmsd calculation
+  double rr11;
+#ifdef __PLUMED_HAS_ARRAYFIRE
+  af::array reference_device;
+  af::array align_device;
+  af::array displace_device;
+  mutable std::vector<double> rr01_host;
+  mutable std::vector<double> cpositions_host;
+  mutable std::vector<double> ddist_drotation_host;
+  mutable std::vector<double> derivatives_host;
+#endif
 // calculates the center from the position provided
   Vector calculateCenter(const std::vector<Vector> &p,const std::vector<double> &w) {
     plumed_massert(p.size()==w.size(),"mismatch in dimension of position/align arrays while calculating the center");
@@ -109,12 +124,25 @@ class RMSD
 /// set reference coordinates, remove the com by using uniform weights
   void setReference(const std::vector<Vector> & reference);
   std::vector<Vector> getReference();
-/// set weights and remove the center from reference with normalized weights. If the com has been removed, it resets to the new value
-  void setAlign(const std::vector<double> & align, bool normalize_weights=true, bool remove_center=true);
-  std::vector<double> getAlign();
 /// set align
   void setDisplace(const std::vector<double> & displace, bool normalize_weights=true);
   std::vector<double> getDisplace();
+/// set weights and remove the center from reference with normalized weights. If the com has been removed, it resets to the new value
+  void setAlign(const std::vector<double> & align, bool normalize_weights=true, bool remove_center=true);
+  std::vector<double> getAlign();
+#ifdef __PLUMED_HAS_ARRAYFIRE
+  void setReference_gpu(af::array & reference_device, const std::vector<Vector> & reference, int n);
+  af::array getReference_gpu();
+  void setAlign_gpu(af::array & align_device, const std::vector<double> & align, int n);
+  af::array getAlign_gpu();
+  void setDisplace_gpu(af::array & displace_device, const std::vector<double> & displace, int n);
+  af::array getDisplace_gpu();
+/// set rr11 for gpu calculate
+  void setrr11_gpu(double & rr11);
+  double getrr11_gpu();
+/// set host transfer memory
+  void setHostmem_gpu(std::vector<double> & derivatives_host, std::vector<double> & rr01_host, std::vector<double> & cpositions_host, std::vector<double> & ddist_drotation_host, int n);
+#endif
 ///
   std::string getMethod();
 /// workhorses
@@ -125,12 +153,18 @@ class RMSD
                          std::vector<Vector>  & derivatives,
                          std::vector<Vector>  & displacement,
                          bool squared=false)const;
+  void optimalAlignment_gpu(const std::vector<Vector> & positions,
+                            double & rr00, double & rr11, Tensor & rr01, Vector & cpositions, int deviceid) const;
+  void optimalAlignment_cpu(const  std::vector<double>  & align,
+                            const std::vector<Vector> & positions,
+                            const std::vector<Vector> & reference,
+                            double & rr00, double & rr11, Tensor & rr01, Vector & cpositions) const;
   template <bool safe,bool alEqDis>
   double optimalAlignment(const  std::vector<double>  & align,
                           const  std::vector<double>  & displace,
                           const std::vector<Vector> & positions,
                           const std::vector<Vector> & reference,
-                          std::vector<Vector>  & DDistDPos, bool squared=false)const;
+                          std::vector<Vector>  & DDistDPos, bool squared=false, bool gpu=false, int deviceid=0)const;
 
   template <bool safe, bool alEqDis>
   double optimalAlignmentWithCloseStructure(const  std::vector<double>  & align,
@@ -294,6 +328,18 @@ class RMSDCoreData
   Tensor ddist_drr01;
   Tensor ddist_drotation;
   std::vector<Vector> d; // difference of components
+#ifdef __PLUMED_HAS_ARRAYFIRE
+  af::array positions_device;
+  af::array reference_device;
+  af::array cp_device;
+  af::array cr_device;
+  af::array rr01_device;
+  af::array rotation_device;
+  af::array d_device;
+  af::array align_device;
+  af::array displace_device;
+  af::array ddist_drotation_device;
+#endif
 public:
   /// the constructor (note: only references are passed, therefore is rather fast)
   /// note: this aligns the reference onto the positions
@@ -331,10 +377,14 @@ class RMSDCoreData
   //  does the core calc : first thing to call after the constructor:
   // only_rotation=true does not retrieve the derivatives, just retrieve the optimal rotation (the same calc cannot be exploit further)
   void doCoreCalc(bool safe,bool alEqDis, bool only_rotation=false);
+  void doCoreCalc_cpu(bool safe,bool alEqDis, bool only_rotation=false);
+  void doCoreCalc_gpu(bool safe,bool alEqDis, bool only_rotation=false);
   // do calculation with close structure data structures
   void doCoreCalcWithCloseStructure(bool safe,bool alEqDis, const Tensor & rotationPosClose, const Tensor & rotationRefClose, std::array<std::array<Tensor,3>,3> & drotationPosCloseDrr01);
   // retrieve the distance if required after doCoreCalc
   double getDistance(bool squared);
+  double getDistance_cpu(bool squared);
+  double getDistance_gpu(bool squared);
   // retrieve the derivative of the distance respect to the position
   std::vector<Vector> getDDistanceDPositions();
   // retrieve the derivative of the distance respect to the reference