diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d7b72543c..788bbd81b8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,9 +16,14 @@ we hit release version 1.0.0.
       sisl.geom.graphene
 
 ### Fixed
-
 - `projection` arguments of several functions has been streamlined
 
+### Changed
+- internal Cython code for performance improvements.
+  This yield significant perf. improvements for DFT sparse matrices
+  with *many* edges in the sparse matrix, but a perf. hit for very
+  small TB matrices.
+
 
 ## [0.15.2] - 2024-11-06
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e44950fb77..a50c371a69 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,6 @@ add_compile_definitions(CYTHON_NO_PYINIT_EXPORT=1)
 #: lib, perhaps we should change this
 set(CMAKE_SHARED_MODULE_PREFIX "")
 
-
 # Determine whether we are in CIBUILDWHEEL
 # and whether we are building for the universal target
 set(_def_fortran TRUE)
@@ -81,6 +80,8 @@ option(WITH_FORTRAN
 
 # Define all options for the user
 if( WITH_FORTRAN )
+  enable_language(Fortran)
+
   set(F2PY_REPORT_ON_ARRAY_COPY 10
     CACHE STRING
     "The minimum (element) size of arrays before warning about copies")
@@ -209,6 +210,18 @@ if(WITH_FORTRAN)
 endif(WITH_FORTRAN)
 
 
+message(STATUS "Python variables:")
+list(APPEND CMAKE_MESSAGE_INDENT "  ")
+
+cmake_print_variables(Python_INCLUDE_DIRS)
+cmake_print_variables(Python_NumPy_INCLUDE_DIRS)
+if(WITH_FORTRAN)
+  cmake_print_variables(Python_NumPy_F2Py_INCLUDE_DIR)
+endif()
+
+list(POP_BACK CMAKE_MESSAGE_INDENT)
+
+
 message(STATUS "sisl options")
 list(APPEND CMAKE_MESSAGE_INDENT "  ")
 
@@ -230,18 +243,6 @@ endif()
 list(POP_BACK CMAKE_MESSAGE_INDENT)
 
 
-message(STATUS "Python variables:")
-list(APPEND CMAKE_MESSAGE_INDENT "  ")
-
-cmake_print_variables(Python_INCLUDE_DIRS)
-cmake_print_variables(Python_NumPy_INCLUDE_DIRS)
-if(WITH_FORTRAN)
-  cmake_print_variables(Python_NumPy_F2Py_INCLUDE_DIR)
-endif()
-
-list(POP_BACK CMAKE_MESSAGE_INDENT)
-
-
 
 # Return in _result whether the _file should be built, or not
 # It checks whether the file is present in the NO_COMPILATION
diff --git a/benchmarks/optimizations/hamiltonian.ipynb b/benchmarks/optimizations/hamiltonian.ipynb
new file mode 100644
index 0000000000..e6edd7ff5c
--- /dev/null
+++ b/benchmarks/optimizations/hamiltonian.ipynb
@@ -0,0 +1,77 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we test and check the performance of the `Hk` implementation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from pathlib import Path\n",
+    "import numpy as np\n",
+    "import sisl as si\n",
+    "\n",
+    "files = Path(os.environ[\"SISL_FILES_TESTS\"])\n",
+    "siesta = files / \"siesta\"\n",
+    "\n",
+    "N = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "H = si.Hamiltonian.read(siesta / \"Si_pdos_k\" / \"Si_pdos.TSHS\").tile(N, 0).tile(N, 1)\n",
+    "\n",
+    "%timeit H.Hk()\n",
+    "%timeit H.Hk([0.1] * 3)\n",
+    "%timeit H.Hk(format=\"array\")\n",
+    "%timeit H.Hk([0.1] * 3, format=\"array\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "H = si.Hamiltonian.read(siesta / \"Pt2_soc\" / \"Pt2_xx.TSHS\").tile(N, 0).tile(N // 2, 1)\n",
+    "\n",
+    "%timeit H.Hk()\n",
+    "%timeit H.Hk([0.1] * 3)\n",
+    "%timeit H.Hk(format=\"array\")\n",
+    "%timeit H.Hk([0.1] * 3, format=\"array\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
index 5390c0c125..8762b8ab42 100755
--- a/benchmarks/run.sh
+++ b/benchmarks/run.sh
@@ -15,6 +15,5 @@ profile=$base.profile
 # Stats
 stats=$base.stats
 
-python -m cProfile -o $profile $script $@
-python stats.py $profile > $stats
-
+python3 -m cProfile -o $profile $script $@
+python3 stats.py $profile > $stats
diff --git a/benchmarks/run3.sh b/benchmarks/run3.sh
deleted file mode 100755
index d3586bb313..0000000000
--- a/benchmarks/run3.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-script=graphene.py
-if [ $# -gt 0 ]; then
-    script=$1
-    shift
-fi
-
-# Base name
-base=${script%.py}
-
-# Determine output profile
-profile=$base.profile
-
-# Stats
-stats=$base.stats
-
-python3 -m cProfile -o $profile $script $@
-python3 stats.py $profile > $stats
-
diff --git a/src/sisl/CMakeLists.txt b/src/sisl/CMakeLists.txt
index b94e5a741e..87a6346477 100644
--- a/src/sisl/CMakeLists.txt
+++ b/src/sisl/CMakeLists.txt
@@ -1,3 +1,9 @@
+set_property(DIRECTORY
+  APPEND
+  PROPERTY INCLUDE_DIRECTORIES
+  ${CMAKE_CURRENT_SOURCE_DIR}/_core
+  )
+
 foreach(source _indices _math_small)
   add_cython_library(
     SOURCE ${source}.pyx
@@ -29,6 +35,7 @@ endforeach()
 get_directory_property( SISL_DEFINITIONS DIRECTORY
   ${CMAKE_CURRENT_SOURCE_DIR}
   COMPILE_DEFINITIONS )
+
 # Join to stringify list
 list(JOIN SISL_DEFINITIONS " " SISL_DEFINITIONS)
 
diff --git a/src/sisl/__init__.py b/src/sisl/__init__.py
index 8b1f7b065b..3841fc2dac 100644
--- a/src/sisl/__init__.py
+++ b/src/sisl/__init__.py
@@ -88,6 +88,8 @@
 # import the common options used
 from ._common import *
 
+from ._core import *
+
 # Import warning classes
 # We currently do not import warn and info
 # as they are too generic names in case one does from sisl import *
@@ -106,8 +108,6 @@
 # Below are sisl-specific imports
 from .shape import *
 
-from ._core import *
-
 # Physical quantities and required classes
 from .physics import *
 
diff --git a/src/sisl/_core/CMakeLists.txt b/src/sisl/_core/CMakeLists.txt
index 5453a1c9f6..f2cea9c7c6 100644
--- a/src/sisl/_core/CMakeLists.txt
+++ b/src/sisl/_core/CMakeLists.txt
@@ -1,4 +1,4 @@
-foreach(source _lattice _sparse)
+foreach(source _lattice _dtypes _sparse)
   add_cython_library(
     SOURCE ${source}.pyx
     LIBRARY ${source}
diff --git a/src/sisl/_core/_dtypes.pxd b/src/sisl/_core/_dtypes.pxd
new file mode 100644
index 0000000000..9d2247c9e5
--- /dev/null
+++ b/src/sisl/_core/_dtypes.pxd
@@ -0,0 +1,102 @@
+"""
+Shared header for fused dtypes
+"""
+cimport cython
+
+import numpy as np
+
+cimport numpy as cnp
+from numpy cimport (
+    complex64_t,
+    complex128_t,
+    float32_t,
+    float64_t,
+    int8_t,
+    int16_t,
+    int32_t,
+    int64_t,
+    uint8_t,
+    uint16_t,
+    uint32_t,
+    uint64_t,
+)
+
+# Generic typedefs for sisl internal naming convention
+ctypedef size_t size_st
+ctypedef Py_ssize_t ssize_st
+
+
+ctypedef fused ints_st:
+    int
+    long
+
+
+ctypedef fused floats_st:
+    float
+    double
+
+
+ctypedef fused complexs_st:
+    float complex
+    double complex
+
+
+ctypedef fused floatcomplexs_st:
+    float
+    double
+    float complex
+    double complex
+
+
+# We need this fused data-type to omit complex data-types
+ctypedef fused reals_st:
+    int
+    long
+    float
+    double
+
+ctypedef fused numerics_st:
+    int
+    long
+    float
+    double
+    float complex
+    double complex
+
+ctypedef fused _type2dtype_types_st:
+    short
+    int
+    long
+    float
+    double
+    float complex
+    double complex
+    float32_t
+    float64_t
+    #complex64_t # not usable...
+    #complex128_t
+    int8_t
+    int16_t
+    int32_t
+    int64_t
+    uint8_t
+    uint16_t
+    uint32_t
+    uint64_t
+
+
+cdef object type2dtype(const _type2dtype_types_st v)
+
+
+ctypedef fused _inline_sum_st:
+    short
+    int
+    long
+    int16_t
+    int32_t
+    int64_t
+    uint16_t
+    uint32_t
+    uint64_t
+
+cdef ssize_st inline_sum(const _inline_sum_st[::1] array) noexcept nogil
diff --git a/src/sisl/_core/_dtypes.pyx b/src/sisl/_core/_dtypes.pyx
new file mode 100644
index 0000000000..a57d775e35
--- /dev/null
+++ b/src/sisl/_core/_dtypes.pyx
@@ -0,0 +1,80 @@
+"""
+Inline-sum (all useful shared codes could be placed here
+"""
+cimport cython
+
+import numpy as np
+
+cimport numpy as cnp
+from numpy cimport (
+    complex64_t,
+    complex128_t,
+    float32_t,
+    float64_t,
+    int8_t,
+    int16_t,
+    int32_t,
+    int64_t,
+    uint8_t,
+    uint16_t,
+    uint32_t,
+    uint64_t,
+)
+
+
+@cython.initializedcheck(False)
+cdef inline object type2dtype(const _type2dtype_types_st v):
+    if _type2dtype_types_st is int8_t:
+        return np.int8
+    elif _type2dtype_types_st is int16_t:
+        return np.int16
+    elif _type2dtype_types_st is cython.short:
+        return np.int16
+    elif _type2dtype_types_st is int32_t:
+        return np.int32
+    elif _type2dtype_types_st is cython.int:
+        return np.int32
+    elif _type2dtype_types_st is int64_t:
+        return np.int64
+    elif _type2dtype_types_st is cython.long:
+        return np.int64
+    elif _type2dtype_types_st is float32_t:
+        return np.float32
+    elif _type2dtype_types_st is cython.float:
+        return np.float32
+    elif _type2dtype_types_st is float64_t:
+        return np.float64
+    elif _type2dtype_types_st is cython.double:
+        return np.float64
+    elif _type2dtype_types_st is complex64_t:
+        return np.complex64
+    elif _type2dtype_types_st is cython.floatcomplex:
+        return np.complex64
+    elif _type2dtype_types_st is complex128_t:
+        return np.complex128
+    elif _type2dtype_types_st is cython.doublecomplex:
+        return np.complex128
+
+    # More special cases
+    elif _type2dtype_types_st is uint8_t:
+        return np.uint8
+    elif _type2dtype_types_st is uint16_t:
+        return np.uint16
+    elif _type2dtype_types_st is uint32_t:
+        return np.uint32
+    elif _type2dtype_types_st is uint64_t:
+        return np.uint64
+
+
+
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.boundscheck(False)
+cdef inline ssize_st inline_sum(const _inline_sum_st[::1] array) noexcept nogil:
+    cdef ssize_st total, i
+
+    total = 0
+    for i in range(array.shape[0]):
+        total += array[i]
+
+    return total
diff --git a/src/sisl/_core/_sparse.pxd b/src/sisl/_core/_sparse.pxd
index d36d0f24d2..a588c5d149 100644
--- a/src/sisl/_core/_sparse.pxd
+++ b/src/sisl/_core/_sparse.pxd
@@ -1,2 +1,6 @@
 # Define the interfaces for the functions exposed through cimport
-cdef Py_ssize_t inline_sum(const int[::1] array) nogil
+from sisl._core._dtypes cimport ints_st
+
+
+cdef void ncol2ptr_nc(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr, const
+ints_st per_elem) noexcept nogil
diff --git a/src/sisl/_core/_sparse.pyx b/src/sisl/_core/_sparse.pyx
index 484aefbffb..c0ff04e706 100644
--- a/src/sisl/_core/_sparse.pyx
+++ b/src/sisl/_core/_sparse.pyx
@@ -2,76 +2,89 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 cimport cython
-from libc.math cimport fabs
 
 import numpy as np
 
-# This enables Cython enhanced compatibilities
-
-cimport numpy as np
+cimport numpy as cnp
+from numpy cimport dtype, ndarray
 
+from sisl._core._dtypes cimport inline_sum, ints_st, numerics_st, ssize_st, type2dtype
 from sisl._indices cimport in_1d
 
-__all__ = ["fold_csr_matrix", "fold_csr_matrix_nc",
-           "fold_csr_diagonal_nc", "sparse_dense", "inline_sum"]
-
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef inline Py_ssize_t inline_sum(const int[::1] array) noexcept nogil:
-    cdef Py_ssize_t total, i
+cdef void ncol2ptr_nc(const ints_st nr, const ints_st[::1] ncol, ints_st[::1] ptr, const ints_st per_elem) noexcept nogil:
+    cdef ssize_st r, rr
+
+    # this is NC/SOC
+    ptr[0] = 0
+    ptr[1] = ncol[0] * per_elem
+    for r in range(1, nr):
+        rr = r * 2
+        # do both
+        ptr[rr] = ptr[rr - 1] + ncol[r-1] * per_elem
+        ptr[rr+1] = ptr[rr] + ncol[r] * per_elem
 
-    total = 0
-    for i in range(array.shape[0]):
-        total += array[i]
-    return total
+    ptr[nr * 2] = ptr[nr * 2 - 1] + ncol[nr - 1] * per_elem
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
 @cython.cdivision(True)
-def fold_csr_matrix(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] COL):
+def fold_csr_matrix(ints_st[::1] ptr,
+                    ints_st[::1] ncol,
+                    ints_st[::1] col):
     """ Fold all columns into a square matrix """
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
+
     # Number of rows
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] FOLD_ptr = np.empty([nr + 1], dtype=np.int32)
-    cdef int[::1] fold_ptr = FOLD_ptr
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] FOLD_ncol = np.empty([nr], dtype=np.int32)
-    cdef int[::1] fold_ncol = FOLD_ncol
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] FOLD_col = np.empty([inline_sum(ncol)], dtype=np.int32)
-    cdef int[::1] fold_col = FOLD_col
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[ints_st](1)
+    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr + 1], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol)], dtype=dtype)
+
+    cdef ints_st[::1] fold_ptr = FOLD_ptr
+    cdef ints_st[::1] fold_ncol = FOLD_ncol
+    cdef ints_st[::1] fold_col = FOLD_col
+
     # local variables
-    cdef Py_ssize_t r, ind, nz, c
-    cdef int[::1] tmp
+    cdef ints_st r, c, nz, ind
+    cdef ints_st[::1] tmp
 
     nz = 0
     fold_ptr[0] = 0
+
     # Loop on all rows
     for r in range(nr):
 
         # Initialize the pointer arrays
-        if ncol[r] > 0:
-            fold_ncol[r] = 1
-            fold_col[fold_ptr[r]] = col[ptr[r]] % nr
-        else:
-            fold_ncol[r] = 0
-
-        for ind in range(ptr[r] + 1, ptr[r] + ncol[r]):
-            c = col[ind] % nr
-            if not in_1d(fold_col[fold_ptr[r]:fold_ptr[r] + fold_ncol[r]], c):
-                fold_col[fold_ptr[r] + fold_ncol[r]] = c
-                fold_ncol[r] += 1
-
-        # Sort indices (we should implement our own sorting algorithm)
-        tmp = np.sort(fold_col[fold_ptr[r]:fold_ptr[r] + fold_ncol[r]])
-        for ind in range(fold_ncol[r]):
+        # Even though large supercells has *many* double entries (after folding)
+        # this turns out to be faster than incrementally searching
+        # the array.
+        # This kind-of-makes sense.
+        # We can do:
+        #  1.
+        #    a) build a full list of folded items
+        #    b) find unique (and sorted) elements
+        # or
+        #  2.
+        #    a) incrementally add a value, only
+        #       if it does not exist.
+        # 1. creates a bigger temporary array, but only
+        #    adds unique values 1 time through numpy fast algorithm
+        # 2. searchs an array (of seemingly small arrays) ncol times
+        #    which can be quite heavy.
+        tmp = col[ptr[r]:ptr[r] + ncol[r]].copy()
+        for ind in range(ncol[r]):
+            tmp[ind] %= nr
+
+        tmp = np.unique(tmp)
+        fold_ncol[r] = tmp.shape[0]
+        for ind in range(tmp.shape[0]):
             fold_col[fold_ptr[r] + ind] = tmp[ind]
 
         fold_ptr[r + 1] = fold_ptr[r] + fold_ncol[r]
@@ -88,63 +101,53 @@ def fold_csr_matrix(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
 @cython.wraparound(False)
 @cython.initializedcheck(False)
 @cython.cdivision(True)
-def fold_csr_matrix_nc(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] COL):
+def fold_csr_matrix_nc(ints_st[::1] ptr,
+                       ints_st[::1] ncol,
+                       ints_st[::1] col):
     """ Fold all columns into a square matrix """
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
     # Number of rows
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] FOLD_ptr = np.empty([nr * 2 + 1], dtype=np.int32)
-    cdef int[::1] fold_ptr = FOLD_ptr
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] FOLD_ncol = np.empty([nr * 2], dtype=np.int32)
-    cdef int[::1] fold_ncol = FOLD_ncol
-    # We have to multiply by 4, 2 times the number of rows, and each row couples to 2 more elements
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] FOLD_col = np.empty([inline_sum(ncol) * 4], dtype=np.int32)
-    cdef int[::1] fold_col = FOLD_col
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[ints_st](1)
+    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr * 2 + 1], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr * 2], dtype=dtype)
+    # We have to multiply by 4, 2 times for the extra rows, and another
+    # 2 for the possible double couplings
+    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol) * 4], dtype=dtype)
+
+    cdef ints_st[::1] fold_ptr = FOLD_ptr
+    cdef ints_st[::1] fold_ncol = FOLD_ncol
+    cdef ints_st[::1] fold_col = FOLD_col
+
     # local variables
-    cdef Py_ssize_t r, rr, ind, nz, c
-    cdef int[::1] tmp
+    cdef ints_st r, rr, ind, nz, c
+    cdef ints_st[::1] tmp
 
     nz = 0
     fold_ptr[0] = 0
+
     # Loop on all rows
     for r in range(nr):
         rr = r * 2
 
-        # Initialize the pointer arrays
-        if ncol[r] > 0:
-            c = (col[ptr[r]] % nr) * 2
-            fold_ncol[rr] = 2
-            fold_col[fold_ptr[rr]] = c
-            fold_col[fold_ptr[rr] + 1] = c + 1
-        else:
-            fold_ncol[rr] = 0
+        tmp = col[ptr[r]:ptr[r] + ncol[r]].copy()
+        for ind in range(ncol[r]):
+            tmp[ind] = (tmp[ind] % nr) * 2
 
-        for ind in range(ptr[r] + 1, ptr[r] + ncol[r]):
-            c = (col[ind] % nr) * 2
-            if not in_1d(fold_col[fold_ptr[rr]:fold_ptr[rr] + fold_ncol[rr]], c):
-                fold_col[fold_ptr[rr] + fold_ncol[rr]] = c
-                fold_col[fold_ptr[rr] + fold_ncol[rr] + 1] = c + 1
-                fold_ncol[rr] += 2
+        tmp = np.unique(tmp)
 
         # Duplicate pointers and counters for next row (off-diagonal)
-        fold_ptr[rr + 1] = fold_ptr[rr] + fold_ncol[rr]
+        fold_ncol[rr] = tmp.shape[0] * 2
         fold_ncol[rr + 1] = fold_ncol[rr]
+        fold_ptr[rr + 1] = fold_ptr[rr] + fold_ncol[rr]
+        fold_ptr[rr + 2] = fold_ptr[rr + 1] + fold_ncol[rr]
 
-        # Sort indices (we should implement our own sorting algorithm)
-        tmp = np.sort(fold_col[fold_ptr[rr]:fold_ptr[rr] + fold_ncol[rr]])
-        for ind in range(fold_ncol[rr]):
-            c = tmp[ind]
-            fold_col[fold_ptr[rr] + ind] = c
-            # Copy to next row as well
-            fold_col[fold_ptr[rr+1] + ind] = c
+        for ind in range(tmp.shape[0]):
+            fold_col[fold_ptr[rr] + ind * 2] = tmp[ind]
+            fold_col[fold_ptr[rr] + ind * 2 + 1] = tmp[ind] + 1
+            fold_col[fold_ptr[rr+1] + ind * 2] = tmp[ind]
+            fold_col[fold_ptr[rr+1] + ind * 2 + 1] = tmp[ind] + 1
 
-        # Increment the next row
-        fold_ptr[rr + 2] = fold_ptr[rr + 1] + fold_ncol[rr + 1]
         nz += fold_ncol[rr] * 2
 
     if nz > fold_col.shape[0]:
@@ -158,29 +161,30 @@ def fold_csr_matrix_nc(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
 @cython.wraparound(False)
 @cython.initializedcheck(False)
 @cython.cdivision(True)
-def fold_csr_diagonal_nc(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] COL):
+def fold_csr_matrix_nc_diag(ints_st[::1] ptr,
+                            ints_st[::1] ncol,
+                            ints_st[::1] col):
     """ Fold all columns into a square matrix """
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
     # Number of rows
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] FOLD_ptr = np.empty([nr * 2 + 1], dtype=np.int32)
-    cdef int[::1] fold_ptr = FOLD_ptr
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] FOLD_ncol = np.empty([nr * 2], dtype=np.int32)
-    cdef int[::1] fold_ncol = FOLD_ncol
-    # We have to multiply by 2, 2 times the number of rows
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] FOLD_col = np.empty([inline_sum(ncol) * 2], dtype=np.int32)
-    cdef int[::1] fold_col = FOLD_col
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[ints_st](1)
+    cdef ndarray[ints_st, mode='c'] FOLD_ptr = np.empty([nr * 2 + 1], dtype=dtype)
+    cdef ndarray[ints_st, mode='c'] FOLD_ncol = np.empty([nr * 2], dtype=dtype)
+    # We have to multiply by 2 times for the extra rows
+    cdef ndarray[ints_st, mode='c'] FOLD_col = np.empty([inline_sum(ncol) * 2], dtype=dtype)
+
+    cdef ints_st[::1] fold_ptr = FOLD_ptr
+    cdef ints_st[::1] fold_ncol = FOLD_ncol
+    cdef ints_st[::1] fold_col = FOLD_col
+
     # local variables
-    cdef Py_ssize_t r, rr, ind, nz, c
-    cdef int[::1] tmp
+    cdef ints_st r, rr, ind, nz, c
+    cdef ints_st[::1] tmp
 
     nz = 0
     fold_ptr[0] = 0
+
     # Loop on all rows
     for r in range(nr):
         rr = r * 2
@@ -222,45 +226,25 @@ def fold_csr_diagonal_nc(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
     return FOLD_ptr, FOLD_ncol, FOLD_col[:nz].copy()
 
 
-# Here we have the int + long
-# For some analysis it may be useful
-ctypedef fused numeric_complex:
-    int
-    long
-    float
-    double
-    float complex
-    double complex
-
-
 def sparse_dense(M):
-    return _sparse_dense(M.shape, M.ptr, M.ncol, M.col, M._D, M.dtype)
+    cdef cnp.ndarray dense = np.zeros(M.shape, dtype=M.dtype)
+    _sparse_dense(M.ptr, M.ncol, M.col, M._D, dense)
+    return dense
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-@cython.cdivision(True)
-def _sparse_dense(shape,
-                  np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                  np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                  np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                  numeric_complex[:, ::1] D, dtype):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef V = np.zeros(shape, dtype=dtype)
-    cdef VV = V[:, ::1]
-    cdef Py_ssize_t r, ind, ix, s2
-
-    s2 = shape[2]
-    for r in range(nr):
+def _sparse_dense(ints_st[::1] ptr,
+                   ints_st[::1] ncol,
+                   ints_st[::1] col,
+                   numerics_st[:, ::1] data,
+                   numerics_st[:, :, ::1] dense):
+
+    cdef ints_st r, ind, ix, s2
+
+    s2 = dense.shape[2]
+    for r in range(ncol.shape[0]):
         for ind in range(ptr[r], ptr[r] + ncol[r]):
             for ix in range(s2):
-                VV[r, col[ind], ix] += D[ind, ix]
-
-    return V
+                dense[r, col[ind], ix] += data[ind, ix]
diff --git a/src/sisl/_core/sparse.py b/src/sisl/_core/sparse.py
index aa75fdc39b..e8cabbb0a4 100644
--- a/src/sisl/_core/sparse.py
+++ b/src/sisl/_core/sparse.py
@@ -404,7 +404,7 @@ def diags(self, diagonals, offsets=0, dim=None, dtype=None):
         shape[2] = dim
         shape = tuple(shape)
 
-        offsets = array_fill_repeat(offsets, shape[0], cls=dtype)
+        offsets = array_fill_repeat(offsets, shape[0], cls=np.int32)
 
         # Create the index-pointer, data and values
         data = array_fill_repeat(diagonals, shape[0], axis=0, cls=dtype)
@@ -488,7 +488,7 @@ def finalized(self):
         """Whether the contained data is finalized and non-used elements have been removed"""
         return self._finalized
 
-    def finalize(self, sort=True):
+    def finalize(self, sort: bool = True):
         """Finalizes the sparse matrix by removing all non-set elements
 
         One may still interact with the sparse matrix as one would previously.
diff --git a/src/sisl/_core/sparse_geometry.py b/src/sisl/_core/sparse_geometry.py
index 862eb750f3..45c96f081f 100644
--- a/src/sisl/_core/sparse_geometry.py
+++ b/src/sisl/_core/sparse_geometry.py
@@ -652,7 +652,7 @@ def create_construct(self, R, params):
         """
         if len(R) != len(params):
             raise ValueError(
-                f"{self.__class__.__name__}.create_construct got different lengths of `R` and `param`"
+                f"{self.__class__.__name__}.create_construct got different lengths of 'R' and 'params'"
             )
 
         def func(self, ia, atoms, atoms_xyz=None):
@@ -1003,15 +1003,17 @@ def unrepeat(
         atoms = np.arange(self.geometry.na).reshape(-1, reps).T.ravel()
         return self.sub(atoms).untile(reps, axis, segment, *args, sym=sym, **kwargs)
 
-    def finalize(self):
+    def finalize(self, *args, **kwargs):
         """Finalizes the model
 
         Finalizes the model so that all non-used elements are removed. I.e. this simply reduces the memory requirement for the sparse matrix.
 
-        Note that adding more elements to the sparse matrix is more time-consuming than for a non-finalized sparse matrix due to the
+        Notes
+        -----
+        Adding more elements to the sparse matrix is more time-consuming than for a non-finalized sparse matrix due to the
         internal data-representation.
         """
-        self._csr.finalize()
+        self._csr.finalize(*args, **kwargs)
 
     def tocsr(self, dim: int = 0, isc=None, **kwargs):
         """Return a :class:`~scipy.sparse.csr_matrix` for the specified dimension
diff --git a/src/sisl/_indices.pxd b/src/sisl/_indices.pxd
index 261207e919..5922b5bd71 100644
--- a/src/sisl/_indices.pxd
+++ b/src/sisl/_indices.pxd
@@ -1,3 +1,17 @@
 # Define the interfaces for the functions exposed through cimport
-cdef int in_1d(const int[::1] array, const int v) nogil
-cdef Py_ssize_t _index_sorted(const int[::1] array, const int v) nogil
+from numpy cimport int16_t, int32_t, int64_t
+
+from sisl._core._dtypes cimport ints_st, ssize_st
+
+
+cdef bint in_1d(const ints_st[::1] array, const ints_st v) noexcept nogil
+
+ctypedef fused _ints_index_sorted_st:
+    short
+    int
+    long
+    int16_t
+    int32_t
+    int64_t
+
+cdef ssize_st _index_sorted(const ints_st[::1] array, const _ints_index_sorted_st v) noexcept nogil
diff --git a/src/sisl/_indices.pyx b/src/sisl/_indices.pyx
index 2c270ee220..3795342a79 100644
--- a/src/sisl/_indices.pyx
+++ b/src/sisl/_indices.pyx
@@ -2,394 +2,349 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 cimport cython
-from libc.math cimport fabs, sqrt
+from libc.math cimport fabs, fabsf, sqrt, sqrtf
 
 import numpy as np
 
-# This enables Cython enhanced compatibilities
+cimport numpy as cnp
+from numpy cimport dtype, ndarray
 
-cimport numpy as np
+from sisl._core._dtypes cimport floats_st, ints_st, ssize_st, type2dtype
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def indices_only(np.ndarray[np.int32_t, ndim=1, mode='c'] element, np.ndarray[np.int32_t, ndim=1, mode='c'] test_element):
+@cython.initializedcheck(False)
+def indices_only(ints_st[::1] element, ints_st[::1] test_element):
     """ Return indices of all `test_element` in the element array.
 
     Parameters
     ----------
-    element : np.ndarray(np.int32)
+    element :
         array to search in
-    test_element : np.ndarray(np.int32)
+    test_element :
         values to find the indices of in `element`
     """
     # Ensure contiguous arrays
-    cdef int[::1] ELEMENT = element
-    cdef int[::1] TEST_ELEMENT = test_element
-    cdef Py_ssize_t n_element = ELEMENT.shape[0]
-    cdef Py_ssize_t n_test_element = TEST_ELEMENT.shape[0]
+    cdef ssize_st n_element = element.shape[0]
+    cdef ssize_st n_test_element = test_element.shape[0]
 
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] idx = np.empty([max(n_test_element, n_element)], dtype=np.int32)
-    cdef int[::1] IDX = idx
+    cdef object dtype = type2dtype[ints_st](1)
+    cdef ndarray[ints_st, mode='c'] IDX = np.empty([max(n_test_element, n_element)], dtype=dtype)
+    cdef ints_st[::1] idx = IDX
 
-    cdef Py_ssize_t n = _indices_only(n_element, ELEMENT, n_test_element, TEST_ELEMENT, IDX)
+    cdef ssize_st i, j, n
 
-    return idx[:n]
+    n = 0
+    with nogil:
 
+        # Fast return
+        if n_test_element == 0:
+            pass
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-cdef Py_ssize_t _indices_only(const Py_ssize_t n_element, const int[::1] element,
-                       const Py_ssize_t n_test_element, const int[::1] test_element,
-                       int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t i, j, n
+        elif n_element == 0:
+            pass
 
-    # Fast return
-    if n_test_element == 0:
-        return 0
-    elif n_element == 0:
-        return 0
+        elif n_test_element > n_element:
+            for j in range(n_test_element):
+                for i in range(n_element):
+                    if test_element[j] == element[i]:
+                        idx[n] = <ints_st> i
+                        n += 1
+                        break
 
-    elif n_test_element > n_element:
-        n = 0
-        for j in range(n_test_element):
+        else:
             for i in range(n_element):
-                if test_element[j] == element[i]:
-                    idx[n] = i
-                    n += 1
-                    break
+                for j in range(n_test_element):
+                    if test_element[j] == element[i]:
+                        idx[n] = <ints_st> i
+                        n += 1
+                        break
+
+    return IDX[:n].copy()
 
-    else:
-        n = 0
-        for i in range(n_element):
-            for j in range(n_test_element):
-                if test_element[j] == element[i]:
-                    idx[n] = i
-                    n += 1
-                    break
-    return n
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def indices(np.ndarray[np.int32_t, ndim=1, mode='c'] element, np.ndarray[np.int32_t, ndim=1, mode='c'] test_element, int offset=0, both_sorted=False):
+@cython.initializedcheck(False)
+def indices(ints_st[::1] element, ints_st[::1] test_element, ints_st offset=0,
+            both_sorted: bool = False):
     """ Return indices of all `test_element` in the search array. If not found the index will be ``-1``
 
     Parameters
     ----------
-    element : np.ndarray(np.int32)
+    element :
         array to search in
-    test_element : np.ndarray(np.int32)
+    test_element :
         values to find the indices of in `element`
-    offset : int
+    offset :
         index offset
     """
     # Ensure contiguous arrays
-    cdef int[::1] ELEMENT = element
-    cdef int[::1] TEST_ELEMENT = test_element
-    cdef Py_ssize_t n_element = ELEMENT.shape[0]
-    cdef Py_ssize_t n_test_element = TEST_ELEMENT.shape[0]
+    cdef ssize_st n_element = element.shape[0]
+    cdef ssize_st n_test_element = test_element.shape[0]
 
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] idx = np.empty([n_test_element], dtype=np.int32)
-    cdef int[::1] IDX = idx
+    cdef object dtype = type2dtype[ints_st](1)
+    cdef ndarray[ints_st, mode='c'] IDX = np.empty([n_test_element], dtype=dtype)
+    cdef ints_st[::1] idx = IDX
+    cdef ssize_st i, j
+    cdef ints_st ctest_element, celement
 
     if offset < 0:
         raise ValueError(f"indices requires offset argument >=0, got {offset}")
 
-    if both_sorted:
-        _indices_sorted_arrays(n_element, ELEMENT, n_test_element, TEST_ELEMENT, offset, IDX)
-    else:
-        _indices(n_element, ELEMENT, n_test_element, TEST_ELEMENT, offset, IDX)
-
-    return idx
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-cdef void _indices(const Py_ssize_t n_element, const int[::1] element,
-                   const Py_ssize_t n_test_element, const int[::1] test_element,
-                   const int offset, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t i, j
-
-    # Fast return
     if n_test_element == 0:
+        # fast return
         pass
+
     elif n_element == 0:
-        for j in range(n_test_element):
-            idx[j] = -1
-        pass
 
-    elif n_test_element > n_element:
         for j in range(n_test_element):
-            idx[j] = -1
-            for i in range(n_element):
-                if test_element[j] == element[i]:
-                    idx[j] = offset + i
-                    break
+            idx[j] = <ints_st> -1
+
+    elif both_sorted:
+
+        i = j = 0
+        while (i < n_element) and (j < n_test_element):
+            celement = element[i]
+            ctest_element = test_element[j]
+            if celement == ctest_element:
+                idx[j] = <ints_st> (i + offset)
+                j += 1
+            elif celement < ctest_element:
+                i += 1
+            elif celement > ctest_element:
+                idx[j] = <ints_st> -1
+                j += 1
+        for i in range(j, n_test_element):
+            idx[i] = <ints_st> -1
 
     else:
-        # We need to initialize
-        for j in range(n_test_element):
-            idx[j] = -1
-        for i in range(n_element):
+        if n_test_element > n_element:
             for j in range(n_test_element):
-                if test_element[j] == element[i]:
-                    idx[j] = offset + i
-                    break
+                idx[j] = <ints_st> -1
+                for i in range(n_element):
+                    if test_element[j] == element[i]:
+                        idx[j] = <ints_st> (offset + i)
+                        break
 
+        else:
+            # We need to initialize
+            for j in range(n_test_element):
+                idx[j] = <ints_st> -1
+            for i in range(n_element):
+                for j in range(n_test_element):
+                    if test_element[j] == element[i]:
+                        idx[j] = <ints_st> (offset + i)
+                        break
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-cdef void _indices_sorted_arrays(const Py_ssize_t n_element, const int[::1] element,
-                                 const Py_ssize_t n_test_element, const int[::1] test_element,
-                                 const int offset, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t i, j
-    cdef int ctest_element, celement
+    return IDX
 
-    # Fast return
-    if n_test_element == 0:
-        pass
-    elif n_element == 0:
-        for j in range(n_test_element):
-            idx[j] = -1
-        return
-
-    i = 0
-    j = 0
-    while (i < n_element) and (j < n_test_element):
-        celement = element[i]
-        ctest_element = test_element[j]
-        if celement == ctest_element:
-            idx[j] = i + offset
-            j += 1
-        elif celement < ctest_element:
-            i += 1
-        elif celement > ctest_element:
-            idx[j] = -1
-            j += 1
-    for j in range(j, n_test_element):
-        idx[j] = -1
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def indices_in_cylinder(np.ndarray[np.float64_t, ndim=2, mode='c'] dxyz, const double R, const double h):
+@cython.initializedcheck(False)
+def indices_in_cylinder(floats_st[:, ::1] dxyz, const floats_st R, const floats_st h):
     """ Indices for all coordinates that are within a cylinde radius `R` and height `h`
 
     Parameters
     ----------
-    dxyz : ndarray(np.float64)
+    dxyz :
        coordinates centered around the cylinder
-    R : float
+    R :
        radius of cylinder to check
-    h : float
+    h :
        height of cylinder to check
 
     Returns
     -------
-    index : np.ndarray(np.int32)
+    index :
        indices of all dxyz coordinates that are within the cylinder
     """
-    cdef double[:, ::1] dXYZ = dxyz
-    cdef Py_ssize_t n = dXYZ.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1] idx = np.empty([n], dtype=np.int32)
-    cdef int[::1] IDX = idx
+    cdef ssize_st n = dxyz.shape[0]
+    cdef ssize_st nxyz = dxyz.shape[1] - 1
 
-    n = _indices_in_cylinder(dXYZ, R, h, IDX)
+    cdef ndarray[int32_t] IDX = np.empty([n], dtype=np.int32)
+    cdef int[::1] idx = IDX
 
-    if n == 0:
-        return np.empty([0], dtype=np.int32)
-    return idx[:n].copy()
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-cdef Py_ssize_t _indices_in_cylinder(const double[:, ::1] dxyz, const double R, const double h, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t N = dxyz.shape[0]
-    cdef Py_ssize_t xyz = dxyz.shape[1]
-    cdef double R2 = R * R
-    cdef double L2
-    cdef Py_ssize_t i, j, n
-    cdef int skip
+    cdef floats_st R2 = R * R
+    cdef floats_st L2
+    cdef ssize_st i, j, m
+    cdef bint skip
 
     # Reset number of elements
-    n = 0
-
-    for i in range(N):
-        skip = 0
-        for j in range(xyz-1):
-            skip |= dxyz[i, j] > R
-        if skip or dxyz[i, xyz-1] > h: continue
-
-        L2 = 0.
-        for j in range(xyz-1):
-            L2 += dxyz[i, j] * dxyz[i, j]
-        if L2 > R2: continue
-        idx[n] = i
-        n += 1
-
-    return n
+    m = 0
+
+    with nogil:
+        for i in range(n):
+            skip = 0
+            for j in range(nxyz):
+                skip |= dxyz[i, j] > R
+            if skip or dxyz[i, nxyz] > h: continue
+
+            L2 = 0.
+            for j in range(nxyz):
+                L2 += dxyz[i, j] * dxyz[i, j]
+            if L2 > R2: continue
+            idx[m] = <int> i
+            m += 1
+
+    if m == 0:
+        return np.empty([0], dtype=np.int32)
+    return IDX[:m].copy()
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def indices_in_sphere(np.ndarray[np.float64_t, ndim=2, mode='c'] dxyz, const double R):
+@cython.initializedcheck(False)
+def indices_in_sphere(floats_st[:, ::1] dxyz, const floats_st R):
     """ Indices for all coordinates that are within a sphere of radius `R`
 
     Parameters
     ----------
-    dxyz : ndarray(np.float64)
+    dxyz :
        coordinates centered around the sphere
-    R : float
+    R :
        radius of sphere to check
 
     Returns
     -------
-    index : np.ndarray(np.int32)
+    index:
        indices of all dxyz coordinates that are within the sphere of radius `R`
     """
-    cdef double[:, ::1] dXYZ = dxyz
-    cdef Py_ssize_t n = dXYZ.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1] idx = np.empty([n], dtype=np.int32)
-    cdef int[::1] IDX = idx
+    cdef ssize_st n = dxyz.shape[0]
+    cdef ndarray[int32_t, mode='c'] IDX = np.empty([n], dtype=np.int32)
+    cdef int[::1] idx = IDX
 
-    n = _indices_in_sphere(dXYZ, R, IDX)
+    cdef floats_st R2 = R * R
+    cdef ssize_st i, m
 
-    if n == 0:
+    # Reset number of elements
+    m = 0
+
+    with nogil:
+        for i in range(n):
+            if all_fabs_le(dxyz, i, R):
+                if fabs2(dxyz, i) <= R2:
+                    idx[m] = <int> i
+                    m += 1
+    if m == 0:
         return np.empty([0], dtype=np.int32)
-    return idx[:n].copy()
+    return IDX[:m].copy()
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef Py_ssize_t _indices_in_sphere(const double[:, ::1] dxyz, const double R, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t N = dxyz.shape[0]
-    cdef Py_ssize_t xyz = dxyz.shape[1]
-    cdef double R2 = R * R
-    cdef Py_ssize_t i, n
-
-    # Reset number of elements
-    n = 0
-
-    for i in range(N):
-        if all_fabs_le(dxyz, i, R):
-            if fabs2(dxyz, i) <= R2:
-                idx[n] = i
-                n += 1
-    return n
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def indices_in_sphere_with_dist(np.ndarray[np.float64_t, ndim=2, mode='c'] dxyz, const double R):
+def indices_in_sphere_with_dist(floats_st[:, ::1] dxyz, const floats_st R):
     """ Indices and the distances for all coordinates that are within a sphere of radius `R`
 
     Parameters
     ----------
-    dxyz : ndarray(np.float64)
+    dxyz :
        coordinates centered around the sphere
     R : float
        radius of sphere to check
 
     Returns
     -------
-    index : np.ndarray(np.int32)
+    index :
        indices of all dxyz coordinates that are within the sphere of radius `R`
-    dist : np.ndarray(np.float64)
+    dist :
        distances for the coordinates within the sphere of radius `R` (corresponds to `index`)
     """
-    cdef double[:, ::1] dXYZ = dxyz
-    cdef Py_ssize_t n = dXYZ.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] idx = np.empty([n], dtype=np.int32)
-    cdef np.ndarray[np.float64_t, ndim=1, mode='c'] dist = np.empty([n], dtype=np.float64)
-    cdef int[::1] IDX = idx
-    cdef double[::1] DIST = dist
+    cdef ssize_st n = dxyz.shape[0]
+    cdef ndarray[int32_t, mode='c'] IDX = np.empty([n], dtype=np.int32)
+    cdef object dtype = type2dtype[floats_st](1)
+    cdef ndarray[floats_st, mode='c'] DIST = np.empty([n], dtype=dtype)
+    cdef int[::1] idx = IDX
+    cdef floats_st[::1] dist = DIST
+
+    cdef floats_st R2 = R * R
+    cdef floats_st d
+    cdef ssize_st i, m
+
+    with nogil:
+
+        # Reset number of elements
+        m = 0
+
+        if floats_st is cython.float:
+            for i in range(n):
+                if all_fabs_le(dxyz, i, R):
+                    d = fabs2(dxyz, i)
+                    if d <= R2:
+                        dist[m] = sqrtf(d)
+                        idx[m] = <int> i
+                        m += 1
 
-    n = _indices_in_sphere_with_dist(dXYZ, R, DIST, IDX)
+        else:
+            for i in range(n):
+                if all_fabs_le(dxyz, i, R):
+                    d = fabs2(dxyz, i)
+                    if d <= R2:
+                        dist[m] = sqrt(d)
+                        idx[m] = <int> i
+                        m += 1
 
-    if n == 0:
-        return np.empty([0], dtype=np.int32), np.empty([0], dtype=np.float64)
-    return idx[:n].copy(), dist[:n].copy()
+    if m == 0:
+        return np.empty([0], dtype=np.int32), np.empty([0], dtype=dtype)
+    return IDX[:m].copy(), DIST[:m].copy()
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef Py_ssize_t _indices_in_sphere_with_dist(const double[:, ::1] dxyz, const double R,
-                                             double[::1] dist, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t N = dxyz.shape[0]
-    cdef double R2 = R * R
-    cdef double d
-    cdef Py_ssize_t i, n
-
-    # Reset number of elements
-    n = 0
-
-    for i in range(N):
-        if all_fabs_le(dxyz, i, R):
-            d = fabs2(dxyz, i)
-            if d <= R2:
-                dist[n] = sqrt(d)
-                idx[n] = i
-                n += 1
-    return n
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def indices_le(np.ndarray a, const double V):
+def indices_le(ndarray a, const floats_st V):
     """ Indices for all values in `a` that are ``<= V``
 
     Parameters
     ----------
-    a : np.ndarray(np.float64)
+    a :
        array to check if 2D, all last dimension values must be ``<= V``
     V : float
        value that is checked against
 
     Returns
     -------
-    index : np.ndarray(np.int32)
+    index :
        indices for the values in `a` which are less than or equal to `V`
     """
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] idx = np.empty([a.shape[0]], dtype=np.int32)
-    cdef int[::1] IDX = idx
-
-    cdef Py_ssize_t ndim = a.ndim
-    cdef double[::1] A1
-    cdef double[:, ::1] A2
-    cdef Py_ssize_t n
+    cdef ndarray[int32_t, mode='c'] IDX = np.empty([a.shape[0]], dtype=np.int32)
+    cdef int[::1] idx = IDX
 
-    if a.dtype != np.float64:
-        raise ValueError('indices_le requires input array to be of float64 type')
+    cdef ssize_st ndim = a.ndim
+    cdef floats_st[::1] A1
+    cdef floats_st[:, ::1] A2
+    cdef ssize_st n
 
     if ndim == 1:
         A1 = a
-        n = _indices_le1(A1, V, IDX)
+        n = _indices_le1(A1, V, idx)
 
     elif ndim == 2:
         A2 = a
-        n = _indices_le2(A2, V, IDX)
+        n = _indices_le2(A2, V, idx)
+
+    else:
+        raise NotImplementedError("indices_le not implemented for ndim>2")
 
     if n == 0:
         return np.empty([0], dtype=np.int32)
-    return idx[:n].copy()
+    return IDX[:n].copy()
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef Py_ssize_t _indices_le1(const double[::1] a, const double V, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t N = a.shape[0]
-    cdef Py_ssize_t i, n
+cdef ssize_st _indices_le1(const floats_st[::1] a, const floats_st V, int[::1] idx) noexcept nogil:
+    cdef ssize_st N = a.shape[0]
+    cdef ssize_st i, n
     n = 0
     for i in range(N):
         if a[i] <= V:
-            idx[n] = i
+            idx[n] = <int> i
             n += 1
     return n
 
@@ -397,8 +352,8 @@ cdef Py_ssize_t _indices_le1(const double[::1] a, const double V, int[::1] idx)
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef inline int all_le(const double[:, ::1] a, const Py_ssize_t i, const double V) noexcept nogil:
-    cdef Py_ssize_t j
+cdef inline bint all_le(const floats_st[:, ::1] a, const ssize_st i, const floats_st V) noexcept nogil:
+    cdef ssize_st j
     for j in range(a.shape[1]):
         if a[i, j] > V:
             return 0
@@ -408,65 +363,66 @@ cdef inline int all_le(const double[:, ::1] a, const Py_ssize_t i, const double
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef Py_ssize_t _indices_le2(const double[:, ::1] a, const double V, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t N = a.shape[0]
-    cdef Py_ssize_t i, n
+cdef ssize_st _indices_le2(const floats_st[:, ::1] a, const floats_st V, int[::1] idx) noexcept nogil:
+    cdef ssize_st N = a.shape[0]
+    cdef ssize_st i, n
     n = 0
     for i in range(N):
         if all_le(a, i, V):
-            idx[n] = i
+            idx[n] = <int> i
             n += 1
     return n
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def indices_fabs_le(np.ndarray a, const double V):
+@cython.initializedcheck(False)
+def indices_fabs_le(ndarray a, const floats_st V):
     """ Indices for all values in `a` that are ``| | <= V``
 
     Parameters
     ----------
-    a : np.ndarray(np.float64)
+    a :
        array to check if 2D, all last dimension values must be ``| | <= V``
-    V : float
+    V :
        value that is checked against
 
     Returns
     -------
-    index : np.ndarray(np.int32)
+    index :
        indices for the values in ``|a|`` which are less than or equal to `V`
     """
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] idx = np.empty([a.shape[0]], dtype=np.int32)
-    cdef int[::1] IDX = idx
-
-    cdef Py_ssize_t ndim = a.ndim
-    cdef double[::1] A1
-    cdef double[:, ::1] A2
-    cdef Py_ssize_t n
+    cdef ndarray[int32_t, mode='c'] IDX = np.empty([a.shape[0]], dtype=np.int32)
+    cdef int[::1] idx = IDX
 
-    if a.dtype != np.float64:
-        raise ValueError('indices_fabs_le requires input array to be of float64 type')
+    cdef ssize_st ndim = a.ndim
+    cdef floats_st[::1] A1
+    cdef floats_st[:, ::1] A2
+    cdef ssize_st n
 
     if ndim == 1:
         A1 = a
-        n = _indices_fabs_le1(A1, V, IDX)
+        n = _indices_fabs_le1(A1, V, idx)
 
     elif ndim == 2:
         A2 = a
-        n = _indices_fabs_le2(A2, V, IDX)
+        n = _indices_fabs_le2(A2, V, idx)
+
+    else:
+        raise NotImplementedError("indices_fabs_le not implemented for ndim>2")
 
     if n == 0:
         return np.empty([0], dtype=np.int32)
-    return idx[:n].copy()
+    return IDX[:n].copy()
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef inline double fabs2(const double[:, ::1] a, const Py_ssize_t i) noexcept nogil:
-    cdef Py_ssize_t j
-    cdef double abs2
-    abs2 = 0.
+cdef inline floats_st fabs2(const floats_st[:, ::1] a, const ssize_st i) noexcept nogil:
+    cdef ssize_st j
+    cdef floats_st abs2 = 0.
+
     for j in range(a.shape[1]):
         abs2 += a[i, j]*a[i, j]
     return abs2
@@ -475,117 +431,140 @@ cdef inline double fabs2(const double[:, ::1] a, const Py_ssize_t i) noexcept no
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef Py_ssize_t _indices_fabs_le1(const double[::1] a, const double V, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t N = a.shape[0]
-    cdef Py_ssize_t i, n
+cdef ssize_st _indices_fabs_le1(const floats_st[::1] a, const floats_st V, int[::1] idx) noexcept nogil:
+    cdef ssize_st N = a.shape[0]
+    cdef ssize_st i, n
     n = 0
-    for i in range(N):
-        if fabs(a[i]) <= V:
-            idx[n] = i
-            n += 1
+    if floats_st is cython.float:
+        for i in range(N):
+            if fabsf(a[i]) <= V:
+                idx[n] = <int> i
+                n += 1
+    else:
+        for i in range(N):
+            if fabs(a[i]) <= V:
+                idx[n] = <int> i
+                n += 1
     return n
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef inline int all_fabs_le(const double[:, ::1] a, const Py_ssize_t i, const double V) noexcept nogil:
-    cdef Py_ssize_t j
-    for j in range(a.shape[1]):
-        if fabs(a[i, j]) > V:
-            return 0
+cdef inline bint all_fabs_le(const floats_st[:, ::1] a, const ssize_st i, const floats_st V) noexcept nogil:
+    cdef ssize_st j
+
+    if floats_st is cython.float:
+        for j in range(a.shape[1]):
+            if fabsf(a[i, j]) > V:
+                return 0
+
+    else:
+        for j in range(a.shape[1]):
+            if fabs(a[i, j]) > V:
+                return 0
+
     return 1
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef int _indices_fabs_le2(const double[:, ::1] a, const double V, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t N = a.shape[0]
-    cdef Py_ssize_t i, n
+cdef ssize_st _indices_fabs_le2(const floats_st[:, ::1] a, const floats_st V, int[::1] idx) noexcept nogil:
+    cdef ssize_st N = a.shape[0]
+    cdef ssize_st i, n
     n = 0
     for i in range(N):
         if all_fabs_le(a, i, V):
-            idx[n] = i
+            idx[n] = <int> i
             n += 1
+
     return n
 
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def indices_gt_le(np.ndarray a, const double V1, const double V2):
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] idx = np.empty([a.shape[0]], dtype=np.int32)
-    cdef int[::1] IDX = idx
-
-    cdef Py_ssize_t ndim = a.ndim
-    cdef double[::1] A1
-    cdef double[:, ::1] A2
-    cdef Py_ssize_t n
+@cython.initializedcheck(False)
+def indices_gt_le(ndarray a, const floats_st V1, const floats_st V2):
+    cdef ndarray[int32_t, mode='c'] IDX = np.empty([a.shape[0]], dtype=np.int32)
+    cdef int[::1] idx = IDX
 
-    if a.dtype != np.float64:
-        raise ValueError('indices_gt_le requires input array to be of float64 type')
+    cdef ssize_st ndim = a.ndim
+    cdef floats_st[::1] A1
+    cdef floats_st[:, ::1] A2
+    cdef ssize_st n
 
     if ndim == 1:
         A1 = a
-        n = _indices_gt_le1(A1, V1, V2, IDX)
+        n = _indices_gt_le1(A1, V1, V2, idx)
 
     elif ndim == 2:
         A2 = a
-        n = _indices_gt_le2(A2, V1, V2, IDX)
+        n = _indices_gt_le2(A2, V1, V2, idx)
+
+    else:
+        raise NotImplementedError("indices_gt_le not implemented for ndim>2")
 
     if n == 0:
         return np.empty([0], dtype=np.int32)
-    return idx[:n].copy()
+
+    return IDX[:n].copy()
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef Py_ssize_t _indices_gt_le1(const double[::1] a, const double V1, const double V2, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t N = a.shape[0]
-    cdef Py_ssize_t i, n
+cdef ssize_st _indices_gt_le1(const floats_st[::1] a, const floats_st V1, const floats_st
+                                V2, int[::1] idx) noexcept nogil:
+    cdef ssize_st N = a.shape[0]
+    cdef ssize_st i, n
     n = 0
     for i in range(N):
         if V1 < a[i]:
             if a[i] <= V2:
-                idx[n] = i
+                idx[n] = <int> i
                 n += 1
+
     return n
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-cdef inline int all_gt_le(const double[:, ::1] a, const Py_ssize_t i, const double V1, const double V2) noexcept nogil:
-    cdef Py_ssize_t j
-    for j in range(a.shape[1]):
-        if a[i, j] <= V1:
-            return 0
-        elif V2 < a[i, j]:
-            return 0
-    return 1
-
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef Py_ssize_t _indices_gt_le2(const double[:, ::1] a, const double V1, const double V2, int[::1] idx) noexcept nogil:
-    cdef Py_ssize_t N = a.shape[0]
-    cdef Py_ssize_t i, n
+cdef ssize_st _indices_gt_le2(const floats_st[:, ::1] a, const floats_st V1, const floats_st
+                              V2, int[::1] idx) noexcept nogil:
+    cdef ssize_st N = a.shape[0]
+    cdef ssize_st i, n
     n = 0
     for i in range(N):
         if all_gt_le(a, i, V1, V2):
-            idx[n] = i
+            idx[n] = <int> i
             n += 1
+
     return n
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef inline int in_1d(const int[::1] array, const int v) noexcept nogil:
-    cdef Py_ssize_t N = array.shape[0]
-    cdef Py_ssize_t i
+cdef inline bint all_gt_le(const floats_st[:, ::1] a, const ssize_st i, const floats_st V1,
+                          const floats_st V2) noexcept nogil:
+    cdef ssize_st j
+    for j in range(a.shape[1]):
+        if a[i, j] <= V1:
+            return 0
+        elif V2 < a[i, j]:
+            return 0
+    return 1
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cdef inline bint in_1d(const ints_st[::1] array, const ints_st v) noexcept nogil:
+    cdef ssize_st N = array.shape[0]
+    cdef ssize_st i
     for i in range(N):
         if array[i] == v:
             return 1
@@ -594,14 +573,15 @@ cdef inline int in_1d(const int[::1] array, const int v) noexcept nogil:
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def index_sorted(np.ndarray[np.int32_t, ndim=1, mode='c'] a, const int v):
+@cython.initializedcheck(False)
+def index_sorted(ints_st[::1] a, const ints_st v):
     """ Return index for the value v in a sorted array, otherwise return -1
 
     Parameters
     ----------
-    a : int[::1]
+    a :
         sorted array to check
-    v : int
+    v :
         value to find
 
     Returns
@@ -609,60 +589,63 @@ def index_sorted(np.ndarray[np.int32_t, ndim=1, mode='c'] a, const int v):
     int : -1 if not found, otherwise the first index in `a` that is equal to `v`
     """
     # Ensure contiguous arrays
-    cdef int[::1] A = a
-    return <int> _index_sorted(A, v)
+    return _index_sorted(a, v)
+
 
+# This small code needs all variants
+# The variants are declared in the _indices.pxd file
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef Py_ssize_t _index_sorted(const int[::1] a, const int v) noexcept nogil:
+@cython.cdivision(True)
+cdef ssize_st _index_sorted(const ints_st[::1] a, const _ints_index_sorted_st v) noexcept nogil:
     """ Return index for the value v in a sorted array, otherwise return -1
 
     This implements a binary search method
 
     Parameters
     ----------
-    a : int[::1]
+    a :
         sorted array to check
-    v : int
+    v :
         value to find
 
     Returns
     -------
     int : 0 if not unique, otherwise 1.
     """
-    cdef Py_ssize_t i, L, R
+    cdef ssize_st MIN1 = -1
+    cdef ssize_st i, L, R
 
     # Simple binary search
+    R = a.shape[0] - 1
+    if R == -1:
+        return MIN1
+    elif a[R] < v:
+        return MIN1
+
     L = 0
-    R = a.shape[0]
-    if R == 0:
-        return -1
-    elif v < a[L]:
-        return -1
-
-    while L < R:
-        i = (L + R) // 2
+    while L <= R:
+        i = (L + R) / 2
         if a[i] < v:
             L = i + 1
-        elif a[i] == v:
-            return i
+        elif v < a[i]:
+            R = i - 1
         else:
-            R = i
-    if a[R] == v:
-        return R
-    return -1
+            return i
+    return MIN1
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def sorted_unique(np.ndarray[np.int32_t, ndim=1, mode='c'] a):
+@cython.initializedcheck(False)
+def is_sorted_unique(ints_st[::1] a):
     """ Return True/False if all elements of the sorted array `a` are unique
 
     Parameters
     ----------
-    a : np.ndarray(np.int32)
+    a :
         sorted array to check
 
     Returns
@@ -670,31 +653,24 @@ def sorted_unique(np.ndarray[np.int32_t, ndim=1, mode='c'] a):
     int : 0 if not unique, otherwise 1.
     """
     # Ensure contiguous arrays
-    cdef int[::1] A = a
-    cdef Py_ssize_t n = A.shape[0]
+    cdef ssize_st n = a.shape[0]
+    cdef ssize_st i, ret = 1
+
+    if n > 1:
+        # only check for larger than 1 arrays
+        with nogil:
+            for i in range(n - 1):
+                if a[i] == a[i+1]:
+                    ret = 0
+                    break
+    return ret
 
-    return _sorted_unique(n, A)
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef int _sorted_unique(const Py_ssize_t n_a, const int[::1] a) noexcept nogil:
-    cdef Py_ssize_t i
-
-    # Fast return
-    if n_a <= 1:
-        return 1
-
-    for i in range(n_a - 1):
-        if a[i] == a[i+1]:
-            return 0
-    return 1
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def list_index_le(np.ndarray[np.int32_t, ndim=1, mode='c'] a, np.ndarray[np.int32_t, ndim=1, mode='c'] b):
+def list_index_le(ints_st[::1] a, ints_st[::1] b):
     """ Find indices for each ``a`` such that the returned ``a[i] <= b[ret[i]]`` where `b` is assumed sorted
 
     This corresponds to:
@@ -704,34 +680,25 @@ def list_index_le(np.ndarray[np.int32_t, ndim=1, mode='c'] a, np.ndarray[np.int3
 
     Parameters
     ----------
-    a : np.ndarray(np.int32)
+    a :
         values to check indicies of
-    b : np.ndarray(np.int32)
+    b :
         sorted array to check against
 
     Returns
     -------
-    np.ndarray(np.int32): same length as `a` with indicies
+    indices with same length as `a`
     """
     # Ensure contiguous arrays
-    cdef int[::1] A = a
-    cdef int[::1] B = b
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] c = np.empty([A.shape[0]], dtype=np.int32)
-    cdef int[::1] C = c
+    cdef ssize_st na = a.shape[0]
+    cdef ssize_st nb = b.shape[0]
+    cdef object dtype = type2dtype[ints_st](1)
+    cdef ndarray[ints_st] C = np.empty([na], dtype=dtype)
+    cdef ints_st[::1] c = C
 
-    _list_index_le(A, B, C)
-    return c
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-cdef inline void _list_index_le(const int[::1] a, const int[::1] b, int[::1] c) noexcept nogil:
-    cdef Py_ssize_t na = a.shape[0]
-    cdef Py_ssize_t nb = b.shape[0]
-    cdef Py_ssize_t ia, ib
-    cdef int ai, alast
-    cdef Py_ssize_t start = 0
+    cdef ssize_st ia, ib
+    cdef ints_st ai, alast
+    cdef ssize_st start = 0
 
     if na > 0:
         alast = a[0]
@@ -743,6 +710,8 @@ cdef inline void _list_index_le(const int[::1] a, const int[::1] b, int[::1] c)
         alast = ai
         for ib in range(start, nb):
             if ai <= b[ib]:
-                c[ia] = ib
+                c[ia] = <ints_st> ib
                 start = ib
                 break
+
+    return C
diff --git a/src/sisl/_math_small.pyx b/src/sisl/_math_small.pyx
index 5b8cd9b15a..c5e7b39e01 100644
--- a/src/sisl/_math_small.pyx
+++ b/src/sisl/_math_small.pyx
@@ -1,46 +1,51 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+# file, you can obtain one at https://mozilla.org/MPL/2.0/.
 cimport cython
-from libc.math cimport atan2, sqrt
+from libc.math cimport atan2, atan2f, sqrt, sqrtf
 
 import numpy as np
 
-# This enables Cython enhanced compatibilities
+from numpy cimport dtype, ndarray
 
-cimport numpy as np
+from sisl._core._dtypes cimport floats_st, ssize_st, type2dtype
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def cross3(np.ndarray[np.float64_t, ndim=1, mode='c'] u, np.ndarray[np.float64_t, ndim=1, mode='c'] v):
-    cdef np.ndarray[np.float64_t, ndim=1, mode='c'] y = np.empty([3], dtype=np.float64)
+def cross3(const floats_st[::1] u, const floats_st[::1] v):
+    cdef object dtyp = type2dtype[floats_st](1)
+    cdef ndarray[floats_st, mode='c'] Y = np.empty([3], dtype=dtyp)
+    cdef floats_st[::1] y = Y
     y[0] = u[1] * v[2] - u[2] * v[1]
     y[1] = u[2] * v[0] - u[0] * v[2]
     y[2] = u[0] * v[1] - u[1] * v[0]
-    return y
+    return Y
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def dot3(np.ndarray[np.float64_t, ndim=1, mode='c'] u, np.ndarray[np.float64_t, ndim=1, mode='c'] v):
-    return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]
+def dot3(const floats_st[::1] u, const floats_st[::1] v):
+    cdef floats_st r
+    r = u[0] * v[0] + u[1] * v[1] + u[2] * v[2]
+    return r
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def product3(np.ndarray[np.float64_t, ndim=1, mode='c'] v):
-    return v[0] * v[1] * v[2]
+def product3(const floats_st[::1] v):
+    cdef floats_st r
+    r = v[0] * v[1] * v[2]
+    return r
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-def is_ascending(np.ndarray[np.float64_t, ndim=1, mode='c'] v):
-    cdef double[::1] V = v
-    cdef Py_ssize_t i
-    for i in range(1, V.shape[0]):
-        if V[i-1] > V[i]:
+def is_ascending(const floats_st[::1] v):
+    cdef ssize_st i
+    for i in range(1, v.shape[0]):
+        if v[i-1] > v[i]:
             return 0
     return 1
 
@@ -49,26 +54,37 @@ def is_ascending(np.ndarray[np.float64_t, ndim=1, mode='c'] v):
 @cython.wraparound(False)
 @cython.initializedcheck(False)
 @cython.cdivision(True)
-def xyz_to_spherical_cos_phi(np.ndarray[np.float64_t, ndim=1, mode='c'] x,
-                             np.ndarray[np.float64_t, ndim=1, mode='c'] y,
-                             np.ndarray[np.float64_t, ndim=1, mode='c'] z):
+def xyz_to_spherical_cos_phi(floats_st[::1] x,
+                             floats_st[::1] y,
+                             floats_st[::1] z):
     """ In x, y, z coordinates shifted to origo
 
     Returns x = R, y = theta, z = cos_phi
     """
-    cdef double[::1] X = x
-    cdef double[::1] Y = y
-    cdef double[::1] Z = z
-    cdef Py_ssize_t i
-    cdef double R
-    for i in range(X.shape[0]):
-        # theta (radians)
-        R = sqrt(X[i] * X[i] + Y[i] * Y[i] + Z[i] * Z[i])
-        Y[i] = atan2(Y[i], X[i])
-        # Radius
-        X[i] = R
-        # cos(phi)
-        if R > 0.:
-            Z[i] = Z[i] / R
-        else:
-            Z[i] = 0.
+    cdef ssize_st i
+    cdef floats_st R
+
+    if floats_st is cython.float:
+        for i in range(x.shape[0]):
+            # theta (radians)
+            R = sqrtf(x[i] * x[i] + y[i] * y[i] + z[i] * z[i])
+            y[i] = atan2f(y[i], x[i])
+            # Radius
+            x[i] = R
+            # cos(phi)
+            if R > 0.:
+                z[i] = z[i] / R
+            else:
+                z[i] = 0.
+    else:
+        for i in range(x.shape[0]):
+            # theta (radians)
+            R = sqrt(x[i] * x[i] + y[i] * y[i] + z[i] * z[i])
+            y[i] = atan2(y[i], x[i])
+            # Radius
+            x[i] = R
+            # cos(phi)
+            if R > 0.:
+                z[i] = z[i] / R
+            else:
+                z[i] = 0.
diff --git a/src/sisl/io/siesta/_help.py b/src/sisl/io/siesta/_help.py
index 7d3c47244d..d7ac67585d 100644
--- a/src/sisl/io/siesta/_help.py
+++ b/src/sisl/io/siesta/_help.py
@@ -11,7 +11,7 @@
 __all__ = ["_siesta_sc_off"]
 __all__ += ["_csr_from_siesta", "_csr_from_sc_off"]
 __all__ += ["_csr_to_siesta", "_csr_to_sc_off"]
-__all__ += ["_mat_spin_convert", "_fc_correct"]
+__all__ += ["_mat_sisl2siesta", "_mat_siesta2sisl", "_fc_correct"]
 
 
 def _siesta_sc_off(nsc):
@@ -98,45 +98,179 @@ def _csr_from(col_from, csr):
     csr.translate_columns(col_from, col_to)
 
 
-def _mat_spin_convert(M, spin=None):
+def _mat2dtype(M, dtype: np.dtype) -> None:
+    """Change the internal CSR matrix in `M` to a follow `dtype`"""
+
+    if M.dtype == dtype:
+        return M
+
+    spin = M.spin
+    csr = M._csr
+    shape = csr._D.shape
+
+    # Change details
+    old_dtype = np.dtype(M.dtype)
+    new_dtype = np.dtype(dtype)
+
+    def toc(D, re, im):
+        return (D[..., re] + 1j * D[..., im]).astype(dtype, copy=False)
+
+    if old_dtype.kind in ("f", "i"):
+        if new_dtype.kind in ("f", "i"):
+            # this is just simple casting
+            csr._D = csr._D.astype(dtype)
+        elif new_dtype.kind == "c":
+            # we need to *collect it
+            if spin.is_diagonal:
+                # this is just simple casting,
+                # each diagonal component has its own index
+                csr._D = csr._D.astype(dtype)
+            elif spin.is_noncolinear:
+                D = np.empty(shape[:-1] + (shape[-1] - 1,), dtype=dtype)
+                # These should be real only anyways!
+                D[..., [0, 1]] = csr._D[..., [0, 1]].real.astype(dtype)
+                D[..., 2] = toc(csr._D, 2, 3)
+                if D.shape[-1] > 4:
+                    D[..., 3:] = csr._D[..., 4:].astype(dtype)
+                csr._D = D
+            elif spin.is_spinorbit:
+                D = np.empty(shape[:-1] + (shape[-1] - 4,), dtype=dtype)
+                D[..., 0] = toc(csr._D, 0, 4)
+                D[..., 1] = toc(csr._D, 1, 5)
+                D[..., 2] = toc(csr._D, 2, 3)
+                D[..., 3] = toc(csr._D, 6, 7)
+                if D.shape[-1] > 4:
+                    D[..., 4:] = csr._D[..., 8:].astype(dtype)
+                csr._D = D
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    elif old_dtype.kind == "c":
+        if new_dtype.kind == "c":
+            # this is just simple casting
+            csr._D = csr._D.astype(dtype)
+        elif new_dtype.kind in ("f", "i"):
+            # we need to *collect it
+            if spin.is_diagonal:
+                # this is just simple casting,
+                # each diagonal component has its own index
+                csr._D = csr._D.astype(dtype)
+            elif spin.is_noncolinear:
+                D = np.empty(shape[:-1] + (shape[-1] + 1,), dtype=dtype)
+                # These should be real only anyways!
+                D[..., [0, 1]] = csr._D[..., [0, 1]].real.astype(dtype)
+                D[..., 2] = csr._D[..., 2].real.astype(dtype)
+                D[..., 3] = csr._D[..., 2].imag.astype(dtype)
+                if D.shape[-1] > 4:
+                    D[..., 4:] = csr._D[..., 3:].real.astype(dtype)
+                csr._D = D
+            elif spin.is_spinorbit:
+                D = np.empty(shape[:-1] + (shape[-1] + 4,), dtype=dtype)
+                D[..., 0] = csr._D[..., 0].real.astype(dtype)
+                D[..., 1] = csr._D[..., 1].real.astype(dtype)
+                D[..., 2] = csr._D[..., 2].real.astype(dtype)
+                D[..., 3] = csr._D[..., 2].imag.astype(dtype)
+                D[..., 4] = csr._D[..., 0].imag.astype(dtype)
+                D[..., 5] = csr._D[..., 1].imag.astype(dtype)
+                D[..., 6] = csr._D[..., 3].real.astype(dtype)
+                D[..., 7] = csr._D[..., 3].imag.astype(dtype)
+                if D.shape[-1] > 8:
+                    D[..., 8:] = csr._D[..., 4:].real.astype(dtype)
+                csr._D = D
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+    M._reset()
+
+
+def _mat_siesta2sisl(M, dtype: Optional[np.dtype] = None) -> None:
     """Conversion of Siesta spin matrices to sisl spin matrices
 
     The matrices from Siesta are given in a format adheering to the following
-    concept:
+    concept.
+
+    There are two cases:
+
+    1. A non-colinear calculation:
+
+       Siesta uses this convention:
 
-    A non-colinear calculation has the following entries (in C-index) for
-    the sparse matrix:
+            H[:, [0, 1, 2, 3]]
+            H11 == H[:, 0]
+            H22 == H[:, 1]
+            H12 == H[:, 2] - 1j H[:, 3] # spin-box Hermitian
+            H21 == H[:, 2] + 1j H[:, 3]
 
-    H[:, [0, 1, 2, 3]]
-    H11 == H[:, 0]
-    H22 == H[:, 1]
-    H12 == H[:, 2] - 1j H[:, 3] # spin-box Hermitian
-    H21 == H[:, 2] + 1j H[:, 3]
+       In sisl we use this convention, see `Hamiltonian`:
 
-    Although it really does not make sense to change anything, we
-    do change it to adhere to the spin-orbit case (see below).
-    I.e. what Siesta *saves* is the -Im[H12], which we now store
-    as Im[H12].
+            H11 == H[:, 0]
+            H22 == H[:, 1]
+            H12 == H[:, 2] + 1j H[:, 3] # spin-box Hermitian
+            H21 == H[:, 2] - 1j H[:, 3]
 
+    2. A spin-orbit calculation:
 
-    A spin-orbit calculation has the following entries (in C-index) for
-    the sparse matrix:
+       Siesta uses this convention:
 
-    H[:, [0, 1, 2, 3, 4, 5, 6, 7]]
-    H11 == H[:, 0] + 1j H[:, 4]
-    H22 == H[:, 1] + 1j H[:, 5]
-    H12 == H[:, 2] + 1j H[:, 3] # spin-box Hermitian
-    H21 == H[:, 6] + 1j H[:, 7]
+            H[:, [0, 1, 2, 3, 4, 5, 6, 7]]
+            H11 == H[:, 0] + 1j H[:, 4]
+            H22 == H[:, 1] + 1j H[:, 5]
+            H12 == H[:, 2] - 1j H[:, 3]
+            H21 == H[:, 6] + 1j H[:, 7]
+
+       In sisl we use this convention, see `Hamiltonian`:
+
+            H[:, [0, 1, 2, 3, 4, 5, 6, 7]]
+            H11 == H[:, 0] + 1j H[:, 4]
+            H22 == H[:, 1] + 1j H[:, 5]
+            H12 == H[:, 2] + 1j H[:, 3]
+            H21 == H[:, 6] + 1j H[:, 7]
+
+    On top of this it depends on whether the data-type is complex
+    or not.
     """
-    if spin is None:
-        if M.spin.is_noncolinear:
+    if dtype is None:
+        dtype = M.dtype
+
+    spin = M.spin
+
+    if spin.is_noncolinear:
+        if np.dtype(M.dtype).kind in ("f", "i"):
             M._csr._D[:, 3] = -M._csr._D[:, 3]
-        elif M.spin.is_spinorbit:
+        else:
+            M._csr._D[:, 2] = M._csr._D[:, 2].conj()
+    elif spin.is_spinorbit:
+        if np.dtype(M.dtype).kind in ("f", "i"):
+            M._csr._D[:, 3] = -M._csr._D[:, 3]
+        else:
+            M._csr._D[:, 2] = M._csr._D[:, 2].conj()
+
+    _mat2dtype(M, dtype)
+
+
+def _mat_sisl2siesta(M, dtype: Optional[np.dtype] = None) -> None:
+    """Conversion of sisl to Siesta spin matrices"""
+    if dtype is None:
+        dtype = M.dtype
+
+    # convert to float
+    _mat2dtype(M, dtype)
+
+    spin = M.spin
+
+    if spin.is_noncolinear:
+        if np.dtype(M.dtype).kind in ("f", "i"):
             M._csr._D[:, 3] = -M._csr._D[:, 3]
-    elif spin.is_noncolinear:
-        M._D[:, 3] = -M._D[:, 3]
+        else:
+            M._csr._D[:, 2] = M._csr._D[:, 2].conj()
     elif spin.is_spinorbit:
-        M._D[:, 3] = -M._D[:, 3]
+        if np.dtype(M.dtype).kind in ("f", "i"):
+            M._csr._D[:, 3] = -M._csr._D[:, 3]
+        else:
+            M._csr._D[:, 2] = M._csr._D[:, 2].conj()
 
 
 def _geom2hsx(geometry):
diff --git a/src/sisl/io/siesta/binaries.py b/src/sisl/io/siesta/binaries.py
index b33164b638..fd7224b038 100644
--- a/src/sisl/io/siesta/binaries.py
+++ b/src/sisl/io/siesta/binaries.py
@@ -396,6 +396,8 @@ def read_hamiltonian(self, geometry=None, **kwargs) -> Hamiltonian:
         )
 
         # Check whether it is an orthogonal basis set
+        # TODO, this is not an exhaustive test, but is *fine* for most
+        # cases
         orthogonal = np.abs(dS).sum() == geom.no
 
         # Create the Hamiltonian container
@@ -418,7 +420,7 @@ def read_hamiltonian(self, geometry=None, **kwargs) -> Hamiltonian:
             H._csr._D[:, :spin] = dH[:, :] * _Ry2eV
             H._csr._D[:, spin] = dS[:]
 
-        _mat_spin_convert(H)
+        _mat_siesta2sisl(H, dtype=kwargs.get("dtype"))
 
         # Convert to sisl supercell
         # equivalent as _csr_from_siesta with explicit isc from file
@@ -442,7 +444,8 @@ def write_hamiltonian(self, H, **kwargs):
         """Writes the Hamiltonian to a siesta.TSHS file"""
         # we sort below, so no need to do it here
         # see onlysSileSiesta.read_overlap for .transpose()
-        csr = H.transpose(spin=False, sort=False)._csr
+        H = H.transpose(spin=False, sort=False)
+        csr = H._csr
         if csr.nnz == 0:
             raise SileError(
                 f"{self!r}.write_hamiltonian cannot write "
@@ -454,7 +457,7 @@ def write_hamiltonian(self, H, **kwargs):
         # Convert to siesta CSR
         _csr_to_siesta(H.geometry, csr, diag=True)
         csr.finalize(sort=sort)
-        _mat_spin_convert(csr, H.spin)
+        _mat_sisl2siesta(H, dtype=np.float64)
 
         # Extract the data to pass to the fortran routine
         cell = H.geometry.cell
@@ -566,7 +569,7 @@ def read_density_matrix(self, **kwargs) -> DensityMatrix:
         # DM file does not contain overlap matrix... so neglect it for now.
         DM._csr._D[:, spin] = 0.0
 
-        _mat_spin_convert(DM)
+        _mat_siesta2sisl(DM, dtype=kwargs.get("dtype"))
 
         # Convert the supercells to sisl supercells
         if nsc[0] != 0 or geom.no_s >= col.max():
@@ -584,7 +587,8 @@ def read_density_matrix(self, **kwargs) -> DensityMatrix:
 
     def write_density_matrix(self, DM, **kwargs):
         """Writes the density matrix to a siesta.DM file"""
-        csr = DM.transpose(spin=False, sort=False)._csr
+        DM = DM.transpose(spin=False, sort=False)
+        csr = DM._csr
         # This ensures that we don"t have any *empty* elements
         if csr.nnz == 0:
             raise SileError(
@@ -596,7 +600,8 @@ def write_density_matrix(self, DM, **kwargs):
         # We do not really need to sort this one, but we do for consistency
         # of the interface.
         csr.finalize(sort=kwargs.get("sort", True))
-        _mat_spin_convert(csr, DM.spin)
+
+        _mat_sisl2siesta(DM, dtype=np.float64)
 
         # Get DM
         if DM.orthogonal:
@@ -674,7 +679,7 @@ def read_energy_density_matrix(self, **kwargs) -> EnergyDensityMatrix:
         # EDM file does not contain overlap matrix... so neglect it for now.
         EDM._csr._D[:, spin] = 0.0
 
-        _mat_spin_convert(EDM)
+        _mat_siesta2sisl(EDM, dtype=kwargs.get("dtype"))
 
         # Convert the supercells to sisl supercells
         if nsc[0] != 0 or geom.no_s >= col.max():
@@ -704,7 +709,7 @@ def read_fermi_level(self) -> float:
         self._fortran_check("read_fermi_level", "could not read fermi-level.")
         return Ef
 
-    def write_density_matrices(self, DM, EDM, Ef=0.0, **kwargs):
+    def write_density_matrices(self, DM, EDM, Ef: float = 0.0, **kwargs):
         r"""Writes the density matrix to a siesta.DM file
 
         Parameters
@@ -713,31 +718,32 @@ def write_density_matrices(self, DM, EDM, Ef=0.0, **kwargs):
            density matrix to write to the file
         EDM : EnergyDensityMatrix
            energy density matrix to write to the file
-        Ef : float, optional
+        Ef :
            fermi-level to be contained
         """
-        DMcsr = DM.transpose(spin=False, sort=False)._csr
-        EDMcsr = EDM.transpose(spin=False, sort=False)._csr
-        DMcsr.align(EDMcsr)
-        EDMcsr.align(DMcsr)
+        sort = kwargs.get("sort", True)
+        DM = DM.transpose(spin=False, sort=sort)
+        EDM = EDM.transpose(spin=False, sort=sort)
+        DM._csr.align(EDM._csr)
+        EDM._csr.align(DM._csr)
 
-        if DMcsr.nnz == 0:
+        if DM._csr.nnz == 0:
             raise SileError(
                 f"{self!r}.write_density_matrices cannot write "
                 "a zero element sparse matrix!"
             )
 
-        _csr_to_siesta(DM.geometry, DMcsr)
-        _csr_to_siesta(DM.geometry, EDMcsr)
-        sort = kwargs.get("sort", True)
-        DMcsr.finalize(sort=sort)
-        EDMcsr.finalize(sort=sort)
-        _mat_spin_convert(DMcsr, DM.spin)
-        _mat_spin_convert(EDMcsr, EDM.spin)
+        _csr_to_siesta(DM.geometry, DM._csr)
+        _csr_to_siesta(DM.geometry, EDM._csr)
+        DM._csr.finalize(sort=sort)
+        EDM._csr.finalize(sort=sort)
+        _mat_sisl2siesta(DM, dtype=np.float64)
+        _mat_sisl2siesta(EDM, dtype=np.float64)
 
         # Ensure everything is correct
         if not (
-            np.allclose(DMcsr.ncol, EDMcsr.ncol) and np.allclose(DMcsr.col, EDMcsr.col)
+            np.allclose(DM._csr.ncol, EDM._csr.ncol)
+            and np.allclose(DM._csr.col, EDM._csr.col)
         ):
             raise ValueError(
                 f"{self!r}.write_density_matrices got non compatible "
@@ -745,21 +751,21 @@ def write_density_matrices(self, DM, EDM, Ef=0.0, **kwargs):
             )
 
         if DM.orthogonal:
-            dm = DMcsr._D
+            dm = DM._csr._D
         else:
-            dm = DMcsr._D[:, : DM.S_idx]
+            dm = DM._csr._D[:, : DM.S_idx]
         if EDM.orthogonal:
-            edm = EDMcsr._D
+            edm = EDM._csr._D
         else:
-            edm = EDMcsr._D[:, : EDM.S_idx]
+            edm = EDM._csr._D[:, : EDM.S_idx]
 
         nsc = DM.geometry.lattice.nsc.astype(np.int32)
 
         _siesta.write_tsde_dm_edm(
             self.file,
             nsc,
-            DMcsr.ncol,
-            DMcsr.col + 1,
+            DM._csr.ncol,
+            DM._csr.col + 1,
             _toF(dm, np.float64),
             _toF(edm, np.float64, _eV2Ry),
             Ef * _eV2Ry,
@@ -1348,7 +1354,7 @@ def _r_hamiltonian_v0(self, **kwargs):
             )
 
         # Create the Hamiltonian container
-        H = Hamiltonian(geom, spin, nnzpr=1, dtype=np.float32, orthogonal=False)
+        H = Hamiltonian(geom, spin, nnzpr=1, orthogonal=False)
 
         # Create the new sparse matrix
         H._csr.ncol = ncol.astype(np.int32, copy=False)
@@ -1361,7 +1367,7 @@ def _r_hamiltonian_v0(self, **kwargs):
         H._csr._D[:, :spin] = dH[:, :] * _Ry2eV
         H._csr._D[:, spin] = dS[:]
 
-        _mat_spin_convert(H)
+        _mat_siesta2sisl(H, dtype=kwargs.get("dtype"))
 
         # Convert the supercells to sisl supercells
         if no_s // no == np.prod(geom.nsc):
@@ -1392,7 +1398,7 @@ def _r_hamiltonian_v1(self, **kwargs):
             )
 
         # Create the Hamiltonian container
-        H = Hamiltonian(geom, spin, nnzpr=1, dtype=np.float32, orthogonal=False)
+        H = Hamiltonian(geom, spin, nnzpr=1, orthogonal=False)
 
         # Create the new sparse matrix
         H._csr.ncol = ncol.astype(np.int32, copy=False)
@@ -1406,7 +1412,7 @@ def _r_hamiltonian_v1(self, **kwargs):
         H._csr._D[:, :spin] = dH[:, :] * _Ry2eV
         H._csr._D[:, spin] = dS[:]
 
-        _mat_spin_convert(H)
+        _mat_siesta2sisl(H, dtype=kwargs.get("dtype"))
 
         # Convert the supercells to sisl supercells
         _csr_from_sc_off(H.geometry, isc.T, H._csr)
@@ -1440,7 +1446,7 @@ def _r_overlap_v0(self, **kwargs):
             )
 
         # Create the Hamiltonian container
-        S = Overlap(geom, nnzpr=1, dtype=np.float32)
+        S = Overlap(geom, nnzpr=1)
 
         # Create the new sparse matrix
         S._csr.ncol = ncol.astype(np.int32, copy=False)
diff --git a/src/sisl/io/siesta/siesta_nc.py b/src/sisl/io/siesta/siesta_nc.py
index 91eb35eefa..16d795e53d 100644
--- a/src/sisl/io/siesta/siesta_nc.py
+++ b/src/sisl/io/siesta/siesta_nc.py
@@ -250,11 +250,11 @@ def read_hamiltonian(self, **kwargs) -> Hamiltonian:
             H._csr._D[:, i] = sp.variables["H"][i, :] * Ry2eV
 
         # fix siesta specific notation
-        _mat_spin_convert(H)
+        _mat_siesta2sisl(H, dtype=kwargs.get("dtype"))
 
         # Shift to the Fermi-level
-        Ef = -self._value("Ef")[:] * Ry2eV
-        H.shift(Ef)
+        Ef = self._value("Ef")[:] * Ry2eV
+        H.shift(-Ef)
 
         return H.transpose(spin=False, sort=kwargs.get("sort", True))
 
@@ -285,7 +285,7 @@ def read_density_matrix(self, **kwargs) -> DensityMatrix:
             DM._csr._D[:, i] = sp.variables["DM"][i, :]
 
         # fix siesta specific notation
-        _mat_spin_convert(DM)
+        _mat_siesta2sisl(DM, dtype=kwargs.get("dtype"))
 
         return DM.transpose(spin=False, sort=kwargs.get("sort", True))
 
@@ -305,7 +305,7 @@ def read_energy_density_matrix(self, **kwargs) -> EnergyDensityMatrix:
                 EDM._csr._D[:, i] -= sp.variables["DM"][i, :] * Ef[i]
 
         # fix siesta specific notation
-        _mat_spin_convert(EDM)
+        _mat_siesta2sisl(EDM, dtype=kwargs.get("dtype"))
 
         return EDM.transpose(spin=False, sort=kwargs.get("sort", True))
 
@@ -613,7 +613,8 @@ def write_hamiltonian(self, H, **kwargs):
         Ef : float, optional
            the Fermi level of the electronic structure (in eV), default to 0.
         """
-        csr = H.transpose(spin=False, sort=False)._csr
+        H = H.transpose(spin=False, sort=False)
+        csr = H._csr
         if csr.nnz == 0:
             raise SileError(
                 f"{self}.write_hamiltonian cannot write a zero element sparse matrix!"
@@ -622,7 +623,8 @@ def write_hamiltonian(self, H, **kwargs):
         # Convert to siesta CSR
         _csr_to_siesta(H.geometry, csr)
         csr.finalize(sort=kwargs.get("sort", True))
-        _mat_spin_convert(csr, H.spin)
+
+        _mat_sisl2siesta(H, dtype=np.float64)
 
         # Ensure that the geometry is written
         self.write_geometry(H.geometry)
@@ -671,7 +673,8 @@ def write_density_matrix(self, DM, **kwargs):
         DM : DensityMatrix
            the model to be saved in the NC file
         """
-        csr = DM.transpose(spin=False, sort=False)._csr
+        DM = DM.transpose(spin=False, sort=False)
+        csr = DM._csr
         if csr.nnz == 0:
             raise SileError(
                 f"{self}.write_density_matrix cannot write a zero element sparse matrix!"
@@ -680,7 +683,7 @@ def write_density_matrix(self, DM, **kwargs):
         # Convert to siesta CSR (we don't need to sort this matrix)
         _csr_to_siesta(DM.geometry, csr)
         csr.finalize(sort=kwargs.get("sort", True))
-        _mat_spin_convert(csr, DM.spin)
+        _mat_sisl2siesta(DM, dtype=np.float64)
 
         # Ensure that the geometry is written
         self.write_geometry(DM.geometry)
@@ -728,7 +731,8 @@ def write_energy_density_matrix(self, EDM, **kwargs):
         EDM : EnergyDensityMatrix
            the model to be saved in the NC file
         """
-        csr = EDM.transpose(spin=False, sort=False)._csr
+        EDM = EDM.transpose(spin=False, sort=False)
+        csr = EDM._csr
         if csr.nnz == 0:
             raise SileError(
                 f"{self}.write_energy_density_matrix cannot write a zero element sparse matrix!"
@@ -737,7 +741,7 @@ def write_energy_density_matrix(self, EDM, **kwargs):
         # no need to sort this matrix
         _csr_to_siesta(EDM.geometry, csr)
         csr.finalize(sort=kwargs.get("sort", True))
-        _mat_spin_convert(csr, EDM.spin)
+        _mat_sisl2siesta(EDM, dtype=np.float64)
 
         # Ensure that the geometry is written
         self.write_geometry(EDM.geometry)
diff --git a/src/sisl/io/siesta/tests/test_matrices.py b/src/sisl/io/siesta/tests/test_matrices.py
new file mode 100644
index 0000000000..e43c0b710d
--- /dev/null
+++ b/src/sisl/io/siesta/tests/test_matrices.py
@@ -0,0 +1,116 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import sisl
+from sisl.io.siesta._help import _mat2dtype
+
+pytestmark = [pytest.mark.io, pytest.mark.siesta]
+
+listify = sisl.utils.listify
+
+
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize(
+    "matrix,ext",
+    (map(lambda x: ("Hamiltonian", x), ["nc", "TSHS"]) | listify)
+    + (map(lambda x: ("DensityMatrix", x), ["nc", "DM"]) | listify)
+    + (map(lambda x: ("EnergyDensityMatrix", x), ["nc"]) | listify),
+)
+@pytest.mark.parametrize("read_dtype", [np.float64, np.complex128])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.complex128])
+def test_non_colinear(sisl_tmp, sort, matrix, ext, dtype, read_dtype):
+    if ext == "nc":
+        pytest.importorskip("netCDF4")
+
+    M = getattr(sisl, matrix)(sisl.geom.graphene(), spin=sisl.Spin("NC"), dtype=dtype)
+    if np.issubdtype(dtype, np.complexfloating):
+        onsite = [0.1 + 0j, 0.2 + 0j, 0.3 + 0.4j]
+        nn = [0.2, 0.3, 0.4 + 0.5j]
+    else:
+        onsite = [0.1, 0.2, 0.3, 0.4]
+        nn = [0.2, 0.3, 0.4, 0.5]
+    M.construct(([0.1, 1.44], [onsite, nn]))
+
+    f1 = sisl_tmp(f"M1.{ext}")
+    f2 = sisl_tmp(f"M2.{ext}")
+    M.write(f1, sort=sort)
+    M.finalize()
+    with sisl.get_sile(f1) as sile:
+        M2 = M.read(sile, dtype=read_dtype)
+    M2.write(f2, sort=sort)
+    with sisl.get_sile(f2) as sile:
+        M3 = M2.read(sile, dtype=read_dtype)
+
+    if sort:
+        M.finalize(sort=sort)
+    assert M._csr.spsame(M2._csr)
+    assert M._csr.spsame(M3._csr)
+
+    from sisl.io.siesta._help import _mat2dtype
+
+    # Convert to the same dtype
+    _mat2dtype(M2, dtype)
+    _mat2dtype(M3, dtype)
+    if M.orthogonal and not M2.orthogonal:
+        assert np.allclose(M._csr._D, M2._csr._D[..., :-1])
+    else:
+        assert np.allclose(M._csr._D, M2._csr._D)
+    if M.orthogonal and not M3.orthogonal:
+        assert np.allclose(M._csr._D, M3._csr._D[..., :-1])
+    else:
+        assert np.allclose(M._csr._D, M3._csr._D)
+
+
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize(
+    "matrix,ext",
+    (map(lambda x: ("Hamiltonian", x), ["nc", "TSHS"]) | listify)
+    + (map(lambda x: ("DensityMatrix", x), ["nc", "DM"]) | listify)
+    + (map(lambda x: ("EnergyDensityMatrix", x), ["nc"]) | listify),
+)
+@pytest.mark.parametrize("read_dtype", [np.float64, np.complex128])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.complex128])
+def test_spin_orbit(sisl_tmp, sort, matrix, ext, dtype, read_dtype):
+    if ext == "nc":
+        pytest.importorskip("netCDF4")
+
+    M = getattr(sisl, matrix)(sisl.geom.graphene(), spin=sisl.Spin("SO"), dtype=dtype)
+    if np.issubdtype(dtype, np.complexfloating):
+        onsite = [0.1 + 0j, 0.2 + 0j, 0.3 + 0.4j, 0.3 - 0.4j]
+        nn = [0.2 + 0.1j, 0.3 + 0.3j, 0.4 + 0.5j, 0.4 - 0.5j]
+    else:
+        onsite = [0.1, 0.2, 0.3, 0.4, 0, 0, 0.3, -0.4]
+        nn = [0.2, 0.3, 0.4, 0.5, 0.1, 0.3, 0.4, -0.5]
+    M.construct(([0.1, 1.44], [onsite, nn]))
+
+    f1 = sisl_tmp(f"M1.{ext}")
+    f2 = sisl_tmp(f"M2.{ext}")
+    M.write(f1, sort=sort)
+    M.finalize()
+    with sisl.get_sile(f1) as sile:
+        M2 = M.read(sile, dtype=read_dtype)
+    M2.write(f2, sort=sort)
+    with sisl.get_sile(f2) as sile:
+        M3 = M2.read(sile, dtype=read_dtype)
+
+    if sort:
+        M.finalize(sort=sort)
+    assert M._csr.spsame(M2._csr)
+    assert M._csr.spsame(M3._csr)
+
+    # Convert to the same dtype
+    _mat2dtype(M2, dtype)
+    _mat2dtype(M3, dtype)
+    if M.orthogonal and not M2.orthogonal:
+        assert np.allclose(M._csr._D, M2._csr._D[..., :-1])
+    else:
+        assert np.allclose(M._csr._D, M2._csr._D)
+    if M.orthogonal and not M3.orthogonal:
+        assert np.allclose(M._csr._D, M3._csr._D[..., :-1])
+    else:
+        assert np.allclose(M._csr._D, M3._csr._D)
diff --git a/src/sisl/io/siesta/tests/test_siesta.py b/src/sisl/io/siesta/tests/test_siesta.py
index 6f62ec52ee..e959fe6ed3 100644
--- a/src/sisl/io/siesta/tests/test_siesta.py
+++ b/src/sisl/io/siesta/tests/test_siesta.py
@@ -161,99 +161,6 @@ def test_nc_density_matrix(sisl_tmp, sisl_system):
     assert sisl_system.g.atoms.equal(ndm.atoms, R=False)
 
 
-def test_nc_H_non_colinear(sisl_tmp):
-    H1 = Hamiltonian(sisl.geom.graphene(), spin=sisl.Spin("NC"))
-    H1.construct(([0.1, 1.44], [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]]))
-
-    f1 = sisl_tmp("H1.nc")
-    f2 = sisl_tmp("H2.nc")
-    H1.write(f1)
-    H1.finalize()
-    with sisl.get_sile(f1) as sile:
-        H2 = sile.read_hamiltonian()
-    H2.write(f2)
-    with sisl.get_sile(f2) as sile:
-        H3 = sile.read_hamiltonian()
-    assert H1._csr.spsame(H2._csr)
-    assert np.allclose(H1._csr._D, H2._csr._D)
-    assert H1._csr.spsame(H3._csr)
-    assert np.allclose(H1._csr._D, H3._csr._D)
-
-
-def test_nc_DM_non_colinear(sisl_tmp):
-    DM1 = DensityMatrix(sisl.geom.graphene(), spin=sisl.Spin("NC"))
-    DM1.construct(([0.1, 1.44], [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]]))
-
-    f1 = sisl_tmp("DM1.nc")
-    f2 = sisl_tmp("DM2.nc")
-    DM1.write(f1)
-    DM1.finalize()
-    with sisl.get_sile(f1) as sile:
-        DM2 = sile.read_density_matrix()
-    DM2.write(f2)
-    with sisl.get_sile(f2) as sile:
-        DM3 = sile.read_density_matrix()
-    assert DM1._csr.spsame(DM2._csr)
-    assert DM1._csr.spsame(DM3._csr)
-    # DM1 is finalized, but DM2 is not finalized
-    assert np.allclose(DM1._csr._D, DM2._csr._D)
-    # DM2 and DM3 are the same
-    assert np.allclose(DM2._csr._D, DM3._csr._D)
-    DM2.finalize()
-    assert np.allclose(DM1._csr._D, DM2._csr._D)
-
-
-def test_nc_EDM_non_colinear(sisl_tmp):
-    EDM1 = EnergyDensityMatrix(sisl.geom.graphene(), spin=sisl.Spin("NC"))
-    EDM1.construct(([0.1, 1.44], [[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5]]))
-
-    f1 = sisl_tmp("EDM1.nc")
-    f2 = sisl_tmp("EDM2.nc")
-    EDM1.write(f1, sort=False)
-    EDM1.finalize()
-    with sisl.get_sile(f1) as sile:
-        EDM2 = sile.read_energy_density_matrix(sort=False)
-    EDM2.write(f2, sort=False)
-    with sisl.get_sile(f2) as sile:
-        EDM3 = sile.read_energy_density_matrix(sort=False)
-    assert EDM1._csr.spsame(EDM2._csr)
-    assert EDM1._csr.spsame(EDM3._csr)
-    # EDM1 is finalized, but EDM2 is not finalized
-    assert not np.allclose(EDM1._csr._D, EDM2._csr._D)
-    # EDM2 and EDM3 are the same
-    assert np.allclose(EDM2._csr._D, EDM3._csr._D)
-    EDM2.finalize()
-    assert np.allclose(EDM1._csr._D, EDM2._csr._D)
-
-
-@pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
-def test_nc_H_spin_orbit(sisl_tmp):
-    H1 = Hamiltonian(sisl.geom.graphene(), spin=sisl.Spin("SO"))
-    H1.construct(
-        (
-            [0.1, 1.44],
-            [
-                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
-                [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
-            ],
-        )
-    )
-
-    f1 = sisl_tmp("H1.nc")
-    f2 = sisl_tmp("H2.nc")
-    H1.write(f1)
-    H1.finalize()
-    with sisl.get_sile(f1) as sile:
-        H2 = sile.read_hamiltonian()
-    H2.write(f2)
-    with sisl.get_sile(f2) as sile:
-        H3 = sile.read_hamiltonian()
-    assert H1._csr.spsame(H2._csr)
-    assert np.allclose(H1._csr._D, H2._csr._D)
-    assert H1._csr.spsame(H3._csr)
-    assert np.allclose(H1._csr._D, H3._csr._D)
-
-
 @pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
 def test_nc_H_spin_orbit_nc2tshs2nc(sisl_tmp):
     H1 = Hamiltonian(sisl.geom.graphene(), spin=sisl.Spin("SO"))
@@ -282,34 +189,6 @@ def test_nc_H_spin_orbit_nc2tshs2nc(sisl_tmp):
     assert np.allclose(H1._csr._D, H3._csr._D)
 
 
-@pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
-def test_nc_DM_spin_orbit(sisl_tmp):
-    DM1 = DensityMatrix(sisl.geom.graphene(), spin=sisl.Spin("SO"))
-    DM1.construct(
-        (
-            [0.1, 1.44],
-            [
-                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
-                [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
-            ],
-        )
-    )
-
-    f1 = sisl_tmp("DM1.nc")
-    f2 = sisl_tmp("DM2.nc")
-    DM1.write(f1)
-    DM1.finalize()
-    with sisl.get_sile(f1) as sile:
-        DM2 = sile.read_density_matrix()
-    DM2.write(f2)
-    with sisl.get_sile(f2) as sile:
-        DM3 = sile.read_density_matrix()
-    assert DM1._csr.spsame(DM2._csr)
-    assert np.allclose(DM1._csr._D, DM2._csr._D)
-    assert DM1._csr.spsame(DM3._csr)
-    assert np.allclose(DM1._csr._D, DM3._csr._D)
-
-
 @pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
 def test_nc_DM_spin_orbit_nc2dm2nc(sisl_tmp):
     DM1 = DensityMatrix(sisl.geom.graphene(), orthogonal=False, spin=sisl.Spin("SO"))
diff --git a/src/sisl/io/siesta/tests/test_tsde.py b/src/sisl/io/siesta/tests/test_tsde.py
index 58822a25f2..e251b3468c 100644
--- a/src/sisl/io/siesta/tests/test_tsde.py
+++ b/src/sisl/io/siesta/tests/test_tsde.py
@@ -48,6 +48,34 @@ def test_si_pdos_kgrid_tsde_edm(sisl_files):
     assert np.allclose(EDM1._csr._D[:, :-1], EDM2._csr._D[:, :-1])
 
 
+@pytest.mark.filterwarnings("ignore", message="*Casting complex values")
+@pytest.mark.parametrize(("matrix"), ["density", "energy_density"])
+def test_si_pdos_kgrid_tsde_edm_dtypes(sisl_files, sisl_tmp, matrix):
+    fdf = sisl.get_sile(
+        sisl_files("siesta", "Si_pdos_k", "Si_pdos.fdf"),
+        base=sisl_files("siesta", "Si_pdos_k"),
+    )
+    data = []
+    mull = None
+
+    for dtype in (np.float32, np.float64, np.complex64, np.complex128):
+        M = getattr(fdf, f"read_{matrix}_matrix")(dtype=dtype)
+        data.append(M)
+        assert M.dtype == dtype
+
+        if mull is None:
+            mull = M.mulliken()
+        else:
+            assert np.allclose(mull, M.mulliken(), atol=1e-5)
+
+    fnc = sisl_tmp("tmp.nc")
+    for M in data:
+        M.write(fnc)
+        # The overlap should be here...
+        M1 = M.read(fnc)
+        assert np.allclose(mull, M1.mulliken(), atol=1e-5)
+
+
 @pytest.mark.filterwarnings("ignore", message="*wrong sparse pattern")
 def test_si_pdos_kgrid_tsde_dm_edm_rw(sisl_files, sisl_tmp):
     fdf = sisl.get_sile(
diff --git a/src/sisl/io/siesta/tests/test_tshs.py b/src/sisl/io/siesta/tests/test_tshs.py
index 6ece2f9e16..20e6fc4759 100644
--- a/src/sisl/io/siesta/tests/test_tshs.py
+++ b/src/sisl/io/siesta/tests/test_tshs.py
@@ -28,6 +28,34 @@ def test_tshs_si_pdos_kgrid(sisl_files, sisl_tmp):
     assert np.allclose(HS1._csr._D, HS2._csr._D)
 
 
+@pytest.mark.filterwarnings("ignore", message="*Casting complex values")
+def test_tshs_si_pdos_dtypes_eigs(sisl_files, sisl_tmp):
+    si = sisl.get_sile(sisl_files("siesta", "Si_pdos_k", "Si_pdos.TSHS"))
+    data = []
+    eigs = None
+    k = [0.1] * 3
+    for dtype in (np.float32, np.float64, np.complex64, np.complex128):
+        HS = si.read_hamiltonian(dtype=dtype)
+        data.append(HS)
+        assert HS.dtype == dtype
+
+        if eigs is None:
+            eigs = HS.eigh(k)
+        else:
+            assert np.allclose(eigs, HS.eigh(k), atol=1e-5)
+
+    f = sisl_tmp("tmp.TSHS")
+    fnc = sisl_tmp("tmp.nc")
+    for HS in data:
+        HS.write(f)
+        HS1 = HS.read(f)
+        assert np.allclose(eigs, HS1.eigh(k), atol=1e-5)
+
+        HS.write(fnc)
+        HS1 = HS.read(fnc)
+        assert np.allclose(eigs, HS1.eigh(k), atol=1e-5)
+
+
 def test_tshs_si_pdos_kgrid_tofromnc(sisl_files, sisl_tmp):
     pytest.importorskip("netCDF4")
     si = sisl.get_sile(sisl_files("siesta", "Si_pdos_k", "Si_pdos.TSHS"))
@@ -80,6 +108,34 @@ def test_tshs_soc_pt2_xx(sisl_files, sisl_tmp):
     assert np.allclose(HS1._csr._D, HS2._csr._D)
 
 
+@pytest.mark.filterwarnings("ignore", message="*Casting complex values")
+def test_tshs_soc_pt2_xx_dtypes(sisl_files, sisl_tmp):
+    fdf = sisl.get_sile(sisl_files("siesta", "Pt2_soc", "Pt2.fdf"))
+    data = []
+    eigs = None
+    k = [0.1] * 3
+    for dtype in (np.float32, np.float64, np.complex64, np.complex128):
+        HS = fdf.read_hamiltonian(dtype=dtype)
+        data.append(HS)
+        assert HS.dtype == dtype
+
+        if eigs is None:
+            eigs = HS.eigh(k)
+        else:
+            assert np.allclose(eigs, HS.eigh(k), atol=1e-5)
+
+    f = sisl_tmp("tmp.TSHS")
+    fnc = sisl_tmp("tmp.nc")
+    for HS in data:
+        HS.write(f)
+        HS1 = HS.read(f)
+        assert np.allclose(eigs, HS1.eigh(k), atol=1e-5)
+
+        HS.write(fnc)
+        HS1 = HS.read(fnc)
+        assert np.allclose(eigs, HS1.eigh(k), atol=1e-5)
+
+
 def test_tshs_soc_pt2_xx_pdos(sisl_files):
     fdf = sisl.get_sile(sisl_files("siesta", "Pt2_soc", "Pt2.fdf"))
     sc = fdf.read_lattice(order="TSHS")
@@ -137,32 +193,6 @@ def test_tshs_si_pdos_kgrid_overlap(sisl_files):
     assert np.allclose(HS._csr._D[:, HS.S_idx], S._csr._D[:, 0])
 
 
-@pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
-def test_tshs_spin_orbit(sisl_tmp):
-    H1 = sisl.Hamiltonian(sisl.geom.graphene(), spin=sisl.Spin("SO"))
-    H1.construct(
-        (
-            [0.1, 1.44],
-            [
-                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
-                [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
-            ],
-        )
-    )
-
-    f1 = sisl_tmp("tmp1.TSHS")
-    f2 = sisl_tmp("tmp2.TSHS")
-    H1.write(f1)
-    H1.finalize()
-    H2 = sisl.get_sile(f1).read_hamiltonian()
-    H2.write(f2)
-    H3 = sisl.get_sile(f2).read_hamiltonian()
-    assert H1._csr.spsame(H2._csr)
-    assert np.allclose(H1._csr._D, H2._csr._D)
-    assert H1._csr.spsame(H3._csr)
-    assert np.allclose(H1._csr._D, H3._csr._D)
-
-
 @pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
 def test_tshs_spin_orbit_tshs2nc2tshs(sisl_tmp):
     pytest.importorskip("netCDF4")
diff --git a/src/sisl/io/sile.py b/src/sisl/io/sile.py
index 7469c2e72f..de9c41012f 100644
--- a/src/sisl/io/sile.py
+++ b/src/sisl/io/sile.py
@@ -1303,7 +1303,7 @@ def __getattr__(self, attr):
 
             exe = Path(sys.executable).name
             msg = f"Could not import netCDF4. Please install it using '{exe} -m pip install netCDF4'"
-            raise SileError(msg) from e
+            raise SileError(msg)
 
     netCDF4 = _mock_netCDF4()
 
diff --git a/src/sisl/io/tbtrans/delta.py b/src/sisl/io/tbtrans/delta.py
index d2c0594eb9..8a0955dc28 100644
--- a/src/sisl/io/tbtrans/delta.py
+++ b/src/sisl/io/tbtrans/delta.py
@@ -21,7 +21,8 @@
 from ..siesta._help import (
     _csr_from_sc_off,
     _csr_to_siesta,
-    _mat_spin_convert,
+    _mat_siesta2sisl,
+    _mat_sisl2siesta,
     _siesta_sc_off,
 )
 from ..sile import SileError, add_sile, sile_raise_write
@@ -436,7 +437,8 @@ def write_delta(self, delta, **kwargs):
         The input options for `TBtrans`_ determine whether this is a self-energy term
         or a Hamiltonian term.
         """
-        csr = delta._csr.copy()
+        out_delta = delta.copy()
+        csr = out_delta._csr
         if csr.nnz == 0:
             raise SileError(
                 f"{self!s}.write_overlap cannot write a zero element sparse matrix!"
@@ -446,7 +448,7 @@ def write_delta(self, delta, **kwargs):
         _csr_to_siesta(delta.geometry, csr, diag=False)
         # delta should always write sorted matrices
         csr.finalize(sort=True)
-        _mat_spin_convert(csr, delta.spin)
+        _mat_sisl2siesta(out_delta)
 
         # Ensure that the geometry is written
         self.write_geometry(delta.geometry)
@@ -557,9 +559,9 @@ def write_delta(self, delta, **kwargs):
         csize[-1] = csr.nnz
 
         if delta.spin.kind > delta.spin.POLARIZED:
-            print(delta.spin)
             raise ValueError(
-                f"{self.__class__.__name__}.write_delta only allows spin-polarized delta values"
+                f"{self.__class__.__name__}.write_delta only allows spin-polarized "
+                f"delta values, got {delta.spin!s}"
             )
 
         if delta.dtype.kind == "c":
@@ -667,7 +669,7 @@ def _r_class(self, cls, **kwargs):
 
         # Convert from isc to sisl isc
         _csr_from_sc_off(C.geometry, lvl.variables["isc_off"][:, :], C._csr)
-        _mat_spin_convert(C)
+        _mat_siesta2sisl(C, dtype=kwargs.get("dtype"))
 
         return C
 
diff --git a/src/sisl/physics/CMakeLists.txt b/src/sisl/physics/CMakeLists.txt
index a144553ee0..f5eb34a534 100644
--- a/src/sisl/physics/CMakeLists.txt
+++ b/src/sisl/physics/CMakeLists.txt
@@ -4,15 +4,14 @@ set_property(DIRECTORY
   APPEND
   PROPERTY INCLUDE_DIRECTORIES
   ${CMAKE_CURRENT_SOURCE_DIR}/..
+  ${CMAKE_CURRENT_SOURCE_DIR}/../_core
   )
 
-foreach(source 
+foreach(source
     _bloch _phase
     _matrix_utils
     _matrix_k _matrix_dk _matrix_ddk
-    _matrix_phase _matrix_phase_nc_diag _matrix_phase_nc _matrix_phase_so
-    _matrix_phase3 _matrix_phase3_nc _matrix_phase3_so
-    _matrix_sc_phase _matrix_sc_phase_nc_diag _matrix_sc_phase_nc _matrix_sc_phase_so
+    _matrix_phase _matrix_phase_sc _matrix_phase3
     )
   add_cython_library(
     SOURCE ${source}.pyx
diff --git a/src/sisl/physics/_matrix_ddk.pyx b/src/sisl/physics/_matrix_ddk.pyx
index d1fa01cc41..fc83ac76df 100644
--- a/src/sisl/physics/_matrix_ddk.pyx
+++ b/src/sisl/physics/_matrix_ddk.pyx
@@ -2,26 +2,19 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 cimport cython
-from libc.math cimport fabs
 
 import numpy as np
-
-cimport numpy as np
+cimport numpy as cnp
 
 from ._common import comply_gauge
+from sisl._core._dtypes cimport floats_st
 from ._matrix_phase3 import *
-from ._matrix_phase3_nc import *
-from ._matrix_phase3_so import *
-from ._matrix_phase_nc_diag import *
 from ._phase import *
 
-_dot = np.dot
-_roll = np.roll
-
 __all__ = ["matrix_ddk", "matrix_ddk_nc", "matrix_ddk_nc_diag", "matrix_ddk_so"]
 
 
-def _phase_ddk(gauge, M, sc, np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype):
+def _phase_ddk(gauge, M, sc, cnp.ndarray[floats_st] k, dtype):
     # dtype *must* be passed through phase_dtype
     gauge = comply_gauge(gauge)
 
@@ -34,10 +27,10 @@ def _phase_ddk(gauge, M, sc, np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype
     #  Rd = dx^2, dy^2, dz^2, dzy, dxz, dyx
     if gauge == 'cell':
         phases = phase_rsc(sc, k, dtype).reshape(-1, 1)
-        Rs = _dot(sc.sc_off, sc.cell)
+        Rs = np.dot(sc.sc_off, sc.cell)
         Rd = - (Rs * Rs * phases).astype(dtype, copy=False)
-        Ro = - (_roll(Rs, 1, axis=1) * phases).astype(dtype, copy=False) # z, x, y
-        Ro *= _roll(Rs, -1, axis=1) # y, z, x
+        Ro = - (np.roll(Rs, 1, axis=1) * phases).astype(dtype, copy=False) # z, x, y
+        Ro *= np.roll(Rs, -1, axis=1) # y, z, x
         del phases, Rs
         p_opt = 1
 
@@ -46,151 +39,73 @@ def _phase_ddk(gauge, M, sc, np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype
         rij = M.Rij()._csr._D
         phases = phase_rij(rij, sc, k, dtype).reshape(-1, 1)
         Rd = - (rij * rij * phases).astype(dtype, copy=False)
-        Ro = - (_roll(rij, 1, axis=1) * phases).astype(dtype, copy=False) # z, x, y
-        Ro *= _roll(rij, -1, axis=1) # y, z, x
+        Ro = - (np.roll(rij, 1, axis=1) * phases).astype(dtype, copy=False) # z, x, y
+        Ro *= np.roll(rij, -1, axis=1) # y, z, x
         del rij, phases
-        p_opt = 1
+        p_opt = 0
 
     return p_opt, Rd, Ro
 
 
-def matrix_ddk(gauge, M, const int idx, sc,
-               np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
+def matrix_ddk(gauge, M, const int idx, sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype)
     p_opt, Rd, Ro = _phase_ddk(gauge, M, sc, k, dtype)
-    return _matrix_ddk(M._csr, idx, Rd, Ro, dtype, format, p_opt)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_ddk(csr, const int idx, Rd, Ro, dtype, format, p_opt):
 
     # Return list
     dd = [None, None, None, None, None, None]
 
-    if dtype == np.complex128:
-
-        if format in ("array", "matrix", "dense"):
-            dd[:3] = _phase3_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
-            dd[3:] = _phase3_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
-
-        else:
-            # Default must be something else.
-            dd[:3] = _phase3_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
-            dd[3:] = _phase3_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
-            dd[0] = dd[0].asformat(format)
-            dd[1] = dd[1].asformat(format)
-            dd[2] = dd[2].asformat(format)
-            dd[3] = dd[3].asformat(format)
-            dd[4] = dd[4].asformat(format)
-            dd[5] = dd[5].asformat(format)
-
-    elif dtype == np.float64:
-        if format in ("array", "matrix", "dense"):
-            dd[:3] = _phase3_array_f64(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
-            dd[3:] = _phase3_array_f64(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
-        else:
-            dd[:3] = _phase3_csr_f64(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
-            dd[3:] = _phase3_csr_f64(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
-            dd[0] = dd[0].asformat(format)
-            dd[1] = dd[1].asformat(format)
-            dd[2] = dd[2].asformat(format)
-            dd[3] = dd[3].asformat(format)
-            dd[4] = dd[4].asformat(format)
-            dd[5] = dd[5].asformat(format)
-
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            dd[:3] = _phase3_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
-            dd[3:] = _phase3_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
-        else:
-            dd[:3] = _phase3_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
-            dd[3:] = _phase3_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
-            dd[0] = dd[0].asformat(format)
-            dd[1] = dd[1].asformat(format)
-            dd[2] = dd[2].asformat(format)
-            dd[3] = dd[3].asformat(format)
-            dd[4] = dd[4].asformat(format)
-            dd[5] = dd[5].asformat(format)
-
-    elif dtype == np.float32:
-        if format in ("array", "matrix", "dense"):
-            dd[:3] = _phase3_array_f32(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
-            dd[3:] = _phase3_array_f32(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
-        else:
-            dd[:3] = _phase3_csr_f32(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
-            dd[3:] = _phase3_csr_f32(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
-            dd[0] = dd[0].asformat(format)
-            dd[1] = dd[1].asformat(format)
-            dd[2] = dd[2].asformat(format)
-            dd[3] = dd[3].asformat(format)
-            dd[4] = dd[4].asformat(format)
-            dd[5] = dd[5].asformat(format)
+    csr = M._csr
+
+    if format in ("array", "matrix", "dense"):
+        dd[:3] = _phase3_array(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
+        dd[3:] = _phase3_array(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
 
     else:
-        raise ValueError("matrix_ddk: currently only supports dtype in [float32, float64, complex64, complex128].")
+        # Default must be something else.
+        dd[:3] = _phase3_csr(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rd, p_opt)
+        dd[3:] = _phase3_csr(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ro, p_opt)
+        dd[0] = dd[0].asformat(format)
+        dd[1] = dd[1].asformat(format)
+        dd[2] = dd[2].asformat(format)
+        dd[3] = dd[3].asformat(format)
+        dd[4] = dd[4].asformat(format)
+        dd[5] = dd[5].asformat(format)
 
     return dd
 
 
-def matrix_ddk_nc(gauge, M, sc,
-                  np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
+def matrix_ddk_nc(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, Rd, Ro = _phase_ddk(gauge, M, sc, k, dtype)
-    return _matrix_ddk_nc(M._csr, Rd, Ro, dtype, format, p_opt)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_ddk_nc(csr, Rd, Ro, dtype, format, p_opt):
 
     # Return list
     dd = [None, None, None, None, None, None]
 
-    if dtype == np.complex128:
-
-        if format in ("array", "matrix", "dense"):
-            dd[:3] = _phase3_nc_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
-            dd[3:] = _phase3_nc_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
-
-        else:
-            # Default must be something else.
-            dd[:3] = _phase3_nc_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
-            dd[3:] = _phase3_nc_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
-            dd[0] = dd[0].asformat(format)
-            dd[1] = dd[1].asformat(format)
-            dd[2] = dd[2].asformat(format)
-            dd[3] = dd[3].asformat(format)
-            dd[4] = dd[4].asformat(format)
-            dd[5] = dd[5].asformat(format)
-
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            dd[:3] = _phase3_nc_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
-            dd[3:] = _phase3_nc_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
-        else:
-            dd[:3] = _phase3_nc_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
-            dd[3:] = _phase3_nc_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
-            dd[0] = dd[0].asformat(format)
-            dd[1] = dd[1].asformat(format)
-            dd[2] = dd[2].asformat(format)
-            dd[3] = dd[3].asformat(format)
-            dd[4] = dd[4].asformat(format)
-            dd[5] = dd[5].asformat(format)
+    csr = M._csr
+
+    if format in ("array", "matrix", "dense"):
+        dd[:3] = _phase3_array_nc(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
+        dd[3:] = _phase3_array_nc(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
 
     else:
-        raise ValueError("matrix_ddk_nc: currently only supports dtype in [complex64, complex128].")
+        # Default must be something else.
+        dd[:3] = _phase3_csr_nc(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
+        dd[3:] = _phase3_csr_nc(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
+        dd[0] = dd[0].asformat(format)
+        dd[1] = dd[1].asformat(format)
+        dd[2] = dd[2].asformat(format)
+        dd[3] = dd[3].asformat(format)
+        dd[4] = dd[4].asformat(format)
+        dd[5] = dd[5].asformat(format)
 
     return dd
 
 
-def matrix_ddk_nc_diag(gauge, M, const int idx, sc,
-                       np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
+def matrix_ddk_nc_diag(gauge, M, const int idx, sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, Rd, Ro = _phase_ddk(gauge, M, sc, k, dtype)
 
+    # We need the phases to be consecutive in memory
     Rxx = Rd[:, 0].copy()
     Ryy = Rd[:, 1].copy()
     Rzz = Rd[:, 2].copy()
@@ -200,84 +115,49 @@ def matrix_ddk_nc_diag(gauge, M, const int idx, sc,
     Ryx = Ro[:, 2].copy()
     del Ro
 
-    # Get each of them
-    dxx = _matrix_ddk_nc_diag(M._csr, idx, Rxx, dtype, format, p_opt)
-    dyy = _matrix_ddk_nc_diag(M._csr, idx, Ryy, dtype, format, p_opt)
-    dzz = _matrix_ddk_nc_diag(M._csr, idx, Rzz, dtype, format, p_opt)
-    dzy = _matrix_ddk_nc_diag(M._csr, idx, Rzy, dtype, format, p_opt)
-    dxz = _matrix_ddk_nc_diag(M._csr, idx, Rxz, dtype, format, p_opt)
-    dyx = _matrix_ddk_nc_diag(M._csr, idx, Ryx, dtype, format, p_opt)
-    return dxx, dyy, dzz, dzy, dxz, dyx
+    csr = M._csr
 
+    if format in ("array", "matrix", "dense"):
+        dxx = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxx, p_opt)
+        dyy = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryy, p_opt)
+        dzz = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzz, p_opt)
+        dzy = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzy, p_opt)
+        dxz = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxz, p_opt)
+        dyx = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryx, p_opt)
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_ddk_nc_diag(csr, const int idx, phases, dtype, format, p_opt):
-
-    if dtype == np.complex128:
-
-        if format in ("array", "matrix", "dense"):
-            return _phase_nc_diag_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
-
-        # Default must be something else.
-        return _phase_nc_diag_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
-
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _phase_nc_diag_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
-        return _phase_nc_diag_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
+    else:
+        dxx = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxx, p_opt).asformat(format)
+        dyy = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryy, p_opt).asformat(format)
+        dzz = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzz, p_opt).asformat(format)
+        dzy = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rzy, p_opt).asformat(format)
+        dxz = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Rxz, p_opt).asformat(format)
+        dyx = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, Ryx, p_opt).asformat(format)
 
-    raise ValueError("matrix_ddk_nc_diag: only supports dtype in [complex64, complex128].")
+    return dxx, dyy, dzz, dzy, dxz, dyx
 
 
-def matrix_ddk_so(gauge, M, sc,
-                  np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
+def matrix_ddk_so(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, Rd, Ro = _phase_ddk(gauge, M, sc, k, dtype)
-    return _matrix_ddk_so(M._csr, Rd, Ro, dtype, format, p_opt)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_ddk_so(csr, Rd, Ro, dtype, format, p_opt):
 
     # Return list
     dd = [None, None, None, None, None, None]
 
-    if dtype == np.complex128:
-
-        if format in ("array", "matrix", "dense"):
-            dd[:3] = _phase3_so_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
-            dd[3:] = _phase3_so_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
-
-        else:
-            # Default must be something else.
-            dd[:3] = _phase3_so_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
-            dd[3:] = _phase3_so_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
-            dd[0] = dd[0].asformat(format)
-            dd[1] = dd[1].asformat(format)
-            dd[2] = dd[2].asformat(format)
-            dd[3] = dd[3].asformat(format)
-            dd[4] = dd[4].asformat(format)
-            dd[5] = dd[5].asformat(format)
-
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            dd[:3] = _phase3_so_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
-            dd[3:] = _phase3_so_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
-        else:
-            dd[:3] = _phase3_so_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
-            dd[3:] = _phase3_so_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
-            dd[0] = dd[0].asformat(format)
-            dd[1] = dd[1].asformat(format)
-            dd[2] = dd[2].asformat(format)
-            dd[3] = dd[3].asformat(format)
-            dd[4] = dd[4].asformat(format)
-            dd[5] = dd[5].asformat(format)
+    csr = M._csr
+
+    if format in ("array", "matrix", "dense"):
+        dd[:3] = _phase3_array_so(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
+        dd[3:] = _phase3_array_so(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
 
     else:
-        raise ValueError("matrix_ddk_so: currently only supports dtype in [complex64, complex128].")
+        # Default must be something else.
+        dd[:3] = _phase3_csr_so(csr.ptr, csr.ncol, csr.col, csr._D, Rd, p_opt)
+        dd[3:] = _phase3_csr_so(csr.ptr, csr.ncol, csr.col, csr._D, Ro, p_opt)
+        dd[0] = dd[0].asformat(format)
+        dd[1] = dd[1].asformat(format)
+        dd[2] = dd[2].asformat(format)
+        dd[3] = dd[3].asformat(format)
+        dd[4] = dd[4].asformat(format)
+        dd[5] = dd[5].asformat(format)
 
     return dd
diff --git a/src/sisl/physics/_matrix_dk.pyx b/src/sisl/physics/_matrix_dk.pyx
index 0523e7a8ba..3a937f3495 100644
--- a/src/sisl/physics/_matrix_dk.pyx
+++ b/src/sisl/physics/_matrix_dk.pyx
@@ -2,108 +2,73 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 cimport cython
-from libc.math cimport fabs
 
 import numpy as np
 
-cimport numpy as np
+cimport numpy as cnp
+
+from sisl._core._dtypes cimport floats_st, ints_st
 
 from ._common import comply_gauge
+from ._matrix_phase import *
 from ._matrix_phase3 import *
-from ._matrix_phase3_nc import *
-from ._matrix_phase3_so import *
-from ._matrix_phase_nc_diag import *
 from ._phase import *
 
-_dot = np.dot
-
 __all__ = ["matrix_dk", "matrik_dk_nc", "matrik_dk_nc_diag", "matrik_dk_so"]
 
 
-def _phase_dk(gauge, M, sc, np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype):
+def _phase_dk(gauge, M, sc, cnp.ndarray[floats_st] k, dtype):
     # dtype *must* be passed through phase_dtype
     gauge = comply_gauge(gauge)
 
     # This is the differentiated matrix with respect to k
     # See _phase.pyx, we are using exp(i k.R/r)
     #  i R
-    if gauge == 'cell':
-        iRs = phase_rsc(sc, k, dtype).reshape(-1, 1)
-        iRs = (1j * _dot(sc.sc_off, sc.cell) * iRs).astype(dtype, copy=False)
-        p_opt = 1
 
-    elif gauge == 'atom':
+    if gauge == 'atom':
         M.finalize()
         rij = M.Rij()._csr._D
         iRs = (1j * rij * phase_rij(rij, sc, k, dtype).reshape(-1, 1)).astype(dtype, copy=False)
         del rij
         p_opt = 0
 
+    elif gauge == 'cell':
+        iRs = phase_rsc(sc, k, dtype).reshape(-1, 1)
+        iRs = (1j * np.dot(sc.sc_off, sc.cell) * iRs).astype(dtype, copy=False)
+        p_opt = 1
+
     return p_opt, iRs
 
 
-def matrix_dk(gauge, M, const int idx, sc,
-              np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
+def matrix_dk(gauge, M, const ints_st idx, sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, iRs = _phase_dk(gauge, M, sc, k, dtype)
-    return _matrix_dk(M._csr, idx, iRs, dtype, format, p_opt)
 
+    csr = M._csr
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_dk(csr, const int idx, iRs, dtype, format, p_opt):
+    if format in ("array", "matrix", "dense"):
+        return _phase3_array(csr.ptr, csr.ncol, csr.col, csr._D, idx, iRs, p_opt)
 
-    if dtype == np.complex128:
+    # Default must be something else.
+    d1, d2, d3 = _phase3_csr(csr.ptr, csr.ncol, csr.col, csr._D, idx, iRs, p_opt)
+    return d1.asformat(format), d2.asformat(format), d3.asformat(format)
 
-        if format in ("array", "matrix", "dense"):
-            return _phase3_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, iRs, p_opt)
 
-        # Default must be something else.
-        d1, d2, d3 = _phase3_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, iRs, p_opt)
-        return d1.asformat(format), d2.asformat(format), d3.asformat(format)
-
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _phase3_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, iRs, p_opt)
-        d1, d2, d3 = _phase3_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, iRs, p_opt)
-        return d1.asformat(format), d2.asformat(format), d3.asformat(format)
-
-    raise ValueError("matrix_dk: currently only supports dtype in [complex64, complex128].")
-
-
-def matrix_dk_nc(gauge, M, sc,
-                 np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
+def matrix_dk_nc(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, iRs = _phase_dk(gauge, M, sc, k, dtype)
-    return _matrix_dk_nc(M._csr, iRs, dtype, format, p_opt)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_dk_nc(csr, iRs, dtype, format, p_opt):
 
-    if dtype == np.complex128:
+    csr = M._csr
 
-        if format in ("array", "matrix", "dense"):
-            return _phase3_nc_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
+    if format in ("array", "matrix", "dense"):
+        return _phase3_array_nc(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
 
-        # Default must be something else.
-        d1, d2, d3 = _phase3_nc_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
-        return d1.asformat(format), d2.asformat(format), d3.asformat(format)
+    # Default must be something else.
+    d1, d2, d3 = _phase3_csr_nc(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
+    return d1.asformat(format), d2.asformat(format), d3.asformat(format)
 
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _phase3_nc_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
-        d1, d2, d3 = _phase3_nc_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
-        return d1.asformat(format), d2.asformat(format), d3.asformat(format)
 
-    raise ValueError("matrix_dk_nc: currently only supports dtype in [complex64, complex128].")
-
-
-def matrix_dk_nc_diag(gauge, M, const int idx, sc,
-                      np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
+def matrix_dk_nc_diag(gauge, M, const ints_st idx, sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, iRs = _phase_dk(gauge, M, sc, k, dtype)
 
@@ -112,59 +77,30 @@ def matrix_dk_nc_diag(gauge, M, const int idx, sc,
     phz = iRs[:, 2].copy()
     del iRs
 
-    # Get each of them
-    x = _matrix_dk_nc_diag(M._csr, idx, phx, dtype, format, p_opt)
-    y = _matrix_dk_nc_diag(M._csr, idx, phy, dtype, format, p_opt)
-    z = _matrix_dk_nc_diag(M._csr, idx, phz, dtype, format, p_opt)
-    return x, y, z
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_dk_nc_diag(csr, const int idx, phases, dtype, format, p_opt):
-
-    if dtype == np.complex128:
+    csr = M._csr
 
-        if format in ("array", "matrix", "dense"):
-            return _phase_nc_diag_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
+    if format in ("array", "matrix", "dense"):
+        x = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phx, p_opt)
+        y = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phy, p_opt)
+        z = _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phz, p_opt)
 
-        # Default must be something else.
-        return _phase_nc_diag_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
+    else:
+        x = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phx, p_opt).asformat(format)
+        y = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phy, p_opt).asformat(format)
+        z = _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phz, p_opt).asformat(format)
 
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _phase_nc_diag_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
-        return _phase_nc_diag_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
-
-    raise ValueError("matrix_dk_nc_diag: only supports dtype in [complex64, complex128].")
+    return x, y, z
 
 
-def matrix_dk_so(gauge, M, sc,
-                 np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
+def matrix_dk_so(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
     p_opt, iRs = _phase_dk(gauge, M, sc, k, dtype)
-    return _matrix_dk_so(M._csr, iRs, dtype, format, p_opt)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_dk_so(csr, iRs, dtype, format, p_opt):
-
-    if dtype == np.complex128:
-
-        if format in ("array", "matrix", "dense"):
-            return _phase3_so_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
 
-        # Default must be something else.
-        d1, d2, d3 = _phase3_so_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
-        return d1.asformat(format), d2.asformat(format), d3.asformat(format)
+    csr = M._csr
 
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _phase3_so_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
-        d1, d2, d3 = _phase3_so_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
-        return d1.asformat(format), d2.asformat(format), d3.asformat(format)
+    if format in ("array", "matrix", "dense"):
+        return _phase3_array_so(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
 
-    raise ValueError("matrix_dk_so: currently only supports dtype in [complex64, complex128].")
+    # Default must be something else.
+    d1, d2, d3 = _phase3_csr_so(csr.ptr, csr.ncol, csr.col, csr._D, iRs, p_opt)
+    return d1.asformat(format), d2.asformat(format), d3.asformat(format)
diff --git a/src/sisl/physics/_matrix_k.pyx b/src/sisl/physics/_matrix_k.pyx
index be8b80a451..eb8a78e14f 100644
--- a/src/sisl/physics/_matrix_k.pyx
+++ b/src/sisl/physics/_matrix_k.pyx
@@ -4,38 +4,46 @@
 cimport cython
 
 import numpy as np
+cimport numpy as cnp
 
-cimport numpy as np
-
+from sisl._core._dtypes cimport floats_st, ints_st
 from ._common import comply_gauge
 from ._matrix_phase import *
-from ._matrix_phase_nc import *
-from ._matrix_phase_nc_diag import *
-from ._matrix_phase_so import *
-from ._matrix_sc_phase import *
-from ._matrix_sc_phase_nc import *
-from ._matrix_sc_phase_nc_diag import *
-from ._matrix_sc_phase_so import *
+from ._matrix_phase_sc import *
 from ._phase import *
+from ._phase cimport is_gamma
 
 __all__ = ["matrix_k", "matrix_k_nc", "matrix_k_so", "matrix_k_nc_diag"]
 
 
-def matrix_k(gauge, M, const int idx, sc,
-             np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
-    dtype = phase_dtype(k, M.dtype, dtype)
+def _phase_k(gauge, M, sc, cnp.ndarray[floats_st] K, dtype):
+    cdef floats_st[::1] k = K
+
+    # dtype *must* be passed through phase_dtype
     gauge = comply_gauge(gauge)
 
-    if gauge == 'cell':
-        phases = phase_rsc(sc, k, dtype)
-        p_opt = 1
+    if is_gamma(k):
+        # no - phases required
+        p_opt = -1
+        phases = np.empty([0], dtype=dtype)
 
-    elif gauge == 'atom':
+    elif gauge == "atom":
         M.finalize()
         phases = phase_rij(M.Rij()._csr._D, sc, k, dtype)
         p_opt = 0
+
+    elif gauge == "cell":
+        phases = phase_rsc(sc, k, dtype)
+        p_opt = 1
+
     else:
-        raise ValueError("matrix_k: gauge must be in [cell, atom]")
+        raise ValueError("phase_k: gauge must be in [cell, atom]")
+
+    return p_opt, phases
+
+def matrix_k(gauge, M, const ints_st idx, sc, cnp.ndarray[floats_st] k, dtype, format):
+    dtype = phase_dtype(k, M.dtype, dtype)
+    p_opt, phases = _phase_k(gauge, M, sc, k, dtype)
 
     # Check that the dimension *works*
     if idx < 0:
@@ -44,83 +52,32 @@ def matrix_k(gauge, M, const int idx, sc,
         d = M.shape[-1]
         raise ValueError(f"matrix_k: unknown index specification {idx} must be in 0:{d}")
 
+    csr = M._csr
+
     if format.startswith("sc:") or format == "sc":
         if format == "sc":
             format = "csr"
         else:
             format = format[3:]
         nc = M.geometry.no_s
-        return _matrix_sc_k(M._csr, nc, idx, phases, dtype, format, p_opt)
-
-    return _matrix_k(M._csr, idx, phases, dtype, format, p_opt)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_k(csr, const int idx, phases, dtype, format, p_opt):
-
-    if dtype == np.complex128:
 
         if format in ("array", "matrix", "dense"):
-            return _phase_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
+            return _phase_sc_array(csr.ptr, csr.ncol, csr.col, nc, csr._D, idx, phases, p_opt)
 
-        # Default must be something else.
-        return _phase_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
+        return _phase_sc_csr(csr.ptr, csr.ncol, csr.col, nc, csr._D, idx, phases, p_opt).asformat(format)
 
-    elif dtype == np.float64:
-        if format in ("array", "matrix", "dense"):
-            return _array_f64(csr.ptr, csr.ncol, csr.col, csr._D, idx)
-        return _csr_f64(csr.ptr, csr.ncol, csr.col, csr._D, idx).asformat(format)
 
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _phase_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
-        return _phase_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
+    if format in ("array", "matrix", "dense"):
+        return _phase_array(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
 
-    elif dtype == np.float32:
-        if format in ("array", "matrix", "dense"):
-            return _array_f32(csr.ptr, csr.ncol, csr.col, csr._D, idx)
-        return _csr_f32(csr.ptr, csr.ncol, csr.col, csr._D, idx).asformat(format)
-
-    raise ValueError("matrix_k: currently only supports dtype in [float32, float64, complex64, complex128].")
+    return _phase_csr(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_sc_k(csr, const int nc, const int idx, phases, dtype, format, p_opt):
-    if dtype == np.complex128:
-        if format in ("array", "matrix", "dense"):
-            return _sc_phase_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, nc, idx, phases, p_opt)
-        return _sc_phase_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, nc, idx, phases, p_opt).asformat(format)
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _sc_phase_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, nc, idx, phases, p_opt)
-        return _sc_phase_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, nc, idx, phases, p_opt).asformat(format)
-    elif dtype in (np.float32, np.float64):
-        # direct conversion, should be simple (generally only at Gamma-point)
-        m = csr.tocsr(idx)
-        if format in ("array", "matrix", "dense"):
-            return m.toarray()
-        return m
-
-    raise ValueError("matrix_k: (supercell format) currently only supports dtype in [float32, float64, complex64, complex128].")
-
-
-def matrix_k_nc(gauge, M, sc,
-                np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
+def matrix_k_nc(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
     dtype = phase_dtype(k, M.dtype, dtype, True)
-    gauge = comply_gauge(gauge)
-    if gauge == 'cell':
-        phases = phase_rsc(sc, k, dtype)
-        p_opt = 1
-    elif gauge == 'atom':
-        M.finalize()
-        phases = phase_rij(M.Rij()._csr._D, sc, k, dtype)
-        p_opt = 0
-    else:
-        raise ValueError("matrix_k_nc: gauge must be in [cell, atom]")
+    p_opt, phases = _phase_k(gauge, M, sc, k, dtype)
+
+    csr = M._csr
 
     if format.startswith("sc:") or format == "sc":
         if format == "sc":
@@ -128,165 +85,61 @@ def matrix_k_nc(gauge, M, sc,
         else:
             format = format[3:]
         nc = M.geometry.no_s
-        return _matrix_sc_k_nc(M._csr, nc, phases, dtype, format, p_opt)
-    return _matrix_k_nc(M._csr, phases, dtype, format, p_opt)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_k_nc(csr, phases, dtype, format, p_opt):
 
-    if csr.shape[2] < 4:
-        raise ValueError("matrix_k_nc requires input matrix to have 4 components")
-
-    if dtype == np.complex128:
-        if format in ("array", "matrix", "dense"):
-            return _phase_nc_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt)
-        return _phase_nc_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt).asformat(format)
-    elif dtype == np.complex64:
         if format in ("array", "matrix", "dense"):
-            return _phase_nc_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt)
-        return _phase_nc_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt).asformat(format)
-
-    raise ValueError("matrix_k_nc: only supports dtype in [complex64, complex128].")
+            return _phase_sc_array_nc(csr.ptr, csr.ncol, csr.col, nc, csr._D, phases, p_opt)
 
+        return _phase_sc_csr_nc(csr.ptr, csr.ncol, csr.col, nc, csr._D, phases, p_opt).asformat(format)
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_sc_k_nc(csr, nc, phases, dtype, format, p_opt):
+    if format in ("array", "matrix", "dense"):
+        return _phase_array_nc(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt)
 
-    if csr.shape[2] < 4:
-        raise ValueError("matrix_k_nc: (supercell format) requires input matrix to have 4 components")
+    return _phase_csr_nc(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt).asformat(format)
 
-    if dtype == np.complex128:
-        if format in ("array", "matrix", "dense"):
-            return _sc_phase_nc_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, nc, phases, p_opt)
-        return _sc_phase_nc_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, nc, phases, p_opt).asformat(format)
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _sc_phase_nc_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, nc, phases, p_opt)
-        return _sc_phase_nc_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, nc, phases, p_opt).asformat(format)
 
-    raise ValueError("matrix_k_nc: (supercell format) only supports dtype in [complex64, complex128].")
+def matrix_k_nc_diag(gauge, M, const ints_st idx, sc, cnp.ndarray[floats_st] k, dtype, format):
+    dtype = phase_dtype(k, M.dtype, dtype, True)
+    p_opt, phases = _phase_k(gauge, M, sc, k, dtype)
 
+    csr = M._csr
 
-def matrix_k_so(gauge, M, sc,
-                np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
-    dtype = phase_dtype(k, M.dtype, dtype, True)
-    gauge = comply_gauge(gauge)
-    if gauge == 'cell':
-        phases = phase_rsc(sc, k, dtype)
-        p_opt = 1
-    elif gauge == 'atom':
-        M.finalize()
-        phases = phase_rij(M.Rij()._csr._D, sc, k, dtype)
-        p_opt = 0
-    else:
-        raise ValueError("matrix_k_so: gauge must be in [r, R]")
     if format.startswith("sc:") or format == "sc":
         if format == "sc":
             format = "csr"
         else:
             format = format[3:]
         nc = M.geometry.no_s
-        return _matrix_sc_k_so(M._csr, nc, phases, dtype, format, p_opt)
-    return _matrix_k_so(M._csr, phases, dtype, format, p_opt)
-
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_k_so(csr, phases, dtype, format, p_opt):
-
-    if csr.shape[2] < 8:
-        raise ValueError("matrix_k_so requires input matrix to have 8 components")
-
-    if dtype == np.complex128:
         if format in ("array", "matrix", "dense"):
-            return _phase_so_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt)
-        return _phase_so_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt).asformat(format)
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _phase_so_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt)
-        return _phase_so_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt).asformat(format)
+            return _phase_sc_array_nc_diag(csr.ptr, csr.ncol, csr.col, nc, csr._D, idx, phases, p_opt)
 
-    raise ValueError("matrix_k_so: only supports dtype in [complex64, complex128].")
+        return _phase_sc_csr_nc_diag(csr.ptr, csr.ncol, csr.col, nc, csr._D, idx, phases, p_opt).asformat(format)
 
+    if format in ("array", "matrix", "dense"):
+        return _phase_array_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_sc_k_so(csr, nc, phases, dtype, format, p_opt):
+    return _phase_csr_nc_diag(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
 
-    if csr.shape[2] < 8:
-        raise ValueError("matrix_k_so: (supercell format) requires input matrix to have 8 components")
 
-    if dtype == np.complex128:
-        if format in ("array", "matrix", "dense"):
-            return _sc_phase_so_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, nc, phases, p_opt)
-        return _sc_phase_so_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, nc, phases, p_opt).asformat(format)
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _sc_phase_so_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, nc, phases, p_opt)
-        return _sc_phase_so_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, nc, phases, p_opt).asformat(format)
-
-    raise ValueError("matrix_k_so: (supercell format) only supports dtype in [complex64, complex128].")
+def matrix_k_so(gauge, M, sc, cnp.ndarray[floats_st] k, dtype, format):
+    dtype = phase_dtype(k, M.dtype, dtype, True)
+    p_opt, phases = _phase_k(gauge, M, sc, k, dtype)
 
+    csr = M._csr
 
-def matrix_k_nc_diag(gauge, M, const int idx, sc,
-                     np.ndarray[np.float64_t, ndim=1, mode='c'] k, dtype, format):
-    dtype = phase_dtype(k, M.dtype, dtype, True)
-    gauge = comply_gauge(gauge)
-    if gauge == 'cell':
-        phases = phase_rsc(sc, k, dtype)
-        p_opt = 1
-    elif gauge == 'atom':
-        M.finalize()
-        phases = phase_rij(M.Rij()._csr._D, sc, k, dtype)
-        p_opt = 0
-    else:
-        raise ValueError("matrix_k_nc_diag: gauge must be in [r, R]")
     if format.startswith("sc:") or format == "sc":
         if format == "sc":
             format = "csr"
         else:
             format = format[3:]
         nc = M.geometry.no_s
-        return _matrix_sc_k_nc_diag(M._csr, nc, idx, phases, dtype, format, p_opt)
-    return _matrix_k_nc_diag(M._csr, idx, phases, dtype, format, p_opt)
-
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_k_nc_diag(csr, const int idx, phases, dtype, format, p_opt):
-
-    if dtype == np.complex128:
-        if format in ("array", "matrix", "dense"):
-            return _phase_nc_diag_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
-        return _phase_nc_diag_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
-    elif dtype == np.complex64:
         if format in ("array", "matrix", "dense"):
-            return _phase_nc_diag_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt)
-        return _phase_nc_diag_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, idx, phases, p_opt).asformat(format)
+            return _phase_sc_array_so(csr.ptr, csr.ncol, csr.col, nc, csr._D, phases, p_opt)
 
-    raise ValueError("matrix_k_nc_diag: only supports dtype in [complex64, complex128].")
+        return _phase_sc_csr_so(csr.ptr, csr.ncol, csr.col, nc, csr._D, phases, p_opt).asformat(format)
 
+    if format in ("array", "matrix", "dense"):
+        return _phase_array_so(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt)
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.initializedcheck(False)
-def _matrix_sc_k_nc_diag(csr, const int nc, const int idx, phases, dtype, format, p_opt):
-
-    if dtype == np.complex128:
-        if format in ("array", "matrix", "dense"):
-            return _sc_phase_nc_diag_array_c128(csr.ptr, csr.ncol, csr.col, csr._D, nc, idx, phases, p_opt)
-        return _sc_phase_nc_diag_csr_c128(csr.ptr, csr.ncol, csr.col, csr._D, nc, idx, phases, p_opt).asformat(format)
-    elif dtype == np.complex64:
-        if format in ("array", "matrix", "dense"):
-            return _sc_phase_nc_diag_array_c64(csr.ptr, csr.ncol, csr.col, csr._D, nc, idx, phases, p_opt)
-        return _sc_phase_nc_diag_csr_c64(csr.ptr, csr.ncol, csr.col, csr._D, nc, idx, phases, p_opt).asformat(format)
-
-    raise ValueError("matrix_k_nc_diag: (supercell format) only supports dtype in [complex64, complex128].")
+    return _phase_csr_so(csr.ptr, csr.ncol, csr.col, csr._D, phases, p_opt).asformat(format)
diff --git a/src/sisl/physics/_matrix_phase.pyx b/src/sisl/physics/_matrix_phase.pyx
index ac5bf3832a..fba3e6d0cb 100644
--- a/src/sisl/physics/_matrix_phase.pyx
+++ b/src/sisl/physics/_matrix_phase.pyx
@@ -1,276 +1,633 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
 cimport cython
 
 import numpy as np
-cimport numpy as np
-from scipy.sparse import csr_matrix
-
-from sisl._indices cimport _index_sorted
-from sisl._core._sparse import fold_csr_matrix
-
-__all__ = ['_csr_f32', '_csr_f64', '_phase_csr_c64', '_phase_csr_c128',
-           '_array_f32', '_array_f64', '_phase_array_c64', '_phase_array_c128']
 
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_real:
-    float
-    double
-
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
+cimport numpy as cnp
 
+from scipy.sparse import csr_matrix
 
-def _csr_f32(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-             np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-             np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-             numeric_real[:, ::1] D, const int idx):
+from sisl._indices cimport _index_sorted
 
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
+from sisl._core._sparse import (
+    fold_csr_matrix,
+    fold_csr_matrix_nc,
+    fold_csr_matrix_nc_diag,
+)
+
+from sisl._core._dtypes cimport (
+    complexs_st,
+    floatcomplexs_st,
+    floats_st,
+    ints_st,
+    numerics_st,
+    reals_st,
+    ssize_st,
+    type2dtype,
+)
+
+from ._matrix_utils cimport (
+    _f_matrix_box_nc,
+    _f_matrix_box_so,
+    _matrix_box_nc_cmplx,
+    _matrix_box_nc_real,
+    _matrix_box_so_cmplx,
+    _matrix_box_so_real,
+)
+
+__all__ = [
+    "_phase_csr",
+    "_phase_array",
+    "_phase_csr_nc",
+    "_phase_array_nc",
+    "_phase_csr_nc_diag",
+    "_phase_array_nc_diag",
+    "_phase_csr_so",
+    "_phase_array_so",
+]
+
+"""
+In this Cython code we use `p_opt` to signal whether the resulting
+matrices will use the phases variable.
+
+There are 3 cases:
+
+p_opt == -1:
+    no phases are added, the `phases` array will not be accessed
+p_opt == 0:
+    the phases are *per* spares index, i.e. the array is as big
+    as the sparse data.
+p_opt == 1:
+    the phases are in reduced format where each column block
+    uses the same phase. A column block is defined as `col[ind] / nr` which
+    results in a unique index.
+"""
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_csr(ints_st[::1] ptr,
+               ints_st[::1] ncol,
+               ints_st[::1] col,
+               numerics_st[:, ::1] D,
+               const ints_st idx,
+               floatcomplexs_st[::1] phases,
+               const int p_opt):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef Py_ssize_t nr = v_ncol.shape[0]
-    cdef np.ndarray[np.float32_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.float32)
-    cdef float[::1] v = V
-    cdef Py_ssize_t r, ind, s_idx
-    cdef int c
-
-    for r in range(nr):
-        for ind in range(ptr[r], ptr[r] + ncol[r]):
-            c = col[ind] % nr
-            s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-            v[v_ptr[r] + s_idx] += D[ind, idx]
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix(ptr, ncol, col)
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+    cdef ints_st[::1] tmp
+
+    # This may fail, when numerics_st is complex, but floatcomplexs_st is float
+    cdef object dtype = type2dtype[floatcomplexs_st](1)
+    cdef cnp.ndarray[floatcomplexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef floatcomplexs_st[::1] v = V
+
+    # Local columns
+    cdef ints_st nr = ncol.shape[0]
+    cdef ints_st r, ind, s, s_idx, c
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+
+                    tmp = v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]]
+                    s_idx = _index_sorted(tmp, c)
+                    v[v_ptr[r] + s_idx] += <floatcomplexs_st> (D[ind, idx])
+
+        elif p_opt == 0:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+
+                    tmp = v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]]
+                    s_idx = _index_sorted(tmp, c)
+                    v[v_ptr[r] + s_idx] += <floatcomplexs_st> (D[ind, idx] * phases[ind])
+
+        else:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+                    s = col[ind] / nr
+
+                    tmp = v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]]
+                    s_idx = _index_sorted(tmp, c)
+                    v[v_ptr[r] + s_idx] += <floatcomplexs_st> (D[ind, idx] * phases[s])
 
     return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
 
 
-def _csr_f64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-             np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-             np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-             numeric_real[:, ::1] D, const int idx):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef Py_ssize_t nr = v_ncol.shape[0]
-    cdef np.ndarray[np.float64_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.float64)
-    cdef double[::1] v = V
-    cdef Py_ssize_t r, ind, s_idx
-    cdef int c
-
-    for r in range(nr):
-        for ind in range(ptr[r], ptr[r] + ncol[r]):
-            c = col[ind] % nr
-            s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-            v[v_ptr[r] + s_idx] += D[ind, idx]
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_array(ints_st[::1] ptr,
+                 ints_st[::1] ncol,
+                 ints_st[::1] col,
+                 numerics_st[:, ::1] D,
+                 const int idx,
+                 floatcomplexs_st[::1] phases,
+                 const int p_opt):
+
+    cdef ints_st[::1] tmp
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[floatcomplexs_st](1)
+    cdef cnp.ndarray[floatcomplexs_st, ndim=2, mode='c'] V = np.zeros([nr, nr], dtype=dtype)
+    cdef floatcomplexs_st[:, ::1] v = V
+
+    # Local columns
+    cdef ints_st r, ind, s, c
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+                    v[r, c] += <floatcomplexs_st> (D[ind, idx])
+
+        elif p_opt == 0:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+                    v[r, c] += <floatcomplexs_st> (D[ind, idx] * phases[ind])
+
+        else:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+                    s = col[ind] / nr
+                    v[r, c] += <floatcomplexs_st> (D[ind, idx] * phases[s])
 
+    return V
 
-def _phase_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                   np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                   np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                   numeric_complex[:, ::1] D, const int idx,
-                   np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
 
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_csr_nc_diag(ints_st[::1] ptr,
+                       ints_st[::1] ncol,
+                       ints_st[::1] col,
+                       numerics_st[:, ::1] D,
+                       const int idx,
+                       complexs_st[::1] phases,
+                       const int p_opt):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef Py_ssize_t nr = v_ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] v = V
-    cdef Py_ssize_t r, ind, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                v[v_ptr[r] + s_idx] += D[ind, idx] * phases[ind]
-    else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                v[v_ptr[r] + s_idx] += D[ind, idx] * phases[col[ind] / nr]
-
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc_diag(ptr, ncol, col)
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+    cdef ints_st[::1] tmp
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st[::1] v = V
+
+    # Local columns
+    cdef ints_st nr = ncol.shape[0]
+    cdef ints_st r, rr, ind, s, s_idx, c
+
+    cdef complexs_st d
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    d = <complexs_st> D[ind, idx]
+                    v[v_ptr[rr] + s_idx] += d
+                    v[v_ptr[rr+1] + s_idx] += d
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    d = (phases[ind] * D[ind, idx])
+                    v[v_ptr[rr] + s_idx] += d
+                    v[v_ptr[rr+1] + s_idx] += d
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    d = (phases[s] * D[ind, idx])
+                    v[v_ptr[rr] + s_idx] += d
+                    v[v_ptr[rr+1] + s_idx] += d
+
+    nr = nr * 2
     return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
 
 
-def _phase_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                    numeric_complex[:, ::1] D, const int idx,
-                    np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_array_nc_diag(ints_st[::1] ptr,
+                         ints_st[::1] ncol,
+                         ints_st[::1] col,
+                         numerics_st[:, ::1] D,
+                         const int idx,
+                         complexs_st[::1] phases,
+                         const int p_opt):
+
+    cdef ints_st[::1] tmp
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=dtype)
+    cdef complexs_st[:, ::1] v = V
+
+    # Local columns
+    cdef ints_st r, rr, ind, s, c
+
+    cdef complexs_st d
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    d = D[ind, idx]
+                    v[rr, c] += d
+                    v[rr + 1, c + 1] += d
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    d = (phases[ind] * D[ind, idx])
+                    v[rr, c] += d
+                    v[rr + 1, c + 1] += d
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+                    d = (phases[s] * D[ind, idx])
+                    v[rr, c] += d
+                    v[rr + 1, c + 1] += d
+
+    return V
 
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_csr_nc(ints_st[::1] ptr,
+                  ints_st[::1] ncol,
+                  ints_st[::1] col,
+                  numerics_st[:, ::1] D,
+                  complexs_st[::1] phases,
+                  const int p_opt):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef Py_ssize_t nr = v_ncol.shape[0]
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] v = V
-    cdef Py_ssize_t r, ind, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                v[v_ptr[r] + s_idx] += D[ind, idx] * phases[ind]
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(ptr, ncol, col)
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+    cdef ints_st[::1] tmp
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st[::1] v = V
+
+    # Local columns
+    cdef ints_st nr = ncol.shape[0]
+    cdef ints_st r, rr, ind, s, s_idx, c
+
+    cdef complexs_st ph
+    cdef _f_matrix_box_nc func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nc_cmplx
     else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                v[v_ptr[r] + s_idx] += D[ind, idx] * phases[col[ind] / nr]
-
+        func = _matrix_box_nc_real
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    v[v_ptr[rr] + s_idx] += D[ind, 0]
+                    ph = (D[ind, 2] + 1j * D[ind, 3])
+                    v[v_ptr[rr] + s_idx+1] += ph
+                    v[v_ptr[rr+1] + s_idx] += ph.conjugate()
+                    v[v_ptr[rr+1] + s_idx+1] += D[ind, 1]
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    ph = phases[ind]
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[v_ptr[rr] + s_idx] += M[0]
+                    v[v_ptr[rr] + s_idx+1] += M[1]
+                    v[v_ptr[rr+1] + s_idx] += M[2]
+                    v[v_ptr[rr+1] + s_idx+1] += M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+                    ph = phases[s]
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[v_ptr[rr] + s_idx] += M[0]
+                    v[v_ptr[rr] + s_idx+1] += M[1]
+                    v[v_ptr[rr+1] + s_idx] += M[2]
+                    v[v_ptr[rr+1] + s_idx+1] += M[3]
+
+    nr = nr * 2
     return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
 
 
-def _array_f32(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-               np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-               np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-               numeric_real[:, ::1] D, const int idx):
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_array_nc(ints_st[::1] ptr,
+                    ints_st[::1] ncol,
+                    ints_st[::1] col,
+                    numerics_st[:, ::1] D,
+                    complexs_st[::1] phases,
+                    const int p_opt):
 
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.float32_t, ndim=2, mode='c'] V = np.zeros([nr, nr], dtype=np.float32)
-    cdef float[:, ::1] v = V
-    cdef Py_ssize_t r, ind
-
-    for r in range(nr):
-        for ind in range(ptr[r], ptr[r] + ncol[r]):
-            v[r, col[ind] % nr] += D[ind, idx]
-
-    return V
+    cdef ints_st[::1] tmp
+    cdef ints_st nr = ncol.shape[0]
 
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=dtype)
+    cdef complexs_st[:, ::1] v = V
 
-def _array_f64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-               np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-               np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-               numeric_real[:, ::1] D, const int idx):
+    # Local columns
+    cdef ints_st r, rr, ind, s, c
 
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
+    cdef complexs_st ph
+    cdef _f_matrix_box_nc func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0]
 
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V = np.zeros([nr, nr], dtype=np.float64)
-    cdef double[:, ::1] v = V
-    cdef Py_ssize_t r, ind
-
-    for r in range(nr):
-        for ind in range(ptr[r], ptr[r] + ncol[r]):
-            v[r, col[ind] % nr] += D[ind, idx]
+    if numerics_st in complexs_st:
+        func = _matrix_box_nc_cmplx
+    else:
+        func = _matrix_box_nc_real
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+
+                    v[rr, c] += <complexs_st> D[ind, 0]
+                    ph = <complexs_st> (D[ind, 2] + 1j * D[ind, 3])
+                    v[rr, c + 1] += ph
+                    v[rr + 1, c] += ph.conjugate()
+                    v[rr + 1, c + 1] += <complexs_st> D[ind, 1]
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    ph = phases[ind]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] += M[0]
+                    v[rr, c + 1] += M[1]
+                    v[rr + 1, c] += M[2]
+                    v[rr + 1, c + 1] += M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+                    ph = phases[s]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] += M[0]
+                    v[rr, c + 1] += M[1]
+                    v[rr + 1, c] += M[2]
+                    v[rr + 1, c + 1] += M[3]
 
     return V
 
 
-def _phase_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                     np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                     np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                     numeric_complex[:, ::1] D, const int idx,
-                     np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] V = np.zeros([nr, nr], dtype=np.complex64)
-    cdef float complex[:, ::1] v = V
-    cdef Py_ssize_t r, ind, c
 
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                v[r, c] += D[ind, idx] * phases[ind]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_csr_so(ints_st[::1] ptr,
+                  ints_st[::1] ncol,
+                  ints_st[::1] col,
+                  numerics_st[:, ::1] D,
+                  complexs_st[::1] phases,
+                  const int p_opt):
 
+    # Now create the folded sparse elements
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(ptr, ncol, col)
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+    cdef ints_st[::1] tmp
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st[::1] v = V
+
+    # Local columns
+    cdef ints_st nr = ncol.shape[0]
+    cdef ints_st r, rr, ind, s, s_idx, c
+
+    cdef complexs_st ph
+    cdef _f_matrix_box_so func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_so_cmplx
     else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                v[r, c] += D[ind, idx] * phases[col[ind] / nr]
+        func = _matrix_box_so_real
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    v[v_ptr[rr] + s_idx] += (D[ind, 0] + 1j * D[ind, 4])
+                    v[v_ptr[rr] + s_idx+1] += (D[ind, 2] + 1j * D[ind, 3])
+                    v[v_ptr[rr+1] + s_idx] += (D[ind, 6] + 1j * D[ind, 7])
+                    v[v_ptr[rr+1] + s_idx+1] += (D[ind, 1] + 1j * D[ind, 5])
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    ph = phases[ind]
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[v_ptr[rr] + s_idx] += M[0]
+                    v[v_ptr[rr] + s_idx+1] += M[1]
+                    v[v_ptr[rr+1] + s_idx] += M[2]
+                    v[v_ptr[rr+1] + s_idx+1] += M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+                    ph = phases[s]
+
+                    tmp = v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]]
+                    s_idx = _index_sorted(tmp, c)
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[v_ptr[rr] + s_idx] += M[0]
+                    v[v_ptr[rr] + s_idx+1] += M[1]
+                    v[v_ptr[rr+1] + s_idx] += M[2]
+                    v[v_ptr[rr+1] + s_idx+1] += M[3]
+
+    nr = nr * 2
+    return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
 
-    return V
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_array_so(ints_st[::1] ptr,
+                    ints_st[::1] ncol,
+                    ints_st[::1] col,
+                    numerics_st[:, ::1] D,
+                    complexs_st[::1] phases,
+                    const int p_opt):
 
-def _phase_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                      numeric_complex[:, ::1] D, const int idx,
-                      np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
+    cdef ints_st nr = ncol.shape[0]
 
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=dtype)
+    cdef complexs_st[:, ::1] v = V
 
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] V = np.zeros([nr, nr], dtype=np.complex128)
-    cdef double complex[:, ::1] v = V
-    cdef Py_ssize_t r, ind, c
+    # Local columns
+    cdef ints_st r, rr, s, c, ind
 
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                v[r, c] += D[ind, idx] * phases[ind]
+    cdef complexs_st ph
+    cdef _f_matrix_box_so func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0]
 
+    if numerics_st in complexs_st:
+        func = _matrix_box_so_cmplx
     else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                v[r, c] += D[ind, idx] * phases[col[ind] / nr]
+        func = _matrix_box_so_real
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+
+                    v[rr, c] += (D[ind, 0] + 1j * D[ind, 4])
+                    v[rr, c + 1] += (D[ind, 2] + 1j * D[ind, 3])
+                    v[rr + 1, c] += (D[ind, 6] + 1j * D[ind, 7])
+                    v[rr + 1, c + 1] += (D[ind, 1] + 1j * D[ind, 5])
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    ph = phases[ind]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] += M[0]
+                    v[rr, c + 1] += M[1]
+                    v[rr + 1, c] += M[2]
+                    v[rr + 1, c + 1] += M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+                    ph = phases[s]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] += M[0]
+                    v[rr, c + 1] += M[1]
+                    v[rr + 1, c] += M[2]
+                    v[rr + 1, c + 1] += M[3]
 
     return V
diff --git a/src/sisl/physics/_matrix_phase3.pyx b/src/sisl/physics/_matrix_phase3.pyx
index b40155b738..220a8570b9 100644
--- a/src/sisl/physics/_matrix_phase3.pyx
+++ b/src/sisl/physics/_matrix_phase3.pyx
@@ -1,417 +1,549 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
 cimport cython
 
 import numpy as np
-cimport numpy as np
-from scipy.sparse import csr_matrix
-
-from sisl._indices cimport _index_sorted
-from sisl._core._sparse import fold_csr_matrix
-
-__all__ = ['_phase3_csr_f32', '_phase3_csr_f64',
-           '_phase3_csr_c64', '_phase3_csr_c128',
-           '_phase3_array_f32', '_phase3_array_f64',
-           '_phase3_array_c64', '_phase3_array_c128']
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_real:
-    float
-    double
 
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
+cimport numpy as cnp
 
+from scipy.sparse import csr_matrix
 
-def _phase3_csr_f32(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                    numeric_real[:, ::1] D, const int idx,
-                    np.ndarray[np.float32_t, ndim=2, mode='c'] PHASES, const int p_opt):
+from sisl._indices cimport _index_sorted
 
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float[:, ::1] phases = PHASES
+from sisl._core._sparse import fold_csr_matrix, fold_csr_matrix_nc
+
+from sisl._core._dtypes cimport (
+    complexs_st,
+    floatcomplexs_st,
+    floats_st,
+    ints_st,
+    numerics_st,
+    ssize_st,
+    type2dtype,
+)
+
+from ._matrix_utils cimport (
+    _f_matrix_box_nc,
+    _f_matrix_box_so,
+    _matrix_box_nc_cmplx,
+    _matrix_box_nc_real,
+    _matrix_box_so_cmplx,
+    _matrix_box_so_real,
+)
+
+__all__ = [
+    "_phase3_csr",
+    "_phase3_array",
+    "_phase3_csr_nc",
+    "_phase3_array_nc",
+    "_phase3_csr_so",
+    "_phase3_array_so",
+]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase3_csr(ints_st[::1] ptr,
+                ints_st[::1] ncol,
+                ints_st[::1] col,
+                numerics_st[:, ::1] D,
+                const int idx,
+                floatcomplexs_st[:, ::1] phases,
+                const int p_opt):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef Py_ssize_t nr = v_ncol.shape[0]
-    cdef np.ndarray[np.float32_t, ndim=1, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=np.float32)
-    cdef np.ndarray[np.float32_t, ndim=1, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=np.float32)
-    cdef np.ndarray[np.float32_t, ndim=1, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=np.float32)
-    cdef float[::1] vx = Vx
-    cdef float[::1] vy = Vy
-    cdef float[::1] vz = Vz
-    cdef float d
-    cdef Py_ssize_t r, ind, s, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                d = <float> D[ind, idx]
-                vx[v_ptr[r] + s_idx] += d * phases[ind, 0]
-                vy[v_ptr[r] + s_idx] += d * phases[ind, 1]
-                vz[v_ptr[r] + s_idx] += d * phases[ind, 2]
-
-    else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s = col[ind] / nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                d = <float> D[ind, idx]
-                vx[v_ptr[r] + s_idx] += d * phases[s, 0]
-                vy[v_ptr[r] + s_idx] += d * phases[s, 1]
-                vz[v_ptr[r] + s_idx] += d * phases[s, 2]
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix(ptr, ncol, col)
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+
+    # This may fail, when numerics_st is complex, but floatcomplexs_st is float
+    cdef object dtype = type2dtype[floatcomplexs_st](1)
+    cdef cnp.ndarray[floatcomplexs_st, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef cnp.ndarray[floatcomplexs_st, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef cnp.ndarray[floatcomplexs_st, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=dtype)
+
+    # Local columns
+    cdef ints_st nr = ncol.shape[0]
+    cdef ints_st r, ind, s, s_idx, c
+
+    cdef numerics_st d
+
+    with nogil:
+        if p_opt == 0:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+                    s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
+                    d = D[ind, idx]
+                    Vx[v_ptr[r] + s_idx] += <floatcomplexs_st> (d * phases[ind, 0])
+                    Vy[v_ptr[r] + s_idx] += <floatcomplexs_st> (d * phases[ind, 1])
+                    Vz[v_ptr[r] + s_idx] += <floatcomplexs_st> (d * phases[ind, 2])
+
+        else:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+                    s = col[ind] / nr
+                    s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
+                    d = D[ind, idx]
+                    Vx[v_ptr[r] + s_idx] += <floatcomplexs_st> (d * phases[s, 0])
+                    Vy[v_ptr[r] + s_idx] += <floatcomplexs_st> (d * phases[s, 1])
+                    Vz[v_ptr[r] + s_idx] += <floatcomplexs_st> (d * phases[s, 2])
 
     return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
 
 
-def _phase3_csr_f64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                    numeric_real[:, ::1] D, const int idx,
-                    np.ndarray[np.float64_t, ndim=2, mode='c'] PHASES, const int p_opt):
 
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double[:, ::1] phases = PHASES
 
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef Py_ssize_t nr = v_ncol.shape[0]
-    cdef np.ndarray[np.float64_t, ndim=1, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=np.float64)
-    cdef np.ndarray[np.float64_t, ndim=1, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=np.float64)
-    cdef np.ndarray[np.float64_t, ndim=1, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=np.float64)
-    cdef double[::1] vx = Vx
-    cdef double[::1] vy = Vy
-    cdef double[::1] vz = Vz
-    cdef double d
-    cdef Py_ssize_t r, ind, s, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                d = <double> D[ind, idx]
-                vx[v_ptr[r] + s_idx] += d * phases[ind, 0]
-                vy[v_ptr[r] + s_idx] += d * phases[ind, 1]
-                vz[v_ptr[r] + s_idx] += d * phases[ind, 2]
-
-    else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s = col[ind] / nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                d = <double> D[ind, idx]
-                vx[v_ptr[r] + s_idx] += d * phases[s, 0]
-                vy[v_ptr[r] + s_idx] += d * phases[s, 1]
-                vz[v_ptr[r] + s_idx] += d * phases[s, 2]
-
-    return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase3_array(ints_st[::1] ptr,
+                  ints_st[::1] ncol,
+                  ints_st[::1] col,
+                  numerics_st[:, ::1] D,
+                  const int idx,
+                  floatcomplexs_st[:, ::1] phases,
+                  const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[floatcomplexs_st](1)
+    cdef cnp.ndarray[floatcomplexs_st, ndim=2, mode='c'] Vx = np.zeros([nr, nr], dtype=dtype)
+    cdef cnp.ndarray[floatcomplexs_st, ndim=2, mode='c'] Vy = np.zeros([nr, nr], dtype=dtype)
+    cdef cnp.ndarray[floatcomplexs_st, ndim=2, mode='c'] Vz = np.zeros([nr, nr], dtype=dtype)
+
+    # Local columns
+    cdef ints_st r, ind, s, c
+    cdef numerics_st d
+
+    with nogil:
+        if p_opt == 0:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+                    d = D[ind, idx]
+                    Vx[r, c] += <floatcomplexs_st> (d * phases[ind, 0])
+                    Vy[r, c] += <floatcomplexs_st> (d * phases[ind, 1])
+                    Vz[r, c] += <floatcomplexs_st> (d * phases[ind, 2])
+
+        else:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] % nr
+                    s = col[ind] / nr
+                    d = D[ind, idx]
+                    Vx[r, c] += <floatcomplexs_st> (d * phases[s, 0])
+                    Vy[r, c] += <floatcomplexs_st> (d * phases[s, 1])
+                    Vz[r, c] += <floatcomplexs_st> (d * phases[s, 2])
 
-
-def _phase3_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                    np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                    numeric_complex[:, ::1] D, const int idx,
-                    np.ndarray[np.complex64_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[:, ::1] phases = PHASES
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef Py_ssize_t nr = v_ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] vx = Vx
-    cdef float complex[::1] vy = Vy
-    cdef float complex[::1] vz = Vz
-    cdef float complex d
-    cdef Py_ssize_t r, ind, s, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                d = <float complex> D[ind, idx]
-                vx[v_ptr[r] + s_idx] += d * phases[ind, 0]
-                vy[v_ptr[r] + s_idx] += d * phases[ind, 1]
-                vz[v_ptr[r] + s_idx] += d * phases[ind, 2]
-
-    else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s = col[ind] / nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                d = <float complex> D[ind, idx]
-                vx[v_ptr[r] + s_idx] += d * phases[s, 0]
-                vy[v_ptr[r] + s_idx] += d * phases[s, 1]
-                vz[v_ptr[r] + s_idx] += d * phases[s, 2]
-
-    return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
+    return Vx, Vy, Vz
 
 
-def _phase3_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                     np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                     np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                     numeric_complex[:, ::1] D, const int idx,
-                     np.ndarray[np.complex128_t, ndim=2, mode='c'] PHASES, const int p_opt):
+###
+# Non-collinear code
+###
 
-    # Convert to memory viewsz
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[:, ::1] phases = PHASES
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase3_csr_nc(ints_st[::1] ptr,
+                   ints_st[::1] ncol,
+                   ints_st[::1] col,
+                   numerics_st[:, ::1] D,
+                   complexs_st[:, ::1] phases,
+                   const int p_opt):
 
     # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef Py_ssize_t nr = v_ncol.shape[0]
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] vx = Vx
-    cdef double complex[::1] vy = Vy
-    cdef double complex[::1] vz = Vz
-    cdef double complex d
-    cdef Py_ssize_t r, ind, s, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                d = <double complex> D[ind, idx]
-                vx[v_ptr[r] + s_idx] += d * phases[ind, 0]
-                vy[v_ptr[r] + s_idx] += d * phases[ind, 1]
-                vz[v_ptr[r] + s_idx] += d * phases[ind, 2]
-
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(ptr, ncol, col)
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st ph, v12
+
+    # Local columns (not in NC form)
+    cdef ints_st nr = ncol.shape[0]
+    cdef ints_st r, rr, ind, s, c
+    cdef ints_st s_idx
+    cdef numerics_st *d
+    cdef _f_matrix_box_nc func
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nc_cmplx
     else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s = col[ind] / nr
-                s_idx = _index_sorted(v_col[v_ptr[r]:v_ptr[r] + v_ncol[r]], c)
-                d = <double complex> D[ind, idx]
-                vx[v_ptr[r] + s_idx] += d * phases[s, 0]
-                vy[v_ptr[r] + s_idx] += d * phases[s, 1]
-                vz[v_ptr[r] + s_idx] += d * phases[s, 2]
-
+        func = _matrix_box_nc_real
+
+    with nogil:
+        if p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
+
+                    d = &D[ind, 0]
+
+                    ph = phases[ind, 0]
+                    func(d, ph, M)
+                    Vx[v_ptr[rr] + s_idx] += M[0]
+                    Vx[v_ptr[rr] + s_idx+1] += M[1]
+                    Vx[v_ptr[rr+1] + s_idx] += M[2]
+                    Vx[v_ptr[rr+1] + s_idx+1] += M[3]
+
+                    ph = phases[ind, 1]
+                    func(d, ph, M)
+                    Vy[v_ptr[rr] + s_idx] += M[0]
+                    Vy[v_ptr[rr] + s_idx+1] += M[1]
+                    Vy[v_ptr[rr+1] + s_idx] += M[2]
+                    Vy[v_ptr[rr+1] + s_idx+1] += M[3]
+
+                    ph = phases[ind, 2]
+                    func(d, ph, M)
+                    Vz[v_ptr[rr] + s_idx] += M[0]
+                    Vz[v_ptr[rr] + s_idx+1] += M[1]
+                    Vz[v_ptr[rr+1] + s_idx] += M[2]
+                    Vz[v_ptr[rr+1] + s_idx+1] += M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+
+                    s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
+
+                    d = &D[ind, 0]
+
+                    ph = phases[s, 0]
+                    func(d, ph, M)
+                    Vx[v_ptr[rr] + s_idx] += M[0]
+                    Vx[v_ptr[rr] + s_idx+1] += M[1]
+                    Vx[v_ptr[rr+1] + s_idx] += M[2]
+                    Vx[v_ptr[rr+1] + s_idx+1] += M[3]
+
+                    ph = phases[s, 1]
+                    func(d, ph, M)
+                    Vy[v_ptr[rr] + s_idx] += M[0]
+                    Vy[v_ptr[rr] + s_idx+1] += M[1]
+                    Vy[v_ptr[rr+1] + s_idx] += M[2]
+                    Vy[v_ptr[rr+1] + s_idx+1] += M[3]
+
+                    ph = phases[s, 2]
+                    func(d, ph, M)
+                    Vz[v_ptr[rr] + s_idx] += M[0]
+                    Vz[v_ptr[rr] + s_idx+1] += M[1]
+                    Vz[v_ptr[rr+1] + s_idx] += M[2]
+                    Vz[v_ptr[rr+1] + s_idx+1] += M[3]
+
+    nr = nr * 2
     return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
 
 
-def _phase3_array_f32(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                      numeric_real[:, ::1] D, const int idx,
-                      np.ndarray[np.float32_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float[:, ::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.float32_t, ndim=2, mode='c'] Vx = np.zeros([nr, nr], dtype=np.float32)
-    cdef np.ndarray[np.float32_t, ndim=2, mode='c'] Vy = np.zeros([nr, nr], dtype=np.float32)
-    cdef np.ndarray[np.float32_t, ndim=2, mode='c'] Vz = np.zeros([nr, nr], dtype=np.float32)
-    cdef float[:, ::1] vx = Vx
-    cdef float[:, ::1] vy = Vy
-    cdef float[:, ::1] vz = Vz
-    cdef float d
-    cdef Py_ssize_t r, ind, s, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                d = <float> D[ind, idx]
-                vx[r, c] += d * phases[ind, 0]
-                vy[r, c] += d * phases[ind, 1]
-                vz[r, c] += d * phases[ind, 2]
-
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase3_array_nc(ints_st[::1] ptr,
+                     ints_st[::1] ncol,
+                     ints_st[::1] col,
+                     numerics_st[:, ::1] D,
+                     complexs_st[:, ::1] phases,
+                     const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] Vx = np.zeros([nr * 2, nr * 2], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] Vy = np.zeros([nr * 2, nr * 2], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] Vz = np.zeros([nr * 2, nr * 2], dtype=dtype)
+
+    cdef complexs_st ph
+    cdef ints_st r, rr, ind, s, c
+    cdef ints_st s_idx
+    cdef numerics_st *d
+    cdef _f_matrix_box_nc func
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nc_cmplx
     else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s = col[ind] / nr
-                d = <float> D[ind, idx]
-                vx[r, c] += d * phases[s, 0]
-                vy[r, c] += d * phases[s, 1]
-                vz[r, c] += d * phases[s, 2]
+        func = _matrix_box_nc_real
+
+    with nogil:
+        if p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+
+                    d = &D[ind, 0]
+
+                    ph = phases[ind, 0]
+                    func(d, ph, M)
+                    Vx[rr, c] += M[0]
+                    Vx[rr, c+1] += M[1]
+                    Vx[rr+1, c] += M[2]
+                    Vx[rr+1, c+1] += M[3]
+
+                    ph = phases[ind, 1]
+                    func(d, ph, M)
+                    Vy[rr, c] += M[0]
+                    Vy[rr, c+1] += M[1]
+                    Vy[rr+1, c] += M[2]
+                    Vy[rr+1, c+1] += M[3]
+
+                    ph = phases[ind, 2]
+                    func(d, ph, M)
+                    Vz[rr, c] += M[0]
+                    Vz[rr, c+1] += M[1]
+                    Vz[rr+1, c] += M[2]
+                    Vz[rr+1, c+1] += M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+
+                    d = &D[ind, 0]
+
+                    ph = phases[s, 0]
+                    func(d, ph, M)
+                    Vx[rr, c] += M[0]
+                    Vx[rr, c+1] += M[1]
+                    Vx[rr+1, c] += M[2]
+                    Vx[rr+1, c+1] += M[3]
+
+                    ph = phases[s, 1]
+                    func(d, ph, M)
+                    Vy[rr, c] += M[0]
+                    Vy[rr, c+1] += M[1]
+                    Vy[rr+1, c] += M[2]
+                    Vy[rr+1, c+1] += M[3]
+
+                    ph = phases[s, 2]
+                    func(d, ph, M)
+                    Vz[rr, c] += M[0]
+                    Vz[rr, c+1] += M[1]
+                    Vz[rr+1, c] += M[2]
+                    Vz[rr+1, c+1] += M[3]
 
     return Vx, Vy, Vz
 
 
-def _phase3_array_f64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                      numeric_real[:, ::1] D, const int idx,
-                      np.ndarray[np.float64_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double[:, ::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.float64_t, ndim=2, mode='c'] Vx = np.zeros([nr, nr], dtype=np.float64)
-    cdef np.ndarray[np.float64_t, ndim=2, mode='c'] Vy = np.zeros([nr, nr], dtype=np.float64)
-    cdef np.ndarray[np.float64_t, ndim=2, mode='c'] Vz = np.zeros([nr, nr], dtype=np.float64)
-    cdef double[:, ::1] vx = Vx
-    cdef double[:, ::1] vy = Vy
-    cdef double[:, ::1] vz = Vz
-    cdef double d
-    cdef Py_ssize_t r, ind, s, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                d = <double> D[ind, idx]
-                vx[r, c] += d * phases[ind, 0]
-                vy[r, c] += d * phases[ind, 1]
-                vz[r, c] += d * phases[ind, 2]
 
-    else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s = col[ind] / nr
-                d = <double> D[ind, idx]
-                vx[r, c] += d * phases[s, 0]
-                vy[r, c] += d * phases[s, 1]
-                vz[r, c] += d * phases[s, 2]
-
-    return Vx, Vy, Vz
+###
+# Spin-orbit coupling matrices
+###
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase3_csr_so(ints_st[::1] ptr,
+                   ints_st[::1] ncol,
+                   ints_st[::1] col,
+                   numerics_st[:, ::1] D,
+                   complexs_st[:, ::1] phases,
+                   const int p_opt):
 
-def _phase3_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                      numeric_complex[:, ::1] D, const int idx,
-                      np.ndarray[np.complex64_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory viezws
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[:, ::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] Vx = np.zeros([nr, nr], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] Vy = np.zeros([nr, nr], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] Vz = np.zeros([nr, nr], dtype=np.complex64)
-    cdef float complex[:, ::1] vx = Vx
-    cdef float complex[:, ::1] vy = Vy
-    cdef float complex[:, ::1] vz = Vz
-    cdef float complex d
-
-    cdef Py_ssize_t r, ind, s, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                d = <float complex> D[ind, idx]
-                vx[r, c] += d * phases[ind, 0]
-                vy[r, c] += d * phases[ind, 1]
-                vz[r, c] += d * phases[ind, 2]
-
+    # Now create the folded sparse elements
+    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(ptr, ncol, col)
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st ph
+
+    # Local columns (not in NC form)
+    cdef ints_st nr = ncol.shape[0]
+    cdef ints_st r, rr, ind, s, c
+    cdef ints_st s_idx
+    cdef _f_matrix_box_so func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_so_cmplx
     else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s = col[ind] / nr
-                d = <float complex> D[ind, idx]
-                vx[r, c] += d * phases[s, 0]
-                vy[r, c] += d * phases[s, 1]
-                vz[r, c] += d * phases[s, 2]
-
-    return Vx, Vy, Vz
-
+        func = _matrix_box_so_real
+
+    with nogil:
+        if p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
+
+                    d = &D[ind, 0]
+
+                    ph = phases[ind, 0]
+                    func(d, ph, M)
+                    Vx[v_ptr[rr] + s_idx] += M[0]
+                    Vx[v_ptr[rr] + s_idx+1] += M[1]
+                    Vx[v_ptr[rr+1] + s_idx] += M[2]
+                    Vx[v_ptr[rr+1] + s_idx+1] += M[3]
+
+                    ph = phases[ind, 1]
+                    func(d, ph, M)
+                    Vy[v_ptr[rr] + s_idx] += M[0]
+                    Vy[v_ptr[rr] + s_idx+1] += M[1]
+                    Vy[v_ptr[rr+1] + s_idx] += M[2]
+                    Vy[v_ptr[rr+1] + s_idx+1] += M[3]
+
+                    ph = phases[ind, 2]
+                    func(d, ph, M)
+                    Vz[v_ptr[rr] + s_idx] += M[0]
+                    Vz[v_ptr[rr] + s_idx+1] += M[1]
+                    Vz[v_ptr[rr+1] + s_idx] += M[2]
+                    Vz[v_ptr[rr+1] + s_idx+1] += M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+
+                    s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
+
+                    d = &D[ind, 0]
+
+                    ph = phases[s, 0]
+                    func(d, ph, M)
+                    Vx[v_ptr[rr] + s_idx] += M[0]
+                    Vx[v_ptr[rr] + s_idx+1] += M[1]
+                    Vx[v_ptr[rr+1] + s_idx] += M[2]
+                    Vx[v_ptr[rr+1] + s_idx+1] += M[3]
+
+                    ph = phases[s, 1]
+                    func(d, ph, M)
+                    Vy[v_ptr[rr] + s_idx] += M[0]
+                    Vy[v_ptr[rr] + s_idx+1] += M[1]
+                    Vy[v_ptr[rr+1] + s_idx] += M[2]
+                    Vy[v_ptr[rr+1] + s_idx+1] += M[3]
+
+                    ph = phases[s, 2]
+                    func(d, ph, M)
+                    Vz[v_ptr[rr] + s_idx] += M[0]
+                    Vz[v_ptr[rr] + s_idx+1] += M[1]
+                    Vz[v_ptr[rr+1] + s_idx] += M[2]
+                    Vz[v_ptr[rr+1] + s_idx+1] += M[3]
+
+    nr = nr * 2
+    return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
 
-def _phase3_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                       numeric_complex[:, ::1] D, const int idx,
-                       np.ndarray[np.complex128_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[:, ::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] Vx = np.zeros([nr, nr], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] Vy = np.zeros([nr, nr], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] Vz = np.zeros([nr, nr], dtype=np.complex128)
-    cdef double complex[:, ::1] vx = Vx
-    cdef double complex[:, ::1] vy = Vy
-    cdef double complex[:, ::1] vz = Vz
-    cdef double complex d
-    cdef Py_ssize_t r, ind, s, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                d = <double complex> D[ind, idx]
-                vx[r, c] += d * phases[ind, 0]
-                vy[r, c] += d * phases[ind, 1]
-                vz[r, c] += d * phases[ind, 2]
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase3_array_so(ints_st[::1] ptr,
+                     ints_st[::1] ncol,
+                     ints_st[::1] col,
+                     numerics_st[:, ::1] D,
+                     complexs_st[:, ::1] phases,
+                     const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] Vx = np.zeros([nr * 2, nr * 2], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] Vy = np.zeros([nr * 2, nr * 2], dtype=dtype)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] Vz = np.zeros([nr * 2, nr * 2], dtype=dtype)
+    cdef complexs_st[:, ::1] vx = Vx
+    cdef complexs_st[:, ::1] vy = Vy
+    cdef complexs_st[:, ::1] vz = Vz
+
+    cdef complexs_st ph
+    cdef ints_st r, rr, ind, s, c
+    cdef ints_st s_idx
+    cdef _f_matrix_box_so func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_so_cmplx
     else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] % nr
-                s = col[ind] / nr
-                d = <double complex> D[ind, idx]
-                vx[r, c] += d * phases[s, 0]
-                vy[r, c] += d * phases[s, 1]
-                vz[r, c] += d * phases[s, 2]
+        func = _matrix_box_so_real
+
+    with nogil:
+        if p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+
+                    d = &D[ind, 0]
+
+                    ph = phases[ind, 0]
+                    func(d, ph, M)
+                    vx[rr, c] += M[0]
+                    vx[rr, c+1] += M[1]
+                    vx[rr+1, c] += M[2]
+                    vx[rr+1, c+1] += M[3]
+
+                    ph = phases[ind, 1]
+                    func(d, ph, M)
+                    vy[rr, c] += M[0]
+                    vy[rr, c+1] += M[1]
+                    vy[rr+1, c] += M[2]
+                    vy[rr+1, c+1] += M[3]
+
+                    ph = phases[ind, 2]
+                    func(d, ph, M)
+                    vz[rr, c] += M[0]
+                    vz[rr, c+1] += M[1]
+                    vz[rr+1, c] += M[2]
+                    vz[rr+1, c+1] += M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = (col[ind] % nr) * 2
+                    s = col[ind] / nr
+
+                    d = &D[ind, 0]
+
+                    ph = phases[s, 0]
+                    func(d, ph, M)
+                    vx[rr, c] += M[0]
+                    vx[rr, c+1] += M[1]
+                    vx[rr+1, c] += M[2]
+                    vx[rr+1, c+1] += M[3]
+
+                    ph = phases[s, 1]
+                    func(d, ph, M)
+                    vy[rr, c] += M[0]
+                    vy[rr, c+1] += M[1]
+                    vy[rr+1, c] += M[2]
+                    vy[rr+1, c+1] += M[3]
+
+                    ph = phases[s, 2]
+                    func(d, ph, M)
+                    vz[rr, c] += M[0]
+                    vz[rr, c+1] += M[1]
+                    vz[rr+1, c] += M[2]
+                    vz[rr+1, c+1] += M[3]
 
     return Vx, Vy, Vz
diff --git a/src/sisl/physics/_matrix_phase3_nc.pyx b/src/sisl/physics/_matrix_phase3_nc.pyx
deleted file mode 100644
index 9aee5b8cf0..0000000000
--- a/src/sisl/physics/_matrix_phase3_nc.pyx
+++ /dev/null
@@ -1,366 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
-cimport cython
-
-import numpy as np
-cimport numpy as np
-from scipy.sparse import csr_matrix
-
-from sisl._indices cimport _index_sorted
-from sisl._core._sparse import fold_csr_matrix_nc
-
-__all__ = ["_phase3_nc_csr_c64", "_phase3_nc_csr_c128",
-           "_phase3_nc_array_c64", "_phase3_nc_array_c128"]
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
-
-
-def _phase3_nc_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                       numeric_complex[:, ::1] D,
-                       np.ndarray[np.complex64_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[:, ::1] phases = PHASES
-    # Local columns (not in NC form)
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] vx = Vx
-    cdef float complex[::1] vy = Vy
-    cdef float complex[::1] vz = Vz
-    cdef float complex ph, v12
-    cdef Py_ssize_t r, rr, ind, s, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                ph = phases[ind, 0]
-                vx[v_ptr[rr] + s_idx] = vx[v_ptr[rr] + s_idx] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[v_ptr[rr] + s_idx+1] = vx[v_ptr[rr] + s_idx+1] + ph * v12
-                vx[v_ptr[rr+1] + s_idx] = vx[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vx[v_ptr[rr+1] + s_idx+1] = vx[v_ptr[rr+1] + s_idx+1] + <float complex> (ph * D[ind, 1])
-
-                ph = phases[ind, 1]
-                vy[v_ptr[rr] + s_idx] = vy[v_ptr[rr] + s_idx] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[v_ptr[rr] + s_idx+1] = vy[v_ptr[rr] + s_idx+1] + ph * v12
-                vy[v_ptr[rr+1] + s_idx] = vy[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vy[v_ptr[rr+1] + s_idx+1] = vy[v_ptr[rr+1] + s_idx+1] + <float complex> (ph * D[ind, 1])
-
-                ph = phases[ind, 2]
-                vz[v_ptr[rr] + s_idx] = vz[v_ptr[rr] + s_idx] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[v_ptr[rr] + s_idx+1] = vz[v_ptr[rr] + s_idx+1] + ph * v12
-                vz[v_ptr[rr+1] + s_idx] = vz[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vz[v_ptr[rr+1] + s_idx+1] = vz[v_ptr[rr+1] + s_idx+1] + <float complex> (ph * D[ind, 1])
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s = col[ind] / nr
-
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                ph = phases[s, 0]
-                vx[v_ptr[rr] + s_idx] = vx[v_ptr[rr] + s_idx] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[v_ptr[rr] + s_idx+1] = vx[v_ptr[rr] + s_idx+1] + ph * v12
-                vx[v_ptr[rr+1] + s_idx] = vx[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vx[v_ptr[rr+1] + s_idx+1] = vx[v_ptr[rr+1] + s_idx+1] + <float complex> (ph * D[ind, 1])
-
-                ph = phases[s, 1]
-                vy[v_ptr[rr] + s_idx] = vy[v_ptr[rr] + s_idx] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[v_ptr[rr] + s_idx+1] = vy[v_ptr[rr] + s_idx+1] + ph * v12
-                vy[v_ptr[rr+1] + s_idx] = vy[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vy[v_ptr[rr+1] + s_idx+1] = vy[v_ptr[rr+1] + s_idx+1] + <float complex> (ph * D[ind, 1])
-
-                ph = phases[s, 2]
-                vz[v_ptr[rr] + s_idx] = vz[v_ptr[rr] + s_idx] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[v_ptr[rr] + s_idx+1] = vz[v_ptr[rr] + s_idx+1] + ph * v12
-                vz[v_ptr[rr+1] + s_idx] = vz[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vz[v_ptr[rr+1] + s_idx+1] = vz[v_ptr[rr+1] + s_idx+1] + <float complex> (ph * D[ind, 1])
-
-    nr = nr * 2
-    return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
-
-
-def _phase3_nc_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                        numeric_complex[:, ::1] D,
-                        np.ndarray[np.complex128_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[:, ::1] phases = PHASES
-    # Local columns (not in NC form)
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] vx = Vx
-    cdef double complex[::1] vy = Vy
-    cdef double complex[::1] vz = Vz
-    cdef double complex ph, v12
-    cdef Py_ssize_t r, rr, ind, s, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                ph = phases[ind, 0]
-                vx[v_ptr[rr] + s_idx] = vx[v_ptr[rr] + s_idx] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[v_ptr[rr] + s_idx+1] = vx[v_ptr[rr] + s_idx+1] + ph * v12
-                vx[v_ptr[rr+1] + s_idx] = vx[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vx[v_ptr[rr+1] + s_idx+1] = vx[v_ptr[rr+1] + s_idx+1] + <double complex> (ph * D[ind, 1])
-
-                ph = phases[ind, 1]
-                vy[v_ptr[rr] + s_idx] = vy[v_ptr[rr] + s_idx] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[v_ptr[rr] + s_idx+1] = vy[v_ptr[rr] + s_idx+1] + ph * v12
-                vy[v_ptr[rr+1] + s_idx] = vy[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vy[v_ptr[rr+1] + s_idx+1] = vy[v_ptr[rr+1] + s_idx+1] + <double complex> (ph * D[ind, 1])
-
-                ph = phases[ind, 2]
-                vz[v_ptr[rr] + s_idx] = vz[v_ptr[rr] + s_idx] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[v_ptr[rr] + s_idx+1] = vz[v_ptr[rr] + s_idx+1] + ph * v12
-                vz[v_ptr[rr+1] + s_idx] = vz[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vz[v_ptr[rr+1] + s_idx+1] = vz[v_ptr[rr+1] + s_idx+1] + <double complex> (ph * D[ind, 1])
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s = col[ind] / nr
-
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                ph = phases[s, 0]
-                vx[v_ptr[rr] + s_idx] = vx[v_ptr[rr] + s_idx] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[v_ptr[rr] + s_idx+1] = vx[v_ptr[rr] + s_idx+1] + ph * v12
-                vx[v_ptr[rr+1] + s_idx] = vx[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vx[v_ptr[rr+1] + s_idx+1] = vx[v_ptr[rr+1] + s_idx+1] + <double complex> (ph * D[ind, 1])
-
-                ph = phases[s, 1]
-                vy[v_ptr[rr] + s_idx] = vy[v_ptr[rr] + s_idx] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[v_ptr[rr] + s_idx+1] = vy[v_ptr[rr] + s_idx+1] + ph * v12
-                vy[v_ptr[rr+1] + s_idx] = vy[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vy[v_ptr[rr+1] + s_idx+1] = vy[v_ptr[rr+1] + s_idx+1] + <double complex> (ph * D[ind, 1])
-
-                ph = phases[s, 2]
-                vz[v_ptr[rr] + s_idx] = vz[v_ptr[rr] + s_idx] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[v_ptr[rr] + s_idx+1] = vz[v_ptr[rr] + s_idx+1] + ph * v12
-                vz[v_ptr[rr+1] + s_idx] = vz[v_ptr[rr+1] + s_idx] + ph * v12.conjugate()
-                vz[v_ptr[rr+1] + s_idx+1] = vz[v_ptr[rr+1] + s_idx+1] + <double complex> (ph * D[ind, 1])
-
-    nr = nr * 2
-    return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
-
-
-def _phase3_nc_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                         numeric_complex[:, ::1] D,
-                         np.ndarray[np.complex64_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[:, ::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] Vx = np.zeros([nr * 2, nr * 2], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] Vy = np.zeros([nr * 2, nr * 2], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] Vz = np.zeros([nr * 2, nr * 2], dtype=np.complex64)
-    cdef float complex[:, ::1] vx = Vx
-    cdef float complex[:, ::1] vy = Vy
-    cdef float complex[:, ::1] vz = Vz
-    cdef float complex ph, v12
-    cdef Py_ssize_t r, rr, ind, c, s
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-
-                ph = phases[ind, 0]
-                vx[rr, c] = vx[rr, c] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[rr, c+1] = vx[rr, c+1] + ph * v12
-                vx[rr+1, c] = vx[rr+1, c] + ph * v12.conjugate()
-                vx[rr+1, c+1] = vx[rr+1, c+1] + <float complex> (ph * D[ind, 1])
-
-                ph = phases[ind, 1]
-                vy[rr, c] = vy[rr, c] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[rr, c+1] = vy[rr, c+1] + ph * v12
-                vy[rr+1, c] = vy[rr+1, c] + ph * v12.conjugate()
-                vy[rr+1, c+1] = vy[rr+1, c+1] + <float complex> (ph * D[ind, 1])
-
-                ph = phases[ind, 2]
-                vz[rr, c] = vz[rr, c] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[rr, c+1] = vz[rr, c+1] + ph * v12
-                vz[rr+1, c] = vz[rr+1, c] + ph * v12.conjugate()
-                vz[rr+1, c+1] = vz[rr+1, c+1] + <float complex> (ph * D[ind, 1])
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s = col[ind] / nr
-
-                ph = phases[s, 0]
-                vx[rr, c] = vx[rr, c] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[rr, c+1] = vx[rr, c+1] + ph * v12
-                vx[rr+1, c] = vx[rr+1, c] + ph * v12.conjugate()
-                vx[rr+1, c+1] = vx[rr+1, c+1] + <float complex> (ph * D[ind, 1])
-
-                ph = phases[s, 1]
-                vy[rr, c] = vy[rr, c] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[rr, c+1] = vy[rr, c+1] + ph * v12
-                vy[rr+1, c] = vy[rr+1, c] + ph * v12.conjugate()
-                vy[rr+1, c+1] = vy[rr+1, c+1] + <float complex> (ph * D[ind, 1])
-
-                ph = phases[s, 2]
-                vz[rr, c] = vz[rr, c] + <float complex> (ph * D[ind, 0])
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[rr, c+1] = vz[rr, c+1] + ph * v12
-                vz[rr+1, c] = vz[rr+1, c] + ph * v12.conjugate()
-                vz[rr+1, c+1] = vz[rr+1, c+1] + <float complex> (ph * D[ind, 1])
-
-    return Vx, Vy, Vz
-
-
-def _phase3_nc_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                          np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                          np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                          numeric_complex[:, ::1] D,
-                          np.ndarray[np.complex128_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[:, ::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] Vx = np.zeros([nr * 2, nr * 2], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] Vy = np.zeros([nr * 2, nr * 2], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] Vz = np.zeros([nr * 2, nr * 2], dtype=np.complex128)
-    cdef double complex[:, ::1] vx = Vx
-    cdef double complex[:, ::1] vy = Vy
-    cdef double complex[:, ::1] vz = Vz
-    cdef double complex ph, v12
-    cdef Py_ssize_t r, rr, ind, c, s
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-
-                ph = phases[ind, 0]
-                vx[rr, c] = vx[rr, c] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[rr, c+1] = vx[rr, c+1] + ph * v12
-                vx[rr+1, c] = vx[rr+1, c] + ph * v12.conjugate()
-                vx[rr+1, c+1] = vx[rr+1, c+1] + <double complex> (ph * D[ind, 1])
-
-                ph = phases[ind, 1]
-                vy[rr, c] = vy[rr, c] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[rr, c+1] = vy[rr, c+1] + ph * v12
-                vy[rr+1, c] = vy[rr+1, c] + ph * v12.conjugate()
-                vy[rr+1, c+1] = vy[rr+1, c+1] + <double complex> (ph * D[ind, 1])
-
-                ph = phases[ind, 2]
-                vz[rr, c] = vz[rr, c] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[rr, c+1] = vz[rr, c+1] + ph * v12
-                vz[rr+1, c] = vz[rr+1, c] + ph * v12.conjugate()
-                vz[rr+1, c+1] = vz[rr+1, c+1] + <double complex> (ph * D[ind, 1])
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s = col[ind] / nr
-
-                ph = phases[s, 0]
-                vx[rr, c] = vx[rr, c] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[rr, c+1] = vx[rr, c+1] + ph * v12
-                vx[rr+1, c] = vx[rr+1, c] + ph * v12.conjugate()
-                vx[rr+1, c+1] = vx[rr+1, c+1] + <double complex> (ph * D[ind, 1])
-
-                ph = phases[s, 1]
-                vy[rr, c] = vy[rr, c] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[rr, c+1] = vy[rr, c+1] + ph * v12
-                vy[rr+1, c] = vy[rr+1, c] + ph * v12.conjugate()
-                vy[rr+1, c+1] = vy[rr+1, c+1] + <double complex> (ph * D[ind, 1])
-
-                ph = phases[s, 2]
-                vz[rr, c] = vz[rr, c] + <double complex> (ph * D[ind, 0])
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[rr, c+1] = vz[rr, c+1] + ph * v12
-                vz[rr+1, c] = vz[rr+1, c] + ph * v12.conjugate()
-                vz[rr+1, c+1] = vz[rr+1, c+1] + <double complex> (ph * D[ind, 1])
-
-    return Vx, Vy, Vz
diff --git a/src/sisl/physics/_matrix_phase3_so.pyx b/src/sisl/physics/_matrix_phase3_so.pyx
deleted file mode 100644
index 2fba15af46..0000000000
--- a/src/sisl/physics/_matrix_phase3_so.pyx
+++ /dev/null
@@ -1,438 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
-cimport cython
-
-import numpy as np
-cimport numpy as np
-from scipy.sparse import csr_matrix
-
-from sisl._indices cimport _index_sorted
-from sisl._core._sparse import fold_csr_matrix_nc
-
-__all__ = ["_phase3_so_csr_c64", "_phase3_so_csr_c128",
-           "_phase3_so_array_c64", "_phase3_so_array_c128"]
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
-
-
-def _phase3_so_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                       numeric_complex[:, ::1] D,
-                       np.ndarray[np.complex64_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[:, ::1] phases = PHASES
-    # Local columns (not in NC form)
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] vx = Vx
-    cdef float complex[::1] vy = Vy
-    cdef float complex[::1] vz = Vz
-    cdef float complex ph, vv
-    cdef Py_ssize_t r, rr, ind, s, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                ph = phases[ind, 0]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vx[v_ptr[rr] + s_idx] = vx[v_ptr[rr] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[v_ptr[rr] + s_idx+1] = vx[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vx[v_ptr[rr+1] + s_idx] = vx[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vx[v_ptr[rr+1] + s_idx+1] = vx[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-                ph = phases[ind, 1]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vy[v_ptr[rr] + s_idx] = vy[v_ptr[rr] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[v_ptr[rr] + s_idx+1] = vy[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vy[v_ptr[rr+1] + s_idx] = vy[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vy[v_ptr[rr+1] + s_idx+1] = vy[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-                ph = phases[ind, 2]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vz[v_ptr[rr] + s_idx] = vz[v_ptr[rr] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[v_ptr[rr] + s_idx+1] = vz[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vz[v_ptr[rr+1] + s_idx] = vz[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vz[v_ptr[rr+1] + s_idx+1] = vz[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s = col[ind] / nr
-
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                ph = phases[s, 0]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vx[v_ptr[rr] + s_idx] = vx[v_ptr[rr] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[v_ptr[rr] + s_idx+1] = vx[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vx[v_ptr[rr+1] + s_idx] = vx[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vx[v_ptr[rr+1] + s_idx+1] = vx[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-                ph = phases[s, 1]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vy[v_ptr[rr] + s_idx] = vy[v_ptr[rr] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[v_ptr[rr] + s_idx+1] = vy[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vy[v_ptr[rr+1] + s_idx] = vy[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vy[v_ptr[rr+1] + s_idx+1] = vy[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-                ph = phases[s, 2]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vz[v_ptr[rr] + s_idx] = vz[v_ptr[rr] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[v_ptr[rr] + s_idx+1] = vz[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vz[v_ptr[rr+1] + s_idx] = vz[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vz[v_ptr[rr+1] + s_idx+1] = vz[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-    nr = nr * 2
-    return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
-
-
-def _phase3_so_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                        numeric_complex[:, ::1] D,
-                        np.ndarray[np.complex128_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[:, ::1] phases = PHASES
-    # Local columns (not in NC form)
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] Vx = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] Vy = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] Vz = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] vx = Vx
-    cdef double complex[::1] vy = Vy
-    cdef double complex[::1] vz = Vz
-    cdef double complex ph, vv
-    cdef Py_ssize_t r, rr, ind, s, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                ph = phases[ind, 0]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vx[v_ptr[rr] + s_idx] = vx[v_ptr[rr] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[v_ptr[rr] + s_idx+1] = vx[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vx[v_ptr[rr+1] + s_idx] = vx[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vx[v_ptr[rr+1] + s_idx+1] = vx[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-                ph = phases[ind, 1]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vy[v_ptr[rr] + s_idx] = vy[v_ptr[rr] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[v_ptr[rr] + s_idx+1] = vy[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vy[v_ptr[rr+1] + s_idx] = vy[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vy[v_ptr[rr+1] + s_idx+1] = vy[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-                ph = phases[ind, 2]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vz[v_ptr[rr] + s_idx] = vz[v_ptr[rr] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[v_ptr[rr] + s_idx+1] = vz[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vz[v_ptr[rr+1] + s_idx] = vz[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vz[v_ptr[rr+1] + s_idx+1] = vz[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s = col[ind] / nr
-
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                ph = phases[s, 0]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vx[v_ptr[rr] + s_idx] = vx[v_ptr[rr] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[v_ptr[rr] + s_idx+1] = vx[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vx[v_ptr[rr+1] + s_idx] = vx[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vx[v_ptr[rr+1] + s_idx+1] = vx[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-                ph = phases[s, 1]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vy[v_ptr[rr] + s_idx] = vy[v_ptr[rr] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[v_ptr[rr] + s_idx+1] = vy[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vy[v_ptr[rr+1] + s_idx] = vy[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vy[v_ptr[rr+1] + s_idx+1] = vy[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-                ph = phases[s, 2]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vz[v_ptr[rr] + s_idx] = vz[v_ptr[rr] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[v_ptr[rr] + s_idx+1] = vz[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vz[v_ptr[rr+1] + s_idx] = vz[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vz[v_ptr[rr+1] + s_idx+1] = vz[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-    nr = nr * 2
-    return csr_matrix((Vx, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vy, V_COL, V_PTR), shape=(nr, nr)), csr_matrix((Vz, V_COL, V_PTR), shape=(nr, nr))
-
-
-def _phase3_so_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                         numeric_complex[:, ::1] D,
-                         np.ndarray[np.complex64_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[:, ::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] Vx = np.zeros([nr * 2, nr * 2], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] Vy = np.zeros([nr * 2, nr * 2], dtype=np.complex64)
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] Vz = np.zeros([nr * 2, nr * 2], dtype=np.complex64)
-    cdef float complex[:, ::1] vx = Vx
-    cdef float complex[:, ::1] vy = Vy
-    cdef float complex[:, ::1] vz = Vz
-    cdef float complex ph, v12
-    cdef Py_ssize_t r, rr, ind, s, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-
-                ph = phases[ind, 0]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vx[rr, c] = vx[rr, c] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[rr, c+1] = vx[rr, c+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vx[rr+1, c] = vx[rr+1, c] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vx[rr+1, c+1] = vx[rr+1, c+1] + ph * vv
-
-                ph = phases[ind, 1]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vy[rr, c] = vy[rr, c] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[rr, c+1] = vy[rr, c+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vy[rr+1, c] = vy[rr+1, c] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vy[rr+1, c+1] = vy[rr+1, c+1] + ph * vv
-
-                ph = phases[ind, 2]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vz[rr, c] = vz[rr, c] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[rr, c+1] = vz[rr, c+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vz[rr+1, c] = vz[rr+1, c] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vz[rr+1, c+1] = vz[rr+1, c+1] + ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s = col[ind] / nr
-
-                ph = phases[s, 0]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vx[rr, c] = vx[rr, c] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[rr, c+1] = vx[rr, c+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vx[rr+1, c] = vx[rr+1, c] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vx[rr+1, c+1] = vx[rr+1, c+1] + ph * vv
-
-                ph = phases[s, 1]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vy[rr, c] = vy[rr, c] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[rr, c+1] = vy[rr, c+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vy[rr+1, c] = vy[rr+1, c] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vy[rr+1, c+1] = vy[rr+1, c+1] + ph * vv
-
-                ph = phases[s, 2]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                vz[rr, c] = vz[rr, c] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[rr, c+1] = vz[rr, c+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                vz[rr+1, c] = vz[rr+1, c] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                vz[rr+1, c+1] = vz[rr+1, c+1] + ph * vv
-
-    return Vx, Vy, Vz
-
-
-def _phase3_so_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                          np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                          np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                          numeric_complex[:, ::1] D,
-                          np.ndarray[np.complex128_t, ndim=2, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[:, ::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] Vx = np.zeros([nr * 2, nr * 2], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] Vy = np.zeros([nr * 2, nr * 2], dtype=np.complex128)
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] Vz = np.zeros([nr * 2, nr * 2], dtype=np.complex128)
-    cdef double complex[:, ::1] vx = Vx
-    cdef double complex[:, ::1] vy = Vy
-    cdef double complex[:, ::1] vz = Vz
-    cdef double complex ph, vv
-    cdef Py_ssize_t r, rr, ind, s, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-
-                ph = phases[ind, 0]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vx[rr, c] = vx[rr, c] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[rr, c+1] = vx[rr, c+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vx[rr+1, c] = vx[rr+1, c] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vx[rr+1, c+1] = vx[rr+1, c+1] + ph * vv
-
-                ph = phases[ind, 1]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vy[rr, c] = vy[rr, c] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[rr, c+1] = vy[rr, c+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vy[rr+1, c] = vy[rr+1, c] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vy[rr+1, c+1] = vy[rr+1, c+1] + ph * vv
-
-                ph = phases[ind, 2]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vz[rr, c] = vz[rr, c] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[rr, c+1] = vz[rr, c+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vz[rr+1, c] = vz[rr+1, c] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vz[rr+1, c+1] = vz[rr+1, c+1] + ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s = col[ind] / nr
-
-                ph = phases[s, 0]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vx[rr, c] = vx[rr, c] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vx[rr, c+1] = vx[rr, c+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vx[rr+1, c] = vx[rr+1, c] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vx[rr+1, c+1] = vx[rr+1, c+1] + ph * vv
-
-                ph = phases[s, 1]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vy[rr, c] = vy[rr, c] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vy[rr, c+1] = vy[rr, c+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vy[rr+1, c] = vy[rr+1, c] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vy[rr+1, c+1] = vy[rr+1, c+1] + ph * vv
-
-                ph = phases[s, 2]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                vz[rr, c] = vz[rr, c] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                vz[rr, c+1] = vz[rr, c+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                vz[rr+1, c] = vz[rr+1, c] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                vz[rr+1, c+1] = vz[rr+1, c+1] + ph * vv
-
-    return Vx, Vy, Vz
diff --git a/src/sisl/physics/_matrix_phase_nc.pyx b/src/sisl/physics/_matrix_phase_nc.pyx
deleted file mode 100644
index 5b10ae6cf2..0000000000
--- a/src/sisl/physics/_matrix_phase_nc.pyx
+++ /dev/null
@@ -1,224 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
-cimport cython
-
-import numpy as np
-cimport numpy as np
-from scipy.sparse import csr_matrix
-
-from sisl._indices cimport _index_sorted
-from sisl._core._sparse import fold_csr_matrix_nc
-
-__all__ = ['_phase_nc_csr_c64', '_phase_nc_csr_c128',
-           '_phase_nc_array_c64', '_phase_nc_array_c128']
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
-
-
-def _phase_nc_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                      numeric_complex[:, ::1] D,
-                      np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] v = V
-    cdef float complex ph, v12
-    cdef Py_ssize_t r, rr, ind, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[ind]
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                v[v_ptr[rr] + s_idx] += D[ind, 0] * ph
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + s_idx+1] += v12 * ph
-                v[v_ptr[rr+1] + s_idx] += v12.conjugate() * ph
-                v[v_ptr[rr+1] + s_idx+1] += D[ind, 1] * ph
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[col[ind] / nr]
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                v[v_ptr[rr] + s_idx] += D[ind, 0] * ph
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + s_idx+1] += v12 * ph
-                v[v_ptr[rr+1] + s_idx] += v12.conjugate() * ph
-                v[v_ptr[rr+1] + s_idx+1] += D[ind, 1] * ph
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nr * 2))
-
-
-def _phase_nc_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                       numeric_complex[:, ::1] D,
-                       np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] v = V
-    cdef double complex ph, v12
-    cdef Py_ssize_t r, rr, ind, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[ind]
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                v[v_ptr[rr] + s_idx] += D[ind, 0] * ph
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + s_idx+1] += v12 * ph
-                v[v_ptr[rr+1] + s_idx] += v12.conjugate() * ph
-                v[v_ptr[rr+1] + s_idx+1] += D[ind, 1] * ph
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[col[ind] / nr]
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                v[v_ptr[rr] + s_idx] += D[ind, 0] * ph
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + s_idx+1] += v12 * ph
-                v[v_ptr[rr+1] + s_idx] += v12.conjugate() * ph
-                v[v_ptr[rr+1] + s_idx+1] += D[ind, 1] * ph
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nr * 2))
-
-
-def _phase_nc_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                        numeric_complex[:, ::1] D,
-                        np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=np.complex64)
-    cdef float complex[:, ::1] v = V
-    cdef float complex ph, v12
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[ind]
-                v[rr, c] += D[ind, 0] * ph
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] += v12 * ph
-                v[rr+1, c] += v12.conjugate() * ph
-                v[rr+1, c+1] += D[ind, 1] * ph
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[col[ind] / nr]
-                v[rr, c] += D[ind, 0] * ph
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] += v12 * ph
-                v[rr+1, c] += v12.conjugate() * ph
-                v[rr+1, c+1] += D[ind, 1] * ph
-
-    return V
-
-
-def _phase_nc_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                         numeric_complex[:, ::1] D,
-                         np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=np.complex128)
-    cdef double complex[:, ::1] v = V
-    cdef double complex ph, v12
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[ind]
-                v[rr, c] += D[ind, 0] * ph
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] += v12 * ph
-                v[rr+1, c] += v12.conjugate() * ph
-                v[rr+1, c+1] += D[ind, 1] * ph
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[col[ind] / nr]
-                v[rr, c] += D[ind, 0] * ph
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] += v12 * ph
-                v[rr+1, c] += v12.conjugate() * ph
-                v[rr+1, c+1] += D[ind, 1] * ph
-
-    return V
diff --git a/src/sisl/physics/_matrix_phase_nc_diag.pyx b/src/sisl/physics/_matrix_phase_nc_diag.pyx
deleted file mode 100644
index 8a590f79c6..0000000000
--- a/src/sisl/physics/_matrix_phase_nc_diag.pyx
+++ /dev/null
@@ -1,198 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
-cimport cython
-
-import numpy as np
-cimport numpy as np
-from scipy.sparse import csr_matrix
-
-from sisl._indices cimport _index_sorted
-from sisl._core._sparse import fold_csr_diagonal_nc
-
-__all__ = ['_phase_nc_diag_csr_c64', '_phase_nc_diag_csr_c128',
-           '_phase_nc_diag_array_c64', '_phase_nc_diag_array_c128']
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
-
-
-def _phase_nc_diag_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                           np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                           np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                           numeric_complex[:, ::1] D, const int idx,
-                           np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_diagonal_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] v = V
-    cdef float complex vv
-    cdef Py_ssize_t r, rr, ind, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-                vv = <float complex> (phases[ind] * D[ind, idx])
-                v[v_ptr[rr] + s_idx] = v[v_ptr[rr] + s_idx] + vv
-                v[v_ptr[rr+1] + s_idx] = v[v_ptr[rr+1] + s_idx] + vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-                vv = <float complex> (phases[col[ind] / nr] * D[ind, idx])
-                v[v_ptr[rr] + s_idx] = v[v_ptr[rr] + s_idx] + vv
-                v[v_ptr[rr+1] + s_idx] = v[v_ptr[rr+1] + s_idx] + vv
-
-    nr = nr * 2
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
-
-
-def _phase_nc_diag_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                            np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                            np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                            numeric_complex[:, ::1] D, const int idx,
-                            np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_diagonal_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] v = V
-    cdef double complex vv
-    cdef Py_ssize_t r, rr, ind, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-                vv = <double complex> (phases[ind] * D[ind, idx])
-                v[v_ptr[rr] + s_idx] = v[v_ptr[rr] + s_idx] + vv
-                v[v_ptr[rr+1] + s_idx] = v[v_ptr[rr+1] + s_idx] + vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-                vv = <double complex> (phases[col[ind] / nr] * D[ind, idx])
-                v[v_ptr[rr] + s_idx] = v[v_ptr[rr] + s_idx] + vv
-                v[v_ptr[rr+1] + s_idx] = v[v_ptr[rr+1] + s_idx] + vv
-
-    nr = nr * 2
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr, nr))
-
-
-def _phase_nc_diag_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                             np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                             np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                             numeric_complex[:, ::1] D, const int idx,
-                             np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=np.complex64)
-    cdef float complex[:, ::1] v = V
-    cdef float complex vv
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                vv = <float complex> (phases[ind] * D[ind, idx])
-                v[rr, c] = v[rr, c] + vv
-                v[rr+1, c+1] = v[rr+1, c+1] + vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                vv = <float complex> (phases[col[ind] / nr] * D[ind, idx])
-                v[rr, c] = v[rr, c] + vv
-                v[rr+1, c+1] = v[rr+1, c+1] + vv
-
-    return V
-
-
-def _phase_nc_diag_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                              np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                              np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                              numeric_complex[:, ::1] D, const int idx,
-                              np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=np.complex128)
-    cdef double complex[:, ::1] v = V
-    cdef double complex vv
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                vv = <double complex> (phases[ind] * D[ind, idx])
-                v[rr, c] = v[rr, c] + vv
-                v[rr+1, c+1] = v[rr+1, c+1] + vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                vv = <double complex> (phases[col[ind] / nr] * D[ind, idx])
-                v[rr, c] = v[rr, c] + vv
-                v[rr+1, c+1] = v[rr+1, c+1] + vv
-
-    return V
diff --git a/src/sisl/physics/_matrix_phase_sc.pyx b/src/sisl/physics/_matrix_phase_sc.pyx
new file mode 100644
index 0000000000..ad9ede2faf
--- /dev/null
+++ b/src/sisl/physics/_matrix_phase_sc.pyx
@@ -0,0 +1,675 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+cimport cython
+
+import numpy as np
+
+cimport numpy as cnp
+
+from scipy.sparse import csr_matrix
+
+from sisl._core._dtypes cimport (
+    complexs_st,
+    floatcomplexs_st,
+    floats_st,
+    inline_sum,
+    ints_st,
+    numerics_st,
+    ssize_st,
+    type2dtype,
+)
+from sisl._core._sparse cimport ncol2ptr_nc
+from sisl._indices cimport _index_sorted
+
+from ._matrix_utils cimport (
+    _f_matrix_box_nc,
+    _f_matrix_box_so,
+    _matrix_box_nc_cmplx,
+    _matrix_box_nc_real,
+    _matrix_box_so_cmplx,
+    _matrix_box_so_real,
+)
+
+__all__ = [
+    "_phase_sc_csr",
+    "_phase_sc_array",
+    "_phase_sc_csr_nc",
+    "_phase_sc_array_nc",
+    "_phase_sc_csr_nc_diag",
+    "_phase_sc_array_nc_diag",
+    "_phase_sc_csr_so",
+    "_phase_sc_array_so",
+]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_sc_csr(ints_st[::1] ptr,
+                  ints_st[::1] ncol,
+                  ints_st[::1] col,
+                  const ints_st nc,
+                  numerics_st[:, ::1] D,
+                  const int idx,
+                  floatcomplexs_st[::1] phases,
+                  const int p_opt):
+
+    # Now copy the sparse matrix form
+    cdef ints_st nr = ncol.shape[0]
+    cdef object idtype = type2dtype[ints_st](1)
+    cdef cnp.ndarray[ints_st, mode='c'] V_PTR = np.empty([nr + 1], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_NCOL = np.empty([nr], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_COL = np.empty([inline_sum(ncol)], dtype=idtype)
+
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+
+    cdef object dtype = type2dtype[floatcomplexs_st](1)
+    cdef cnp.ndarray[floatcomplexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef floatcomplexs_st[::1] v = V
+
+    cdef ints_st r, c, nz, ind, cind
+    cdef floatcomplexs_st ph
+
+    # Copy ncol
+    v_ncol[:] = ncol[:]
+
+    # This abstraction allows to handle non-finalized CSR matrices
+    cind = 0
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                v_ptr[r] = cind
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    v[cind] = <floatcomplexs_st> D[ind, idx]
+                    v_col[cind] = col[ind]
+                    cind = cind + 1
+
+        elif p_opt == 0:
+            for r in range(nr):
+                v_ptr[r] = cind
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    ph = phases[ind]
+                    v[cind] = <floatcomplexs_st> (D[ind, idx] * ph)
+                    v_col[cind] = col[ind]
+                    cind = cind + 1
+
+        else:
+            for r in range(nr):
+                v_ptr[r] = cind
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    ph = phases[col[ind] / nr]
+                    v[cind] = <floatcomplexs_st> (D[ind, idx] * ph)
+                    v_col[cind] = col[ind]
+                    cind = cind + 1
+
+    v_ptr[nr] = cind
+
+    return csr_matrix((V, V_COL, V_PTR), shape=(nr, nc))
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_sc_array(ints_st[::1] ptr,
+                    ints_st[::1] ncol,
+                    ints_st[::1] col,
+                    const ints_st nc,
+                    numerics_st[:, ::1] D,
+                    const int idx,
+                    floatcomplexs_st[::1] phases,
+                    const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[floatcomplexs_st](1)
+    cdef cnp.ndarray[floatcomplexs_st, ndim=2, mode='c'] V = np.zeros([nr, nc], dtype=dtype)
+    cdef floatcomplexs_st[:, ::1] v = V
+
+    cdef ints_st r, c, nz, ind
+    cdef floatcomplexs_st ph
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    v[r, col[ind]] = <floatcomplexs_st> D[ind, idx]
+
+        elif p_opt == 0:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    ph = phases[ind]
+                    v[r, col[ind]] = <floatcomplexs_st> (D[ind, idx] * ph)
+
+        else:
+            for r in range(nr):
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    ph = phases[col[ind] / nr]
+                    v[r, col[ind]] = <floatcomplexs_st> (D[ind, idx] * ph)
+
+    return V
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_sc_csr_nc(ints_st[::1] ptr,
+                     ints_st[::1] ncol,
+                     ints_st[::1] col,
+                     const ints_st nc,
+                     numerics_st[:, ::1] D,
+                     complexs_st[::1] phases,
+                     const int p_opt):
+
+    # Now copy the sparse matrix form
+    cdef ints_st nr = ncol.shape[0]
+    cdef object idtype = type2dtype[ints_st](1)
+    cdef cnp.ndarray[ints_st, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_NCOL = np.empty([nr*2], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_COL = np.empty([inline_sum(ncol)*4], dtype=idtype)
+
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st[::1] v = V
+
+    cdef ints_st r, rr, cind, c, nz, ind
+    cdef complexs_st ph
+    cdef _f_matrix_box_nc func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nc_cmplx
+    else:
+        func = _matrix_box_nc_real
+
+    # We have to do it manually due to the double elements per matrix element
+    ncol2ptr_nc(nr, ncol, v_ptr, 2)
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                v_ncol[rr] = ncol[r] * 2
+                v_ncol[rr+1] = ncol[r] * 2
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+
+                    v[v_ptr[rr] + cind] = <complexs_st> D[ind, 0]
+                    v_col[v_ptr[rr] + cind] = c
+                    ph = <complexs_st> (D[ind, 2] + 1j * D[ind, 3])
+                    v[v_ptr[rr] + cind+1] = ph
+                    v_col[v_ptr[rr] + cind+1] = c + 1
+                    v[v_ptr[rr+1] + cind] = ph.conjugate()
+                    v_col[v_ptr[rr+1] + cind] = c
+                    v[v_ptr[rr+1] + cind+1] = <complexs_st> D[ind, 1]
+                    v_col[v_ptr[rr+1] + cind+1] = c + 1
+
+                    cind = cind + 2
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                v_ncol[rr] = ncol[r] * 2
+                v_ncol[rr+1] = ncol[r] * 2
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[ind]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[v_ptr[rr] + cind] = M[0]
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr] + cind+1] = M[1]
+                    v_col[v_ptr[rr] + cind+1] = c + 1
+                    v[v_ptr[rr+1] + cind] = M[2]
+                    v_col[v_ptr[rr+1] + cind] = c
+                    v[v_ptr[rr+1] + cind+1] = M[3]
+                    v_col[v_ptr[rr+1] + cind+1] = c + 1
+
+                    cind = cind + 2
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                v_ncol[rr] = ncol[r] * 2
+                v_ncol[rr+1] = ncol[r] * 2
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[col[ind] / nr]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+
+                    v[v_ptr[rr] + cind] = M[0]
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr] + cind+1] = M[1]
+                    v_col[v_ptr[rr] + cind+1] = c + 1
+                    v[v_ptr[rr+1] + cind] = M[2]
+                    v_col[v_ptr[rr+1] + cind] = c
+                    v[v_ptr[rr+1] + cind+1] = M[3]
+                    v_col[v_ptr[rr+1] + cind+1] = c + 1
+
+                    cind = cind + 2
+
+    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_sc_array_nc(ints_st[::1] ptr,
+                       ints_st[::1] ncol,
+                       ints_st[::1] col,
+                       const ints_st nc,
+                       numerics_st[:, ::1] D,
+                       complexs_st[::1] phases,
+                       const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr*2, nc*2], dtype=dtype)
+    cdef complexs_st[:, ::1] v = V
+
+    cdef complexs_st ph
+    cdef ints_st r, rr, c, nz, ind
+    cdef numerics_st *d
+    cdef _f_matrix_box_nc func
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_nc_cmplx
+    else:
+        func = _matrix_box_nc_real
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    v[rr, c] = <complexs_st> D[ind, 0]
+                    ph = <complexs_st> (D[ind, 2] + 1j * D[ind, 3])
+                    v[rr, c+1] = ph
+                    v[rr+1, c] = ph.conjugate()
+                    v[rr+1, c+1] = <complexs_st> D[ind, 1]
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[ind]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] = M[0]
+                    v[rr, c+1] = M[1]
+                    v[rr+1, c] = M[2]
+                    v[rr+1, c+1] = M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[col[ind] / nr]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] = M[0]
+                    v[rr, c+1] = M[1]
+                    v[rr+1, c] = M[2]
+                    v[rr+1, c+1] = M[3]
+
+    return V
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_sc_csr_nc_diag(ints_st[::1] ptr,
+                          ints_st[::1] ncol,
+                          ints_st[::1] col,
+                          const ints_st nc,
+                          numerics_st[:, ::1] D,
+                          const int idx,
+                          complexs_st[::1] phases,
+                          const int p_opt):
+
+    # Now copy the sparse matrix form
+    cdef ints_st nr = ncol.shape[0]
+    cdef object idtype = type2dtype[ints_st](1)
+    cdef cnp.ndarray[ints_st, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_NCOL = np.empty([nr*2], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_COL = np.empty([inline_sum(ncol)*2], dtype=idtype)
+
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st[::1] v = V
+
+    cdef ints_st r, rr, cind, c, nz, ind
+    cdef complexs_st ph
+
+    # We have to do it manually due to the double elements per matrix element
+    ncol2ptr_nc(nr, ncol, v_ptr, 1)
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                v_ncol[rr] = ncol[r]
+                v_ncol[rr+1] = ncol[r]
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+
+                    v[v_ptr[rr] + cind] = <complexs_st> D[ind, idx]
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr+1] + cind] = <complexs_st> D[ind, idx]
+                    v_col[v_ptr[rr+1] + cind] = c + 1
+
+                    cind = cind + 1
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                v_ncol[rr] = ncol[r] * 2
+                v_ncol[rr+1] = ncol[r] * 2
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[ind]
+
+                    v[v_ptr[rr] + cind] = <complexs_st> (D[ind, idx] * ph)
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr+1] + cind] = <complexs_st> (D[ind, idx] * ph)
+                    v_col[v_ptr[rr+1] + cind] = c + 1
+
+                    cind = cind + 1
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                v_ncol[rr] = ncol[r] * 2
+                v_ncol[rr+1] = ncol[r] * 2
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[col[ind] / nr]
+
+
+                    v[v_ptr[rr] + cind] = <complexs_st> (D[ind, idx] * ph)
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr+1] + cind] = <complexs_st> (D[ind, idx] * ph)
+                    v_col[v_ptr[rr+1] + cind] = c + 1
+
+                    cind = cind + 1
+
+    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_sc_array_nc_diag(ints_st[::1] ptr,
+                            ints_st[::1] ncol,
+                            ints_st[::1] col,
+                            const ints_st nc,
+                            numerics_st[:, ::1] D,
+                            const int idx,
+                            complexs_st[::1] phases,
+                            const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr*2, nc*2], dtype=dtype)
+    cdef complexs_st[:, ::1] v = V
+
+    cdef complexs_st d
+    cdef ints_st r, rr, c, nz, ind
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    d = <complexs_st> D[ind, idx]
+                    v[rr, c] = d
+                    v[rr+1, c+1] = d
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    d = <complexs_st> (D[ind, idx] * phases[ind])
+
+                    v[rr, c] = d
+                    v[rr+1, c+1] = d
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    d = <complexs_st> (D[ind, idx] * phases[col[ind] / nr])
+
+                    v[rr, c] = d
+                    v[rr+1, c+1] = d
+
+    return V
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_sc_csr_so(ints_st[::1] ptr,
+                     ints_st[::1] ncol,
+                     ints_st[::1] col,
+                     const ints_st nc,
+                     numerics_st[:, ::1] D,
+                     complexs_st[::1] phases,
+                     const int p_opt):
+
+    # Now copy the sparse matrix form
+    cdef ints_st nr = ncol.shape[0]
+    cdef object idtype = type2dtype[ints_st](1)
+    cdef cnp.ndarray[ints_st, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_NCOL = np.empty([nr*2], dtype=idtype)
+    cdef cnp.ndarray[ints_st, mode='c'] V_COL = np.empty([inline_sum(ncol)*4], dtype=idtype)
+
+    cdef ints_st[::1] v_ptr = V_PTR
+    cdef ints_st[::1] v_ncol = V_NCOL
+    cdef ints_st[::1] v_col = V_COL
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, mode='c'] V = np.zeros([v_col.shape[0]], dtype=dtype)
+    cdef complexs_st[::1] v = V
+
+    cdef ints_st r, rr, cind, c, nz, ind
+    cdef complexs_st ph
+    cdef _f_matrix_box_so func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_so_cmplx
+    else:
+        func = _matrix_box_so_real
+
+    # We have to do it manually due to the double elements per matrix element
+    ncol2ptr_nc(nr, ncol, v_ptr, 2)
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                v_ncol[rr] = ncol[r] * 2
+                v_ncol[rr+1] = ncol[r] * 2
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+
+                    v[v_ptr[rr] + cind] = <complexs_st> (D[ind, 0] + 1j * D[ind, 4])
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr] + cind+1] = <complexs_st> (D[ind, 2] + 1j * D[ind, 3])
+                    v_col[v_ptr[rr] + cind+1] = c + 1
+                    v[v_ptr[rr+1] + cind] = <complexs_st> (D[ind, 6] + 1j * D[ind, 7])
+                    v_col[v_ptr[rr+1] + cind] = c
+                    v[v_ptr[rr+1] + cind+1] = <complexs_st> (D[ind, 1] + 1j * D[ind, 5])
+                    v_col[v_ptr[rr+1] + cind+1] = c + 1
+
+                    cind = cind + 2
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                v_ncol[rr] = ncol[r] * 2
+                v_ncol[rr+1] = ncol[r] * 2
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[ind]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+
+                    v[v_ptr[rr] + cind] = M[0]
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr] + cind+1] = M[1]
+                    v_col[v_ptr[rr] + cind+1] = c + 1
+                    v[v_ptr[rr+1] + cind] = M[2]
+                    v_col[v_ptr[rr+1] + cind] = c
+                    v[v_ptr[rr+1] + cind+1] = M[3]
+                    v_col[v_ptr[rr+1] + cind+1] = c + 1
+
+                    cind = cind + 2
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                v_ncol[rr] = ncol[r] * 2
+                v_ncol[rr+1] = ncol[r] * 2
+
+                cind = 0
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[col[ind] / nr]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+
+                    v[v_ptr[rr] + cind] = M[0]
+                    v_col[v_ptr[rr] + cind] = c
+                    v[v_ptr[rr] + cind+1] = M[1]
+                    v_col[v_ptr[rr] + cind+1] = c + 1
+                    v[v_ptr[rr+1] + cind] = M[2]
+                    v_col[v_ptr[rr+1] + cind] = c
+                    v[v_ptr[rr+1] + cind+1] = M[3]
+                    v_col[v_ptr[rr+1] + cind+1] = c + 1
+
+                    cind = cind + 2
+
+    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+def _phase_sc_array_so(ints_st[::1] ptr,
+                       ints_st[::1] ncol,
+                       ints_st[::1] col,
+                       const ints_st nc,
+                       numerics_st[:, ::1] D,
+                       complexs_st[::1] phases,
+                       const int p_opt):
+
+    cdef ints_st nr = ncol.shape[0]
+
+    cdef object dtype = type2dtype[complexs_st](1)
+    cdef cnp.ndarray[complexs_st, ndim=2, mode='c'] V = np.zeros([nr*2, nc*2], dtype=dtype)
+    cdef complexs_st[:, ::1] v = V
+
+    cdef complexs_st ph
+    cdef ints_st r, rr, c, nz, ind
+    cdef _f_matrix_box_so func
+    cdef numerics_st *d
+    cdef complexs_st *M = [0, 0, 0, 0]
+
+    if numerics_st in complexs_st:
+        func = _matrix_box_so_cmplx
+    else:
+        func = _matrix_box_so_real
+
+    with nogil:
+        if p_opt == -1:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+
+                    v[rr, c] = <complexs_st> (D[ind, 0] + 1j * D[ind, 4])
+                    v[rr, c+1] = <complexs_st> (D[ind, 2] + 1j * D[ind, 3])
+                    v[rr+1, c] = <complexs_st> (D[ind, 6] + 1j * D[ind, 7])
+                    v[rr+1, c+1] = <complexs_st> (D[ind, 1] + 1j * D[ind, 5])
+
+        elif p_opt == 0:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[ind]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] = M[0]
+                    v[rr, c+1] = M[1]
+                    v[rr+1, c] = M[2]
+                    v[rr+1, c+1] = M[3]
+
+        else:
+            for r in range(nr):
+                rr = r * 2
+                for ind in range(ptr[r], ptr[r] + ncol[r]):
+                    c = col[ind] * 2
+                    ph = phases[col[ind] / nr]
+
+                    d = &D[ind, 0]
+                    func(d, ph, M)
+                    v[rr, c] = M[0]
+                    v[rr, c+1] = M[1]
+                    v[rr+1, c] = M[2]
+                    v[rr+1, c+1] = M[3]
+
+    return V
diff --git a/src/sisl/physics/_matrix_phase_so.pyx b/src/sisl/physics/_matrix_phase_so.pyx
deleted file mode 100644
index ea2b6b1572..0000000000
--- a/src/sisl/physics/_matrix_phase_so.pyx
+++ /dev/null
@@ -1,248 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
-cimport cython
-
-import numpy as np
-cimport numpy as np
-from scipy.sparse import csr_matrix
-
-from sisl._indices cimport _index_sorted
-from sisl._core._sparse import fold_csr_matrix_nc
-
-__all__ = ['_phase_so_csr_c64', '_phase_so_csr_c128',
-           '_phase_so_array_c64', '_phase_so_array_c128']
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
-
-
-def _phase_so_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                      numeric_complex[:, ::1] D,
-                      np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] v = V
-    cdef float complex ph, vv
-    cdef Py_ssize_t r, rr, ind, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[ind]
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[v_ptr[rr] + s_idx] = v[v_ptr[rr] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + s_idx+1] = v[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[v_ptr[rr+1] + s_idx] = v[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[v_ptr[rr+1] + s_idx+1] = v[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[col[ind] / nr]
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[v_ptr[rr] + s_idx] = v[v_ptr[rr] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + s_idx+1] = v[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[v_ptr[rr+1] + s_idx] = v[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[v_ptr[rr+1] + s_idx+1] = v[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nr * 2))
-
-
-def _phase_so_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                       numeric_complex[:, ::1] D,
-                       np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    # Now create the folded sparse elements
-    V_PTR, V_NCOL, V_COL = fold_csr_matrix_nc(PTR, NCOL, COL)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] v = V
-    cdef double complex ph, vv
-    cdef Py_ssize_t r, rr, ind, s_idx
-    cdef int c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[ind]
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[v_ptr[rr] + s_idx] = v[v_ptr[rr] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + s_idx+1] = v[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[v_ptr[rr+1] + s_idx] = v[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[v_ptr[rr+1] + s_idx+1] = v[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[col[ind] / nr]
-                s_idx = _index_sorted(v_col[v_ptr[rr]:v_ptr[rr] + v_ncol[rr]], c)
-
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[v_ptr[rr] + s_idx] = v[v_ptr[rr] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + s_idx+1] = v[v_ptr[rr] + s_idx+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[v_ptr[rr+1] + s_idx] = v[v_ptr[rr+1] + s_idx] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[v_ptr[rr+1] + s_idx+1] = v[v_ptr[rr+1] + s_idx+1] + ph * vv
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nr * 2))
-
-
-def _phase_so_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                        numeric_complex[:, ::1] D,
-                        np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=np.complex64)
-    cdef float complex[:, ::1] v = V
-    cdef float complex ph, vv
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[ind]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[rr, c] = v[rr, c] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = v[rr, c+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[rr+1, c] = v[rr+1, c] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[rr+1, c+1] = v[rr+1, c+1] + ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[col[ind] / nr]
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[rr, c] = v[rr, c] + ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = v[rr, c+1] + ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[rr+1, c] = v[rr+1, c] + ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[rr+1, c+1] = v[rr+1, c+1] + ph * vv
-
-    return V
-
-
-def _phase_so_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                         numeric_complex[:, ::1] D,
-                         np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nr * 2], dtype=np.complex128)
-    cdef double complex[:, ::1] v = V
-    cdef double complex ph, vv
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[ind]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[rr, c] = v[rr, c] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = v[rr, c+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[rr+1, c] = v[rr+1, c] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[rr+1, c+1] = v[rr+1, c+1] + ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = (col[ind] % nr) * 2
-                ph = phases[col[ind] / nr]
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[rr, c] = v[rr, c] + ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = v[rr, c+1] + ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[rr+1, c] = v[rr+1, c] + ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[rr+1, c+1] = v[rr+1, c+1] + ph * vv
-
-    return V
diff --git a/src/sisl/physics/_matrix_sc_phase.pyx b/src/sisl/physics/_matrix_sc_phase.pyx
deleted file mode 100644
index 12a7f17aff..0000000000
--- a/src/sisl/physics/_matrix_sc_phase.pyx
+++ /dev/null
@@ -1,185 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
-cimport cython
-
-import numpy as np
-cimport numpy as np
-from scipy.sparse import csr_matrix
-
-from sisl._core._sparse cimport inline_sum
-
-__all__ = ['_sc_phase_csr_c64', '_sc_phase_csr_c128',
-           '_sc_phase_array_c64', '_sc_phase_array_c128']
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_real:
-    float
-    double
-
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
-
-
-def _sc_phase_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                      np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                      numeric_complex[:, ::1] D,
-                      const int nc, const int idx,
-                      np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] ph = PHASES
-
-    # Now copy the sparse matrix form
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_PTR = np.empty([nr + 1], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_NCOL = np.empty([nr], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_COL = np.empty([inline_sum(ncol)], dtype=np.int32)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] v = V
-    cdef Py_ssize_t r, ind, cind
-
-    # Copy ncol
-    v_ncol[:] = ncol[:]
-
-    cind = 0
-    if p_opt == 0:
-        for r in range(nr):
-            v_ptr[r] = cind
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                v[cind] = D[ind, idx] * ph[ind]
-                v_col[cind] = col[ind]
-                cind = cind + 1
-    else:
-        for r in range(nr):
-            v_ptr[r] = cind
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                v[cind] = D[ind, idx] * ph[col[ind] / nr]
-                v_col[cind] = col[ind]
-                cind = cind + 1
-    v_ptr[nr] = cind
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr, nc))
-
-
-def _sc_phase_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                       np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                       numeric_complex[:, ::1] D,
-                       const int nc, const int idx,
-                       np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] ph = PHASES
-
-    # Now copy the sparse matrix form
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_PTR = np.empty([nr + 1], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_NCOL = np.empty([nr], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_COL = np.empty([inline_sum(ncol)], dtype=np.int32)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] v = V
-    cdef Py_ssize_t r, ind, cind
-
-    # Copy ncol
-    v_ncol[:] = ncol[:]
-
-    cind = 0
-    if p_opt == 0:
-        for r in range(nr):
-            v_ptr[r] = cind
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                v[cind] = D[ind, idx] * ph[ind]
-                v_col[cind] = col[ind]
-                cind = cind + 1
-    else:
-        for r in range(nr):
-            v_ptr[r] = cind
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                v[cind] = D[ind, idx] * ph[col[ind] / nr]
-                v_col[cind] = col[ind]
-                cind = cind + 1
-    v_ptr[nr] = cind
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr, nc))
-
-
-def _sc_phase_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                        np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                        numeric_complex[:, ::1] D,
-                        const int nc, const int idx,
-                        np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] ph = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] V = np.zeros([nr, nc], dtype=np.complex64)
-    cdef float complex[:, ::1] v = V
-    cdef Py_ssize_t r, ind
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                v[r, col[ind]] = D[ind, idx] * ph[ind]
-
-    else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                v[r, col[ind]] = D[ind, idx] * ph[col[ind] / nr]
-
-    return V
-
-
-def _sc_phase_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                         numeric_complex[:, ::1] D,
-                         const int nc, const int idx,
-                         np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] ph = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] V = np.zeros([nr, nc], dtype=np.complex128)
-    cdef double complex[:, ::1] v = V
-    cdef Py_ssize_t r, ind
-
-    if p_opt == 0:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                v[r, col[ind]] = D[ind, idx] * ph[ind]
-
-    else:
-        for r in range(nr):
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                v[r, col[ind]] = D[ind, idx] * ph[col[ind] / nr]
-
-    return V
diff --git a/src/sisl/physics/_matrix_sc_phase_nc.pyx b/src/sisl/physics/_matrix_sc_phase_nc.pyx
deleted file mode 100644
index 2c98d45d62..0000000000
--- a/src/sisl/physics/_matrix_sc_phase_nc.pyx
+++ /dev/null
@@ -1,272 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
-cimport cython
-
-import numpy as np
-
-cimport numpy as np
-
-from scipy.sparse import csr_matrix
-
-from sisl._core._sparse cimport inline_sum
-from sisl.physics._matrix_utils cimport ncol2ptr_double
-
-__all__ = ['_sc_phase_nc_csr_c64', '_sc_phase_nc_csr_c128',
-           '_sc_phase_nc_array_c64', '_sc_phase_nc_array_c128']
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
-
-
-def _sc_phase_nc_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                         numeric_complex[:, ::1] D,
-                         const int nc,
-                         np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    # Now copy the sparse matrix form
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_NCOL = np.empty([nr*2], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_COL = np.empty([inline_sum(ncol)*4], dtype=np.int32)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] v = V
-    cdef float complex ph, v12
-    cdef Py_ssize_t r, rr, ind, cind, c
-
-    # We have to do it manually due to the double elements per matrix element
-    ncol2ptr_double(nr, ncol, v_ptr)
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r] * 2
-            v_ncol[rr+1] = ncol[r] * 2
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[ind]
-                c = col[ind] * 2
-
-                v[v_ptr[rr] + cind] = D[ind, 0] * ph
-                v_col[v_ptr[rr] + cind] = c
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + cind+1] = v12 * ph
-                v_col[v_ptr[rr] + cind+1] = c + 1
-                v[v_ptr[rr+1] + cind] = v12.conjugate() * ph
-                v_col[v_ptr[rr+1] + cind] = c
-                v[v_ptr[rr+1] + cind+1] = D[ind, 1] * ph
-                v_col[v_ptr[rr+1] + cind+1] = c + 1
-
-                cind = cind + 2
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r] * 2
-            v_ncol[rr+1] = ncol[r] * 2
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[col[ind] / nr]
-                c = col[ind] * 2
-
-                v[v_ptr[rr] + cind] = D[ind, 0] * ph
-                v_col[v_ptr[rr] + cind] = c
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + cind+1] = v12 * ph
-                v_col[v_ptr[rr] + cind+1] = c + 1
-                v[v_ptr[rr+1] + cind] = v12.conjugate() * ph
-                v_col[v_ptr[rr+1] + cind] = c
-                v[v_ptr[rr+1] + cind+1] = D[ind, 1] * ph
-                v_col[v_ptr[rr+1] + cind+1] = c + 1
-
-                cind = cind + 2
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
-
-
-def _sc_phase_nc_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                          np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                          np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                          numeric_complex[:, ::1] D,
-                          const int nc,
-                          np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-
-    # Now copy the sparse matrix form
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_NCOL = np.empty([nr*2], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_COL = np.empty([inline_sum(ncol)*4], dtype=np.int32)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] v = V
-    cdef double complex ph, v12
-    cdef Py_ssize_t r, rr, ind, cind, c
-
-    ncol2ptr_double(nr, ncol, v_ptr)
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r] * 2
-            v_ncol[rr+1] = ncol[r] * 2
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[ind]
-                c = col[ind] * 2
-
-                v[v_ptr[rr] + cind] = D[ind, 0] * ph
-                v_col[v_ptr[rr] + cind] = c
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + cind+1] = v12 * ph
-                v_col[v_ptr[rr] + cind+1] = c + 1
-                v[v_ptr[rr+1] + cind] = v12.conjugate() * ph
-                v_col[v_ptr[rr+1] + cind] = c
-                v[v_ptr[rr+1] + cind+1] = D[ind, 1] * ph
-                v_col[v_ptr[rr+1] + cind+1] = c + 1
-
-                cind = cind + 2
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r] * 2
-            v_ncol[rr+1] = ncol[r] * 2
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[col[ind] / nr]
-                c = col[ind] * 2
-
-                v[v_ptr[rr] + cind] = D[ind, 0] * ph
-                v_col[v_ptr[rr] + cind] = c
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + cind+1] = v12 * ph
-                v_col[v_ptr[rr] + cind+1] = c + 1
-                v[v_ptr[rr+1] + cind] = v12.conjugate() * ph
-                v_col[v_ptr[rr+1] + cind] = c
-                v[v_ptr[rr+1] + cind+1] = D[ind, 1] * ph
-                v_col[v_ptr[rr+1] + cind+1] = c + 1
-                cind = cind + 2
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
-
-
-def _sc_phase_nc_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                           np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                           np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                           numeric_complex[:, ::1] D,
-                           const int nc,
-                           np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nc * 2], dtype=np.complex64)
-    cdef float complex[:, ::1] v = V
-    cdef float complex ph, v12
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[ind]
-                c = col[ind] * 2
-                v[rr, c] = D[ind, 0] * ph
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = v12 * ph
-                v[rr+1, c] = v12.conjugate() * ph
-                v[rr+1, c+1] = D[ind, 1] * ph
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[col[ind] / nr]
-                c = col[ind] * 2
-                v[rr, c] = D[ind, 0] * ph
-                v12 = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = v12 * ph
-                v[rr+1, c] = v12.conjugate() * ph
-                v[rr+1, c+1] = D[ind, 1] * ph
-
-    return V
-
-
-def _sc_phase_nc_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                            np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                            np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                            numeric_complex[:, ::1] D,
-                            const int nc,
-                            np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nc * 2], dtype=np.complex128)
-    cdef double complex[:, ::1] v = V
-    cdef double complex ph, v12
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[ind]
-                c = col[ind] * 2
-                v[rr, c] = D[ind, 0] * ph
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = v12 * ph
-                v[rr+1, c] = v12.conjugate() * ph
-                v[rr+1, c+1] = D[ind, 1] * ph
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[col[ind] / nr]
-                c = col[ind] * 2
-                v[rr, c] = D[ind, 0] * ph
-                v12 = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = v12 * ph
-                v[rr+1, c] = v12.conjugate() * ph
-                v[rr+1, c+1] = D[ind, 1] * ph
-
-    return V
diff --git a/src/sisl/physics/_matrix_sc_phase_nc_diag.pyx b/src/sisl/physics/_matrix_sc_phase_nc_diag.pyx
deleted file mode 100644
index d2ac71b721..0000000000
--- a/src/sisl/physics/_matrix_sc_phase_nc_diag.pyx
+++ /dev/null
@@ -1,234 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
-cimport cython
-
-import numpy as np
-
-cimport numpy as np
-
-from scipy.sparse import csr_matrix
-
-from sisl._core._sparse cimport inline_sum
-from sisl.physics._matrix_utils cimport ncol2ptr_single
-
-__all__ = ['_sc_phase_nc_diag_csr_c64', '_sc_phase_nc_diag_csr_c128',
-           '_sc_phase_nc_diag_array_c64', '_sc_phase_nc_diag_array_c128']
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
-
-
-def _sc_phase_nc_diag_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                              np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                              np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                              numeric_complex[:, ::1] D,
-                              const int nc, const int idx,
-                              np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    # Now copy the sparse matrix form
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_NCOL = np.empty([nr*2], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_COL = np.empty([inline_sum(ncol)*2], dtype=np.int32)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] v = V
-    cdef float complex vv
-    cdef Py_ssize_t r, rr, ind, cind, c
-
-    ncol2ptr_single(nr, ncol, v_ptr)
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r]
-            v_ncol[rr+1] = ncol[r]
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] * 2
-                vv = <float complex> (phases[ind] * D[ind, idx])
-                v[v_ptr[rr] + cind] = vv
-                v_col[v_ptr[rr] + cind] = c
-                v[v_ptr[rr+1] + cind] = vv
-                v_col[v_ptr[rr+1] + cind] = c + 1
-
-                cind = cind + 1
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r]
-            v_ncol[rr+1] = ncol[r]
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] * 2
-                vv = <float complex> (phases[col[ind] / nr] * D[ind, idx])
-                v[v_ptr[rr] + cind] = vv
-                v_col[v_ptr[rr] + cind] = c
-                v[v_ptr[rr+1] + cind] = vv
-                v_col[v_ptr[rr+1] + cind] = c + 1
-                cind = cind + 1
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
-
-
-def _sc_phase_nc_diag_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                               np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                               np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                               numeric_complex[:, ::1] D,
-                               const int nc, const int idx,
-                               np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-
-    # Now copy the sparse matrix form
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_NCOL = np.empty([nr*2], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_COL = np.empty([inline_sum(ncol)*2], dtype=np.int32)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] v = V
-    cdef double complex vv
-    cdef Py_ssize_t r, rr, ind, cind, c
-
-    # We have to do it manually due to the double elements per matrix element
-    ncol2ptr_single(nr, ncol, v_ptr)
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r]
-            v_ncol[rr+1] = ncol[r]
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] * 2
-                vv = <double complex> (phases[ind] * D[ind, idx])
-                v[v_ptr[rr] + cind] = vv
-                v_col[v_ptr[rr] + cind] = c
-                v[v_ptr[rr+1] + cind] = vv
-                v_col[v_ptr[rr+1] + cind] = c + 1
-                cind = cind + 1
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r]
-            v_ncol[rr+1] = ncol[r]
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] * 2
-                vv = <double complex> (phases[col[ind] / nr] * D[ind, idx])
-                v[v_ptr[rr] + cind] = vv
-                v_col[v_ptr[rr] + cind] = c
-                v[v_ptr[rr+1] + cind] = vv
-                v_col[v_ptr[rr+1] + cind] = c + 1
-                cind = cind + 1
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
-
-
-def _sc_phase_nc_diag_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                                np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                                np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                                numeric_complex[:, ::1] D,
-                                const int nc, const int idx,
-                                np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nc * 2], dtype=np.complex64)
-    cdef float complex[:, ::1] v = V
-    cdef float complex vv
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] * 2
-                vv = <float complex> (phases[ind] * D[ind, idx])
-                v[rr, c] = vv
-                v[rr+1, c+1] = vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] * 2
-                vv = <float complex> (phases[col[ind] / nr] * D[ind, idx])
-                v[rr, c] = vv
-                v[rr+1, c+1] = vv
-
-    return V
-
-
-def _sc_phase_nc_diag_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                                 np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                                 np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                                 numeric_complex[:, ::1] D,
-                                 const int nc, const int idx,
-                                 np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nc * 2], dtype=np.complex128)
-    cdef double complex[:, ::1] v = V
-    cdef double complex vv
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] * 2
-                vv = <double complex> (phases[ind] * D[ind, idx])
-                v[rr, c] = vv
-                v[rr+1, c+1] = vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                c = col[ind] * 2
-                vv = <double complex> (phases[col[ind] / nr] * D[ind, idx])
-                v[rr, c] = vv
-                v[rr+1, c+1] = vv
-
-    return V
diff --git a/src/sisl/physics/_matrix_sc_phase_so.pyx b/src/sisl/physics/_matrix_sc_phase_so.pyx
deleted file mode 100644
index 60d3327cfa..0000000000
--- a/src/sisl/physics/_matrix_sc_phase_so.pyx
+++ /dev/null
@@ -1,293 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
-cimport cython
-
-import numpy as np
-
-cimport numpy as np
-
-from scipy.sparse import csr_matrix
-
-from sisl._core._sparse cimport inline_sum
-from sisl.physics._matrix_utils cimport ncol2ptr_double
-
-__all__ = ['_sc_phase_so_csr_c64', '_sc_phase_so_csr_c128',
-           '_sc_phase_so_array_c64', '_sc_phase_so_array_c128']
-
-# The fused data-types forces the data input to be of "correct" values.
-ctypedef fused numeric_complex:
-    float
-    double
-    float complex
-    double complex
-
-
-def _sc_phase_so_csr_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                         np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                         numeric_complex[:, ::1] D,
-                         const int nc,
-                         np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    # Now copy the sparse matrix form
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_NCOL = np.empty([nr*2], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_COL = np.empty([inline_sum(ncol)*4], dtype=np.int32)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex64_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex64)
-    cdef float complex[::1] v = V
-    cdef float complex ph, vv
-    cdef Py_ssize_t r, rr, ind, cind, c
-
-    ncol2ptr_double(nr, ncol, v_ptr)
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r] * 2
-            v_ncol[rr+1] = ncol[r] * 2
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[ind]
-                c = col[ind] * 2
-
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[v_ptr[rr] + cind] = ph * vv
-                v_col[v_ptr[rr] + cind] = c
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + cind+1] = ph * vv
-                v_col[v_ptr[rr] + cind+1] = c + 1
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[v_ptr[rr+1] + cind] = ph * vv
-                v_col[v_ptr[rr+1] + cind] = c
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[v_ptr[rr+1] + cind+1] = ph * vv
-                v_col[v_ptr[rr+1] + cind+1] = c + 1
-
-                cind = cind + 2
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r] * 2
-            v_ncol[rr+1] = ncol[r] * 2
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[col[ind] / nr]
-                c = col[ind] * 2
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[v_ptr[rr] + cind] = ph * vv
-                v_col[v_ptr[rr] + cind] = c
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + cind+1] = ph * vv
-                v_col[v_ptr[rr] + cind+1] = c + 1
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[v_ptr[rr+1] + cind] = ph * vv
-                v_col[v_ptr[rr+1] + cind] = c
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[v_ptr[rr+1] + cind+1] = ph * vv
-                v_col[v_ptr[rr+1] + cind+1] = c + 1
-                cind = cind + 2
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
-
-
-def _sc_phase_so_csr_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                          np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                          np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                          numeric_complex[:, ::1] D,
-                          const int nc,
-                          np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-
-    # Now copy the sparse matrix form
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_PTR = np.empty([nr*2 + 1], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_NCOL = np.empty([nr*2], dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1, mode='c'] V_COL = np.empty([inline_sum(ncol)*4], dtype=np.int32)
-    cdef int[::1] v_ptr = V_PTR
-    cdef int[::1] v_ncol = V_NCOL
-    cdef int[::1] v_col = V_COL
-
-    cdef np.ndarray[np.complex128_t, ndim=1, mode='c'] V = np.zeros([v_col.shape[0]], dtype=np.complex128)
-    cdef double complex[::1] v = V
-    cdef double complex ph, vv
-    cdef Py_ssize_t r, rr, ind, cind
-
-    # We have to do it manually due to the double elements per matrix element
-    ncol2ptr_double(nr, ncol, v_ptr)
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r] * 2
-            v_ncol[rr+1] = ncol[r] * 2
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[ind]
-                c = col[ind] * 2
-
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[v_ptr[rr] + cind] = ph * vv
-                v_col[v_ptr[rr] + cind] = c
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + cind+1] = ph * vv
-                v_col[v_ptr[rr] + cind+1] = c + 1
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[v_ptr[rr+1] + cind] = ph * vv
-                v_col[v_ptr[rr+1] + cind] = c
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[v_ptr[rr+1] + cind+1] = ph * vv
-                v_col[v_ptr[rr+1] + cind+1] = c + 1
-                cind = cind + 2
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            v_ncol[rr] = ncol[r] * 2
-            v_ncol[rr+1] = ncol[r] * 2
-
-            cind = 0
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[col[ind] / nr]
-                c = col[ind] * 2
-
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[v_ptr[rr] + cind] = ph * vv
-                v_col[v_ptr[rr] + cind] = c
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[v_ptr[rr] + cind+1] = ph * vv
-                v_col[v_ptr[rr] + cind+1] = c + 1
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[v_ptr[rr+1] + cind] = ph * vv
-                v_col[v_ptr[rr+1] + cind] = c
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[v_ptr[rr+1] + cind+1] = ph * vv
-                v_col[v_ptr[rr+1] + cind+1] = c + 1
-                cind = cind + 2
-
-    return csr_matrix((V, V_COL, V_PTR), shape=(nr * 2, nc * 2))
-
-
-def _sc_phase_so_array_c64(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                           np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                           np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                           numeric_complex[:, ::1] D,
-                           const int nc,
-                           np.ndarray[np.complex64_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef float complex[::1] phases = PHASES
-
-    cdef Py_ssize_t nr = ncol.shape[0]
-    cdef np.ndarray[np.complex64_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nc * 2], dtype=np.complex64)
-    cdef float complex[:, ::1] v = V
-    cdef float complex ph, vv
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[ind]
-                c = col[ind] * 2
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[rr, c] = ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[rr+1, c] = ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[rr+1, c+1] = ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[col[ind] / nr]
-                c = col[ind] * 2
-                vv = <float complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[rr, c] = ph * vv
-                vv = <float complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = ph * vv
-                vv = <float complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[rr+1, c] = ph * vv
-                vv = <float complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[rr+1, c+1] = ph * vv
-
-    return V
-
-
-def _sc_phase_so_array_c128(np.ndarray[np.int32_t, ndim=1, mode='c'] PTR,
-                            np.ndarray[np.int32_t, ndim=1, mode='c'] NCOL,
-                            np.ndarray[np.int32_t, ndim=1, mode='c'] COL,
-                            numeric_complex[:, ::1] D,
-                            const int nc,
-                            np.ndarray[np.complex128_t, ndim=1, mode='c'] PHASES, const int p_opt):
-
-    # Convert to memory views
-    cdef int[::1] ptr = PTR
-    cdef int[::1] ncol = NCOL
-    cdef int[::1] col = COL
-    cdef double complex[::1] phases = PHASES
-    cdef Py_ssize_t nr = ncol.shape[0]
-
-    cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] V = np.zeros([nr * 2, nc * 2], dtype=np.complex128)
-    cdef double complex[:, ::1] v = V
-    cdef double complex ph, vv
-    cdef Py_ssize_t r, rr, ind, c
-
-    if p_opt == 0:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[ind]
-                c = col[ind] * 2
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[rr, c] = ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[rr+1, c] = ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[rr+1, c+1] = ph * vv
-
-    else:
-        for r in range(nr):
-            rr = r * 2
-            for ind in range(ptr[r], ptr[r] + ncol[r]):
-                ph = phases[col[ind] / nr]
-                c = col[ind] * 2
-                vv = <double complex> (D[ind, 0] + 1j * D[ind, 4])
-                v[rr, c] = ph * vv
-                vv = <double complex> (D[ind, 2] + 1j * D[ind, 3])
-                v[rr, c+1] = ph * vv
-                vv = <double complex> (D[ind, 6] + 1j * D[ind, 7])
-                v[rr+1, c] = ph * vv
-                vv = <double complex> (D[ind, 1] + 1j * D[ind, 5])
-                v[rr+1, c+1] = ph * vv
-
-    return V
diff --git a/src/sisl/physics/_matrix_utils.pxd b/src/sisl/physics/_matrix_utils.pxd
index c83feed2b8..b235ca106b 100644
--- a/src/sisl/physics/_matrix_utils.pxd
+++ b/src/sisl/physics/_matrix_utils.pxd
@@ -1,3 +1,38 @@
-# Define the interfaces for the functions exposed through cimport
-cdef void ncol2ptr_double(const int nr, const int[::1] ncol, int[::1] ptr) nogil
-cdef void ncol2ptr_single(const int nr, const int[::1] ncol, int[::1] ptr) nogil
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+cimport cython
+
+import numpy as np
+
+cimport numpy as cnp
+
+from sisl._core._dtypes cimport complexs_st, numerics_st, reals_st
+
+ctypedef fused _internal_complexs_st:
+    float complex
+    double complex
+
+ctypedef void(*_f_matrix_box_nc)(const numerics_st *data,
+                                 const complexs_st phase,
+                                 complexs_st *M) noexcept nogil
+
+cdef void _matrix_box_nc_real(const reals_st *data,
+                              const complexs_st phase,
+                              complexs_st *M) noexcept nogil
+
+cdef void _matrix_box_nc_cmplx(const _internal_complexs_st *data,
+                               const complexs_st phase,
+                               complexs_st *M) noexcept nogil
+
+ctypedef void(*_f_matrix_box_so)(const numerics_st *data,
+                                 const complexs_st phase,
+                                 complexs_st *M) noexcept nogil
+
+cdef void _matrix_box_so_real(const reals_st *data,
+                              const complexs_st phase,
+                              complexs_st *M) noexcept nogil
+
+cdef void _matrix_box_so_cmplx(const _internal_complexs_st *data,
+                               const complexs_st phase,
+                               complexs_st *M) noexcept nogil
diff --git a/src/sisl/physics/_matrix_utils.pyx b/src/sisl/physics/_matrix_utils.pyx
index 0f9014a094..7b0e2fb904 100644
--- a/src/sisl/physics/_matrix_utils.pyx
+++ b/src/sisl/physics/_matrix_utils.pyx
@@ -1,37 +1,75 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
-# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
 cimport cython
 
-__all__ = ["ncol2ptr_double", "ncol2ptr_single"]
+import numpy as np
 
+cimport numpy as cnp
 
-cdef void ncol2ptr_double(const int nr, const int[::1] ncol, int[::1] ptr) noexcept nogil:
-    cdef Py_ssize_t r, rr
+from sisl._core._dtypes cimport complexs_st, numerics_st, reals_st
 
-    # this is NC/SOC
-    ptr[0] = 0
-    ptr[1] = ncol[0] * 2
-    for r in range(1, nr):
-        rr = r * 2
-        # do both
-        ptr[rr] = ptr[rr - 1] + ncol[r-1] * 2
-        ptr[rr+1] = ptr[rr] + ncol[r] * 2
+"""
+These routines converts an array of n-values into a spin-box matrix.
 
-    ptr[nr * 2] = ptr[nr * 2 - 1] + ncol[nr - 1] * 2
+In all cases, the resulting linear returned matrix `M`
+has 4 entries.
 
+M[0] == spin[0, 0]
+M[1] == spin[0, 1]
+M[2] == spin[1, 0]
+M[3] == spin[1, 1]
+"""
 
-cdef void ncol2ptr_single(const int nr, const int[::1] ncol, int[::1] ptr) noexcept nogil:
-    cdef Py_ssize_t r, rr
 
-    # this is NC/SOC
-    ptr[0] = 0
-    ptr[1] = ncol[0]
-    for r in range(1, nr):
-        rr = r * 2
-        # do both
-        ptr[rr] = ptr[rr - 1] + ncol[r-1]
-        ptr[rr+1] = ptr[rr] + ncol[r]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+cdef inline void _matrix_box_nc_real(const reals_st *data,
+                                     const complexs_st phase,
+                                     complexs_st *M) noexcept nogil:
+    M[0] = <complexs_st> (data[0] * phase)
+    M[1] = <complexs_st> ((data[2] + 1j * data[3]) * phase)
+    M[2] = <complexs_st> ((data[2] + 1j * data[3]).conjugate() * phase)
+    M[3] = <complexs_st> (data[1] * phase)
 
-    ptr[nr * 2] = ptr[nr * 2 - 1] + ncol[nr - 1]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+cdef inline void _matrix_box_nc_cmplx(const _internal_complexs_st *data,
+                                      const complexs_st phase,
+                                      complexs_st *M) noexcept nogil:
+    M[0] = <complexs_st> (data[0] * phase)
+    M[1] = <complexs_st> (data[2] * phase)
+    M[2] = <complexs_st> (data[2].conjugate() * phase)
+    M[3] = <complexs_st> (data[1] * phase)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+cdef inline void _matrix_box_so_real(const reals_st *data,
+                                     const complexs_st phase,
+                                     complexs_st *M) noexcept nogil:
+    M[0] = <complexs_st> ((data[0] + 1j * data[4]) * phase)
+    M[1] = <complexs_st> ((data[2] + 1j * data[3]) * phase)
+    M[2] = <complexs_st> ((data[6] + 1j * data[7]) * phase)
+    M[3] = <complexs_st> ((data[1] + 1j * data[5]) * phase)
+
+
+# necessary to double the interfaces
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+@cython.cdivision(True)
+cdef inline void _matrix_box_so_cmplx(const _internal_complexs_st *data,
+                                      const complexs_st phase,
+                                      complexs_st *M) noexcept nogil:
+    M[0] = <complexs_st> (data[0] * phase)
+    M[1] = <complexs_st> (data[2] * phase)
+    M[2] = <complexs_st> (data[3] * phase)
+    M[3] = <complexs_st> (data[1] * phase)
diff --git a/src/sisl/physics/_phase.pxd b/src/sisl/physics/_phase.pxd
new file mode 100644
index 0000000000..7449aaa779
--- /dev/null
+++ b/src/sisl/physics/_phase.pxd
@@ -0,0 +1,7 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+from sisl._core._dtypes cimport floats_st
+
+
+cdef bint is_gamma(const floats_st[::1] k) noexcept nogil
diff --git a/src/sisl/physics/_phase.pyx b/src/sisl/physics/_phase.pyx
index 183804867b..ddd784cce9 100644
--- a/src/sisl/physics/_phase.pyx
+++ b/src/sisl/physics/_phase.pyx
@@ -2,33 +2,36 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 cimport cython
-from libc.math cimport fabs
-
-import numpy as np
-
-cimport numpy as np
+from libc.math cimport fabs, fabsf
 
 from numpy import complex64, complex128, dot, exp, float32, float64, ndarray, ones, pi
 
-from numpy cimport complex64_t, complex128_t, float32_t, float64_t, ndarray
-
-__all__ = ['phase_dtype', 'phase_rsc', 'phase_rij']
+from sisl._core._dtypes cimport floats_st
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.initializedcheck(False)
-cdef inline int is_gamma(const double[::1] k) noexcept nogil:
-    if fabs(k[0]) > 0.0000001:
-        return 0
-    if fabs(k[1]) > 0.0000001:
-        return 0
-    if fabs(k[2]) > 0.0000001:
-        return 0
+cdef inline bint is_gamma(const floats_st[::1] k) noexcept nogil:
+    if floats_st is cython.float:
+        if fabsf(k[0]) > 0.0000001:
+            return 0
+        if fabsf(k[1]) > 0.0000001:
+            return 0
+        if fabsf(k[2]) > 0.0000001:
+            return 0
+
+    else:
+        if fabs(k[0]) > 0.0000001:
+            return 0
+        if fabs(k[1]) > 0.0000001:
+            return 0
+        if fabs(k[2]) > 0.0000001:
+            return 0
     return 1
 
 
-def phase_dtype(ndarray[float64_t, ndim=1, mode='c'] k, M_dtype, R_dtype, force_complex=False):
+def phase_dtype(const floats_st[::1] k, M_dtype, R_dtype, force_complex: bool=False):
     if is_gamma(k) and not force_complex:
         if R_dtype is None:
             return M_dtype
@@ -52,7 +55,7 @@ def phase_dtype(ndarray[float64_t, ndim=1, mode='c'] k, M_dtype, R_dtype, force_
     return R_dtype
 
 
-def phase_rsc(sc, ndarray[float64_t, ndim=1, mode='c'] k, dtype):
+def phase_rsc(sc, const floats_st[::1] k, dtype):
     """ Calculate the phases for the supercell interactions using k """
 
     # Figure out if this is a Gamma point or not
@@ -66,7 +69,7 @@ def phase_rsc(sc, ndarray[float64_t, ndim=1, mode='c'] k, dtype):
     return phases
 
 
-def phase_rij(rij, sc, ndarray[float64_t, ndim=1, mode='c'] k, dtype):
+def phase_rij(rij, sc, const floats_st[::1] k, dtype):
     """ Calculate the phases for the distance matrix using k """
 
     # Figure out if this is a Gamma point or not
diff --git a/src/sisl/physics/densitymatrix.py b/src/sisl/physics/densitymatrix.py
index eddab35867..4de3f7a7cb 100644
--- a/src/sisl/physics/densitymatrix.py
+++ b/src/sisl/physics/densitymatrix.py
@@ -24,40 +24,12 @@
 from sisl.messages import deprecate_argument, progressbar, warn
 from sisl.typing import AtomsIndex, GaugeType, SeqFloat
 
-from .sparse import SparseOrbitalBZSpin
+from .sparse import SparseOrbitalBZSpin, _get_spin
 from .spin import Spin
 
 __all__ = ["DensityMatrix"]
 
 
-def _get_density(DM, orthogonal, what="sum"):
-    DM = DM.T
-    if orthogonal:
-        off = 0
-    else:
-        off = 1
-    if what == "sum":
-        if DM.shape[0] in (2 + off, 4 + off, 8 + off):
-            return DM[0] + DM[1]
-        return DM[0]
-    if what == "spin":
-        m = np.empty([3, DM.shape[1]], dtype=DM.dtype)
-        if DM.shape[0] == 8 + off:
-            m[0] = DM[2] + DM[6]
-            m[1] = -DM[3] + DM[7]
-            m[2] = DM[0] - DM[1]
-        elif DM.shape[0] == 4 + off:
-            m[0] = 2 * DM[2]
-            m[1] = -2 * DM[3]
-            m[2] = DM[0] - DM[1]
-        elif DM.shape[0] == 2 + off:
-            m[:2, :] = 0.0
-            m[2] = DM[0] - DM[1]
-        elif DM.shape[0] == 1 + off:
-            m[...] = 0.0
-        return m
-
-
 class _densitymatrix(SparseOrbitalBZSpin):
     def spin_rotate(self, angles: SeqFloat, rad: bool = False):
         r"""Rotates spin-boxes by fixed angles around the :math:`x`, :math:`y` and :math:`z` axis, respectively.
@@ -539,10 +511,10 @@ def bond_order(
         m, *opts = method.split(":")
 
         # only extract the summed density
-        what = "sum"
+        what = "trace"
         if "spin" in opts:
             # do this for each spin x, y, z
-            what = "spin"
+            what = "vector"
             del opts[opts.index("spin")]
 
         # Check that there are no un-used options
@@ -556,7 +528,7 @@ def bond_order(
         rows, cols, DM = _to_coo(self._csr)
 
         # Convert to requested matrix form
-        D = _get_density(DM, self.orthogonal, what)
+        D = _get_spin(DM, self.spin, what).T
 
         # Define a matrix-matrix multiplication
         def mm(A, B):
diff --git a/src/sisl/physics/energydensitymatrix.py b/src/sisl/physics/energydensitymatrix.py
index 895189e6bf..b11aa221d5 100644
--- a/src/sisl/physics/energydensitymatrix.py
+++ b/src/sisl/physics/energydensitymatrix.py
@@ -326,7 +326,7 @@ def shift(self, E, DM):
             return
 
         for i in range(self.spin.spinor):
-            self._csr._D[:, i] += DM._csr._D[:, i] * E[i]
+            self._csr._D[:, i].real += DM._csr._D[:, i].real * E[i]
 
     @staticmethod
     def read(sile, *args, **kwargs):
diff --git a/src/sisl/physics/hamiltonian.py b/src/sisl/physics/hamiltonian.py
index 6a243dde60..60b02e0f86 100644
--- a/src/sisl/physics/hamiltonian.py
+++ b/src/sisl/physics/hamiltonian.py
@@ -328,7 +328,7 @@ def shift(self, E):
             # For non-collinear and SO only the diagonal (real) components
             # should be shifted.
             for i in range(self.spin.spinor):
-                self._csr._D[:, i] += self._csr._D[:, self.S_idx] * E[i]
+                self._csr._D[:, i].real += self._csr._D[:, self.S_idx].real * E[i]
 
     def eigenvalue(self, k=(0, 0, 0), gauge: GaugeType = "cell", **kwargs):
         """Calculate the eigenvalues at `k` and return an `EigenvalueElectron` object containing all eigenvalues for a given `k`
diff --git a/src/sisl/physics/sparse.py b/src/sisl/physics/sparse.py
index f2c531aa6d..c5e8a79fd4 100644
--- a/src/sisl/physics/sparse.py
+++ b/src/sisl/physics/sparse.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import warnings
+from typing import Literal
 
 import numpy as np
 from scipy.sparse import SparseEfficiencyWarning, csr_matrix
@@ -13,6 +14,7 @@
 from sisl import Geometry
 from sisl._core.sparse import issparse
 from sisl._core.sparse_geometry import SparseOrbital
+from sisl._help import dtype_complex_to_real, dtype_real_to_complex
 from sisl._internal import set_module
 from sisl.messages import warn
 from sisl.typing import AtomsIndex, GaugeType, KPoint
@@ -29,6 +31,84 @@
 warnings.filterwarnings("ignore", category=SparseEfficiencyWarning)
 
 
+def _get_spin(M, spin, what: Literal["trace", "box", "vector"] = "box"):
+    if what == "trace":
+        if spin.spinor == 2:
+            # we have both up+down
+            # TODO fix spin-orbit with complex values
+            return M[..., 0] + M[..., 1]
+        return M[..., 0]
+
+    if what == "vector":
+        m = np.empty(M.shape[:-1] + (3,), dtype=dtype_complex_to_real(M.dtype))
+        if spin.is_unpolarized:
+            # no spin-density
+            m[...] = 0.0
+        else:
+            # Same for all spin-configurations
+            m[..., 2] = (M[..., 0] - M[..., 1]).real
+
+            # These indices should be reflected in sisl/physics/sparse.py
+            # for the Mxy[ri] indices in the reset method
+            if spin.is_polarized:
+                m[..., :2] = 0.0
+            elif spin.is_noncolinear:
+                if spin.dkind in ("f", "i"):
+                    m[..., 0] = 2 * M[..., 2]
+                    m[..., 1] = -2 * M[..., 3]
+                else:
+                    m[..., 0] = 2 * M[..., 2].real
+                    m[..., 1] = -2 * M[..., 2].imag
+            else:
+                # spin-orbit
+                if spin.dkind in ("f", "i"):
+                    m[..., 0] = M[..., 2] + M[..., 6]
+                    m[..., 1] = -M[..., 3] + M[..., 7]
+                else:
+                    tmp = M[..., 2].conj() + M[..., 3]
+                    m[..., 0] = tmp.real
+                    m[..., 1] = tmp.imag
+        return m
+
+    if what == "box":
+        m = np.empty(M.shape[:-1] + (2, 2), dtype=dtype_real_to_complex(M.dtype))
+        if spin.is_unpolarized:
+            # no spin-density
+            m[...] = 0.0
+            m[..., 0, 0] = M[..., 0]
+            m[..., 1, 1] = M[..., 0]
+        elif spin.is_polarized:
+            m[...] = 0.0
+            m[..., 0, 0] = M[..., 0]
+            m[..., 1, 1] = M[..., 1]
+        elif spin.is_noncolinear:
+            if spin.dkind in ("f", "i"):
+                m[..., 0, 0] = M[..., 0]
+                m[..., 1, 1] = M[..., 1]
+                m[..., 0, 1] = M[..., 2] + 1j * M[..., 3]
+                m[..., 1, 0] = m[..., 0, 1].conj()
+            else:
+                m[..., 0, 0] = M[..., 0]
+                m[..., 1, 1] = M[..., 1]
+                m[..., 0, 1] = M[..., 2]
+                m[..., 1, 0] = M[..., 2].conj()
+        else:
+            if spin.dkind in ("f", "i"):
+                m[..., 0, 0] = M[..., 0] + 1j * M[..., 4]
+                m[..., 1, 1] = M[..., 1] + 1j * M[..., 5]
+                m[..., 0, 1] = M[..., 2] + 1j * M[..., 3]
+                m[..., 1, 0] = M[..., 6] + 1j * M[..., 7]
+            else:
+                m[..., 0, 0] = M[..., 0]
+                m[..., 1, 1] = M[..., 1]
+                m[..., 0, 1] = M[..., 2]
+                m[..., 1, 0] = M[..., 3]
+
+        return m
+
+    raise ValueError(f"Wrong 'what' argument got {what}.")
+
+
 @set_module("sisl.physics")
 class SparseOrbitalBZ(SparseOrbital):
     r"""Sparse object containing the orbital connections in a Brillouin zone
@@ -84,6 +164,8 @@ def __init__(
 
     def _reset(self):
         r"""Reset object according to the options, please refer to `SparseOrbital.reset` for details"""
+        # Update the shape
+        self._csr._shape = self.shape[:-1] + self._csr._D.shape[-1:]
         if self.orthogonal:
             self.Sk = self._Sk_diagonal
             self.S_idx = -100
@@ -763,6 +845,9 @@ def _reset(self):
         r"""Reset object according to the options, please refer to `SparseOrbital.reset` for details"""
         super()._reset()
 
+        # Update the dtype of the spin
+        self._spin = Spin(self.spin, dtype=self.dtype)
+
         if self.spin.is_unpolarized:
             self.UP = 0
             self.DOWN = 0
@@ -780,7 +865,7 @@ def _reset(self):
             self.dSk = self._dSk
 
         elif self.spin.is_noncolinear:
-            if self.spin.dkind == "f":
+            if self.spin.dkind in ("f", "i"):
                 self.M11 = 0
                 self.M22 = 1
                 self.M12r = 2
@@ -789,7 +874,6 @@ def _reset(self):
                 self.M11 = 0
                 self.M22 = 1
                 self.M12 = 2
-                raise NotImplementedError("Currently not implemented")
             self.Pk = self._Pk_non_colinear
             self.Sk = self._Sk_non_colinear
             self.dPk = self._dPk_non_colinear
@@ -798,7 +882,7 @@ def _reset(self):
             self.ddSk = self._ddSk_non_colinear
 
         elif self.spin.is_spinorbit:
-            if self.spin.dkind == "f":
+            if self.spin.dkind in ("f", "i"):
                 self.SX = np.array([0, 0, 1, 0, 0, 0, 1, 0], self.dtype)
                 self.SY = np.array([0, 0, 0, -1, 0, 0, 0, 1], self.dtype)
                 self.SZ = np.array([1, -1, 0, 0, 0, 0, 0, 0], self.dtype)
@@ -815,7 +899,7 @@ def _reset(self):
                 self.M22 = 1
                 self.M12 = 2
                 self.M21 = 3
-                raise NotImplementedError("Currently not implemented")
+
             # The overlap is the same as non-collinear
             self.Pk = self._Pk_spin_orbit
             self.Sk = self._Sk_non_colinear
@@ -836,7 +920,7 @@ def spin(self):
         r"""Associated spin class"""
         return self._spin
 
-    def create_construct(self, R, param):
+    def create_construct(self, R, params):
         r"""Create a simple function for passing to the `construct` function.
 
         This is to relieve the creation of simplistic
@@ -846,7 +930,7 @@ def create_construct(self, R, param):
 
         >>> def func(self, ia, atoms, atoms_xyz=None):
         ...     idx = self.geometry.close(ia, R=R, atoms=atoms, atoms_xyz=atoms_xyz)
-        ...     for ix, p in zip(idx, param):
+        ...     for ix, p in zip(idx, params):
         ...         self[ia, ix] = p
 
         In the non-colinear case the matrix element :math:`\mathbf M_{ij}` will be set
@@ -865,79 +949,97 @@ def create_construct(self, R, param):
 
         Parameters
         ----------
-        R : array_like
+        R :
            radii parameters for different shells.
-           Must have same length as `param` or one less.
+           Must have same length as `params` or one less.
            If one less it will be extended with ``R[0]/100``
-        param : array_like
+        params :
            coupling constants corresponding to the `R`
-           ranges. ``param[0,:]`` are the elements
+           ranges. ``params[0,:]`` are the elements
            for the all atoms within ``R[0]`` of each atom.
 
         See Also
         --------
         construct : routine to create the sparse matrix from a generic function (as returned from `create_construct`)
         """
-        if len(R) != len(param):
+        if len(R) != len(params):
             raise ValueError(
-                f"{self.__class__.__name__}.create_construct got different lengths of `R` and `param`"
+                f"{self.__class__.__name__}.create_construct got different lengths of 'R' and 'params'"
             )
         if not self.spin.is_diagonal:
+            # This portion of code splits the construct into doing Hermitian
+            # assignments. This probably needs rigorous testing.
+
+            dtype_cplx = dtype_real_to_complex(self.dtype)
+
             is_complex = self.dkind == "c"
             if self.spin.is_spinorbit:
                 if is_complex:
                     nv = 4
                     # Hermitian parameters
-                    paramH = [
-                        [p[0].conj(), p[1].conj(), p[3].conj(), p[2].conj(), *p[4:]]
-                        for p in param
+                    # The input order is [uu, dd, ud, du]
+                    paramsH = [
+                        [
+                            p[0].conjugate(),
+                            p[1].conjugate(),
+                            p[3].conjugate(),
+                            p[2].conjugate(),
+                            *p[4:],
+                        ]
+                        for p in params
                     ]
                 else:
                     nv = 8
                     # Hermitian parameters
-                    paramH = [
+                    # The input order is [Ruu, Rdd, Rud, Iud, Iuu, Idd, Rdu, idu]
+                    paramsH = [
                         [p[0], p[1], p[6], -p[7], -p[4], -p[5], p[2], -p[3], *p[8:]]
-                        for p in param
+                        for p in params
                     ]
                 if not self.orthogonal:
                     nv += 1
 
                 # ensure we have correct number of values
-                assert all(len(p) == nv for p in param)
+                assert all(len(p) == nv for p in params)
 
                 if R[0] <= 0.1001:  # no atom closer than 0.1001 Ang!
                     # We check that the the parameters here is Hermitian
-                    p = param[0]
+                    p = params[0]
                     if is_complex:
-                        onsite = np.array([[p[0], p[2]], [p[3], p[1]]], self.dtype)
+                        onsite = np.array([[p[0], p[2]], [p[3], p[1]]], dtype_cplx)
                     else:
                         onsite = np.array(
                             [
                                 [p[0] + 1j * p[4], p[2] + 1j * p[3]],
                                 [p[6] + 1j * p[7], p[1] + 1j * p[5]],
                             ],
-                            np.complex128,
+                            dtype_cplx,
                         )
-                    if not np.allclose(onsite, onsite.T.conj()):
+                    if not np.allclose(onsite, onsite.T.conjugate()):
                         warn(
-                            f"{self.__class__.__name__}.create_construct is NOT Hermitian for on-site terms. This is your responsibility!"
+                            f"{self.__class__.__name__}.create_construct is NOT "
+                            "Hermitian for on-site terms. This is your responsibility! "
+                            "The code will continue silently, be AWARE!"
                         )
 
             elif self.spin.is_noncolinear:
                 if is_complex:
                     nv = 3
                     # Hermitian parameters
-                    paramH = [[p[0].conj(), p[1].conj(), p[2], *p[3:]] for p in param]
+                    paramsH = [
+                        [p[0].conjugate(), p[1].conjugate(), p[2], *p[3:]]
+                        for p in params
+                    ]
                 else:
                     nv = 4
                     # Hermitian parameters
-                    # Note that we don"t need to do anything here.
+                    # Note that we don't need to do anything here.
                     # H_ij = [[0, 2 + 1j 3],
                     #         [2 - 1j 3, 1]]
                     # H_ji = [[0, 2 + 1j 3],
                     #         [2 - 1j 3, 1]]
                     # H_ij^H == H_ji^H
-                    paramH = param
+                    paramsH = params
                 if not self.orthogonal:
                     nv += 1
 
@@ -945,21 +1047,25 @@ def create_construct(self, R, param):
                 # Since the values are ensured Hermitian in the on-site case anyways.
 
                 # ensure we have correct number of values
-                assert all(len(p) == nv for p in param)
+                assert all(len(p) == nv for p in params)
 
             na = self.geometry.na
 
             # Now create the function that returns the assignment function
             def func(self, ia, atoms, atoms_xyz=None):
                 idx = self.geometry.close(ia, R=R, atoms=atoms, atoms_xyz=atoms_xyz)
-                for ix, p, pc in zip(idx, param, paramH):
+                for ix, p, pc in zip(idx, params, paramsH):
                     ix_ge = (ix % na) >= ia
                     self[ia, ix[ix_ge]] = p
                     self[ia, ix[~ix_ge]] = pc
 
+            func.R = R
+            func.params = params
+            func.paramsH = paramsH
+
             return func
 
-        return super().create_construct(R, param)
+        return super().create_construct(R, params)
 
     def __len__(self):
         r"""Returns number of rows in the basis (if non-collinear or spin-orbit, twice the number of orbitals)"""
@@ -1403,7 +1509,7 @@ def transpose(self, hermitian: bool = False, spin: bool = True, sort: bool = Tru
         if sp.is_spinorbit:
             if hermitian and spin:
                 # conjugate the imaginary value and transpose spin-box
-                if sp.dkind == "f":
+                if sp.dkind in ("f", "i"):
                     # imaginary components (including transposing)
                     #    12,11,22,21
                     D[:, [3, 4, 5, 7]] = -D[:, [7, 4, 5, 3]]
@@ -1413,7 +1519,7 @@ def transpose(self, hermitian: bool = False, spin: bool = True, sort: bool = Tru
                     D[:, [0, 1, 2, 3]] = np.conj(D[:, [0, 1, 3, 2]])
             elif hermitian:
                 # conjugate the imaginary value
-                if sp.dkind == "f":
+                if sp.dkind in ("f", "i"):
                     # imaginary components
                     #    12,11,22,21
                     D[:, [3, 4, 5, 7]] *= -1.0
@@ -1421,7 +1527,7 @@ def transpose(self, hermitian: bool = False, spin: bool = True, sort: bool = Tru
                     D[:, :] = np.conj(D[:, :])
             elif spin:
                 # transpose spin-box, 12 <-> 21
-                if sp.dkind == "f":
+                if sp.dkind in ("f", "i"):
                     D[:, [2, 3, 6, 7]] = D[:, [6, 7, 2, 3]]
                 else:
                     D[:, [2, 3]] = D[:, [3, 2]]
@@ -1438,7 +1544,7 @@ def transpose(self, hermitian: bool = False, spin: bool = True, sort: bool = Tru
                 # So for transposing we should negate the sign
                 # to ensure we put the opposite value in the
                 # correct place.
-                if sp.dkind == "f":
+                if sp.dkind in ("f", "i"):
                     D[:, 3] = -D[:, 3]
                 else:
                     D[:, 2] = np.conj(D[:, 2])
@@ -1462,7 +1568,7 @@ def trs(self):
 
         # Apply Pauli-Y on the left and right of each spin-box
         if sp.is_spinorbit:
-            if sp.dkind == "f":
+            if sp.dkind in ("f", "i"):
                 # [R11, R22, R12, I12, I11, I22, R21, I21]
                 # [R11, R22] = [R22, R11]
                 # [I12, I21] = [I21, I12] (conj + Y @ Y[sign-changes conj])
@@ -1473,7 +1579,7 @@ def trs(self):
             else:
                 raise NotImplementedError
         elif sp.is_noncolinear:
-            if sp.dkind == "f":
+            if sp.dkind in ("f", "i"):
                 # [R11, R22, R12, I12]
                 D[:, 2] = -D[:, 2]
             else:
@@ -1519,6 +1625,12 @@ def transform(self, matrix=None, dtype=None, spin=None, orthogonal=None):
         The transformation matrix does *not* act on the rows and columns, only on the
         final dimension of the matrix.
 
+        The matrix transformation is done like this:
+
+        >>> out = in @ matrix.T
+
+        Meaning that ``matrix[0, :]`` will be the factors of the input matrix elements.
+
         Parameters
         ----------
         matrix : array_like, optional
@@ -1593,7 +1705,7 @@ def transform(self, matrix=None, dtype=None, spin=None, orthogonal=None):
         )
         new._csr = self._csr.transform(matrix, dtype=dtype)
 
-        if not orthogonal and self.orthogonal:
+        if self.orthogonal and not orthogonal:
             # set identity overlap matrix, loop over rows
             for i in range(new._csr.shape[0]):
                 new._csr[i, i, -1] = 1.0
diff --git a/src/sisl/physics/spin.py b/src/sisl/physics/spin.py
index c86510ee69..191c510916 100644
--- a/src/sisl/physics/spin.py
+++ b/src/sisl/physics/spin.py
@@ -56,7 +56,7 @@ class Spin:
     #: The :math:`\boldsymbol\sigma_z` Pauli matrix
     Z = np.array([[1, 0], [0, -1]], np.complex128)
 
-    __slots__ = ("_size", "_kind", "_dtype")
+    __slots__ = ("_kind", "_dtype")
 
     def __init__(self, kind="", dtype=None):
         if isinstance(kind, Spin):
@@ -64,7 +64,6 @@ def __init__(self, kind="", dtype=None):
                 dtype = kind._dtype
             self._kind = kind._kind
             self._dtype = dtype
-            self._size = kind._size
             return
 
         if dtype is None:
@@ -107,24 +106,6 @@ def __init__(self, kind="", dtype=None):
         # Now assert the checks
         self._kind = kind
 
-        if np.dtype(dtype).kind == "c":
-            size = {
-                self.UNPOLARIZED: 1,
-                self.POLARIZED: 2,
-                self.NONCOLINEAR: 4,
-                self.SPINORBIT: 4,
-            }.get(kind)
-
-        else:
-            size = {
-                self.UNPOLARIZED: 1,
-                self.POLARIZED: 2,
-                self.NONCOLINEAR: 4,
-                self.SPINORBIT: 8,
-            }.get(kind)
-
-        self._size = size
-
     def __str__(self):
         if self.is_unpolarized:
             return f"{self.__class__.__name__}{{unpolarized, kind={self.dkind}}}"
@@ -151,12 +132,32 @@ def dkind(self):
     @property
     def size(self):
         """Number of elements to describe the spin-components"""
-        return self._size
+        size = {
+            "c": {
+                self.UNPOLARIZED: 1,
+                self.POLARIZED: 2,
+                self.NONCOLINEAR: 3,
+                self.SPINORBIT: 4,
+            },
+            "i": {
+                self.UNPOLARIZED: 1,
+                self.POLARIZED: 2,
+                self.NONCOLINEAR: 4,
+                self.SPINORBIT: 8,
+            },
+            "f": {
+                self.UNPOLARIZED: 1,
+                self.POLARIZED: 2,
+                self.NONCOLINEAR: 4,
+                self.SPINORBIT: 8,
+            },
+        }[self.dkind][self.kind]
+        return size
 
     @property
     def spinor(self):
         """Number of spinor components (1 or 2)"""
-        return min(2, self._size)
+        return min(2, self.size)
 
     @property
     def kind(self):
@@ -196,7 +197,7 @@ def is_spinorbit(self):
         return self.kind == Spin.SPINORBIT
 
     def __len__(self):
-        return self._size
+        return self.size
 
     # Comparisons
     def __lt__(self, other):
@@ -221,6 +222,5 @@ def __getstate__(self):
         return {"size": self.size, "kind": self.kind, "dtype": self.dtype}
 
     def __setstate__(self, state):
-        self._size = state["size"]
         self._kind = state["kind"]
         self._dtype = state["dtype"]