From ef20647d0d24d0c75aa586d938b974f631976421 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 25 Oct 2024 17:42:19 -0700
Subject: [PATCH 01/69] import converter dependencies lazily

---
 hls4ml/converters/keras/qkeras.py   |  4 ++--
 hls4ml/converters/keras_to_hls.py   |  4 ++--
 hls4ml/converters/onnx_to_hls.py    |  7 ++++---
 hls4ml/converters/pytorch_to_hls.py |  4 ++--
 hls4ml/model/quantizers.py          | 20 ++++++++++++--------
 hls4ml/optimization/__init__.py     |  4 +---
 6 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/hls4ml/converters/keras/qkeras.py b/hls4ml/converters/keras/qkeras.py
index 7357d95aed..d1910c070d 100644
--- a/hls4ml/converters/keras/qkeras.py
+++ b/hls4ml/converters/keras/qkeras.py
@@ -1,5 +1,3 @@
-from qkeras.quantizers import get_quantizer
-
 from hls4ml.converters.keras.convolution import parse_conv1d_layer, parse_conv2d_layer
 from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_dense_layer
 from hls4ml.converters.keras.recurrent import parse_rnn_layer
@@ -88,6 +86,8 @@ def parse_qrnn_layer(keras_layer, input_names, input_shapes, data_reader):
 
 @keras_handler('QActivation')
 def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader):
+    from qkeras.quantizers import get_quantizer
+
     assert keras_layer['class_name'] == 'QActivation'
     supported_activations = [
         'quantized_relu',
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index e31e2b96a9..9fc63cf398 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -160,9 +160,9 @@ def get_model_arch(config):
         # Model instance passed in config from API
         keras_model = config['KerasModel']
         if isinstance(keras_model, str):
-            from tensorflow.keras.models import load_model
+            import keras
 
-            keras_model = load_model(keras_model)
+            keras_model = keras.models.load_model(keras_model)
         model_arch = json.loads(keras_model.to_json())
         reader = KerasModelReader(keras_model)
     elif 'KerasJson' in config:
diff --git a/hls4ml/converters/onnx_to_hls.py b/hls4ml/converters/onnx_to_hls.py
index 75850fa93e..99281888f3 100644
--- a/hls4ml/converters/onnx_to_hls.py
+++ b/hls4ml/converters/onnx_to_hls.py
@@ -1,6 +1,3 @@
-import onnx
-from onnx import helper, numpy_helper
-
 from hls4ml.model import ModelGraph
 
 
@@ -21,6 +18,8 @@ def replace_char_inconsitency(name):
 
 
 def get_onnx_attribute(operation, name, default=None):
+    from onnx import helper
+
     attr = next((x for x in operation.attribute if x.name == name), None)
     if attr is None:
         value = default
@@ -76,6 +75,7 @@ def get_input_shape(graph, node):
 
 def get_constant_value(graph, constant_name):
     tensor = next((x for x in graph.initializer if x.name == constant_name), None)
+    from onnx import numpy_helper
     return numpy_helper.to_array(tensor)
 
 
@@ -273,6 +273,7 @@ def onnx_to_hls(config):
     # Extract model architecture
     print('Interpreting Model ...')
 
+    import onnx
     onnx_model = onnx.load(config['OnnxModel']) if isinstance(config['OnnxModel'], str) else config['OnnxModel']
 
     layer_list, input_layers, output_layers = parse_onnx_model(onnx_model)
diff --git a/hls4ml/converters/pytorch_to_hls.py b/hls4ml/converters/pytorch_to_hls.py
index 79ca1fa5c6..3ec5b17691 100644
--- a/hls4ml/converters/pytorch_to_hls.py
+++ b/hls4ml/converters/pytorch_to_hls.py
@@ -1,5 +1,3 @@
-import torch
-
 from hls4ml.model import ModelGraph
 
 
@@ -26,6 +24,8 @@ def get_weights_data(self, layer_name, var_name):
 
 class PyTorchFileReader(PyTorchModelReader):  # Inherit get_weights_data method
     def __init__(self, config):
+        import torch
+
         self.config = config
 
         if not torch.cuda.is_available():
diff --git a/hls4ml/model/quantizers.py b/hls4ml/model/quantizers.py
index a5b9ceb8c4..b445c70af3 100644
--- a/hls4ml/model/quantizers.py
+++ b/hls4ml/model/quantizers.py
@@ -5,8 +5,6 @@
 """
 
 import numpy as np
-import tensorflow as tf
-from qkeras.quantizers import get_quantizer
 
 from hls4ml.model.types import (
     ExponentPrecisionType,
@@ -87,6 +85,8 @@ class QKerasQuantizer(Quantizer):
     """
 
     def __init__(self, config):
+        from qkeras.quantizers import get_quantizer
+
         self.quantizer_fn = get_quantizer(config)
         self.alpha = config['config'].get('alpha', None)
         if config['class_name'] == 'quantized_bits':
@@ -106,8 +106,8 @@ def __init__(self, config):
             self.hls_type = FixedPrecisionType(width=16, integer=6, signed=True)
 
     def __call__(self, data):
-        tf_data = tf.convert_to_tensor(data)
-        return self.quantizer_fn(tf_data).numpy()
+        data = np.array(data, dtype='float32')
+        return self.quantizer_fn(data).numpy()
         # return self.quantizer_fn(data)
 
     def _get_type(self, quantizer_config):
@@ -132,6 +132,8 @@ class QKerasBinaryQuantizer(Quantizer):
     """
 
     def __init__(self, config, xnor=False):
+        from qkeras.quantizers import get_quantizer
+
         self.bits = 1 if xnor else 2
         self.hls_type = XnorPrecisionType() if xnor else IntegerPrecisionType(width=2, signed=True)
         self.alpha = config['config']['alpha']
@@ -141,8 +143,8 @@ def __init__(self, config, xnor=False):
         self.binary_quantizer = BinaryQuantizer(1) if xnor else BinaryQuantizer(2)
 
     def __call__(self, data):
-        x = tf.convert_to_tensor(data)
-        y = self.quantizer_fn(x).numpy()
+        data = np.array(data, dtype='float32')
+        y = self.quantizer_fn(data).numpy()
         return self.binary_quantizer(y)
 
 
@@ -154,14 +156,16 @@ class QKerasPO2Quantizer(Quantizer):
     """
 
     def __init__(self, config):
+        from qkeras.quantizers import get_quantizer
+
         self.bits = config['config']['bits']
         self.quantizer_fn = get_quantizer(config)
         self.hls_type = ExponentPrecisionType(width=self.bits, signed=True)
 
     def __call__(self, data):
         # Weights are quantized to nearest power of two
-        x = tf.convert_to_tensor(data)
-        y = self.quantizer_fn(x)
+        data = np.array(data, dtype='float32')
+        y = self.quantizer_fn(data)
         if hasattr(y, 'numpy'):
             y = y.numpy()
         return y
diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py
index c626b70c2b..2b49886e39 100644
--- a/hls4ml/optimization/__init__.py
+++ b/hls4ml/optimization/__init__.py
@@ -1,3 +1 @@
-from .dsp_aware_pruning import optimize_keras_model_for_hls4ml  # noqa: F401
-from .dsp_aware_pruning.attributes import get_attributes_from_keras_model_and_hls4ml_config  # noqa: F401
-from .dsp_aware_pruning.keras import optimize_model  # noqa: F401
+# No imports as each of the optimization modules may contain different dependencies.

From 028b4d0dd63257c2662d32e818fd606b72c1eca6 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 25 Oct 2024 20:12:28 -0700
Subject: [PATCH 02/69] make tf and qkeras optionl, stop assuming keras is
 tf.keras

---
 hls4ml/converters/__init__.py                 |  6 ++---
 hls4ml/model/optimizer/passes/qkeras.py       |  3 ++-
 hls4ml/model/profiling.py                     | 26 ++++++++++++++-----
 .../dsp_aware_pruning/keras/__init__.py       |  4 ---
 hls4ml/utils/config.py                        |  4 +--
 hls4ml/writer/catapult_writer.py              |  4 ++-
 hls4ml/writer/quartus_writer.py               |  4 ++-
 hls4ml/writer/vivado_writer.py                |  4 ++-
 8 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index 3d7ce1fe56..1343907b54 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -93,10 +93,10 @@ def parse_yaml_config(config_file):
     """
 
     def construct_keras_model(loader, node):
-        from tensorflow.keras.models import load_model
-
         model_str = loader.construct_scalar(node)
-        return load_model(model_str)
+        import keras
+
+        return keras.models.load_model(model_str)
 
     yaml.add_constructor('!keras_model', construct_keras_model, Loader=yaml.SafeLoader)
 
diff --git a/hls4ml/model/optimizer/passes/qkeras.py b/hls4ml/model/optimizer/passes/qkeras.py
index 03690bed0d..fb02d4eccf 100644
--- a/hls4ml/model/optimizer/passes/qkeras.py
+++ b/hls4ml/model/optimizer/passes/qkeras.py
@@ -1,5 +1,4 @@
 import numpy as np
-import tensorflow as tf
 
 from hls4ml.model.layers import ApplyAlpha
 from hls4ml.model.optimizer import ConfigurableOptimizerPass, OptimizerPass, register_pass
@@ -113,6 +112,8 @@ def match(self, node):
     def transform(self, model, node):
         # The quantizer has to be applied to set the scale attribute
         # This must be applied to the _unquantized_ weights to obtain the correct scale
+        import tensorflow as tf
+
         quantizer = node.weights['weight'].quantizer.quantizer_fn  # get QKeras quantizer
         weights = node.weights['weight'].data_unquantized  # get weights
         qweights = quantizer(tf.convert_to_tensor(weights))
diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index 84a83de23e..a7fee506e5 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -13,12 +13,11 @@
 from hls4ml.model.layers import GRU, LSTM, SeparableConv1D, SeparableConv2D
 
 try:
-    import qkeras
-    from tensorflow import keras
+    import keras
 
-    __tf_profiling_enabled__ = True
+    __keras_profiling_enabled__ = True
 except ImportError:
-    __tf_profiling_enabled__ = False
+    __keras_profiling_enabled__ = False
 
 try:
     import torch
@@ -27,6 +26,19 @@
 except ImportError:
     __torch_profiling_enabled__ = False
 
+try:
+    import qkeras
+
+    __qkeras_profiling_enabled__ = True
+except ImportError:
+    __qkeras_profiling_enabled__ = False
+
+_activations = list()
+if __keras_profiling_enabled__:
+    _activations.append(keras.layers.Activation)
+if __qkeras_profiling_enabled__:
+    _activations.append(qkeras.qactivations)
+
 
 def get_unoptimized_hlsmodel(model):
     from hls4ml.converters import convert_from_config
@@ -482,7 +494,7 @@ def numerical(model=None, hls_model=None, X=None, plot='boxplot'):
     if hls_model_present:
         data = weights_hlsmodel(hls_model_unoptimized, fmt='summary', plot=plot)
     elif model_present:
-        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
+        if __keras_profiling_enabled__ and isinstance(model, keras.Model):
             data = weights_keras(model, fmt='summary', plot=plot)
         elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
             data = weights_torch(model, fmt='summary', plot=plot)
@@ -520,7 +532,7 @@ def numerical(model=None, hls_model=None, X=None, plot='boxplot'):
     if X is not None:
         print("Profiling activations" + before)
         data = None
-        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
+        if __keras_profiling_enabled__ and isinstance(model, keras.Model):
             data = activations_keras(model, X, fmt='summary', plot=plot)
         elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
             data = activations_torch(model, X, fmt='summary', plot=plot)
@@ -590,7 +602,7 @@ def get_ymodel_keras(keras_model, X):
         if (
             hasattr(layer, 'activation')
             and layer.activation is not None
-            and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation))
+            and not isinstance(layer, _activations)
             and layer.activation.__name__ != 'linear'
         ):
             tmp_activation = layer.activation
diff --git a/hls4ml/optimization/dsp_aware_pruning/keras/__init__.py b/hls4ml/optimization/dsp_aware_pruning/keras/__init__.py
index 29012bd39e..b525f58a33 100644
--- a/hls4ml/optimization/dsp_aware_pruning/keras/__init__.py
+++ b/hls4ml/optimization/dsp_aware_pruning/keras/__init__.py
@@ -4,9 +4,6 @@
 import numpy as np
 import tensorflow as tf
 
-# Enables printing of loss tensors during custom training loop
-from tensorflow.python.ops.numpy_ops import np_config
-
 import hls4ml.optimization.dsp_aware_pruning.keras.utils as utils
 from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
 from hls4ml.optimization.dsp_aware_pruning.keras.builder import build_optimizable_model, remove_custom_regularizers
@@ -15,7 +12,6 @@
 from hls4ml.optimization.dsp_aware_pruning.keras.reduction import reduce_model
 from hls4ml.optimization.dsp_aware_pruning.scheduler import OptimizationScheduler
 
-np_config.enable_numpy_behavior()
 default_regularization_range = np.logspace(-6, -2, num=16).tolist()
 
 
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index e450084095..6a356f5f27 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -1,7 +1,5 @@
 import json
 
-import qkeras
-
 import hls4ml
 
 
@@ -48,6 +46,8 @@ def create_config(output_dir='my-hls-test', project_name='myproject', backend='V
 
 def _get_precision_from_quantizer(quantizer):
     if isinstance(quantizer, str):
+        import qkeras
+
         quantizer_obj = qkeras.get_quantizer(quantizer)
         quantizer = {}
         # Some activations are classes with get_config method
diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py
index 7db1063206..9a48460995 100755
--- a/hls4ml/writer/catapult_writer.py
+++ b/hls4ml/writer/catapult_writer.py
@@ -889,7 +889,9 @@ def keras_model_representer(dumper, keras_model):
             return dumper.represent_scalar('!keras_model', model_path)
 
         try:
-            from tensorflow.keras import Model as KerasModel
+            import keras
+
+            KerasModel = keras.models.Model
 
             yaml.add_multi_representer(KerasModel, keras_model_representer)
         except Exception:
diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py
index 932a8b6a6d..1d61bde1f4 100644
--- a/hls4ml/writer/quartus_writer.py
+++ b/hls4ml/writer/quartus_writer.py
@@ -1327,7 +1327,9 @@ def keras_model_representer(dumper, keras_model):
             return dumper.represent_scalar('!keras_model', model_path)
 
         try:
-            from tensorflow.keras import Model as KerasModel
+            import keras
+
+            KerasModel = keras.models.Model
 
             yaml.add_multi_representer(KerasModel, keras_model_representer)
         except Exception:
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 0341959045..6531f9db87 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -817,7 +817,9 @@ def keras_model_representer(dumper, keras_model):
             return dumper.represent_scalar('!keras_model', model_path)
 
         try:
-            from tensorflow.keras import Model as KerasModel
+            import keras
+
+            KerasModel = keras.models.Model
 
             yaml.add_multi_representer(KerasModel, keras_model_representer)
         except Exception:

From 72eb0531b2a824f437f00d1a4c357c702db5148c Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 25 Oct 2024 20:34:29 -0700
Subject: [PATCH 03/69] less mandatory dependency

---
 hls4ml/model/profiling.py       |  8 ++++----
 hls4ml/report/quartus_report.py |  6 +++---
 setup.cfg                       | 18 +++++++++++++-----
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index a7fee506e5..6def53f7d1 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -33,11 +33,11 @@
 except ImportError:
     __qkeras_profiling_enabled__ = False
 
-_activations = list()
+__keras_activations = list()
 if __keras_profiling_enabled__:
-    _activations.append(keras.layers.Activation)
+    __keras_activations.append(keras.layers.Activation)
 if __qkeras_profiling_enabled__:
-    _activations.append(qkeras.qactivations)
+    __keras_activations.append(qkeras.QActivation)
 
 
 def get_unoptimized_hlsmodel(model):
@@ -602,7 +602,7 @@ def get_ymodel_keras(keras_model, X):
         if (
             hasattr(layer, 'activation')
             and layer.activation is not None
-            and not isinstance(layer, _activations)
+            and not isinstance(layer, tuple(__keras_activations))
             and layer.activation.__name__ != 'linear'
         ):
             tmp_activation = layer.activation
diff --git a/hls4ml/report/quartus_report.py b/hls4ml/report/quartus_report.py
index c337e5de10..47fc43c132 100644
--- a/hls4ml/report/quartus_report.py
+++ b/hls4ml/report/quartus_report.py
@@ -2,9 +2,6 @@
 import webbrowser
 from ast import literal_eval
 
-from calmjs.parse import asttypes, es5
-from tabulate import tabulate
-
 
 def parse_quartus_report(hls_dir, write_to_file=True):
     '''
@@ -53,6 +50,8 @@ def read_quartus_report(hls_dir, open_browser=False):
     Returns:
         None
     '''
+    from tabulate import tabulate
+
     report = parse_quartus_report(hls_dir)
 
     print('HLS Resource Summary\n')
@@ -100,6 +99,7 @@ def read_js_object(js_script):
     Returns:
         Dictionary of variables defines in script
     '''
+    from calmjs.parse import asttypes, es5
 
     def visit(node):
         if isinstance(node, asttypes.Program):
diff --git a/setup.cfg b/setup.cfg
index 0b81e7b592..c987f1c317 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -22,17 +22,15 @@ description_file = README.md
 [options]
 packages = find:
 install_requires =
-    calmjs.parse
     h5py
     numpy
-    onnx>=1.4.0
+    pyyaml
     pydigitalwavetools==1.1
     pyparsing
     pyyaml
-    tabulate
-    tensorflow>=2.8.0,<=2.14.1
-    tensorflow-model-optimization<=0.7.5
+
 python_requires = >=3.10, <3.12
+python_requires = >=3.10
 include_package_data = True
 scripts = scripts/hls4ml
 
@@ -51,14 +49,24 @@ profiling =
     matplotlib
     pandas
     seaborn
+qkeras =
+    qkeras
+    tensorflow>=2.8.0,<=2.14.1
+    tensorflow-model-optimization<=0.7.5
+quantus_report =
+    calmjs.parse
+    tabulate
 sr =
     sympy
 testing =
     HGQ~=0.2.0
+    calmjs.parse
+    onnx>=1.4.0
     pytest
     pytest-cov
     pytest-randomly
     qonnx
+    tabulate
     torch
 
 [check-manifest]

From 63af2acd17abb09a2bcb619c61dc0edaebb85691 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sat, 26 Oct 2024 09:11:34 -0700
Subject: [PATCH 04/69] fix dsp_aware_pruning test import path

---
 test/pytest/test_optimization/test_attributes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/test_optimization/test_attributes.py b/test/pytest/test_optimization/test_attributes.py
index a42d3a6751..c9e22091f2 100644
--- a/test/pytest/test_optimization/test_attributes.py
+++ b/test/pytest/test_optimization/test_attributes.py
@@ -1,7 +1,7 @@
 from tensorflow.keras.layers import Conv2D, Dense, Flatten, ReLU
 from tensorflow.keras.models import Sequential
 
-from hls4ml.optimization import get_attributes_from_keras_model_and_hls4ml_config
+from hls4ml.optimization.dsp_aware_pruning import get_attributes_from_keras_model_and_hls4ml_config
 from hls4ml.utils.config import config_from_keras_model
 
 

From c11dddb59c8666dbd4429ee879f0f0a385630f41 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 15 Dec 2024 06:31:39 +0000
Subject: [PATCH 05/69] fix broken setup.cfg after rebase, rm pyparsing

---
 setup.cfg | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index c987f1c317..1d4241f063 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,12 +24,8 @@ packages = find:
 install_requires =
     h5py
     numpy
-    pyyaml
     pydigitalwavetools==1.1
-    pyparsing
     pyyaml
-
-python_requires = >=3.10, <3.12
 python_requires = >=3.10
 include_package_data = True
 scripts = scripts/hls4ml

From d9aaa1a73273cf25362669aae1972d9b51aaf131 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 15 Dec 2024 06:31:59 +0000
Subject: [PATCH 06/69] purge qkeras workaround

---
 hls4ml/__init__.py | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/hls4ml/__init__.py b/hls4ml/__init__.py
index e3a7247b0d..0ff5e52ac9 100644
--- a/hls4ml/__init__.py
+++ b/hls4ml/__init__.py
@@ -1,33 +1,3 @@
-# Temporary workaround for QKeras installation requirement, will be removed after 1.0.0
-def maybe_install_qkeras():
-    import subprocess
-    import sys
-
-    QKERAS_PKG_NAME = 'QKeras'
-    # QKERAS_PKG_SOURCE = QKERAS_PKG_NAME
-    QKERAS_PKG_SOURCE = 'qkeras@git+https://github.com/fastmachinelearning/qkeras.git'
-
-    def pip_list():
-        p = subprocess.run([sys.executable, '-m', 'pip', 'list'], check=True, capture_output=True)
-        return p.stdout.decode()
-
-    def pip_install(package):
-        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
-
-    all_pkgs = pip_list()
-    if QKERAS_PKG_NAME not in all_pkgs:
-        print('QKeras installation not found, installing one...')
-        pip_install(QKERAS_PKG_SOURCE)
-        print('QKeras installed.')
-
-
-try:
-    maybe_install_qkeras()
-except Exception:
-    print('Could not find QKeras installation, make sure you have QKeras installed.')
-
-# End of workaround
-
 from hls4ml import converters, report, utils  # noqa: F401, E402
 
 try:

From 485442368b7a376436c09e24076587c15ca8945f Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 15 Dec 2024 22:57:36 +0000
Subject: [PATCH 07/69] switch to pyproject.toml

switch to pyproject.toml

include pyproject.toml after install
---
 .pre-commit-config.yaml                  |  12 +--
 MANIFEST.in                              |   2 +-
 scripts/hls4ml => hls4ml/cli/__init__.py |   0
 pyproject.toml                           | 101 ++++++++++++++++++++++-
 setup.cfg                                |  74 -----------------
 setup.py                                 |   4 -
 6 files changed, 104 insertions(+), 89 deletions(-)
 rename scripts/hls4ml => hls4ml/cli/__init__.py (100%)
 delete mode 100644 setup.cfg
 delete mode 100644 setup.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0601a84b2d..9380ac1689 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,6 +9,11 @@ repos:
     args: ['--line-length=125',
            '--skip-string-normalization']
 
+- repo: https://github.com/tox-dev/pyproject-fmt
+  rev: v2.5.0
+  hooks:
+    - id: pyproject-fmt
+
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v5.0.0
   hooks:
@@ -16,6 +21,7 @@ repos:
   - id: check-case-conflict
   - id: check-merge-conflict
   - id: check-symlinks
+  - id: check-toml
   - id: check-yaml
   - id: debug-statements
   - id: end-of-file-fixer
@@ -27,7 +33,6 @@ repos:
   rev: 5.13.2
   hooks:
   - id: isort
-    args: ["--profile", "black", --line-length=125]
 
 - repo: https://github.com/asottile/pyupgrade
   rev: v3.19.0
@@ -35,11 +40,6 @@ repos:
   - id: pyupgrade
     args: ["--py36-plus"]
 
-- repo: https://github.com/asottile/setup-cfg-fmt
-  rev: v2.7.0
-  hooks:
-  - id: setup-cfg-fmt
-
 - repo: https://github.com/pycqa/flake8
   rev: 7.1.1
   hooks:
diff --git a/MANIFEST.in b/MANIFEST.in
index 549cc6983c..7bcfbfaf6d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,4 @@
-include LICENSE README.md CONTRIBUTING.md CITATION.cff pyproject.toml setup.py setup.cfg .clang-format
+include LICENSE README.md CONTRIBUTING.md CITATION.cff pyproject.toml setup.py .clang-format
 graft example-models
 graft test
 graft contrib
diff --git a/scripts/hls4ml b/hls4ml/cli/__init__.py
similarity index 100%
rename from scripts/hls4ml
rename to hls4ml/cli/__init__.py
diff --git a/pyproject.toml b/pyproject.toml
index 6402ab0e7a..b713b41d80 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,103 @@
 [build-system]
-# AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD!
-requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5", "wheel"]
 build-backend = "setuptools.build_meta"
 
+requires = [ "setuptools>=61", "setuptools-scm>=8" ]
+
+[project]
+name = "hls4ml"
+version = "1.0.0"
+description = "Machine learning in FPGAs using HLS"
+readme = "README.md"
+license = { text = "Apache-2.0" }
+authors = [ { name = "hls4ml Team" } ]
+requires-python = ">=3.10"
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: C++",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Topic :: Software Development :: Libraries",
+  "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [ "h5py", "numpy", "pydigitalwavetools==1.1", "pyyaml" ]
+
+optional-dependencies.doc = [
+  "sphinx",
+  "sphinx-contributors",
+  "sphinx-github-changelog",
+  "sphinx-rtd-theme",
+]
+optional-dependencies.HGQ = [ "hgq~=0.2.0" ]
+optional-dependencies.optimization = [
+  "keras-tuner==1.1.3",
+  "ortools==9.4.1874",
+  "packaging",
+]
+optional-dependencies.profiling = [ "matplotlib", "pandas", "seaborn" ]
+optional-dependencies.qkeras = [
+  "qkeras",
+  "tensorflow>=2.8,<=2.14.1",
+  "tensorflow-model-optimization<=0.7.5",
+]
+optional-dependencies.quantus_report = [ "calmjs-parse", "tabulate" ]
+optional-dependencies.sr = [ "sympy" ]
+optional-dependencies.testing = [
+  "calmjs-parse",
+  "hgq~=0.2.0",
+  "onnx>=1.4",
+  "pytest",
+  "pytest-cov",
+  "pytest-randomly",
+  "qonnx",
+  "tabulate",
+  "torch",
+]
+urls.Homepage = "https://fastmachinelearning.org/hls4ml"
+scripts.hls4ml = "hls4ml.cli:main"
+entry-points.pytest_randomly.random_seeder = "hls4ml:reseed"
+
+[tool.setuptools]
+packages = [ "hls4ml" ]
+include-package-data = true
+
 [tool.setuptools_scm]
-# See configuration details in https://github.com/pypa/setuptools_scm
+
 version_scheme = "release-branch-semver"
-git_describe_command = "git describe --dirty --tags --long --match v* --first-parent"
+git_describe_command = [
+  "git",
+  "describe",
+  "--dirty",
+  "--tags",
+  "--long",
+  "--match",
+  "v*",
+  "--first-parent",
+]
 write_to = "hls4ml/_version.py"
+
+[tool.black]
+line-length = 125
+skip-string-normalization = true
+
+[tool.isort]
+profile = "black"
+line_length = 125
+
+[tool.flake8]
+max-line-length = 125
+extend-ignore = [ "E203", "T201" ]
+
+[tool.check-manifest]
+ignore = [
+  ".github/**",
+  "docs/**",
+  ".pre-commit-config.yaml",
+  "Jenkinsfile",
+  "hls4ml/_version.py",
+]
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 1d4241f063..0000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,74 +0,0 @@
-[metadata]
-name = hls4ml
-description = Machine learning in FPGAs using HLS
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://fastmachinelearning.org/hls4ml
-author = hls4ml Team
-license = Apache-2.0
-license_files = LICENSE
-classifiers =
-    Development Status :: 4 - Beta
-    Intended Audience :: Developers
-    Intended Audience :: Science/Research
-    License :: OSI Approved :: Apache Software License
-    Programming Language :: C++
-    Programming Language :: Python :: 3
-    Programming Language :: Python :: 3 :: Only
-    Topic :: Software Development :: Libraries
-    Topic :: Software Development :: Libraries :: Python Modules
-description_file = README.md
-
-[options]
-packages = find:
-install_requires =
-    h5py
-    numpy
-    pydigitalwavetools==1.1
-    pyyaml
-python_requires = >=3.10
-include_package_data = True
-scripts = scripts/hls4ml
-
-[options.entry_points]
-pytest_randomly.random_seeder =
-    hls4ml = hls4ml:reseed
-
-[options.extras_require]
-HGQ =
-    HGQ~=0.2.0
-optimization =
-    keras-tuner==1.1.3
-    ortools==9.4.1874
-    packaging
-profiling =
-    matplotlib
-    pandas
-    seaborn
-qkeras =
-    qkeras
-    tensorflow>=2.8.0,<=2.14.1
-    tensorflow-model-optimization<=0.7.5
-quantus_report =
-    calmjs.parse
-    tabulate
-sr =
-    sympy
-testing =
-    HGQ~=0.2.0
-    calmjs.parse
-    onnx>=1.4.0
-    pytest
-    pytest-cov
-    pytest-randomly
-    qonnx
-    tabulate
-    torch
-
-[check-manifest]
-ignore =
-    .github/**
-    docs/**
-    .pre-commit-config.yaml
-    Jenkinsfile
-    hls4ml/_version.py
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 1abbd068c1..0000000000
--- a/setup.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import setuptools
-
-if __name__ == "__main__":
-    setuptools.setup()

From 06f9cda7a705c2c10e83c71c9bc28edc29af644a Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 15 Dec 2024 22:57:49 +0000
Subject: [PATCH 08/69] format

---
 hls4ml/converters/onnx_to_hls.py |  2 ++
 hls4ml/writer/oneapi_writer.py   | 49 ++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/hls4ml/converters/onnx_to_hls.py b/hls4ml/converters/onnx_to_hls.py
index 99281888f3..f3b6acbaf3 100644
--- a/hls4ml/converters/onnx_to_hls.py
+++ b/hls4ml/converters/onnx_to_hls.py
@@ -76,6 +76,7 @@ def get_input_shape(graph, node):
 def get_constant_value(graph, constant_name):
     tensor = next((x for x in graph.initializer if x.name == constant_name), None)
     from onnx import numpy_helper
+
     return numpy_helper.to_array(tensor)
 
 
@@ -274,6 +275,7 @@ def onnx_to_hls(config):
     print('Interpreting Model ...')
 
     import onnx
+
     onnx_model = onnx.load(config['OnnxModel']) if isinstance(config['OnnxModel'], str) else config['OnnxModel']
 
     layer_list, input_layers, output_layers = parse_onnx_model(onnx_model)
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index fe633214f6..c9af2544bd 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -102,9 +102,10 @@ def write_project_cpp(self, model):
         project_name = model.config.get_project_name()
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f, open(
-            f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w'
-        ) as fout:
+        with (
+            open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f,
+            open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w') as fout,
+        ):
             model_inputs = model.get_input_variables()
             model_outputs = model.get_output_variables()
             model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
@@ -207,9 +208,10 @@ def write_project_header(self, model):
         project_name = model.config.get_project_name()
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.h')) as f, open(
-            f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w'
-        ) as fout:
+        with (
+            open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.h')) as f,
+            open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w') as fout,
+        ):
             model_inputs = model.get_input_variables()
             model_outputs = model.get_output_variables()
             # model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
@@ -254,9 +256,10 @@ def write_defines(self, model):
             model (ModelGraph): the hls4ml model.
         """
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/firmware/defines.h')) as f, open(
-            f'{model.config.get_output_dir()}/src/firmware/defines.h', 'w'
-        ) as fout:
+        with (
+            open(os.path.join(filedir, '../templates/oneapi/firmware/defines.h')) as f,
+            open(f'{model.config.get_output_dir()}/src/firmware/defines.h', 'w') as fout,
+        ):
             for line in f.readlines():
                 # Insert numbers
                 if '// hls-fpga-machine-learning insert numbers' in line:
@@ -298,9 +301,10 @@ def write_parameters(self, model):
             model (ModelGraph): the hls4ml model.
         """
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/firmware/parameters.h')) as f, open(
-            f'{model.config.get_output_dir()}/src/firmware/parameters.h', 'w'
-        ) as fout:
+        with (
+            open(os.path.join(filedir, '../templates/oneapi/firmware/parameters.h')) as f,
+            open(f'{model.config.get_output_dir()}/src/firmware/parameters.h', 'w') as fout,
+        ):
             for line in f.readlines():
                 if '// hls-fpga-machine-learning insert includes' in line:
                     newline = line
@@ -376,9 +380,10 @@ def write_test_bench(self, model):
                     output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat'
                 )
 
-        with open(os.path.join(filedir, '../templates/oneapi/myproject_test.cpp')) as f, open(
-            f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w'
-        ) as fout:
+        with (
+            open(os.path.join(filedir, '../templates/oneapi/myproject_test.cpp')) as f,
+            open(f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w') as fout,
+        ):
             for line in f.readlines():
                 indent = ' ' * (len(line) - len(line.lstrip(' ')))
 
@@ -434,9 +439,10 @@ def write_bridge(self, model):
         indent = '    '
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f, open(
-            f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w'
-        ) as fout:
+        with (
+            open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f,
+            open(f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w') as fout,
+        ):
             for line in f.readlines():
                 if 'MYPROJECT' in line:
                     newline = line.replace('MYPROJECT', format(project_name.upper()))
@@ -511,9 +517,10 @@ def write_build_script(self, model):
         # Makefile
         filedir = os.path.dirname(os.path.abspath(__file__))
         device = model.config.get_config_value('Part')
-        with open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f, open(
-            f'{model.config.get_output_dir()}/CMakeLists.txt', 'w'
-        ) as fout:
+        with (
+            open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f,
+            open(f'{model.config.get_output_dir()}/CMakeLists.txt', 'w') as fout,
+        ):
             for line in f.readlines():
                 line = line.replace('myproject', model.config.get_project_name())
                 line = line.replace('mystamp', model.config.get_config_value('Stamp'))

From 014c1dbc730a57241ca9eab9f402b2758edacd8d Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 15 Dec 2024 23:12:45 +0000
Subject: [PATCH 09/69] rm useless flake8 config in pyprject.toml

---
 pyproject.toml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b713b41d80..756e688d5c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,10 +89,6 @@ skip-string-normalization = true
 profile = "black"
 line_length = 125
 
-[tool.flake8]
-max-line-length = 125
-extend-ignore = [ "E203", "T201" ]
-
 [tool.check-manifest]
 ignore = [
   ".github/**",

From d3c888145910629170bb2399403ce65e65b97ec0 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 16 Dec 2024 01:20:13 +0000
Subject: [PATCH 10/69] Add hint on import failure

---
 hls4ml/converters/__init__.py       | 47 +++++++++--------------------
 hls4ml/converters/onnx_to_hls.py    |  4 +++
 hls4ml/converters/pytorch_to_hls.py |  4 +++
 hls4ml/model/__init__.py            |  7 -----
 hls4ml/model/quantizers.py          |  4 +++
 hls4ml/report/quartus_report.py     |  4 +++
 hls4ml/utils/config.py              |  2 ++
 pyproject.toml                      |  1 +
 8 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index 1343907b54..693a76f666 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -1,6 +1,5 @@
 import importlib
 import os
-import warnings
 
 import yaml
 
@@ -10,33 +9,19 @@
 from hls4ml.converters.keras_to_hls import get_supported_keras_layers  # noqa: F401
 from hls4ml.converters.keras_to_hls import parse_keras_model  # noqa: F401
 from hls4ml.converters.keras_to_hls import keras_to_hls, register_keras_layer_handler
+from hls4ml.converters.onnx_to_hls import get_supported_onnx_layers  # noqa: F401
 from hls4ml.converters.onnx_to_hls import parse_onnx_model  # noqa: F401
+from hls4ml.converters.onnx_to_hls import onnx_to_hls, register_onnx_layer_handler
+from hls4ml.converters.pytorch_to_hls import (  # noqa: F401
+    get_supported_pytorch_layers,
+    pytorch_to_hls,
+    register_pytorch_layer_handler,
+)
 from hls4ml.model import ModelGraph
 from hls4ml.utils.config import create_config
+from hls4ml.utils.dependency import requires
 from hls4ml.utils.symbolic_utils import LUTFunction
 
-# ----------Make converters available if the libraries can be imported----------#
-try:
-    from hls4ml.converters.pytorch_to_hls import (  # noqa: F401
-        get_supported_pytorch_layers,
-        pytorch_to_hls,
-        register_pytorch_layer_handler,
-    )
-
-    __pytorch_enabled__ = True
-except ImportError:
-    warnings.warn("WARNING: Pytorch converter is not enabled!", stacklevel=1)
-    __pytorch_enabled__ = False
-
-try:
-    from hls4ml.converters.onnx_to_hls import get_supported_onnx_layers  # noqa: F401
-    from hls4ml.converters.onnx_to_hls import onnx_to_hls, register_onnx_layer_handler
-
-    __onnx_enabled__ = True
-except ImportError:
-    warnings.warn("WARNING: ONNX converter is not enabled!", stacklevel=1)
-    __onnx_enabled__ = False
-
 # ----------Layer handling register----------#
 model_types = ['keras', 'pytorch', 'onnx']
 
@@ -51,7 +36,7 @@
                 # and has 'handles' attribute
                 # and is defined in this module (i.e., not imported)
                 if callable(func) and hasattr(func, 'handles') and func.__module__ == lib.__name__:
-                    for layer in func.handles:
+                    for layer in func.handles:  # type: ignore
                         if model_type == 'keras':
                             register_keras_layer_handler(layer, func)
                         elif model_type == 'pytorch':
@@ -124,15 +109,9 @@ def convert_from_config(config):
 
     model = None
     if 'OnnxModel' in yamlConfig:
-        if __onnx_enabled__:
-            model = onnx_to_hls(yamlConfig)
-        else:
-            raise Exception("ONNX not found. Please install ONNX.")
+        model = onnx_to_hls(yamlConfig)
     elif 'PytorchModel' in yamlConfig:
-        if __pytorch_enabled__:
-            model = pytorch_to_hls(yamlConfig)
-        else:
-            raise Exception("PyTorch not found. Please install PyTorch.")
+        model = pytorch_to_hls(yamlConfig)
     else:
         model = keras_to_hls(yamlConfig)
 
@@ -174,6 +153,7 @@ def _check_model_config(model_config):
     return model_config
 
 
+@requires('_keras')
 def convert_from_keras_model(
     model,
     output_dir='my-hls-test',
@@ -237,6 +217,7 @@ def convert_from_keras_model(
     return keras_to_hls(config)
 
 
+@requires('_torch')
 def convert_from_pytorch_model(
     model,
     output_dir='my-hls-test',
@@ -308,6 +289,7 @@ def convert_from_pytorch_model(
     return pytorch_to_hls(config)
 
 
+@requires('onnx')
 def convert_from_onnx_model(
     model,
     output_dir='my-hls-test',
@@ -371,6 +353,7 @@ def convert_from_onnx_model(
     return onnx_to_hls(config)
 
 
+@requires('sr')
 def convert_from_symbolic_expression(
     expr,
     n_symbols=None,
diff --git a/hls4ml/converters/onnx_to_hls.py b/hls4ml/converters/onnx_to_hls.py
index f3b6acbaf3..d51701e726 100644
--- a/hls4ml/converters/onnx_to_hls.py
+++ b/hls4ml/converters/onnx_to_hls.py
@@ -1,4 +1,5 @@
 from hls4ml.model import ModelGraph
+from hls4ml.utils.dependency import requires
 
 
 # ----------------------Helpers---------------------
@@ -17,6 +18,7 @@ def replace_char_inconsitency(name):
     return name.replace('.', '_')
 
 
+@requires('onnx')
 def get_onnx_attribute(operation, name, default=None):
     from onnx import helper
 
@@ -73,6 +75,7 @@ def get_input_shape(graph, node):
     return rv
 
 
+@requires('onnx')
 def get_constant_value(graph, constant_name):
     tensor = next((x for x in graph.initializer if x.name == constant_name), None)
     from onnx import numpy_helper
@@ -258,6 +261,7 @@ def parse_onnx_model(onnx_model):
     return layer_list, input_layers, output_layers
 
 
+@requires('onnx')
 def onnx_to_hls(config):
     """Convert onnx model to hls model from configuration.
 
diff --git a/hls4ml/converters/pytorch_to_hls.py b/hls4ml/converters/pytorch_to_hls.py
index 3ec5b17691..f279a1970a 100644
--- a/hls4ml/converters/pytorch_to_hls.py
+++ b/hls4ml/converters/pytorch_to_hls.py
@@ -1,4 +1,5 @@
 from hls4ml.model import ModelGraph
+from hls4ml.utils.dependency import requires
 
 
 class PyTorchModelReader:
@@ -22,6 +23,7 @@ def get_weights_data(self, layer_name, var_name):
         return data
 
 
+@requires('_torch')
 class PyTorchFileReader(PyTorchModelReader):  # Inherit get_weights_data method
     def __init__(self, config):
         import torch
@@ -103,6 +105,7 @@ def decorator(function):
 # ----------------------------------------------------------------
 
 
+@requires('_torch')
 def parse_pytorch_model(config, verbose=True):
     """Convert PyTorch model to hls4ml ModelGraph.
 
@@ -368,6 +371,7 @@ def parse_pytorch_model(config, verbose=True):
     return layer_list, input_layers
 
 
+@requires('_torch')
 def pytorch_to_hls(config):
     layer_list, input_layers = parse_pytorch_model(config)
     print('Creating HLS model')
diff --git a/hls4ml/model/__init__.py b/hls4ml/model/__init__.py
index fc504392b6..4ca72e3cd6 100644
--- a/hls4ml/model/__init__.py
+++ b/hls4ml/model/__init__.py
@@ -1,8 +1 @@
 from hls4ml.model.graph import HLSConfig, ModelGraph  # noqa: F401
-
-try:
-    from hls4ml.model import profiling  # noqa: F401
-
-    __profiling_enabled__ = True
-except ImportError:
-    __profiling_enabled__ = False
diff --git a/hls4ml/model/quantizers.py b/hls4ml/model/quantizers.py
index b445c70af3..eb313fc4ea 100644
--- a/hls4ml/model/quantizers.py
+++ b/hls4ml/model/quantizers.py
@@ -14,6 +14,7 @@
     SaturationMode,
     XnorPrecisionType,
 )
+from hls4ml.utils.dependency import requires
 
 
 class Quantizer:
@@ -84,6 +85,7 @@ class QKerasQuantizer(Quantizer):
         config (dict): Config of the QKeras quantizer to wrap.
     """
 
+    @requires('qkeras')
     def __init__(self, config):
         from qkeras.quantizers import get_quantizer
 
@@ -131,6 +133,7 @@ class QKerasBinaryQuantizer(Quantizer):
         config (dict): Config of the QKeras quantizer to wrap.
     """
 
+    @requires('qkeras')
     def __init__(self, config, xnor=False):
         from qkeras.quantizers import get_quantizer
 
@@ -155,6 +158,7 @@ class QKerasPO2Quantizer(Quantizer):
         config (dict): Config of the QKeras quantizer to wrap.
     """
 
+    @requires('qkeras')
     def __init__(self, config):
         from qkeras.quantizers import get_quantizer
 
diff --git a/hls4ml/report/quartus_report.py b/hls4ml/report/quartus_report.py
index 47fc43c132..677a931402 100644
--- a/hls4ml/report/quartus_report.py
+++ b/hls4ml/report/quartus_report.py
@@ -2,6 +2,8 @@
 import webbrowser
 from ast import literal_eval
 
+from hls4ml.utils.dependency import requires
+
 
 def parse_quartus_report(hls_dir, write_to_file=True):
     '''
@@ -39,6 +41,7 @@ def parse_quartus_report(hls_dir, write_to_file=True):
     return results
 
 
+@requires('quantus-report')
 def read_quartus_report(hls_dir, open_browser=False):
     '''
     Parse and print the Quartus report to print the report. Optionally open a browser.
@@ -89,6 +92,7 @@ def _find_project_dir(hls_dir):
     return top_func_name + '-fpga.prj'
 
 
+@requires('quantus-report')
 def read_js_object(js_script):
     '''
     Reads the JavaScript file and return a dictionary of variables definded in the script.
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index 6a356f5f27..8c8ff3a069 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -1,6 +1,7 @@
 import json
 
 import hls4ml
+from hls4ml.utils.dependency import requires
 
 
 def create_config(output_dir='my-hls-test', project_name='myproject', backend='Vivado', version='1.0.0', **kwargs):
@@ -44,6 +45,7 @@ def create_config(output_dir='my-hls-test', project_name='myproject', backend='V
     return config
 
 
+@requires('qkeras')
 def _get_precision_from_quantizer(quantizer):
     if isinstance(quantizer, str):
         import qkeras
diff --git a/pyproject.toml b/pyproject.toml
index 756e688d5c..24175c9612 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ optional-dependencies.doc = [
   "sphinx-rtd-theme",
 ]
 optional-dependencies.HGQ = [ "hgq~=0.2.0" ]
+optional-dependencies.onnx = [ "onnx>=1.4" ]
 optional-dependencies.optimization = [
   "keras-tuner==1.1.3",
   "ortools==9.4.1874",

From 738e5b01ee8b7c8441870d467008d5e011ab14c7 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 16 Dec 2024 01:32:12 +0000
Subject: [PATCH 11/69] leftover

---
 hls4ml/utils/dependency.py | 55 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 hls4ml/utils/dependency.py

diff --git a/hls4ml/utils/dependency.py b/hls4ml/utils/dependency.py
new file mode 100644
index 0000000000..e546dcb8c9
--- /dev/null
+++ b/hls4ml/utils/dependency.py
@@ -0,0 +1,55 @@
+import sys
+from functools import wraps
+from importlib.metadata import metadata
+from inspect import ismethod
+
+extra_requires: dict[str, list[str]] = {}
+subpackage = None
+for k, v in metadata('hls4ml')._headers:  # type: ignore
+    if k != 'Requires-Dist':
+        continue
+    if '; extra == ' not in v:
+        continue
+
+    req, pkg = v.split('; extra == ')
+    pkg = pkg.strip('"')
+
+    extra_requires.setdefault(pkg, []).append(req)
+
+
+def requires(pkg: str):
+    """Mark a function or method as requiring a package to be installed.
+    'name': requires hls4ml[name] to be installed.
+    '_name': requires name to be installed.
+
+    Parameters
+    ----------
+    pkg : str
+        The package to require.
+    """
+
+    def deco(f):
+        if ismethod(f):
+            qualifier = f"Method {f.__self__.__class__.__name__}.{f.__name__}"
+        else:
+            qualifier = f"Function {f.__name__}"
+
+        if not pkg.startswith("_"):
+            reqs = ", ".join(extra_requires[pkg])
+            msg = f"{qualifier} requires {reqs}, but package {{ename}} is missing"
+            "Please consider install it with `pip install hls4ml[{pkg}]` for full functionality with {pkg}."
+        else:
+            msg = f"{qualifier} requires {pkg[1:]}, but package {{ename}} is missing."
+            "Consider install it with `pip install {pkg}`."
+
+        @wraps(f)
+        def inner(*args, **kwargs):
+            try:
+                return f(*args, **kwargs)
+            except ImportError as e:
+                print(msg.format(ename=e.name), file=sys.stderr)
+                raise e
+
+        return inner
+
+    return deco

From bc7778bd13cdd6fc0d7ceb1e00be9bdfc195bcf3 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 16 Dec 2024 01:32:46 +0000
Subject: [PATCH 12/69] rm setup.py from manifest

---
 MANIFEST.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 7bcfbfaf6d..708e40c86b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,4 @@
-include LICENSE README.md CONTRIBUTING.md CITATION.cff pyproject.toml setup.py .clang-format
+include LICENSE README.md CONTRIBUTING.md CITATION.cff pyproject.toml .clang-format
 graft example-models
 graft test
 graft contrib

From b76b5cb99e4928ba5f8791f406f5fc89276f5378 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 16 Dec 2024 01:48:01 +0000
Subject: [PATCH 13/69] manifest fix 2

---
 MANIFEST.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 708e40c86b..5bec5fe2a6 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,5 +3,6 @@ graft example-models
 graft test
 graft contrib
 recursive-include hls4ml/templates *
-global-exclude .git .gitmodules .gitlab-ci.yml
+recursive-include hls4ml *.py
+global-exclude .git .gitmodules .gitlab-ci.yml *.pyc
 include hls4ml/backends/vivado_accelerator/supported_boards.json

From b7f60f5ae2f895acfe69e283850bc2be4b31db59 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Thu, 7 Nov 2024 05:50:56 +0000
Subject: [PATCH 14/69] keras v3 object based parser

---
 hls4ml/converters/keras_to_hls.py      |  28 ++-
 hls4ml/converters/keras_v3/__init__.py |   4 +
 hls4ml/converters/keras_v3/_base.py    | 144 +++++++++++++
 hls4ml/converters/keras_v3/core.py     |  91 +++++++++
 hls4ml/converters/keras_v3_to_hls.py   | 270 +++++++++++++++++++++++++
 5 files changed, 532 insertions(+), 5 deletions(-)
 create mode 100644 hls4ml/converters/keras_v3/__init__.py
 create mode 100644 hls4ml/converters/keras_v3/_base.py
 create mode 100644 hls4ml/converters/keras_v3/core.py
 create mode 100644 hls4ml/converters/keras_v3_to_hls.py

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 9fc63cf398..a206da4da7 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -1,9 +1,12 @@
 import json
+from warnings import warn
 
 import h5py
 
 from hls4ml.model import ModelGraph
 
+from .keras_v3_to_hls import parse_keras_v3_model
+
 MAXMULT = 4096
 
 
@@ -228,8 +231,8 @@ def parse_keras_model(model_arch, reader):
         layer_config = model_arch['config']
         if 'layers' in layer_config:  # Newer Keras versions have 'layers' in 'config' key
             layer_config = layer_config['layers']
-        # Sequential doesn't have InputLayer in TF < 2.3 (Keras 2.4.0)
         if layer_config[0]['class_name'] != 'InputLayer':
+            warn(DeprecationWarning('keras < 2.4.0 (tf 2.3) is deprecated. Please use a newer version.'))
             input_layer = {}
             input_layer['name'] = 'input1'
             input_layer['class_name'] = 'InputLayer'
@@ -241,25 +244,33 @@ def parse_keras_model(model_arch, reader):
         layer_config = model_arch['config']['layers']
         input_layers = [inp[0] for inp in model_arch['config']['input_layers']]
         output_layers = [out[0] for out in model_arch['config']['output_layers']]
+    else:
+        raise Exception(f'ERROR: Model class not supported: {model_arch["class_name"]}')
 
     # Get input shape and check for unsupported layer type
     for keras_layer in layer_config:
         if keras_layer['class_name'] not in supported_layers:
-            raise Exception('ERROR: Unsupported layer type: {}'.format(keras_layer['class_name']))
+            raise Exception(f'ERROR: Unsupported layer type: {keras_layer["class_name"]}')
 
     output_shapes = {}
     output_shape = None
 
     print('Topology:')
     for keras_layer in layer_config:
-        if 'batch_input_shape' in keras_layer['config']:
+        if 'batch_input_shape' in keras_layer['config'] or 'batch_shape' in keras_layer['config']:
             if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
                 input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
             else:
-                input_shapes = [keras_layer['config']['batch_input_shape']]
+                _input_shapes = keras_layer['config'].get('batch_input_shape', None)
+                input_shapes = _input_shapes or keras_layer['config']['batch_shape']
         else:
             if 'inbound_nodes' in keras_layer:
-                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
+                if 'args' in keras_layer['inbound_nodes'][0]:
+                    # keras v3
+                    input_shapes = [arg['config']['shape'] for arg in keras_layer['inbound_nodes'][0]['args']]
+                else:
+                    # keras v2
+                    input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
             else:
                 # Sequential model, so output_shape from the previous layer is still valid
                 input_shapes = [output_shape]
@@ -323,6 +334,13 @@ def parse_keras_model(model_arch, reader):
 
 
 def keras_to_hls(config):
+    if 'KerasModel' in config:
+        import keras
+
+        if keras.__version__ >= '3.0':
+            layer_list, input_layers, output_layers, _ = parse_keras_v3_model(config['KerasModel'])
+            return ModelGraph(config, layer_list, input_layers, output_layers)
+
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
     print('Creating HLS model')
diff --git a/hls4ml/converters/keras_v3/__init__.py b/hls4ml/converters/keras_v3/__init__.py
new file mode 100644
index 0000000000..d064a39cbd
--- /dev/null
+++ b/hls4ml/converters/keras_v3/__init__.py
@@ -0,0 +1,4 @@
+from . import core  # noqa: F401
+from ._base import registry as layer_handlers
+
+__all__ = ['layer_handlers']
diff --git a/hls4ml/converters/keras_v3/_base.py b/hls4ml/converters/keras_v3/_base.py
new file mode 100644
index 0000000000..e68db860bc
--- /dev/null
+++ b/hls4ml/converters/keras_v3/_base.py
@@ -0,0 +1,144 @@
+import typing
+from typing import Any, Callable, Sequence
+
+if typing.TYPE_CHECKING:
+    import keras
+    from keras.api import KerasTensor
+
+T_kv3_handler = Callable[
+    ['keras.Layer', Sequence['keras.KerasTensor'], Sequence['keras.KerasTensor']], tuple[dict[str, Any], ...]
+]
+
+registry: dict[str, T_kv3_handler] = {}
+
+
+def register(cls: str | type):
+    """Decorator to register a handler for a specific layer class. Suggested to decorate the `KerasV3LayerHandler` class.
+
+    Parameters
+    ----------
+    cls : str|type
+        If str, the key to register the handler under. If type, the class to register the handler for.
+
+    Examples
+    --------
+    ```python
+    @keras_dispatcher.register
+    class MyLayerHandler(KerasV3LayerHandler):
+        handles = ('my_package.src.submodule.MyLayer', 'MyLayer2')
+
+        def handle(self, layer, inp_tensors, out_tensors):
+            # handler code
+
+
+    @keras_dispatcher.register('MyLayer3')
+    def my_layer_handler(layer, inp_tensors, out_tensors):
+        # handler code
+    ```
+    """
+
+    def deco(func: T_kv3_handler):
+        if isinstance(cls, str):
+            registry[cls] = func
+        for k in getattr(func, 'handles', ()):
+            registry[k] = func
+        return func
+
+    if isinstance(cls, type):
+        return deco(cls())
+    return deco
+
+
+def maybe_add_attrs(config: dict[str, Any], obj: Any, *attrs: str):
+    for attr in attrs:
+        if attr not in config and hasattr(obj, attr):
+            config[attr] = getattr(obj, attr)
+
+
+class KerasV3LayerHandler:
+    """Base class for keras v3 layer handlers. Subclass this class to create a handler for a specific layer type."""
+
+    handles = ()
+
+    def __call__(
+        self,
+        layer: 'keras.Layer',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        """Handle a keras layer. Return a tuple of dictionaries, each
+        dictionary representing a layer (module) in the HLS model. One
+        layer may correspond one or more dictionaries (e.g., layers with
+        activation functions will be split into two layers).
+
+        Some common attributes are automatically added to the dictionary
+        if the handler returns a single dictionary. If the handler
+        returns multiple dictionaries, the attributes must be added
+        manually. Anything returned by the handler will override the
+        automatic attributes.
+
+        Automatic attributes: - name - class_name - module -
+        input_keras_tensor_names - input_shape -
+        output_keras_tensor_names
+
+        If the layer has an activation function, an additional
+        dictionary will be added to the return value representing the
+        activation function.
+
+
+        Parameters
+        ----------
+        layer : keras.Layer
+            The layer to be converted to HLS configuration(s).
+        in_tensors : Sequence[KerasTensor]
+            The list of input tensors to the layer.
+        out_tensors : Sequence[KerasTensor]
+            The list of output tensors from the layer.
+
+        Returns
+        -------
+        dict[str, Any] | tuple[dict[str, Any], ...]
+            layer configuration(s) for the HLS model to be consumed by
+            the ModelGraph constructor
+        """  # noqa: E501
+        import keras
+
+        config0 = self.handle(layer, in_tensors, out_tensors)
+        if isinstance(config0, tuple):
+            return config0
+
+        name = layer.name
+        class_name = layer.__class__.__name__
+        module = layer.__module__
+        config1 = {
+            'name': name,
+            'class_name': class_name,
+            'module': module,
+            'input_keras_tensor_names': [t.name for t in in_tensors],
+            'input_shape': [list(t.shape[1:]) for t in in_tensors],
+            'output_keras_tensor_names': [t.name for t in out_tensors],
+        }
+
+        maybe_add_attrs(config1, layer, 'epsilon', 'use_bias', 'data_format')
+
+        config1.update(config0)
+        ret = (config1,)
+
+        activation = getattr(layer, 'activation', None)
+        if activation not in (keras.activations.linear, None):
+            act_cls_name = activation.__class__.__name__
+            act_config = {
+                'class_name': 'Activation',
+                'activation': act_cls_name,
+                'name': f'{name}_{act_cls_name}',
+            }
+            ret = *ret, act_config
+        return ret
+
+    def handle(
+        self,
+        layer: 'keras.Layer',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ) -> dict[str, Any] | tuple[dict[str, Any], ...]:
+        return {}
diff --git a/hls4ml/converters/keras_v3/core.py b/hls4ml/converters/keras_v3/core.py
new file mode 100644
index 0000000000..e6f1caa881
--- /dev/null
+++ b/hls4ml/converters/keras_v3/core.py
@@ -0,0 +1,91 @@
+import typing
+from typing import Any, Sequence
+
+import numpy as np
+
+from ._base import KerasV3LayerHandler, register
+
+if typing.TYPE_CHECKING:
+    import keras
+    from keras.api import KerasTensor
+    from keras.src.layers.merging.base_merge import Merge
+
+
+@register
+class KV3DenseHandler(KerasV3LayerHandler):
+    handles = ('keras.src.layers.core.dense.Dense',)
+
+    def handle(
+        self,
+        layer: 'keras.layers.Dense',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        kernel = np.array(layer.kernel)
+        assert layer._build_shapes_dict is not None, f"Layer {layer.name} is not built"
+        # inp_shape = layer._build_shapes_dict['input_shape'][1:]
+        config = {
+            'data_format': 'channels_last',
+            'weight_data': kernel,
+            'bias_data': np.array(layer.bias) if layer.use_bias else None,
+            'n_out': kernel.shape[1],
+            'n_in': kernel.shape[0],
+        }
+        return config
+
+
+@register
+class KV3InputHandler(KerasV3LayerHandler):
+    handles = ('keras.src.layers.core.input_layer.InputLayer',)
+
+    def handle(
+        self,
+        layer: 'keras.layers.InputLayer',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        config = {'input_shape': list(layer._batch_shape[1:])}
+        return config
+
+
+@register
+class KV3MergeHandler(KerasV3LayerHandler):
+    handles = (
+        'keras.src.layers.merging.add.Add',
+        'keras.src.layers.merging.multiply.Multiply',
+        'keras.src.layers.merging.average.Average',
+        'keras.src.layers.merging.maximum.Maximum',
+        'keras.src.layers.merging.minimum.Minimum',
+        'keras.src.layers.merging.concatenate.Concatenate',
+        'keras.src.layers.merging.subtract.Subtract',
+        'keras.src.layers.merging.dot.Dot',
+    )
+
+    def handle(
+        self,
+        layer: 'Merge',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        assert len(out_tensors) == 1, f"Merge layer {layer.name} has more than one output"
+        output_shape = list(out_tensors[0].shape[1:])
+
+        config: dict[str, Any] = {
+            'output_shape': output_shape,
+            'op': layer.__class__.__name__.lower(),
+        }
+
+        match layer.__class__.__name__:
+            case 'Concatenate':
+                rank = len(output_shape)
+                class_name = f'Concatenate{rank}d'
+                config['axis'] = layer.axis
+            case 'Dot':
+                class_name = f'Dot{len(output_shape)}d'
+                rank = len(output_shape)
+                assert rank == 1, f"Dot product only supported for 1D tensors, got {rank}D on layer {layer.name}"
+            case _:
+                class_name = 'Merge'
+
+        config['class_name'] = class_name
+        return config
diff --git a/hls4ml/converters/keras_v3_to_hls.py b/hls4ml/converters/keras_v3_to_hls.py
new file mode 100644
index 0000000000..cf5b9c5d25
--- /dev/null
+++ b/hls4ml/converters/keras_v3_to_hls.py
@@ -0,0 +1,270 @@
+import typing
+from itertools import chain
+from typing import Any, Callable, Sequence
+
+if typing.TYPE_CHECKING:
+    import keras
+    from keras.api import KerasTensor
+
+import numpy as np
+
+from .keras_v3 import layer_handlers as v3_layer_handlers
+
+T_kv3_handler = Callable[
+    ['keras.Layer', Sequence['keras.KerasTensor'], Sequence['keras.KerasTensor']], tuple[dict[str, Any], ...]
+]
+
+
+def get_io_tensors(layer: 'keras.Layer', node_whitelist: set[int] | None = None):
+    """Given a keras layer, return a list of tuples of input and output
+    tensors. If the layer is called only once (i.e., no shared layers),
+    the list will contain only one tuple.
+
+    The layer must have been built before calling this function.
+
+    Parameters
+    ----------
+    layer : keras.Layer
+        The layer to get input and output tensors from.
+    node_whitelist : set[int]|None, optional
+        If not None, only return tensors from nodes with ids in this
+        set, used to filter out nodes that are not part of the model, by
+        default None
+
+
+    Returns
+    -------
+    list[tuple[tuple['KerasTensor', ...], tuple['KerasTensor', ...]]]
+        A list of tuples of input and output tensors.
+    """
+    in_nodes = layer._inbound_nodes
+    if node_whitelist is not None:
+        in_nodes = [node for node in in_nodes if id(node) in node_whitelist]
+
+    ret: list[tuple[tuple['KerasTensor', ...], tuple['KerasTensor', ...]]] = []
+    for node in in_nodes:
+        in_tensors = tuple(node.arguments.keras_tensors)
+        out_tensors = tuple(node.outputs)
+        ret.append((in_tensors, out_tensors))
+    return ret
+
+
+def resolve_dependency_relation(model: 'keras.Model'):
+    """Given a keras model, return the following information:
+    - A list of input tensor names
+    - A list of output tensor names
+    - A list of (layer_name, input_tensor_names, output_tensor_names) tuples
+    - A dictionary of tensor_name -> KerasTensor
+
+    Parameters
+    ----------
+    model : keras.Model
+        The keras model to analyze.
+
+    Returns
+    -------
+    tuple[tuple[str, ...], tuple[str, ...], list[tuple[str, tuple[str, ...], tuple[str, ...]]], dict[str, KerasTensor]]
+        inp_tensor_names, out_tensor_names, layer_io, tensors
+    """
+    tensors: dict[str, 'KerasTensor'] = {}
+    "tensor_name -> KerasTensor"
+    depends_on: dict[str, tuple[str, ...]] = {}
+    "tensor_name -> {tensor_name}"
+    layer_io: list[tuple[str, tuple[str, ...], tuple[str, ...]]] = []
+    "layer_name -> ((input_tensor_names), (output_tensor_names))"
+
+    inputs = tuple(t.name for t in model.inputs)
+    outputs = tuple(t.name for t in model.outputs)
+    node_whitelist = {id(node) for v in model._nodes_by_depth.values() for node in v}
+
+    for layer in model.layers:
+        for in_tensors, out_tensors in get_io_tensors(layer, node_whitelist):
+            in_tensor_names = tuple(t.name for t in in_tensors)
+            out_tensor_names = tuple(t.name for t in out_tensors)
+            for t in chain(in_tensors, out_tensors):
+                tensors[t.name] = t
+            for o_name in out_tensor_names:
+                depends_on[o_name] = in_tensor_names
+            layer_io.append((layer.name, in_tensor_names, out_tensor_names))
+
+    return inputs, outputs, layer_io, tensors
+
+
+class UniqueName:
+    """Helper class to generate unique names for layers, if one being used multiple times."""
+
+    def __init__(self):
+        self.used_names: set[str] = set()
+
+    def next_name(self, name: str):
+        i = 0
+        if name in self.used_names:
+            while f'{name}_{i}' in self.used_names:
+                i += 1
+            name = f'{name}_{i}'
+        self.used_names.add(name)
+        return name
+
+    def __call__(self, name: str):
+        return self.next_name(name)
+
+    def reset(self):
+        self.used_names.clear()
+
+
+class KerasV3HandlerDispatcher:
+    """Dispatcher class to handle different types of keras v3 layers."""
+
+    def __init__(self, layer_handlers: dict[str, T_kv3_handler], v2_layer_handlers=None):
+        self.registry = layer_handlers
+        self.v2_layer_handlers = v2_layer_handlers or {}
+
+    def __call__(
+        self, layer: 'keras.Layer', in_tensors: Sequence['keras.KerasTensor'], out_tensors: Sequence['keras.KerasTensor']
+    ) -> tuple[dict[str, Any], ...]:
+        assert layer.built, f"Layer {layer.name} is not built"
+
+        ret = self.v3_call(layer, in_tensors, out_tensors)
+        if ret is not None:
+            return ret
+        ret = self.v2_call(layer, in_tensors, out_tensors)
+        if ret is not None:
+            return ret
+
+        raise ValueError(
+            f"Layer {layer.__class__.__module__}.{layer.__class__.__name__} not found in either v3 or v2 handlers"
+        )
+
+    def v3_call(
+        self, layer: 'keras.layers.Layer', inp_tensors: Sequence['KerasTensor'], out_tensors: Sequence['KerasTensor']
+    ):
+        cls_name = layer.__class__.__name__
+        module = layer.__module__
+        key = f"{module}.{cls_name}"
+
+        # keras v3 handlers
+        handler = self.registry.get(key, None)
+        handler = handler or self.registry.get(cls_name, None)
+
+        if handler is None:
+            return None
+        return handler(layer, inp_tensors, out_tensors)
+
+    def v2_call(
+        self, layer: 'keras.layers.Layer', inp_tensors: Sequence['KerasTensor'], out_tensors: Sequence['KerasTensor']
+    ):
+        # keras v2 handlers fallback
+        print("v2 handler")
+        config = layer.get_config()
+        layer_dict = {'config': config, 'class_name': layer.__class__.__name__}
+
+        class DummyReader:
+            def get_weights_data(self, layer_name, var_name):
+                assert layer_name == layer.name, f"Processing {layer.name}, but handler tried to read {layer_name}"
+                for w in layer.weights:
+                    if var_name in w.name:
+                        return np.array(w)
+                raise ValueError(f"Variable {var_name} not found in layer {layer.name}")
+
+        reader = DummyReader()
+        input_shapes = [list(t.shape) for t in inp_tensors]
+        input_names = [t.name for t in inp_tensors]
+        output_names = [t.name for t in out_tensors]
+        key = layer.__class__.__name__
+        handler = self.v2_layer_handlers.get(key, None)
+        if handler is None:
+            return None
+
+        ret, _ = handler(layer_dict, input_names, input_shapes, reader)
+        ret['outputs'] = output_names
+        ret = (ret,)
+
+        activation = getattr(layer, 'activation', None)
+        if activation not in (keras.activations.linear, None):
+            act_cls_name = activation.__class__.__name__
+            act_config = {
+                'class_name': 'Activation',
+                'activation': act_cls_name,
+                'name': f'{layer.name}_{act_cls_name}',
+            }
+            ret = *ret, act_config
+        return ret
+
+
+def parse_keras_v3_model(model: 'keras.Model'):
+    """Parse a keras model into a list of dictionaries, each
+    representing a layer in the HLS model, and a list of input and
+    output layer names.
+
+    Parameters
+    ----------
+    model : keras.Model
+
+    Returns
+    -------
+    tuple[list[dict[str, Any]], list[str], list[str], list[list[int]]]
+        layer_list, input_layer_names, output_layer_names,
+        batch_output_shapes
+
+    Raises
+    ------
+    ValueError
+        If a circular dependency is detected.
+    """
+
+    from .keras_to_hls import layer_handlers as v2_layer_handlers  # Delayed import to avoid circular import
+
+    keras_v3_dispatcher = KerasV3HandlerDispatcher(v3_layer_handlers, v2_layer_handlers)
+
+    model_inputs, model_outputs, dependency, tensors = resolve_dependency_relation(model)
+
+    satisfied = set()
+    total = len(tensors)
+
+    unique_name = UniqueName()
+
+    layer_list: list[dict[str, Any]] = []
+    while len(satisfied) < total:
+        # Until all tensors in the model are satisfied
+        for i, (layer_name, in_tensor_names, out_tensor_names) in enumerate(dependency):
+            if not all(t in satisfied for t in in_tensor_names):
+                continue  # Skip layer if some inputs are not ready
+            if all(t in satisfied for t in out_tensor_names):
+                continue  # Skip layer if the outputs are already satisfied
+
+            layer: 'keras.Layer' = model.get_layer(layer_name)
+            inp_tensors = [tensors[t] for t in in_tensor_names]
+            out_tensors = [tensors[t] for t in out_tensor_names]
+
+            _configs = keras_v3_dispatcher(layer, inp_tensors, out_tensors)
+            # Dispatch to v3 handler if available, else fallback to v2
+            # handler
+
+            # Prevent name conflicts. If a layer is used multiple times,
+            # add a suffix to the name At this stage, connections
+            # between modules are recorded by i/o keras tensor names
+            # (guaranteed unique), thus we can safely rename the layers
+            for _conf in _configs:
+                _conf['name'] = unique_name(_conf['name'])
+
+            layer_list.extend(_configs)  # Add the layer to the list
+            satisfied.update(out_tensor_names)  # Mark the outputs as satisfied
+            dependency.pop(i)
+            break  # Restart the loop to add another layer
+        else:
+            # If no layer was added in the loop, then there is a circular dependency
+            raise ValueError("Circular dependency detected")
+
+    # Mark inputs[inp layer name] for ModelGraph to parse from i/o keras tensor names
+    provides: dict[str, str] = {}  # tensor_name -> src_layer_name
+    for conf in layer_list:
+        for out_name in conf['output_keras_tensor_names']:
+            provides[out_name] = conf['name']
+        inputs = [provides[tname] for tname in conf['input_keras_tensor_names']]
+        conf['inputs'] = inputs
+
+    input_layer_names = [provides[tname] for tname in model_inputs]
+    output_layer_names = [provides[tname] for tname in model_outputs]
+    batch_output_shapes = [list(tensors[tname].shape) for tname in model_outputs]
+
+    return layer_list, input_layer_names, output_layer_names, batch_output_shapes

From a7206b433a031afc578be2cf41885423442a63cd Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 8 Nov 2024 02:56:59 +0000
Subject: [PATCH 15/69] sequential and i/o tensor name parsing fix

---
 hls4ml/converters/keras_v3/_base.py  | 57 ++++++++++++++++++++++------
 hls4ml/converters/keras_v3_to_hls.py | 36 ++++++++++++------
 hls4ml/utils/config.py               | 14 +++++--
 3 files changed, 80 insertions(+), 27 deletions(-)

diff --git a/hls4ml/converters/keras_v3/_base.py b/hls4ml/converters/keras_v3/_base.py
index e68db860bc..6f69473073 100644
--- a/hls4ml/converters/keras_v3/_base.py
+++ b/hls4ml/converters/keras_v3/_base.py
@@ -1,5 +1,19 @@
 import typing
-from typing import Any, Callable, Sequence
+from types import FunctionType
+from typing import Any, Callable, Sequence, TypedDict
+
+
+class DefaultConfig(TypedDict, total=False):
+    name: str
+    class_name: str
+    module: str
+    input_keras_tensor_names: list[str]
+    input_shape: list[list[int]]
+    output_keras_tensor_names: list[str]
+    epsilon: float
+    use_bias: bool
+    data_format: str
+
 
 if typing.TYPE_CHECKING:
     import keras
@@ -49,7 +63,7 @@ def deco(func: T_kv3_handler):
     return deco
 
 
-def maybe_add_attrs(config: dict[str, Any], obj: Any, *attrs: str):
+def maybe_add_attrs(config: dict[str, Any] | DefaultConfig, obj: Any, *attrs: str):
     for attr in attrs:
         if attr not in config and hasattr(obj, attr):
             config[attr] = getattr(obj, attr)
@@ -103,36 +117,55 @@ def __call__(
         """  # noqa: E501
         import keras
 
-        config0 = self.handle(layer, in_tensors, out_tensors)
-        if isinstance(config0, tuple):
-            return config0
-
         name = layer.name
         class_name = layer.__class__.__name__
         module = layer.__module__
-        config1 = {
+
+        default_config: DefaultConfig = {
             'name': name,
             'class_name': class_name,
             'module': module,
             'input_keras_tensor_names': [t.name for t in in_tensors],
-            'input_shape': [list(t.shape[1:]) for t in in_tensors],
+            'input_shape': [list(t.shape[1:]) for t in in_tensors],  # type: ignore
             'output_keras_tensor_names': [t.name for t in out_tensors],
         }
 
-        maybe_add_attrs(config1, layer, 'epsilon', 'use_bias', 'data_format')
+        maybe_add_attrs(default_config, layer, 'epsilon', 'use_bias', 'data_format')
 
-        config1.update(config0)
-        ret = (config1,)
+        mandatory_keys = ['name', 'class_name', 'output_keras_tensor_names', 'input_keras_tensor_names']
 
+        self.default_config = default_config
+        config0 = self.handle(layer, in_tensors, out_tensors)
+        del self.default_config
+
+        if isinstance(config0, tuple):
+            for conf in config0:
+                for key in mandatory_keys:
+                    assert key in conf, f"Key {key} missing from layer {name} handled by {self.__class__.__name__}"
+            return config0
+
+        config = {}
+        config.update(default_config)
+        config.update(config0)
+        ret = (config,)
+
+        # If activation exists, append it
         activation = getattr(layer, 'activation', None)
         if activation not in (keras.activations.linear, None):
-            act_cls_name = activation.__class__.__name__
+            assert len(out_tensors) == 1, f"Layer {name} has more than one output, but has an activation function"
+            assert isinstance(activation, FunctionType), f"Activation function for layer {name} is not a function"
+            intermediate_tensor_name = f'{out_tensors[0].name}_activation'
+            ret[0]['output_keras_tensor_names'] = [intermediate_tensor_name]
+            act_cls_name = activation.__name__
             act_config = {
                 'class_name': 'Activation',
                 'activation': act_cls_name,
                 'name': f'{name}_{act_cls_name}',
+                'input_keras_tensor_names': [intermediate_tensor_name],
+                'output_keras_tensor_names': [out_tensors[0].name],
             }
             ret = *ret, act_config
+
         return ret
 
     def handle(
diff --git a/hls4ml/converters/keras_v3_to_hls.py b/hls4ml/converters/keras_v3_to_hls.py
index cf5b9c5d25..d602dcf5f3 100644
--- a/hls4ml/converters/keras_v3_to_hls.py
+++ b/hls4ml/converters/keras_v3_to_hls.py
@@ -1,5 +1,6 @@
 import typing
 from itertools import chain
+from types import FunctionType
 from typing import Any, Callable, Sequence
 
 if typing.TYPE_CHECKING:
@@ -154,7 +155,10 @@ def v2_call(
         self, layer: 'keras.layers.Layer', inp_tensors: Sequence['KerasTensor'], out_tensors: Sequence['KerasTensor']
     ):
         # keras v2 handlers fallback
-        print("v2 handler")
+        print(f"v2 handler used for layer {layer.name}")
+
+        import keras
+
         config = layer.get_config()
         layer_dict = {'config': config, 'class_name': layer.__class__.__name__}
 
@@ -176,16 +180,22 @@ def get_weights_data(self, layer_name, var_name):
             return None
 
         ret, _ = handler(layer_dict, input_names, input_shapes, reader)
-        ret['outputs'] = output_names
+        ret['output_keras_tensor_names'] = output_names
+        ret['input_keras_tensor_names'] = input_names
         ret = (ret,)
 
         activation = getattr(layer, 'activation', None)
         if activation not in (keras.activations.linear, None):
-            act_cls_name = activation.__class__.__name__
+            assert isinstance(activation, FunctionType), f"Activation function for layer {layer.name} is not a function"
+            intermediate_tensor_name = f'{output_names[0]}_activation'
+            ret[0]['output_keras_tensor_names'] = (intermediate_tensor_name,)
+            act_cls_name = activation.__name__
             act_config = {
                 'class_name': 'Activation',
                 'activation': act_cls_name,
                 'name': f'{layer.name}_{act_cls_name}',
+                'input_keras_tensor_names': (intermediate_tensor_name,),
+                'output_keras_tensor_names': output_names,
             }
             ret = *ret, act_config
         return ret
@@ -212,6 +222,13 @@ def parse_keras_v3_model(model: 'keras.Model'):
         If a circular dependency is detected.
     """
 
+    assert model.built, "Model must be built before parsing"
+
+    import keras
+
+    if isinstance(model, keras.Sequential):
+        model = model._functional  # everything is functional under the hood lol
+
     from .keras_to_hls import layer_handlers as v2_layer_handlers  # Delayed import to avoid circular import
 
     keras_v3_dispatcher = KerasV3HandlerDispatcher(v3_layer_handlers, v2_layer_handlers)
@@ -219,12 +236,12 @@ def parse_keras_v3_model(model: 'keras.Model'):
     model_inputs, model_outputs, dependency, tensors = resolve_dependency_relation(model)
 
     satisfied = set()
-    total = len(tensors)
 
     unique_name = UniqueName()
 
     layer_list: list[dict[str, Any]] = []
-    while len(satisfied) < total:
+
+    while any(t not in satisfied for t in model_outputs):
         # Until all tensors in the model are satisfied
         for i, (layer_name, in_tensor_names, out_tensor_names) in enumerate(dependency):
             if not all(t in satisfied for t in in_tensor_names):
@@ -237,13 +254,10 @@ def parse_keras_v3_model(model: 'keras.Model'):
             out_tensors = [tensors[t] for t in out_tensor_names]
 
             _configs = keras_v3_dispatcher(layer, inp_tensors, out_tensors)
-            # Dispatch to v3 handler if available, else fallback to v2
-            # handler
+            # Dispatch to v3 handler if available, else fallback to v2 handler
 
-            # Prevent name conflicts. If a layer is used multiple times,
-            # add a suffix to the name At this stage, connections
-            # between modules are recorded by i/o keras tensor names
-            # (guaranteed unique), thus we can safely rename the layers
+            # Prevent name conflicts. If a layer is used multiple times, add a suffix to the name.
+            # At this stage connections between modules are recorded by i/o keras tensor names
             for _conf in _configs:
                 _conf['name'] = unique_name(_conf['name'])
 
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index 8c8ff3a069..f20aa49835 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -1,6 +1,7 @@
 import json
 
 import hls4ml
+import hls4ml.converters.keras_v3_to_hls
 from hls4ml.utils.dependency import requires
 
 
@@ -159,12 +160,17 @@ def config_from_keras_model(
 
     if isinstance(model, dict):
         model_arch = model
+        reader = hls4ml.converters.KerasModelReader(model)
+        layer_list, _, _, _ = hls4ml.converters.parse_keras_model(model_arch, reader)
     else:
-        model_arch = json.loads(model.to_json())
+        import keras
 
-    reader = hls4ml.converters.KerasModelReader(model)
-
-    layer_list, _, _, _ = hls4ml.converters.parse_keras_model(model_arch, reader)
+        if keras.__version__ > '3.0':
+            layer_list, *_ = hls4ml.converters.parse_keras_v3_model(model)
+        else:
+            model_arch = json.loads(model.to_json())
+            reader = hls4ml.converters.KerasModelReader(model)
+            layer_list, _, _, _ = hls4ml.converters.parse_keras_model(model_arch, reader)
 
     def make_layer_config(layer):
         cls_name = layer['class_name']

From 1605f96050350fca1592c763943dfd1445eaae64 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 8 Nov 2024 03:04:02 +0000
Subject: [PATCH 16/69] support activation layers

---
 hls4ml/converters/__init__.py      |   3 +
 hls4ml/converters/keras_v3/core.py | 113 +++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)

diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index 693a76f666..47569b1ad9 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -9,6 +9,7 @@
 from hls4ml.converters.keras_to_hls import get_supported_keras_layers  # noqa: F401
 from hls4ml.converters.keras_to_hls import parse_keras_model  # noqa: F401
 from hls4ml.converters.keras_to_hls import keras_to_hls, register_keras_layer_handler
+from hls4ml.converters.keras_v3_to_hls import parse_keras_v3_model  # noqa: F401
 from hls4ml.converters.onnx_to_hls import get_supported_onnx_layers  # noqa: F401
 from hls4ml.converters.onnx_to_hls import parse_onnx_model  # noqa: F401
 from hls4ml.converters.onnx_to_hls import onnx_to_hls, register_onnx_layer_handler
@@ -17,6 +18,8 @@
     pytorch_to_hls,
     register_pytorch_layer_handler,
 )
+
+# from hls4ml.converters.pytorch_to_hls import parse_pytorch_model  # noqa: F401
 from hls4ml.model import ModelGraph
 from hls4ml.utils.config import create_config
 from hls4ml.utils.dependency import requires
diff --git a/hls4ml/converters/keras_v3/core.py b/hls4ml/converters/keras_v3/core.py
index e6f1caa881..ea63f97095 100644
--- a/hls4ml/converters/keras_v3/core.py
+++ b/hls4ml/converters/keras_v3/core.py
@@ -1,3 +1,4 @@
+import inspect
 import typing
 from typing import Any, Sequence
 
@@ -89,3 +90,115 @@ def handle(
 
         config['class_name'] = class_name
         return config
+
+
+@register
+class KV3ActivationHandler(KerasV3LayerHandler):
+    handles = ('keras.src.layers.activations.activation.Activation',)
+
+    def handle(
+        self,
+        layer: 'keras.layers.Activation',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        import keras
+
+        config = {}
+        config.update(self.default_config)
+
+        activation = getattr(layer, 'activation', keras.activations.linear)
+        match activation:
+            case keras.activations.softmax:
+                class_name = 'Softmax'
+                config['axis'] = -1
+            case keras.activations.hard_sigmoid:
+                class_name = 'HardActivation'
+            case keras.activations.leaky_relu:
+                class_name = 'LeakyReLU'
+                signature = inspect.signature(keras.activations.leaky_relu)
+                config['activ_param'] = signature.parameters['negative_slope'].default
+            case keras.activations.elu:
+                class_name = 'ELU'
+                signature = inspect.signature(keras.activations.elu)
+                config['activ_param'] = signature.parameters['alpha'].default
+            case _:
+                class_name = 'Activation'
+
+        config['activation'] = activation.__name__
+        config['class_name'] = class_name
+        return (config,)
+
+
+@register
+class KV3ReLUHandler(KerasV3LayerHandler):
+    handles = (
+        'keras.src.layers.activations.leaky_relu.LeakyReLU',
+        'keras.src.layers.activations.prelu.PReLU',
+        'keras.src.layers.activations.relu.ReLU',
+    )
+
+    def handle(
+        self,
+        layer: 'keras.layers.ReLU',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        config = {}
+        config.update(self.default_config)
+
+        if layer.__class__.__name__ == 'ReLU':
+            config['class_name'] = 'Activation'
+            config['activation'] = 'relu'
+            return config
+
+        if layer.__class__.__name__ == 'PReLU':
+            config['class_name'] = 'PReLU'
+            config['param_data'] = np.array(layer.alpha)
+            config['activation'] = 'prelu'
+        else:
+            config['class_name'] = 'LeakyReLU'
+            config['activ_param'] = float(layer.negative_slope)
+            config['activation'] = 'leaky_relu'
+
+        return (config,)
+
+
+@register
+class KV3SoftmaxHandler(KerasV3LayerHandler):
+    handles = ('keras.src.layers.activations.softmax.Softmax',)
+
+    def handle(
+        self,
+        layer: 'keras.layers.Softmax',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        config = {}
+        config.update(self.default_config)
+
+        config['class_name'] = 'Softmax'
+        config['axis'] = layer.axis
+        config['activation'] = 'softmax'
+
+        return (config,)
+
+
+@register
+class KV3HardActivationHandler(KerasV3LayerHandler):
+    handles = ('keras.src.layers.activations.elu.ELU',)
+
+    def handle(
+        self,
+        layer: 'keras.layers.ELU',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        config = {}
+        config.update(self.default_config)
+
+        config['class_name'] = 'ELU'
+        config['activ_param'] = float(layer.alpha)
+        config['activation'] = 'elu'
+
+        return (config,)

From a8aa48967558aff62c0a074311aae27eece1bad8 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 8 Nov 2024 03:29:00 +0000
Subject: [PATCH 17/69] consistent v2 weight reader behavior

---
 hls4ml/converters/keras_v3_to_hls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/converters/keras_v3_to_hls.py b/hls4ml/converters/keras_v3_to_hls.py
index d602dcf5f3..5c0168cc1e 100644
--- a/hls4ml/converters/keras_v3_to_hls.py
+++ b/hls4ml/converters/keras_v3_to_hls.py
@@ -168,7 +168,7 @@ def get_weights_data(self, layer_name, var_name):
                 for w in layer.weights:
                     if var_name in w.name:
                         return np.array(w)
-                raise ValueError(f"Variable {var_name} not found in layer {layer.name}")
+                return None
 
         reader = DummyReader()
         input_shapes = [list(t.shape) for t in inp_tensors]

From eafe8b989e3208d35f7ffcc8af98972f1cf60cc6 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 8 Nov 2024 05:17:30 +0000
Subject: [PATCH 18/69] add v3 conv handlers

---
 hls4ml/converters/keras_v3/__init__.py |   1 +
 hls4ml/converters/keras_v3/conv.py     | 122 +++++++++++++++++++++++++
 2 files changed, 123 insertions(+)
 create mode 100644 hls4ml/converters/keras_v3/conv.py

diff --git a/hls4ml/converters/keras_v3/__init__.py b/hls4ml/converters/keras_v3/__init__.py
index d064a39cbd..f658faa1fb 100644
--- a/hls4ml/converters/keras_v3/__init__.py
+++ b/hls4ml/converters/keras_v3/__init__.py
@@ -1,3 +1,4 @@
+from . import conv  # noqa: F401
 from . import core  # noqa: F401
 from ._base import registry as layer_handlers
 
diff --git a/hls4ml/converters/keras_v3/conv.py b/hls4ml/converters/keras_v3/conv.py
new file mode 100644
index 0000000000..871bcb942d
--- /dev/null
+++ b/hls4ml/converters/keras_v3/conv.py
@@ -0,0 +1,122 @@
+import typing
+from math import ceil
+from typing import Sequence
+
+import numpy as np
+
+from ._base import KerasV3LayerHandler, register
+
+if typing.TYPE_CHECKING:
+    import keras
+    from keras.api import KerasTensor
+
+
+@register
+class KV3ConvHandler(KerasV3LayerHandler):
+    handles = (
+        'keras.src.layers.convolutional.conv1d.Conv1D',
+        'keras.src.layers.convolutional.conv2d.Conv2D',
+        'keras.src.layers.convolutional.depthwise_conv1d.DepthwiseConv1D',
+        'keras.src.layers.convolutional.depthwise_conv2d.DepthwiseConv2D',
+        'keras.src.layers.convolutional.separable_conv1d.SeparableConv1D',
+        'keras.src.layers.convolutional.separable_conv2d.SeparableConv2D',
+    )
+
+    def handle(
+        self,
+        layer: 'keras.layers.Conv1D|keras.layers.Conv2D|keras.layers.DepthwiseConv1D|keras.layers.DepthwiseConv2D',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        from keras.src.layers.convolutional.base_conv import BaseConv
+        from keras.src.layers.convolutional.base_depthwise_conv import BaseDepthwiseConv
+        from keras.src.layers.convolutional.base_separable_conv import BaseSeparableConv
+
+        assert len(in_tensors) == 1, f"Layer {layer.name} has more than one input"
+        assert len(out_tensors) == 1, f"Layer {layer.name} has more than one output"
+
+        in_shape: tuple[int, ...] = in_tensors[0].shape[1:]  # type: ignore
+        out_shape: tuple[int, ...] = out_tensors[0].shape[1:]  # type: ignore
+        assert all(isinstance(x, int) for x in in_shape), f"Layer {layer.name} has non-fixed size input: {in_shape}"
+        assert all(isinstance(x, int) for x in out_shape), f"Layer {layer.name} has non-fixed size output: {out_shape}"
+
+        kernel = np.array(layer.kernel)
+        if layer.use_bias:
+            bias = np.array(layer.bias)
+        else:
+            bias = None
+
+        ker_px_shape: tuple[int, ...] = layer.kernel_size
+        data_format = layer.data_format
+
+        if data_format == 'channels_last':
+            *px_in_shape, ch_in = in_shape
+            *px_out_shape, ch_out = out_shape
+        else:
+            ch_in, *px_in_shape = in_shape
+            ch_out, *px_out_shape = out_shape
+
+        if layer.padding == 'same':
+            n_padding = [ceil(N / n) * n - N for N, n in zip(px_in_shape, ker_px_shape)]
+            n_padding0 = [p // 2 for p in n_padding]
+            n_padding1 = [p - p0 for p, p0 in zip(n_padding, n_padding0)]
+        elif layer.padding == 'valid':
+            n_padding0 = [0] * len(px_in_shape)
+            n_padding1 = [0] * len(px_in_shape)
+        elif layer.padding == 'causal':
+            n_padding0 = [ker_px_shape[0] - 1] + [0] * (len(px_in_shape) - 1)
+            n_padding1 = [0] * len(px_in_shape)
+        else:
+            raise ValueError(f"Invalid padding mode {layer.padding} for layer {layer.name}")
+
+        config = {
+            'bias_data': bias,
+            'data_format': data_format,
+            'weight_data': kernel,
+            'bias_data': bias,
+            'n_filt': ch_out,
+            'n_chan': ch_in,
+        }
+
+        if layer.rank == 1:
+            config.update(
+                {
+                    'filt_width': ker_px_shape[0],
+                    'stride_width': layer.strides[0],
+                    'pad_left': n_padding0[0],
+                    'pad_right': n_padding1[0],
+                    'in_width': px_in_shape[0],
+                    'out_width': px_out_shape[0],
+                }
+            )
+        elif layer.rank == 2:
+            config.update(
+                {
+                    'filt_height': ker_px_shape[0],
+                    'filt_width': ker_px_shape[1],
+                    'stride_height': layer.strides[0],
+                    'stride_width': layer.strides[1],
+                    'pad_top': n_padding0[0],
+                    'pad_bottom': n_padding1[0],
+                    'pad_left': n_padding0[1],
+                    'pad_right': n_padding1[1],
+                    'in_height': px_in_shape[0],
+                    'in_width': px_in_shape[1],
+                    'out_height': px_out_shape[0],
+                    'out_width': px_out_shape[1],
+                }
+            )
+        else:
+            _cls = f"{layer.__class__.__module__}.{layer.__class__.__name__}"
+            raise ValueError(f"Only 1D and 2D conv layers are supported, got {_cls} (rank={layer.rank})")
+        if isinstance(layer, BaseDepthwiseConv):
+            config['depthwise_data'] = kernel
+            config['depth_multiplier'] = layer.depth_multiplier
+        elif isinstance(layer, BaseSeparableConv):
+            config['depthwise_data'] = kernel
+            config['pointwise_data'] = np.array(layer.pointwise_kernel)
+            config['depth_multiplier'] = layer.depth_multiplier
+        elif isinstance(layer, BaseConv):
+            config['weight_data'] = kernel
+
+        return config

From 6b8a44cdccef561eeca7c87ff9ce77427008faca Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 8 Nov 2024 14:01:18 +0000
Subject: [PATCH 19/69] add test

---
 test/pytest/test_keras_v3_api.py | 516 +++++++++++++++++++++++++++++++
 1 file changed, 516 insertions(+)
 create mode 100644 test/pytest/test_keras_v3_api.py

diff --git a/test/pytest/test_keras_v3_api.py b/test/pytest/test_keras_v3_api.py
new file mode 100644
index 0000000000..81ac5c240c
--- /dev/null
+++ b/test/pytest/test_keras_v3_api.py
@@ -0,0 +1,516 @@
+import math
+from pathlib import Path
+
+import keras
+import numpy as np
+import pytest
+
+if keras.__version__ < '3.0':
+    pytest.skip('Keras API tests are only for Keras 3.0 and above', allow_module_level=True)
+
+from keras.api.layers import (
+    ELU,
+    Activation,
+    AveragePooling1D,
+    AveragePooling2D,
+    Conv1D,
+    Conv2D,
+    Dense,
+    DepthwiseConv1D,
+    DepthwiseConv2D,
+    LeakyReLU,
+    MaxPooling1D,
+    MaxPooling2D,
+    PReLU,
+)
+
+import hls4ml
+
+test_root_path = Path('/tmp/tests')
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+def test_dense(backend, io_type):
+    model = keras.Sequential(
+        [
+            Dense(
+                2,
+                input_shape=(1,),
+                name='Dense',
+                use_bias=True,
+                kernel_initializer=keras.initializers.RandomUniform(minval=1, maxval=10),  # type: ignore
+                bias_initializer='zeros',
+                kernel_regularizer=None,
+                bias_regularizer=None,
+                activity_regularizer=None,
+                kernel_constraint=None,
+                bias_constraint=None,
+            ),
+            Activation(activation='elu', name='Activation'),
+        ]
+    )
+    model.compile(optimizer='adam', loss='mse')
+
+    X_input = np.random.rand(1000, 1)
+
+    keras_prediction = model.predict(X_input, verbose=0)  # type: ignore
+
+    config = hls4ml.utils.config_from_keras_model(model)
+    output_dir = str(test_root_path / f'hls4mlprj_keras_api_dense_{backend}_{io_type}')
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+
+    hls_model.compile()
+
+    hls_prediction = hls_model.predict(X_input)
+
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.02)
+
+    assert len(model.layers) + 1 == len(hls_model.get_layers())
+    assert list(hls_model.get_layers())[0].attributes['class_name'] == "InputLayer"
+    assert list(hls_model.get_layers())[1].attributes["class_name"] == model.layers[0].name
+    assert list(hls_model.get_layers())[2].attributes['class_name'] == 'ELU'
+
+
+# TODO: add ThresholdedReLU test when it can be made to pass
+# https://github.com/fastmachinelearning/hls4ml/issues/376
+
+
+@pytest.mark.parametrize(
+    "activation_function",
+    [
+        Activation(activation='relu', name='relu'),
+        LeakyReLU(negative_slope=0.5),
+        ELU(alpha=1.0),
+        PReLU(
+            alpha_initializer="zeros",
+        ),
+        Activation(activation='sigmoid', name='sigmoid'),
+    ],
+)
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+def test_activations(activation_function, backend, io_type):
+    model = keras.models.Sequential()
+    model.add(Dense(64, input_shape=(1,), name='Dense', kernel_initializer='lecun_uniform', kernel_regularizer=None))
+    model.add(activation_function)
+
+    model.compile(optimizer='adam', loss='mse')
+
+    model.summary()
+
+    X_input = np.random.rand(1000, 1)
+    keras_prediction = model.predict(X_input, verbose=0)  # type: ignore
+    config = hls4ml.utils.config_from_keras_model(model)
+    output_dir = str(test_root_path / f'hls4mlprj_keras_api_activations_{activation_function.name}_{backend}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+    hls_model.compile()
+    hls_prediction = hls_model.predict(X_input)
+
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.02)
+
+    for layer in hls_model.get_layers():
+        print(layer.attributes.attributes['class_name'])
+    assert len(model.layers) + 1 == len(hls_model.get_layers())
+
+    assert list(hls_model.get_layers())[2].attributes['class_name'] == activation_function.__class__.__name__
+
+
+padds_options = ['same', 'valid']
+
+
+@pytest.mark.parametrize('padds', padds_options)
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+def test_conv1d(padds, backend, io_type):
+    model = keras.models.Sequential()
+    input_shape = (10, 128, 4)
+    model.add(
+        Conv1D(
+            filters=32,
+            kernel_size=3,
+            strides=2,
+            padding=padds,
+            activation='relu',
+            input_shape=input_shape[1:],
+            kernel_initializer='normal',
+            use_bias=False,
+            data_format='channels_last',
+            name='conv',
+        )
+    )
+    model.add(Activation(activation='relu'))
+    model.compile(optimizer='adam', loss='mse')
+
+    X_input = np.random.rand(10, 128, 4)
+    keras_prediction = model.predict(X_input, verbose=0)  # type: ignore
+
+    config = hls4ml.utils.config_from_keras_model(model)
+    output_dir = str(test_root_path / f'hls4mlprj_keras_api_conv1d_{padds}_{backend}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+    hls_model.compile()
+    hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)  # type: ignore
+
+    # 5e-2 might be too high
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=5e-2)
+
+    if backend in ('Vivado', 'Vitis', 'Catapult') and io_type == 'io_stream' and padds == 'same':
+        # Vivado/Vitis inserts and additional layer for 'same' padding in io_stream
+        return
+
+    conv: keras.layers.Conv1D = model.layers[0]
+    ker_w, ch_in, ch_out = conv.kernel.shape
+    inp_shape = model.inputs[0].shape[1:]
+    out_shape = model.outputs[0].shape[1:]
+    hls_attr = hls_model.graph['conv'].attributes
+    _stride = conv.strides[0]
+
+    assert len(model.layers) + 2 == len(hls_model.get_layers())
+
+    assert hls_attr['name'] == model.layers[0].name
+    assert hls_attr['class_name'] == 'Conv1D'
+    assert hls_attr["in_width"] == inp_shape[0]
+    assert hls_attr['filt_width'] == ker_w
+    assert hls_attr['n_chan'] == ch_in
+    assert hls_attr['n_filt'] == ch_out
+    assert hls_attr['stride_width'] == _stride
+    assert hls_attr['data_format'] == conv.data_format
+    assert hls_attr["out_width"] == out_shape[0]
+
+    w_pad = math.ceil(inp_shape[0] / ker_w) * ker_w - inp_shape[0]
+
+    pad_left = w_pad // 2
+    pad_right = w_pad - pad_left
+
+    if model.layers[0].padding == 'same':
+        assert hls_attr['pad_left'] == pad_left
+        assert hls_attr['pad_right'] == pad_right
+    elif model.layers[0].padding == 'valid':
+        assert hls_attr['pad_left'] == 0
+        assert hls_attr['pad_right'] == 0
+
+
+chans_options = ['channels_last']
+padds_options = ['same', 'valid']
+
+
+@pytest.mark.parametrize('chans', chans_options)
+@pytest.mark.parametrize('padds', padds_options)
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+def test_conv2d(chans, padds, backend, io_type):
+    input_shape = (32, 32, 3)
+    model = keras.Sequential(
+        [
+            keras.layers.InputLayer(input_shape),
+            Conv2D(
+                filters=32,
+                kernel_size=(2, 3),
+                strides=(4, 5),
+                padding=padds,
+                kernel_initializer='normal',
+                use_bias=False,
+                data_format=chans,
+                name='conv',
+            ),
+        ]
+    )
+    model.compile(optimizer='adam', loss='mse')
+
+    X_input = np.random.rand(1000, *input_shape)
+    keras_prediction = model.predict(X_input)
+
+    config = hls4ml.utils.config_from_keras_model(model)
+    output_dir = str(test_root_path / f'hls4ml_project_keras_api_conv2d_{backend}_{chans}_{padds}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+    hls_model.compile()
+    hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)  # type: ignore
+
+    # A high tolerance, simply to verify correct functionality
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=5e-2)
+
+    hls_conv_attr = hls_model.graph['conv'].attributes
+
+    conv: keras.layers.Conv2D = model.get_layer('conv')
+
+    kh, kw, ch_in, ch_out = conv.kernel.shape  # type: ignore
+    _stride = conv.strides
+    inp_shape = model.inputs[0].shape[1:]
+    out_shape = model.outputs[0].shape[1:]
+
+    if io_type == 'io_stream' and padds == 'same' and backend in ('Vivado', 'Vitis', 'Catapult'):
+        return
+
+    assert len(model.layers) + 1 == len(hls_model.get_layers())
+    assert hls_conv_attr['name'] == conv.name
+    assert hls_conv_attr['class_name'] == 'Conv2D'
+    assert hls_conv_attr['filt_width'] == kw
+    assert hls_conv_attr['filt_height'] == kh
+    assert hls_conv_attr['n_filt'] == ch_out
+    assert hls_conv_attr['stride_width'] == _stride[1]
+    assert hls_conv_attr['stride_height'] == _stride[0]
+    assert hls_conv_attr['data_format'] == conv.data_format
+
+    if conv.data_format == 'channels_first':
+        assert hls_conv_attr['n_chan'] == inp_shape[0]
+        assert hls_conv_attr['in_height'] == inp_shape[1]
+        assert hls_conv_attr['in_width'] == inp_shape[2]
+        assert hls_conv_attr['out_height'] == out_shape[1]
+        assert hls_conv_attr['out_width'] == out_shape[2]
+    elif model.layers[0].data_format == 'channels_last':
+        assert hls_conv_attr['n_chan'] == inp_shape[2]
+        assert hls_conv_attr['in_height'] == inp_shape[0]
+        assert hls_conv_attr['in_width'] == inp_shape[1]
+        assert hls_conv_attr['out_height'] == out_shape[0]
+        assert hls_conv_attr['out_width'] == out_shape[1]
+
+    if conv.padding == 'same':
+        if conv.data_format == 'channels_first':
+            h_pad = math.ceil(inp_shape[1] / kh) * kh - inp_shape[1]
+            w_pad = math.ceil(inp_shape[2] / kw) * kw - inp_shape[2]
+        elif model.layers[0].data_format == 'channels_last':
+            h_pad = math.ceil(inp_shape[0] / kh) * kh - inp_shape[0]
+            w_pad = math.ceil(inp_shape[1] / kw) * kw - inp_shape[1]
+        else:
+            raise ValueError('Invalid data_format')
+        pad_top = h_pad // 2
+        pad_bottom = h_pad - pad_top
+        pad_left = w_pad // 2
+        pad_right = w_pad - pad_left
+        assert hls_conv_attr['pad_top'] == pad_top
+        assert hls_conv_attr['pad_bottom'] == pad_bottom
+        assert hls_conv_attr['pad_left'] == pad_left
+        assert hls_conv_attr['pad_right'] == pad_right
+    elif model.layers[0].padding == 'valid':
+        assert hls_conv_attr['pad_top'] == 0
+        assert hls_conv_attr['pad_bottom'] == 0
+        assert hls_conv_attr['pad_left'] == 0
+        assert hls_conv_attr['pad_right'] == 0
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult'])
+@pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
+def test_depthwise2d(backend, io_type):
+    '''
+    Test proper handling of DepthwiseConv2D
+    '''
+    X = np.random.rand(10, 32, 32, 3)
+    X = np.round(X * 2**10) * 2**-10  # make it an exact ap_fixed<16,6>
+    model = keras.models.Sequential([keras.layers.Input((32, 32, 3)), DepthwiseConv2D(kernel_size=(3, 3))])
+    model.compile()
+
+    config = hls4ml.utils.config_from_keras_model(
+        model, granularity='name', default_precision='fixed<32,12>', backend=backend
+    )
+    output_dir = str(test_root_path / f'hls4mlprj_keras_api_depthwiseconv2d_{backend}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+    hls_model.compile()
+
+    y_qkeras = model.predict(X)
+    y_hls4ml = hls_model.predict(X)
+
+    np.testing.assert_allclose(y_qkeras, y_hls4ml.reshape(y_qkeras.shape), rtol=1e-2, atol=0.01)  # type: ignore
+
+
+# Currently only Vivado and Vitis is supported for io_stream.
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+@pytest.mark.parametrize('io_type', ['io_stream'])
+def test_depthwise1d(backend, io_type):
+    '''
+    Test proper handling of DepthwiseConv1D.
+    '''
+    X = np.random.rand(10, 32, 3)
+    X = np.round(X * 2**10) * 2**-10  # make it an exact ap_fixed<16,6>
+    model = keras.Sequential([DepthwiseConv1D(kernel_size=3, input_shape=(32, 3))])
+    model.compile()
+
+    config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend)
+    output_dir = str(test_root_path / f'hls4mlprj_keras_api_depthwiseconv1d_{backend}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+    hls_model.compile()
+
+    y_qkeras = model.predict(X)
+    y_hls4ml = hls_model.predict(X)
+
+    np.testing.assert_allclose(y_qkeras, y_hls4ml.reshape(y_qkeras.shape), rtol=1e-2, atol=0.01)  # type: ignore
+
+
+pooling_layers = [MaxPooling1D, MaxPooling2D, AveragePooling1D, AveragePooling2D]
+
+
+@pytest.mark.parametrize('pooling', pooling_layers)
+@pytest.mark.parametrize('padds', padds_options)
+@pytest.mark.parametrize('chans', chans_options)
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI', 'Catapult'])
+def test_pooling(pooling, padds, chans, backend):
+    assert '1D' in pooling.__name__ or '2D' in pooling.__name__
+
+    input_shape = (18, 15, 3) if '2D' in pooling.__name__ else (121, 3)
+    pool_size = (4, 2) if '2D' in pooling.__name__ else 2
+
+    X_input = np.random.rand(100, *input_shape)
+
+    keras_model = keras.Sequential([pooling(pool_size, padding=padds, input_shape=input_shape)])
+    keras_model.compile()
+
+    hls_cfg = hls4ml.utils.config_from_keras_model(keras_model)
+    output_dir = str(
+        test_root_path / f'hls4mlprj_keras_api_pooling_{pooling.__name__}_channels_{chans}_padds_{padds}_backend_{backend}'
+    )
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        keras_model, hls_config=hls_cfg, output_dir=output_dir, backend=backend
+    )
+    hls_model.compile()
+
+    # Verify accuracy
+    keras_prediction = keras_model.predict(X_input)
+    hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)  # type: ignore
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=3e-2)
+
+    # # Verify correct parsing of layer
+    # hls_pool = list(hls_model.get_layers())[-1]
+    # ker_pool = keras_model.layers[-1]
+    # if '2D' in pooling.__name__:
+    #     assert hls_pool.attributes['name'] == ker_pool._name
+    #     assert hls_pool.attributes['class_name'][-2] == str(2)
+    #     assert hls_pool.attributes['stride_height'] == ker_pool.strides[0]
+    #     assert hls_pool.attributes['stride_width'] == ker_pool.strides[1]
+    #     assert hls_pool.attributes['pool_height'] == ker_pool.pool_size[1]
+    #     assert hls_pool.attributes['pool_width'] == ker_pool.pool_size[0]
+
+    #     if hls_pool.attributes['data_format'] == 'channels_last':
+    #         assert hls_pool.attributes['in_height'] == ker_pool.input_shape[1]
+    #         assert hls_pool.attributes['in_width'] == ker_pool.input_shape[2]
+    #         assert hls_pool.attributes['n_filt'] == ker_pool.input_shape[3]
+    #     elif hls_pool.attributes['data_format'] == 'channels_first':
+    #         assert hls_pool.attributes['in_height'] == ker_pool.input_shape[2]
+    #         assert hls_pool.attributes['in_width'] == ker_pool.input_shape[3]
+    #         assert hls_pool.attributes['n_filt'] == ker_pool.input_shape[1]
+
+    #     if ker_pool.padding == 'same':
+    #         # Height
+    #         in_height = ker_pool.input_shape[1]
+    #         if ker_pool.data_format == 'channels_first':
+    #             in_height = ker_pool.input_shape[2]
+    #         out_height = int(math.ceil(float(in_height) / float(ker_pool.strides[0])))
+    #         assert out_height == hls_pool.attributes['out_height']
+    #         if in_height % ker_pool.strides[0] == 0:
+    #             pad_along_height = max(ker_pool.pool_size[1] - ker_pool.strides[0], 0)
+    #         else:
+    #             pad_along_height = max(ker_pool.pool_size[1] - (in_height % ker_pool.strides[0]), 0)
+    #         pad_top = pad_along_height // 2
+    #         pad_bottom = pad_along_height - pad_top
+    #         assert pad_bottom == hls_pool.attributes['pad_bottom']
+    #         assert pad_top == hls_pool.attributes['pad_top']
+
+    #         # Width
+    #         in_width = ker_pool.input_shape[2]
+    #         if ker_pool.data_format == 'channels_first':
+    #             in_height = keras_model.layers[1].input_shape[-1]
+    #         out_width = int(math.ceil(float(in_width) / float(ker_pool.strides[1])))
+    #         assert out_width == hls_pool.attributes['out_width']
+    #         if in_width % ker_pool.strides[1] == 0:
+    #             pad_along_width = max(ker_pool.pool_size[0] - ker_pool.strides[1], 0)
+    #         else:
+    #             pad_along_width = max(ker_pool.pool_size[0] - (in_width % ker_pool.strides[1]), 0)
+    #         pad_left = pad_along_width // 2
+    #         pad_right = pad_along_width - pad_left
+    #         assert pad_left == hls_pool.attributes['pad_left']
+    #         assert pad_right == hls_pool.attributes['pad_right']
+
+    #     elif ker_pool.padding == 'valid':
+    #         if hls_pool.attributes['data_format'] == 'channels_first':
+    #             in_height = ker_pool.input_shape[2]
+    #             in_width = ker_pool.input_shape[3]
+    #         elif hls_pool.attributes['data_format'] == 'channels_last':
+    #             in_height = ker_pool.input_shape[1]
+    #             in_width = ker_pool.input_shape[2]
+    #         else:
+    #             raise ValueError('Invalid data_format')
+
+    #         out_width = int(math.ceil(float(in_width - ker_pool.pool_size[0] + 1) / float(ker_pool.strides[1])))
+    #         out_height = int(math.ceil(float(in_height - ker_pool.pool_size[1] + 1) / float(ker_pool.strides[0])))
+
+    #         assert hls_pool.attributes['out_height'] == out_height
+    #         assert hls_pool.attributes['out_width'] == out_width
+    #         assert hls_pool.attributes['pad_top'] == 0
+    #         assert hls_pool.attributes['pad_bottom'] == 0
+    #         assert hls_pool.attributes['pad_left'] == 0
+    #         assert hls_pool.attributes['pad_right'] == 0
+
+    # elif '1D' in pooling.__name__:
+    #     assert hls_pool.attributes['name'] == ker_pool._name
+    #     assert hls_pool.attributes['class_name'][-2] == str(1)
+    #     assert hls_pool.attributes['n_in'] == ker_pool.input_shape[1]
+    #     assert hls_pool.attributes['n_filt'] == ker_pool.input_shape[2]
+    #     assert hls_pool.attributes['pool_width'] == ker_pool.pool_size[0]
+    #     assert hls_pool.attributes['stride_width'] == ker_pool.strides[0]
+
+    #     out_same = math.ceil(float(ker_pool.input_shape[1]) / float(ker_pool.strides[0]))
+    #     out_valid = math.ceil(float(ker_pool.input_shape[1] - ker_pool.pool_size[0] + 1) / ker_pool.strides[0])
+
+    #     if ker_pool.padding == 'same':
+    #         assert hls_pool.attributes['n_out'] == out_same
+    #         if ker_pool.input_shape[1] % ker_pool.strides[0] == 0:
+    #             pad_along_width = max(ker_pool.pool_size[0] - ker_pool.strides[0], 0)
+    #         else:
+    #             pad_along_width = max(ker_pool.pool_size[0] - (ker_pool.input_shape[1] % ker_pool.strides[0]), 0)
+    #         assert hls_pool.attributes['pad_left'] == pad_along_width // 2
+    #         assert hls_pool.attributes['pad_right'] == pad_along_width - pad_along_width // 2
+
+    #     elif ker_pool.padding == 'valid':
+    #         assert hls_pool.attributes['n_out'] == out_valid
+    #         assert hls_pool.attributes['pad_left'] == 0
+    #         assert hls_pool.attributes['pad_right'] == 0
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+def test_reused_layer(backend, io_type):
+
+    inp1 = keras.layers.Input(shape=(10, 10))
+    inp2 = keras.layers.Input(shape=(10, 10))
+
+    conv = keras.layers.Conv1D(2, 3, activation='relu')
+
+    o1 = conv(inp1)
+    o2 = conv(inp2)
+    o3 = keras.layers.Add()([o1, o2])
+    o4 = keras.layers.Dense(5)(o3)
+
+    _ = keras.layers.Dense(5)(o3)
+
+    model = keras.models.Model(inputs=[inp1, inp2], outputs=[o1, o2, o3, o4])
+
+    _ = model([inp1, inp1])
+
+    hls_config = {'Model': {'Precision': 'ap_fixed<32,8>', 'ReuseFactor': 1}}
+    output_dir = str(test_root_path / f'hls4mlprj_keras_api_conv1d_{backend}_{io_type}')
+
+    model_hls = hls4ml.converters.convert_from_keras_model(
+        model, backend=backend, io_type=io_type, hls_config=hls_config, output_dir=output_dir
+    )
+
+    model_hls.compile()
+
+    data = [np.random.rand(1000, 10, 10).astype(np.float32), np.random.rand(1000, 10, 10).astype(np.float32)]
+    keras_pred = model.predict(data)
+    hls_pred = model_hls.predict(data)
+
+    np.testing.assert_allclose(keras_pred[0].reshape(hls_pred[0].shape), hls_pred[0], rtol=0, atol=1e-5)
+    np.testing.assert_allclose(keras_pred[1].reshape(hls_pred[1].shape), hls_pred[1], rtol=0, atol=1e-5)
+    np.testing.assert_allclose(keras_pred[2].reshape(hls_pred[2].shape), hls_pred[2], rtol=0, atol=1e-5)
+    np.testing.assert_allclose(keras_pred[3].reshape(hls_pred[3].shape), hls_pred[3], rtol=0, atol=1e-2)

From 3f8acb5d7187be1347734e62ed96595d4449cdec Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 17 Dec 2024 08:57:49 +0000
Subject: [PATCH 20/69] pre-commit fix

---
 hls4ml/converters/keras_v3/conv.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hls4ml/converters/keras_v3/conv.py b/hls4ml/converters/keras_v3/conv.py
index 871bcb942d..df226fc6b5 100644
--- a/hls4ml/converters/keras_v3/conv.py
+++ b/hls4ml/converters/keras_v3/conv.py
@@ -73,7 +73,6 @@ def handle(
             'bias_data': bias,
             'data_format': data_format,
             'weight_data': kernel,
-            'bias_data': bias,
             'n_filt': ch_out,
             'n_chan': ch_in,
         }

From d2ccfb4be90994bf66a01480d45094f4828c48e6 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 6 Dec 2024 06:16:21 +0000
Subject: [PATCH 21/69] revert keras v2 converter

---
 hls4ml/converters/keras_to_hls.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index a206da4da7..aa7bfe8862 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -1,5 +1,4 @@
 import json
-from warnings import warn
 
 import h5py
 
@@ -231,8 +230,8 @@ def parse_keras_model(model_arch, reader):
         layer_config = model_arch['config']
         if 'layers' in layer_config:  # Newer Keras versions have 'layers' in 'config' key
             layer_config = layer_config['layers']
+        # Sequential doesn't have InputLayer in TF < 2.3 (Keras 2.4.0)
         if layer_config[0]['class_name'] != 'InputLayer':
-            warn(DeprecationWarning('keras < 2.4.0 (tf 2.3) is deprecated. Please use a newer version.'))
             input_layer = {}
             input_layer['name'] = 'input1'
             input_layer['class_name'] = 'InputLayer'
@@ -244,33 +243,25 @@ def parse_keras_model(model_arch, reader):
         layer_config = model_arch['config']['layers']
         input_layers = [inp[0] for inp in model_arch['config']['input_layers']]
         output_layers = [out[0] for out in model_arch['config']['output_layers']]
-    else:
-        raise Exception(f'ERROR: Model class not supported: {model_arch["class_name"]}')
 
     # Get input shape and check for unsupported layer type
     for keras_layer in layer_config:
         if keras_layer['class_name'] not in supported_layers:
-            raise Exception(f'ERROR: Unsupported layer type: {keras_layer["class_name"]}')
+            raise Exception('ERROR: Unsupported layer type: {}'.format(keras_layer['class_name']))
 
     output_shapes = {}
     output_shape = None
 
     print('Topology:')
     for keras_layer in layer_config:
-        if 'batch_input_shape' in keras_layer['config'] or 'batch_shape' in keras_layer['config']:
+        if 'batch_input_shape' in keras_layer['config']:
             if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
                 input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
             else:
-                _input_shapes = keras_layer['config'].get('batch_input_shape', None)
-                input_shapes = _input_shapes or keras_layer['config']['batch_shape']
+                input_shapes = [keras_layer['config']['batch_input_shape']]
         else:
             if 'inbound_nodes' in keras_layer:
-                if 'args' in keras_layer['inbound_nodes'][0]:
-                    # keras v3
-                    input_shapes = [arg['config']['shape'] for arg in keras_layer['inbound_nodes'][0]['args']]
-                else:
-                    # keras v2
-                    input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
+                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
             else:
                 # Sequential model, so output_shape from the previous layer is still valid
                 input_shapes = [output_shape]

From 033496019f6e6a596121252815cd494d21daabd8 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 13 Nov 2024 05:26:59 +0000
Subject: [PATCH 22/69] make reshape handler compatiable with keras v3

---
 hls4ml/converters/keras/reshape.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/converters/keras/reshape.py b/hls4ml/converters/keras/reshape.py
index 1f6dc2a759..08803df828 100644
--- a/hls4ml/converters/keras/reshape.py
+++ b/hls4ml/converters/keras/reshape.py
@@ -24,7 +24,7 @@ def parse_reshape_layer(keras_layer, input_names, input_shapes, data_reader):
     layer = parse_default_keras_layer(keras_layer, input_names)
 
     layer['target_shape'] = keras_layer['config']['target_shape']
-    output_shape = input_shapes[0][:1] + keras_layer['config']['target_shape']
+    output_shape = input_shapes[0][:1] + list(keras_layer['config']['target_shape'])
 
     return layer, output_shape
 

From 074b4b63f45f1084205fb5c29422722ede9cdbf0 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 13 Nov 2024 04:43:23 +0000
Subject: [PATCH 23/69] add general transpose for vivado/vitis

---
 .../vivado/passes/reshaping_templates.py      | 61 +++++++++++++----
 hls4ml/model/layers.py                        |  8 ++-
 .../templates/vivado/nnet_utils/nnet_array.h  | 52 --------------
 .../templates/vivado/nnet_utils/nnet_stream.h | 23 -------
 .../vivado/nnet_utils/nnet_transpose.h        | 39 +++++++++++
 .../vivado/nnet_utils/nnet_transpose_stream.h | 67 +++++++++++++++++++
 6 files changed, 158 insertions(+), 92 deletions(-)
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_array.h
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_transpose.h
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_transpose_stream.h

diff --git a/hls4ml/backends/vivado/passes/reshaping_templates.py b/hls4ml/backends/vivado/passes/reshaping_templates.py
index ec6705eb29..f43d394cd9 100644
--- a/hls4ml/backends/vivado/passes/reshaping_templates.py
+++ b/hls4ml/backends/vivado/passes/reshaping_templates.py
@@ -1,3 +1,7 @@
+from math import prod
+
+import numpy as np
+
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.layers import Resize, Transpose, ZeroPadding1D, ZeroPadding2D
 
@@ -97,16 +101,45 @@ def format(self, node):
 
 # Transpose templates
 
-transpose_config_template = """struct config{index} : nnet::transpose_config {{
-    static const unsigned depth = {depth};
-    static const unsigned height = {height};
-    static const unsigned width = {width};
-    static constexpr unsigned perm[3] = {{{perm_str}}};
-}};\n"""
 
-transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});'
+transpose_include_list = ['nnet_utils/nnet_transpose.h', 'nnet_utils/nnet_transpose_stream.h']
+
+transpose_config_template = """struct {config_name} {{
+    static const unsigned dims = {dims};
+    static const unsigned N = {N};
+    static const unsigned* const from_shape;
+    static const unsigned* const to_shape;
+    static const unsigned* const perm;
+    static const unsigned* const perm_strides;
+}};
+
+unsigned {config_name}_from_shape[{dims}] = {{{from_shape}}};
+unsigned {config_name}_to_shape[{dims}] = {{{to_shape}}};
+unsigned {config_name}_perm[{dims}] = {{{perm}}};
+unsigned {config_name}_perm_strides[{dims}] = {{{perm_strides}}};
+
+const unsigned* const {config_name}::from_shape = {config_name}_from_shape;
+const unsigned* const {config_name}::to_shape = {config_name}_to_shape;
+const unsigned* const {config_name}::perm = {config_name}_perm;
+const unsigned* const {config_name}::perm_strides = {config_name}_perm_strides;
+"""
+
+transpose_function_template = 'nnet::transpose<{input_t}, {output_t}, {config_name}>({input}, {output});'
 
-transpose_include_list = ['nnet_utils/nnet_array.h', 'nnet_utils/nnet_stream.h']
+
+def permute_config_gen(name: str, shape: tuple[int, ...], perm: tuple[int, ...]):
+    new_shape = tuple(shape[i] for i in perm)
+    strides = np.cumprod((shape[1:] + (1,))[::-1])[::-1]
+    perm_strides = tuple(int(strides[i]) for i in perm)
+    return transpose_config_template.format(
+        dims=len(shape),
+        N=prod(shape),
+        from_shape=', '.join(str(x) for x in shape),
+        perm=', '.join(str(x) for x in perm),
+        perm_strides=', '.join(str(x) for x in perm_strides),
+        to_shape=', '.join(str(x) for x in new_shape),
+        config_name=name,
+    )
 
 
 class TransposeConfigTemplate(LayerConfigTemplate):
@@ -115,18 +148,18 @@ def __init__(self):
         self.template = transpose_config_template
 
     def format(self, node):
-        params = self._default_config_params(node)
-
-        return self.template.format(**params)
+        shape = tuple(node.get_input_variable().shape)
+        perm = tuple(node.get_attr('perm'))
+        name = f'config{node.index}'
+        return permute_config_gen(name, shape, perm)
 
 
 class TransposeFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
-        super().__init__(Transpose, include_header=transpose_include_list)
         self.template = transpose_function_template
+        super().__init__(Transpose, include_header=transpose_include_list)
 
     def format(self, node):
         params = self._default_function_params(node)
-        params['dim'] = node.get_attr('dim')
-
+        params['config_name'] = f'config{node.index}'
         return self.template.format(**params)
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 3847cda9cf..aac11cc7a3 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1221,8 +1221,7 @@ def initialize(self):
         perm = self.get_attr('perm')
         self.set_attr('dim', f'{len(inp.shape)}d')
 
-        if len(perm) > 3:
-            raise Exception('ERROR: Transpose of tensors with rank > 3 is not yet supported.')
+        # TODO: dim>3 is only supported for vivado/vitis backend
 
         # ONNX double transpose specific, sometimes ONNX injects
         # useless double transpose layers when converting
@@ -1242,11 +1241,14 @@ def initialize(self):
             self.set_attr('depth', 1)
             self.set_attr('height', inp.shape[0])
             self.set_attr('width', inp.shape[1])
-        elif len(shape) > 2:
+        elif len(shape) == 3:
             dims = [f'OUT_DEPTH_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}']
             self.set_attr('depth', inp.shape[0])
             self.set_attr('height', inp.shape[1])
             self.set_attr('width', inp.shape[2])
+        elif len(shape) > 3:
+            # Differentiate between 2/3/3+ dim does not really appear to be needed. To be removed?
+            dims = [f'OUT_DIM_{i}_{self.index}' for i in range(1, len(shape) + 1)]
         self.add_output_variable(shape, dims, precision=inp.type.precision)
 
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_array.h b/hls4ml/templates/vivado/nnet_utils/nnet_array.h
deleted file mode 100644
index d179102a99..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_array.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef NNET_ARRAY_H_
-#define NNET_ARRAY_H_
-
-#include <math.h>
-
-namespace nnet {
-
-struct transpose_config {
-    static const unsigned height = 10;
-    static const unsigned width = 10;
-    static const unsigned depth = 10;
-    static constexpr unsigned perm[3] = {2, 0, 1};
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
-    #pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::height; i++) {
-        for (int j = 0; j < CONFIG_T::width; j++) {
-            data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j];
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
-                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
-    unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
-    unsigned dims_t[3];
-    dims_t[0] = dims[CONFIG_T::perm[0]];
-    dims_t[1] = dims[CONFIG_T::perm[1]];
-    dims_t[2] = dims[CONFIG_T::perm[2]];
-
-    int idx[3] = {0}, idx_t[3] = {0};
-    for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) {
-        for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) {
-            for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) {
-                idx_t[0] = idx[CONFIG_T::perm[0]];
-                idx_t[1] = idx[CONFIG_T::perm[1]];
-                idx_t[2] = idx[CONFIG_T::perm[2]];
-
-                data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] =
-                    data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]];
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
index 900db16c36..33538ede9f 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
@@ -179,29 +179,6 @@ void broadcast_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_2d(hls::stream<data_T> &data, hls::stream<res_T> &res) {
-    typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width];
-    #pragma HLS ARRAY_PARTITION variable=data_array complete
-
-    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) {
-        #pragma HLS PIPELINE
-        data_T in_data = data.read();
-        for (int j = 0; j < data_T::size; j++) {
-            data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) {
-        #pragma HLS PIPELINE
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-        for (int j = 0; j < res_T::size; j++) {
-            out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
-        }
-        res.write(out_data);
-    }
-}
 } // namespace nnet
 
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_transpose.h b/hls4ml/templates/vivado/nnet_utils/nnet_transpose.h
new file mode 100644
index 0000000000..85238c25dd
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_transpose.h
@@ -0,0 +1,39 @@
+#ifndef NNET_PERMUTE_H_
+#define NNET_PERMUTE_H_
+
+namespace nnet {
+
+struct transpose_config {
+    static const unsigned dims;
+    static const unsigned N;
+    // vivado/vitis hls can't indexing constexpr array for some reason
+    // and vivado hls don't like template recursion either (vitis is fine)
+    // thus this appears to be the only workaround (or overkill it with codegen)
+    static const unsigned *const from_shape;
+    static const unsigned *const to_shape;
+    static const unsigned *const perm;
+    static const unsigned *const perm_strides;
+};
+
+template <typename CONFIG_T> unsigned transfer_idx(int index) {
+    // Given output idx in c-order flat array, return input idx
+    int idx = 0;
+    for (int i = CONFIG_T::dims - 1; i >= 0; i--) {
+        idx += (index % CONFIG_T::to_shape[i]) * CONFIG_T::perm_strides[i];
+        index /= CONFIG_T::to_shape[i];
+    }
+    return idx;
+}
+
+template <typename data_T, typename res_T, typename CONFIG_T>
+void transpose(const data_T data[CONFIG_T::N], res_T res[CONFIG_T::N]) {
+    for (int i = 0; i < CONFIG_T::N; i++) {
+        #pragma HLS UNROLL
+        int idx = transfer_idx<CONFIG_T>(i);
+        res[i] = data[idx];
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_transpose_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_transpose_stream.h
new file mode 100644
index 0000000000..7f46a68bd2
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_transpose_stream.h
@@ -0,0 +1,67 @@
+#ifndef NNET_TRANSPOSE_STREAM_H
+#define NNET_TRANSPOSE_STREAM_H
+
+#include "hls_stream.h"
+#include "nnet_transpose.h"
+#include <type_traits>
+
+namespace nnet {
+
+template <typename data_T, typename res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::dims == 2, void>::type transpose(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // #pragma HLS INLINE RECURSIVE
+    typename data_T::value_type data_array[CONFIG_T::N];
+    #pragma HLS ARRAY_PARTITION variable=data_array complete
+
+    for (int i = 0; i < CONFIG_T::N / data_T::size; i++) {
+        #pragma HLS PIPELINE
+        data_T in_data = data.read();
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::N / res_T::size; i++) {
+        #pragma HLS PIPELINE
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = typename res_T::value_type(data_array[j * CONFIG_T::from_shape[1] + i]);
+        }
+        res.write(out_data);
+    }
+}
+
+// This sfinae is for vivado_hls, which has some overhead using the transfer_idx in io_stream.
+// In vitis both performs exactly the same, thus this is not removed out of convenience.
+template <typename data_T, typename res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::dims != 2, void>::type transpose(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // #pragma HLS INLINE RECURSIVE
+    typename data_T::value_type data_array[CONFIG_T::N];
+    #pragma HLS ARRAY_PARTITION variable=data_array complete
+
+    for (int i = 0; i < CONFIG_T::N / data_T::size; i++) {
+        #pragma HLS PIPELINE
+        data_T in_data = data.read();
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::N / res_T::size; i++) {
+        #pragma HLS PIPELINE
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = typename res_T::value_type(data_array[transfer_idx<CONFIG_T>(i * res_T::size + j)]);
+        }
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+#endif

From 29674db676d095f615e5d0fe55869084c14341ff Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 15 Nov 2024 05:05:42 +0000
Subject: [PATCH 24/69] general einsum support for io_parallel and latency

---
 hls4ml/backends/vivado/passes/einsum_dense.py | 120 +++++++++
 .../vivado/passes/reshaping_templates.py      |   4 +-
 hls4ml/converters/keras_v3/__init__.py        |   1 +
 hls4ml/converters/keras_v3/einsum_dense.py    |  72 ++++++
 hls4ml/model/layers.py                        |  66 ++++-
 .../vivado/nnet_utils/nnet_einsum_dense.h     |  78 ++++++
 hls4ml/utils/einsum_utils.py                  | 241 ++++++++++++++++++
 7 files changed, 579 insertions(+), 3 deletions(-)
 create mode 100644 hls4ml/backends/vivado/passes/einsum_dense.py
 create mode 100644 hls4ml/converters/keras_v3/einsum_dense.py
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_einsum_dense.h
 create mode 100644 hls4ml/utils/einsum_utils.py

diff --git a/hls4ml/backends/vivado/passes/einsum_dense.py b/hls4ml/backends/vivado/passes/einsum_dense.py
new file mode 100644
index 0000000000..fb52873814
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/einsum_dense.py
@@ -0,0 +1,120 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import EinsumDense
+
+from .reshaping_templates import transpose_config_gen
+
+# Shared Dense template
+
+conv_dense_config_template = """struct config{index}_dense : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    template<class data_T, class res_T, class CONFIG_T>
+    using kernel = nnet::{dense_function}<data_T, res_T, CONFIG_T>;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+# EinsumDense template
+
+einsum_dense_config_template = '''
+struct config{index} {{
+    typedef config{index}_tpose_inp tpose_inp_conf;
+    typedef config{index}_tpose_out tpose_out_conf;
+    typedef config{index}_dense dense_conf;
+
+    // Layer Sizes
+    static const unsigned n_free_data = {n_free_data};
+    static const unsigned n_free_kernel = {n_free_kernel};
+    static const unsigned n_contract = {n_contract};
+    static const unsigned n_inplace = {n_inplace};
+
+    // Resource reuse info
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse_factor};
+    static const unsigned parallelization_factor = {parallelization_factor}; // Only useful when n_inplace > 1
+    static const bool store_weights_in_bram = false; // NOT USED
+}};
+'''
+
+einsum_dense_function_template = 'nnet::einsum_dense<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+
+einsum_dense_include_list = ['nnet_utils/nnet_einsum_dense.h', 'nnet_utils/nnet_dense.h']
+
+
+class EinsumDenseConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(EinsumDense)
+        self.template = einsum_dense_config_template
+        self.dense_template = conv_dense_config_template
+
+    def format(self, node: EinsumDense):
+        default_params = self._default_config_params(node)
+
+        strategy = node.model.config.get_strategy(node)
+        io_type = node.model.config.get_config_value('IOType')
+
+        assert io_type == 'io_parallel', 'EinsumDense layer only supports io_parallel for now'
+        assert strategy.lower() == 'latency', 'EinsumDense layer only supports Latency strategy for now'
+
+        # EinsumDense config
+        params = default_params.copy()
+        params['strategy'] = strategy
+        params['n_free_data'] = node.attributes.attributes['n_free_data']
+        params['n_free_kernel'] = node.attributes.attributes['n_free_kernel']
+        params['n_contract'] = node.attributes.attributes['n_contract']
+        params['n_inplace'] = node.attributes.attributes['n_inplace']
+        params['parallelization_factor'] = node.attributes.attributes['parallelization_factor']
+
+        einsum_conf = self.template.format(**params)
+
+        # inp/out transpose config
+        inp_shape = node.attributes.attributes['inp_shape']
+        out_interpert_shape = node.attributes.attributes['out_interpert_shape']
+        inp_tpose_idxs = node.attributes.attributes['inp_tpose_idxs']
+        out_tpose_idxs = node.attributes.attributes['out_tpose_idxs']
+        tpose_inp_conf_name = f'config{node.index}_tpose_inp'
+        tpose_out_conf_name = f'config{node.index}_tpose_out'
+
+        inp_tpose_conf = transpose_config_gen(tpose_inp_conf_name, inp_shape, inp_tpose_idxs)
+        out_tpose_conf = transpose_config_gen(tpose_out_conf_name, out_interpert_shape, out_tpose_idxs)
+
+        # Dense config
+        dense_params = default_params.copy()
+        dense_params['strategy'] = strategy
+        dense_params['n_in'] = node.attributes.attributes['n_contract']
+        dense_params['n_out'] = node.attributes.attributes['n_free_kernel']
+        if node.attributes.attributes['n_inplace'] == 1:
+            dense_params['nzeros'] = node.get_weights('weight').nzeros  # type: ignore
+        else:
+            dense_params['nzeros'] = '-1; // Not making sense when kernels are switching'
+        dense_params['product_type'] = get_backend('vivado').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision  # type: ignore
+        )
+
+        dense_params['dense_function'] = 'DenseLatency'  # Latency only for now
+
+        dense_config = self.dense_template.format(**dense_params)
+
+        return '\n\n'.join((inp_tpose_conf, out_tpose_conf, dense_config, einsum_conf))
+
+
+class EinsumDenseFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(EinsumDense, include_header=einsum_dense_include_list)
+        self.template = einsum_dense_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/vivado/passes/reshaping_templates.py b/hls4ml/backends/vivado/passes/reshaping_templates.py
index f43d394cd9..e59d81c8c5 100644
--- a/hls4ml/backends/vivado/passes/reshaping_templates.py
+++ b/hls4ml/backends/vivado/passes/reshaping_templates.py
@@ -127,7 +127,7 @@ def format(self, node):
 transpose_function_template = 'nnet::transpose<{input_t}, {output_t}, {config_name}>({input}, {output});'
 
 
-def permute_config_gen(name: str, shape: tuple[int, ...], perm: tuple[int, ...]):
+def transpose_config_gen(name: str, shape: tuple[int, ...], perm: tuple[int, ...]):
     new_shape = tuple(shape[i] for i in perm)
     strides = np.cumprod((shape[1:] + (1,))[::-1])[::-1]
     perm_strides = tuple(int(strides[i]) for i in perm)
@@ -151,7 +151,7 @@ def format(self, node):
         shape = tuple(node.get_input_variable().shape)
         perm = tuple(node.get_attr('perm'))
         name = f'config{node.index}'
-        return permute_config_gen(name, shape, perm)
+        return transpose_config_gen(name, shape, perm)
 
 
 class TransposeFunctionTemplate(FunctionCallTemplate):
diff --git a/hls4ml/converters/keras_v3/__init__.py b/hls4ml/converters/keras_v3/__init__.py
index f658faa1fb..6dffcb71d5 100644
--- a/hls4ml/converters/keras_v3/__init__.py
+++ b/hls4ml/converters/keras_v3/__init__.py
@@ -1,5 +1,6 @@
 from . import conv  # noqa: F401
 from . import core  # noqa: F401
+from . import einsum_dense  # noqa: F401
 from ._base import registry as layer_handlers
 
 __all__ = ['layer_handlers']
diff --git a/hls4ml/converters/keras_v3/einsum_dense.py b/hls4ml/converters/keras_v3/einsum_dense.py
new file mode 100644
index 0000000000..f0f4c7223a
--- /dev/null
+++ b/hls4ml/converters/keras_v3/einsum_dense.py
@@ -0,0 +1,72 @@
+import typing
+from typing import Sequence
+
+from ._base import KerasV3LayerHandler, register
+
+if typing.TYPE_CHECKING:
+    import keras
+    from keras.api import KerasTensor
+
+
+def strip_batch_dim(equation: str):
+    """Remove the batch dimension from the equation.
+
+    Args:
+        equation (str): The einsum equation.
+
+    Returns:
+        str: The einsum equation without the batch dimension.
+    """
+
+    _inps, out = equation.split('->')
+    inp0, inp1 = _inps.split(',')
+    if inp0.startswith('...'):
+        assert out.startswith('...'), f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
+    else:
+        assert inp0[0] == out[0], f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
+        assert inp0[0] not in inp1, f'Error in eq: {equation}: Batch dim is used in the kernel.'
+        inp0, out = inp0[1:], out[1:]
+    return f'{inp0},{inp1}->{out}'
+
+
+@register
+class KV3EinsumDenseHandler(KerasV3LayerHandler):
+    handles = ('keras.src.layers.core.einsum_dense.EinsumDense',)
+
+    def handle(
+        self,
+        layer: 'keras.layers.EinsumDense',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        import keras
+
+        assert len(in_tensors) == 1, 'EinsumDense layer must have exactly one input tensor'
+        assert len(out_tensors) == 1, 'EinsumDense layer must have exactly one output tensor'
+
+        inp_shape: tuple[int, ...] = in_tensors[0].shape[1:]  # type: ignore
+        out_shape: tuple[int, ...] = out_tensors[0].shape[1:]  # type: ignore
+
+        # fmt: off
+        assert all(d is not None for d in inp_shape), \
+            f'Error when processing {layer.name}: EinsumDense layer requires fully inp shapes'
+        assert all(d is not None for d in out_shape), \
+            f'Error when processing {layer.name}: EinsumDense layer requires fully out shapes'
+        # fmt: on
+
+        equation = strip_batch_dim(layer.equation)
+
+        kernel = keras.ops.convert_to_numpy(layer.kernel)
+
+        bias = None
+        if layer.bias_axes:
+            bias = keras.ops.convert_to_numpy(layer.bias)
+
+        return {
+            'class_name': 'EinsumDense',
+            'equation': equation,
+            'weight_data': kernel,
+            'bias_data': bias,
+            'inp_shape': inp_shape,
+            'out_shape': out_shape,
+        }
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index aac11cc7a3..5392e2ffe5 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -27,10 +27,12 @@
     find_minimum_width,
 )
 from hls4ml.utils import attribute_descriptions as descriptions
+from hls4ml.utils.einsum_utils import parse_einsum
 from hls4ml.utils.string_utils import convert_to_snake_case
 
-
 # TODO move this to some utility module
+
+
 class classproperty:
     def __init__(self, func):
         self.func = func
@@ -1618,6 +1620,67 @@ def initialize(self):
         self.add_output_variable([len(self.get_attr('expression'))], [f'N_OUTPUTS_{self.index}'], var_name='y')
 
 
+class EinsumDense(Layer):
+    _expected_attributes = [
+        WeightAttribute('weight'),
+        WeightAttribute('bias'),
+        TypeAttribute('weight'),
+        TypeAttribute('bias'),
+        TypeAttribute('accum'),
+        Attribute('equation', value_type=str),
+        Attribute('inp_shape', value_type=tuple),
+        Attribute('out_shape', value_type=tuple),
+    ]
+
+    def initialize(self):
+        out_shape = self.attributes['out_shape']
+        if len(out_shape) > 1:
+            dims = [f'N_LAYER_{self.index}_D{i}' for i in range(1, len(out_shape) + 1)]
+        else:
+            dims = [f'N_LAYER_{self.index}']
+        self.add_output_variable(list(out_shape), dims)
+
+        kernel: np.ndarray = self.attributes.attributes['weight_data']
+        bias: np.ndarray | None = self.attributes.attributes['bias_data']
+        equation = self.attributes['equation']
+        inp_shape = self.attributes['inp_shape']
+        out_shape = self.attributes['out_shape']
+
+        recipe = parse_einsum(equation, inp_shape, kernel.shape)
+        inp_tpose_idxs, ker_tpose_idxs = recipe['in_transpose_idxs']
+        out_tpose_idxs = recipe['out_transpose_idxs']
+
+        # Pre-transpose kernel (and bias) to save a transpose in cpp. Shouldn't matter for latency strategy though.
+        # hls4ml dense acts like i,ij->j
+        # parser assumes ij,j->i, so we need to transpose the kernel to match
+        kernel = kernel.transpose(ker_tpose_idxs)
+        kernel = kernel.reshape(recipe['I'], recipe['L1'], recipe['C']).transpose(0, 2, 1)
+
+        # TODO: for weight in bram mode (resource), broadcasting bias here shall be avoided.
+        if bias is not None:
+            bias = np.broadcast_to(bias, out_shape).transpose(np.argsort(out_tpose_idxs))
+        else:
+            # The automatically created bias is just the last dimension of the output shape
+            # Which is too small in general for einsum dense.
+            # The transpose is just to match the shape in case of have real bias, no real effect.
+            bias = np.zeros(out_shape).transpose(np.argsort(out_tpose_idxs))
+
+        self.attributes.attributes['weight_data'] = kernel
+        self.attributes.attributes['bias_data'] = bias
+        self.attributes['inp_tpose_idxs'] = inp_tpose_idxs
+        self.attributes['out_tpose_idxs'] = out_tpose_idxs
+        self.attributes['out_interpert_shape'] = recipe['out_interpert_shape']
+        self.attributes['n_free_data'] = recipe['L0']
+        self.attributes['n_free_kernel'] = recipe['L1']
+        self.attributes['n_inplace'] = recipe['I']
+        self.attributes['n_contract'] = recipe['C']
+        pf = self.attributes.attributes.get('parallelization_factor', recipe['L0'])
+        self.attributes['parallelization_factor'] = pf
+
+        self.add_weights(compression=self.model.config.get_compression(self))
+        self.add_bias()
+
+
 layer_map = {
     'Input': Input,
     'InputLayer': Input,
@@ -1686,6 +1749,7 @@ def initialize(self):
     'SymbolicExpression': SymbolicExpression,
     # TensorFlow-specific layers:
     'BiasAdd': BiasAdd,
+    'EinsumDense': EinsumDense,
 }
 
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_einsum_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_einsum_dense.h
new file mode 100644
index 0000000000..1abb7c5d08
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_einsum_dense.h
@@ -0,0 +1,78 @@
+#ifndef NNET_EINSUM_DENSE_H_
+#define NNET_EINSUM_DENSE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense_latency.h"
+#include "nnet_dense_resource.h"
+#include "nnet_function_stubs.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include "nnet_transpose.h"
+
+namespace nnet {
+
+struct einsum_dense_config {
+    // Internal data type definitions
+
+    typedef void tpose_inp_conf;
+    typedef void tpose_out_conf;
+    typedef void dense_conf;
+
+    // Layer Sizes
+    static const unsigned n_free_data = 1;
+    static const unsigned n_free_kernel = 1;
+    static const unsigned n_contract = 1;
+    static const unsigned n_inplace = 1;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency;
+    static const unsigned reuse_factor = 1;
+    static const unsigned parallelization_factor = 1000; // Only useful when n_inplace > 1
+    static const bool store_weights_in_bram = false;     // NOT USED
+
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void einsum_dense(
+    data_T data[CONFIG_T::n_free_data * CONFIG_T::n_contract * CONFIG_T::n_inplace],
+    res_T res[CONFIG_T::n_free_data * CONFIG_T::n_free_kernel * CONFIG_T::n_inplace],
+    typename CONFIG_T::dense_conf::weight_t weights[CONFIG_T::n_free_kernel * CONFIG_T::n_contract * CONFIG_T::n_inplace],
+    typename CONFIG_T::dense_conf::bias_t biases[CONFIG_T::n_free_data * CONFIG_T::n_free_kernel * CONFIG_T::n_inplace]) {
+    data_T inp_tpose[CONFIG_T::n_free_data * CONFIG_T::n_contract * CONFIG_T::n_inplace];
+    res_T out_tpose[CONFIG_T::n_free_data * CONFIG_T::n_free_kernel * CONFIG_T::n_inplace];
+    res_T out_buffer[CONFIG_T::n_free_kernel];
+    #pragma HLS ARRAY_PARTITION variable = inp_tpose complete
+    #pragma HLS ARRAY_PARTITION variable = out_tpose complete
+
+    nnet::transpose<data_T, data_T, typename CONFIG_T::tpose_inp_conf>(data, inp_tpose);
+
+    constexpr unsigned L0 = CONFIG_T::n_free_data;
+    constexpr unsigned L1 = CONFIG_T::n_free_kernel;
+    constexpr unsigned C = CONFIG_T::n_contract;
+    constexpr unsigned I = CONFIG_T::n_inplace;
+
+    for (unsigned l0 = 0; l0 < L0; l0++) {
+        #pragma HLS UNROLL factor = CONFIG_T::parallelization_factor
+        for (unsigned i = 0; i < I; i++) {
+            #pragma HLS UNROLL
+            // even w/o explicit distributed arithmetic optimization, latency kernels are partially implemented as such
+            // so reusing the same multiplier for different weights doesn't really help... only full unrolling for now
+            dense<data_T, res_T, typename CONFIG_T::dense_conf>(&inp_tpose[(i * L0 + l0) * C], out_buffer,
+                                                                &weights[(i * L1 * C)], &biases[((i * L0 + l0) * L1)]);
+            for (unsigned j = 0; j < L1; j++) {
+                #pragma HLS UNROLL
+                out_tpose[(i * L0 + l0) * L1 + j] = out_buffer[j];
+            }
+        }
+    }
+
+    nnet::transpose<res_T, res_T, typename CONFIG_T::tpose_out_conf>(out_tpose, res);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/utils/einsum_utils.py b/hls4ml/utils/einsum_utils.py
new file mode 100644
index 0000000000..7d4253f763
--- /dev/null
+++ b/hls4ml/utils/einsum_utils.py
@@ -0,0 +1,241 @@
+from math import prod
+from typing import TypedDict
+
+import numpy as np
+
+
+class EinsumRecipe(TypedDict):
+    in_transpose_idxs: tuple[tuple[int, ...], tuple[int, ...]]
+    L0: int
+    L1: int
+    I: int
+    C: int
+    out_interpert_shape: tuple[int, ...]
+    out_transpose_idxs: tuple[int, ...]
+
+
+def _validate_einsum_expr(fn: str, shape0: tuple[int, ...], shape1: tuple[int, ...]):
+    """Validate, resolve broadcasting, and compute output shape for einsum string
+
+    Parameters
+    ----------
+    fn : str
+        einsum string, e.g. 'ij,jk->ik'
+    shape0 : tuple[int,...]
+        shape of input0
+    shape1 : tuple[int,...]
+        shape of input1
+
+    Returns
+    -------
+    tuple[str, tuple[int,...]]
+        einsum string w/o broadcasting, and output shape
+
+    Raises
+    ------
+    ValueError
+        If the einsum string is invalid, or if it is incompatible with the input shapes
+    """
+    inp, out = map(str.strip, fn.split('->'))
+    in0, in1 = map(str.strip, inp.split(','))
+    alphabets = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    s_alphabets = set(alphabets)
+
+    # Invalid characters
+    if not (s_alphabets >= set(in0.replace('...', '') + in1.replace('...', '') + out.replace('...', ''))):
+        raise ValueError(f"einsum string {fn} is invalid: subscripts should be in [a-zA-Z] and '...' only")
+
+    in0 = in0.replace('...', '0')
+    in1 = in1.replace('...', '0')
+    out = out.replace('...', '0')
+    ax_in0, ax_in1, ax_out = list(in0), list(in1), list(out)
+    sax_in0, sax_in1, sax_out = set(ax_in0), set(ax_in1), set(ax_out)
+    free_indices = ''.join(sorted(s_alphabets - sax_in0 - sax_in1 - sax_out))
+
+    # Repeated indices
+    if len(sax_in0) != len(ax_in0):
+        for a in in0:
+            if in0.count(a) == 1:
+                continue
+            a = a if a != '0' else '...'
+            raise ValueError(f"einsum string {fn} is invalid: input0 subscripts includes '{a}' multiple times")
+    if len(sax_in1) != len(ax_in1):
+        for a in in1:
+            if in1.count(a) == 1:
+                continue
+            a = a if a != '0' else '...'
+            raise ValueError(f"einsum string {fn} is invalid: input1 subscripts includes '{a}' multiple times")
+    if len(sax_out) != len(ax_out):
+        for a in out:
+            if out.count(a) == 1:
+                continue
+            a = a if a != '0' else '...'
+            raise ValueError(f"einsum string {fn} is invalid: output subscripts includes '{a}' multiple times")
+
+    # Invalid broadcasting
+    if '0' in sax_in0 or '0' in sax_in1 or '0' in sax_out:
+        if '0' in sax_in0 and '0' in sax_in1:
+            raise ValueError(f"einsum string {fn} is invalid: both input0 and input1 allows broadcasting")
+        if '0' not in sax_out:
+            raise ValueError(f"einsum string {fn} is invalid: output does not allow broadcasting, but inputs do")
+        if '0' not in sax_in0 and '0' not in sax_in1:
+            raise ValueError(f"einsum string {fn} is invalid: output allows broadcasting, but inputs do not")
+
+    # Output index out of nowhere
+    if remaining := sax_out - sax_in0 - sax_in1:
+        raise ValueError(f"einsum string {fn} is invalid: output subscripts {remaining} not found in inputs")
+
+    _common_in = sax_in0 & sax_in1
+
+    # Invalid input dimensions
+    if '0' in sax_in0:
+        if len(sax_in0) - 1 > len(shape0):
+            raise ValueError(f"Input0 requires at least {len(sax_in0)-1} dimensions, but only {len(shape0)} given")
+        # Replace broadcasting indices with free indices
+        n_broadcast = len(shape0) - len(sax_in0) + 1
+        in0 = in0.replace('0', free_indices[:n_broadcast])
+        out = out.replace('0', free_indices[:n_broadcast])
+        ax_in0 = list(in0)
+        ax_out = list(out)
+    else:
+        if len(sax_in0) != len(shape0):
+            raise ValueError(f"Input0 requires {len(sax_in0)} dimensions, but {len(shape0)} is given")
+    if '0' in sax_in1:
+        if len(sax_in1) - 1 > len(shape1):
+            raise ValueError(f"Input1 requires at least {len(sax_in1)-1} dimensions, but only {len(shape1)} given")
+        # Replace broadcasting indices with free indices
+        n_broadcast = len(shape1) - len(sax_in1) + 1
+        in1 = in1.replace('0', free_indices[:n_broadcast])
+        out = out.replace('0', free_indices[:n_broadcast])
+        ax_in1 = list(in1)
+        ax_out = list(out)
+    else:
+        if len(sax_in1) != len(shape1):
+            raise ValueError(f"Input1 requires {len(sax_in1)} dimensions, but {len(shape1)} is given")
+
+    # Input dimension mismatch
+    for a in _common_in:
+        ax_0 = ax_in0.index(a)
+        ax_1 = ax_in1.index(a)
+        if shape0[ax_0] != shape1[ax_1]:
+            raise ValueError(
+                f"Input dimension size mismatches for common subscript '{a}': {shape0[ax_0]} and {shape1[ax_1]}"
+            )
+
+    out_shape = tuple(shape0[ax_in0.index(a)] if a in ax_in0 else shape1[ax_in1.index(a)] for a in ax_out)
+    return f'{in0},{in1}->{out}', out_shape
+
+
+def parse_einsum(fn: str, input_shape0: tuple[int, ...], input_shape1: tuple[int, ...]) -> EinsumRecipe:
+    """Execute einsum operation on two input arrays
+
+    Parameters
+    ----------
+    fn : str
+        einsum string, e.g. 'ij,jk->ik'
+    input : np.ndarray
+        input0, the first input array
+    input1 : np.ndarray
+        input1, the second input array
+
+    Returns
+    -------
+    np.ndarray
+        output array
+    """
+
+    fn, _ = _validate_einsum_expr(fn, input_shape0, input_shape1)
+
+    _in, _out = fn.split('->')
+    _in0, _in1 = _in.split(',')
+
+    in0, in1, out = list(_in0), list(_in1), list(_out)
+    s_in0, s_in1, s_out = set(in0), set(in1), set(out)
+    _common = s_in0 & s_in1
+    _contract = _common - s_out
+    _inplace = _common & s_out
+    contract = sorted(_contract, key=lambda x: in1.index(x))
+    inplace = sorted(_inplace, key=lambda x: in1.index(x))
+    invariant0 = sorted((s_out - _common) & s_in0, key=lambda x: in0.index(x))
+    invariant1 = sorted((s_out - _common) & s_in1, key=lambda x: in1.index(x))
+
+    contract_idxs = tuple(map(in0.index, contract)), tuple(map(in1.index, contract))
+    inplace_idxs = tuple(map(in0.index, inplace)), tuple(map(in1.index, inplace))
+    invariant_idxs = tuple(map(in0.index, invariant0)), tuple(map(in1.index, invariant1))
+
+    inplace_shape = tuple(input_shape0[i] for i in inplace_idxs[0])
+    inplace_size = prod(inplace_shape)
+    contract_size = prod(input_shape0[i] for i in contract_idxs[0])
+    invariant_shape0 = tuple(input_shape0[i] for i in invariant_idxs[0])
+    invariant_shape1 = tuple(input_shape1[i] for i in invariant_idxs[1])
+    invariant_size0, invariant_size1 = prod(invariant_shape0), prod(invariant_shape1)
+
+    transpose_idx0 = inplace_idxs[0] + invariant_idxs[0] + contract_idxs[0]
+    transpose_idx1 = inplace_idxs[1] + invariant_idxs[1] + contract_idxs[1]
+
+    out_shape_pretranspose = inplace_shape + invariant_shape0 + invariant_shape1
+    _out_transpose_idx = np.argsort(tuple(map(out.index, inplace + invariant0 + invariant1)))
+    out_transpose_idx = tuple(int(i) for i in _out_transpose_idx)
+
+    return EinsumRecipe(
+        in_transpose_idxs=(transpose_idx0, transpose_idx1),
+        out_interpert_shape=out_shape_pretranspose,
+        out_transpose_idxs=out_transpose_idx,
+        L0=invariant_size0,
+        L1=invariant_size1,
+        I=inplace_size,
+        C=contract_size,
+    )
+
+
+def _exec_einsum(recipe: EinsumRecipe, input0: np.ndarray, input1: np.ndarray) -> np.ndarray:
+    """Execute einsum operation on two input arrays
+
+    Parameters
+    ----------
+    recipe : EinsumRecipe
+        einsum recipe
+    input0 : np.ndarray
+        input0, the first input array
+    input1 : np.ndarray
+        input1, the second input array
+
+    Returns
+    -------
+    np.ndarray
+        output array
+    """
+    input0 = input0.transpose(recipe['in_transpose_idxs'][0]).ravel()
+    input1 = input1.transpose(recipe['in_transpose_idxs'][1]).ravel()
+    output = np.zeros(recipe['L0'] * recipe['L1'] * recipe['I'], dtype=input0.dtype)
+
+    L0, L1, I, C = recipe['L0'], recipe['L1'], recipe['I'], recipe['C']
+
+    for l0 in range(L0):
+        for i in range(I):
+            output[(i * L0 + l0) * L1 : (i * L0 + l0 + 1) * L1] = (
+                input1[i * L1 * C : (i + 1) * L1 * C].reshape((L1, C)) @ input0[(i * L0 + l0) * C : (i * L0 + l0 + 1) * C]
+            )
+
+    return output.reshape(recipe['out_interpert_shape']).transpose(recipe['out_transpose_idxs'])
+
+
+def einsum(fn: str, input0: np.ndarray, input1: np.ndarray) -> np.ndarray:
+    """Execute einsum operation on two input arrays
+
+    Parameters
+    ----------
+    fn : str
+        einsum string, e.g. 'ij,jk->ik'
+    input : np.ndarray
+        input0, the first input array
+    input1 : np.ndarray
+        input1, the second input array
+
+    Returns
+    -------
+    np.ndarray
+        output array
+    """
+    recipe = parse_einsum(fn, input0.shape, input1.shape)
+    return _exec_einsum(recipe, input0, input1)

From 1fb23b97cf093a96969856c5d5f5c104513f12bf Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 15 Nov 2024 07:09:02 +0000
Subject: [PATCH 25/69] add tests for einsumdense

---
 test/pytest/test_einsum_dense.py | 57 ++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 test/pytest/test_einsum_dense.py

diff --git a/test/pytest/test_einsum_dense.py b/test/pytest/test_einsum_dense.py
new file mode 100644
index 0000000000..f36a319ffb
--- /dev/null
+++ b/test/pytest/test_einsum_dense.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+
+import keras
+import numpy as np
+import pytest
+
+from hls4ml.converters import convert_from_keras_model
+
+if keras.__version__ < '3.0.0':
+    pytest.skip('Only keras v3 is supported for now', allow_module_level=True)
+
+from keras.api.layers import EinsumDense, Input
+
+test_root_path = Path(__file__).parent
+
+
+@pytest.mark.parametrize('strategy', ['latency'])
+@pytest.mark.parametrize('io_type', ['io_parallel'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+@pytest.mark.parametrize(
+    'operation',
+    [
+        # eq, inp, out
+        ('bi,j->bij', (8,), (8, 7), None),
+        ('bi,j->bij', (8,), (8, 7), 'i'),
+        ('bi,j->bij', (8,), (8, 7), 'j'),
+        ('bi,io->bo', (8,), 7, None),
+        ('...i,oi->...o', (4, 3), (5,), None),
+        ('...abcd,bcde->...aeb', (5, 4, 3, 2), (5, 6, 4), None),
+        ('...abcd,bcde->...aeb', (5, 4, 3, 2), (5, 6, 4), 'aeb'),
+        ('...abcd,bcde->...aeb', (5, 4, 3, 2), (5, 6, 4), 'ab'),
+        ('...abcd,bcde->...aeb', (5, 4, 3, 2), (5, 6, 4), 'a'),
+    ],
+)
+def test_einsum_dense(backend, io_type, strategy, operation):
+    eq, inp_shape, out_shape, bias_axes = operation
+    model = keras.Sequential(
+        [Input(inp_shape), EinsumDense(eq, output_shape=out_shape, bias_axes=bias_axes, name='einsum_dense')]
+    )
+
+    if bias_axes is not None:
+        layer = model.get_layer('einsum_dense')
+        layer.bias.assign(keras.ops.convert_to_tensor(np.random.rand(*layer.bias.shape)))
+
+    data = np.random.rand(1000, *inp_shape)
+    eq_name = eq.replace(',', '_').replace('->', '_') + ('' if bias_axes is None else f'_{bias_axes}')
+    output_dir = str(test_root_path / f'hls4mlprj_einsum_dense_{eq_name}_{backend}_{io_type}_{strategy}')
+    hls_config = {'Model': {'Precision': 'ap_fixed<32,8>', 'ReuseFactor': 1}, 'Strategy': strategy}
+    model_hls = convert_from_keras_model(
+        model, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type
+    )
+
+    model_hls.compile()
+    r_keras = model.predict(data, verbose=0, batch_size=1000)  # type: ignore
+    r_hls = model_hls.predict(data).reshape(r_keras.shape)  # type: ignore
+
+    np.testing.assert_allclose(r_hls, r_keras, atol=2e-6, rtol=0)

From 5489803bded6392bc64e570d2f8756541c131129 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 19 Nov 2024 03:08:27 +0000
Subject: [PATCH 26/69] keras v3 converter clean-up

---
 hls4ml/converters/keras_v3/_base.py        | 56 ++++++++++++++++++----
 hls4ml/converters/keras_v3/conv.py         |  8 ++--
 hls4ml/converters/keras_v3/core.py         |  2 +-
 hls4ml/converters/keras_v3/einsum_dense.py |  6 +--
 4 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/hls4ml/converters/keras_v3/_base.py b/hls4ml/converters/keras_v3/_base.py
index 6f69473073..28d7c7e1e4 100644
--- a/hls4ml/converters/keras_v3/_base.py
+++ b/hls4ml/converters/keras_v3/_base.py
@@ -1,6 +1,6 @@
 import typing
 from types import FunctionType
-from typing import Any, Callable, Sequence, TypedDict
+from typing import Any, Callable, Sequence, TypedDict, overload
 
 
 class DefaultConfig(TypedDict, total=False):
@@ -26,6 +26,14 @@ class DefaultConfig(TypedDict, total=False):
 registry: dict[str, T_kv3_handler] = {}
 
 
+@overload
+def register(cls: type) -> type: ...
+
+
+@overload
+def register(cls: str) -> Callable[[T_kv3_handler], T_kv3_handler]: ...
+
+
 def register(cls: str | type):
     """Decorator to register a handler for a specific layer class. Suggested to decorate the `KerasV3LayerHandler` class.
 
@@ -51,11 +59,13 @@ def my_layer_handler(layer, inp_tensors, out_tensors):
     ```
     """
 
-    def deco(func: T_kv3_handler):
+    def deco(func):
         if isinstance(cls, str):
             registry[cls] = func
         for k in getattr(func, 'handles', ()):
             registry[k] = func
+        if isinstance(cls, type):
+            return cls
         return func
 
     if isinstance(cls, type):
@@ -79,7 +89,7 @@ def __call__(
         layer: 'keras.Layer',
         in_tensors: Sequence['KerasTensor'],
         out_tensors: Sequence['KerasTensor'],
-    ):
+    ) -> tuple[dict[str, Any], ...]:
         """Handle a keras layer. Return a tuple of dictionaries, each
         dictionary representing a layer (module) in the HLS model. One
         layer may correspond one or more dictionaries (e.g., layers with
@@ -114,8 +124,7 @@ def __call__(
         dict[str, Any] | tuple[dict[str, Any], ...]
             layer configuration(s) for the HLS model to be consumed by
             the ModelGraph constructor
-        """  # noqa: E501
-        import keras
+        """
 
         name = layer.name
         class_name = layer.__class__.__name__
@@ -150,12 +159,23 @@ def __call__(
         ret = (config,)
 
         # If activation exists, append it
+
+        act_config, intermediate_tensor_name = self.maybe_get_activation_config(layer, out_tensors)
+        if act_config is not None:
+            ret[0]['output_keras_tensor_names'] = [intermediate_tensor_name]
+            ret = *ret, act_config
+
+        return ret
+
+    def maybe_get_activation_config(self, layer, out_tensors):
+        import keras
+
         activation = getattr(layer, 'activation', None)
+        name = layer.name
         if activation not in (keras.activations.linear, None):
             assert len(out_tensors) == 1, f"Layer {name} has more than one output, but has an activation function"
             assert isinstance(activation, FunctionType), f"Activation function for layer {name} is not a function"
             intermediate_tensor_name = f'{out_tensors[0].name}_activation'
-            ret[0]['output_keras_tensor_names'] = [intermediate_tensor_name]
             act_cls_name = activation.__name__
             act_config = {
                 'class_name': 'Activation',
@@ -164,9 +184,8 @@ def __call__(
                 'input_keras_tensor_names': [intermediate_tensor_name],
                 'output_keras_tensor_names': [out_tensors[0].name],
             }
-            ret = *ret, act_config
-
-        return ret
+            return act_config, intermediate_tensor_name
+        return None, None
 
     def handle(
         self,
@@ -175,3 +194,22 @@ def handle(
         out_tensors: Sequence['KerasTensor'],
     ) -> dict[str, Any] | tuple[dict[str, Any], ...]:
         return {}
+
+    def load_weight(self, layer: 'keras.Layer', key: str):
+        """Load a weight from a layer.
+
+        Parameters
+        ----------
+        layer : keras.Layer
+            The layer to load the weight from.
+        key : str
+            The key of the weight to load.
+
+        Returns
+        -------
+        np.ndarray
+            The weight.
+        """
+        import keras
+
+        return keras.ops.convert_to_numpy(getattr(layer, key))
diff --git a/hls4ml/converters/keras_v3/conv.py b/hls4ml/converters/keras_v3/conv.py
index df226fc6b5..adf6221822 100644
--- a/hls4ml/converters/keras_v3/conv.py
+++ b/hls4ml/converters/keras_v3/conv.py
@@ -2,8 +2,6 @@
 from math import ceil
 from typing import Sequence
 
-import numpy as np
-
 from ._base import KerasV3LayerHandler, register
 
 if typing.TYPE_CHECKING:
@@ -40,9 +38,9 @@ def handle(
         assert all(isinstance(x, int) for x in in_shape), f"Layer {layer.name} has non-fixed size input: {in_shape}"
         assert all(isinstance(x, int) for x in out_shape), f"Layer {layer.name} has non-fixed size output: {out_shape}"
 
-        kernel = np.array(layer.kernel)
+        kernel = self.load_weight(layer, 'kernel')
         if layer.use_bias:
-            bias = np.array(layer.bias)
+            bias = self.load_weight(layer, 'bias')
         else:
             bias = None
 
@@ -113,7 +111,7 @@ def handle(
             config['depth_multiplier'] = layer.depth_multiplier
         elif isinstance(layer, BaseSeparableConv):
             config['depthwise_data'] = kernel
-            config['pointwise_data'] = np.array(layer.pointwise_kernel)
+            config['pointwise_data'] = self.load_weight(layer, 'pointwise_kernel')
             config['depth_multiplier'] = layer.depth_multiplier
         elif isinstance(layer, BaseConv):
             config['weight_data'] = kernel
diff --git a/hls4ml/converters/keras_v3/core.py b/hls4ml/converters/keras_v3/core.py
index ea63f97095..55a19945a9 100644
--- a/hls4ml/converters/keras_v3/core.py
+++ b/hls4ml/converters/keras_v3/core.py
@@ -28,7 +28,7 @@ def handle(
         config = {
             'data_format': 'channels_last',
             'weight_data': kernel,
-            'bias_data': np.array(layer.bias) if layer.use_bias else None,
+            'bias_data': self.load_weight(layer, 'bias') if layer.use_bias else None,
             'n_out': kernel.shape[1],
             'n_in': kernel.shape[0],
         }
diff --git a/hls4ml/converters/keras_v3/einsum_dense.py b/hls4ml/converters/keras_v3/einsum_dense.py
index f0f4c7223a..cb19272915 100644
--- a/hls4ml/converters/keras_v3/einsum_dense.py
+++ b/hls4ml/converters/keras_v3/einsum_dense.py
@@ -39,8 +39,6 @@ def handle(
         in_tensors: Sequence['KerasTensor'],
         out_tensors: Sequence['KerasTensor'],
     ):
-        import keras
-
         assert len(in_tensors) == 1, 'EinsumDense layer must have exactly one input tensor'
         assert len(out_tensors) == 1, 'EinsumDense layer must have exactly one output tensor'
 
@@ -56,11 +54,11 @@ def handle(
 
         equation = strip_batch_dim(layer.equation)
 
-        kernel = keras.ops.convert_to_numpy(layer.kernel)
+        kernel = self.load_weight(layer, 'kernel')
 
         bias = None
         if layer.bias_axes:
-            bias = keras.ops.convert_to_numpy(layer.bias)
+            bias = self.load_weight(layer, 'bias')
 
         return {
             'class_name': 'EinsumDense',

From 5e187812c4cbfb1679b341ada14f88af33f34769 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 2 Dec 2024 23:07:29 +0000
Subject: [PATCH 27/69] add symbolic quantized interval

---
 hls4ml/utils/einsum_utils.py  |  10 +-
 hls4ml/utils/qinterval.py     | 326 ++++++++++++++++++++++++++++++++++
 test/pytest/test_qinterval.py | 103 +++++++++++
 3 files changed, 435 insertions(+), 4 deletions(-)
 create mode 100644 hls4ml/utils/qinterval.py
 create mode 100644 test/pytest/test_qinterval.py

diff --git a/hls4ml/utils/einsum_utils.py b/hls4ml/utils/einsum_utils.py
index 7d4253f763..c175f9994a 100644
--- a/hls4ml/utils/einsum_utils.py
+++ b/hls4ml/utils/einsum_utils.py
@@ -213,15 +213,17 @@ def _exec_einsum(recipe: EinsumRecipe, input0: np.ndarray, input1: np.ndarray) -
 
     for l0 in range(L0):
         for i in range(I):
-            output[(i * L0 + l0) * L1 : (i * L0 + l0 + 1) * L1] = (
-                input1[i * L1 * C : (i + 1) * L1 * C].reshape((L1, C)) @ input0[(i * L0 + l0) * C : (i * L0 + l0 + 1) * C]
-            )
+            A = input1[i * L1 * C : (i + 1) * L1 * C].reshape((L1, C))
+            B = input0[(i * L0 + l0) * C : (i * L0 + l0 + 1) * C]
+            output[(i * L0 + l0) * L1 : (i * L0 + l0 + 1) * L1] = A @ B
 
     return output.reshape(recipe['out_interpert_shape']).transpose(recipe['out_transpose_idxs'])
 
 
 def einsum(fn: str, input0: np.ndarray, input1: np.ndarray) -> np.ndarray:
-    """Execute einsum operation on two input arrays
+    """Execute einsum operation on two input arrays.
+
+    WARNING: Order of multiplication is reversed -- watchout if you are using non-commutative operators
 
     Parameters
     ----------
diff --git a/hls4ml/utils/qinterval.py b/hls4ml/utils/qinterval.py
new file mode 100644
index 0000000000..4fe3bca8e4
--- /dev/null
+++ b/hls4ml/utils/qinterval.py
@@ -0,0 +1,326 @@
+from functools import singledispatchmethod
+from typing import Any, Sequence, overload
+
+import numpy as np
+
+from hls4ml.utils.einsum_utils import EinsumRecipe, parse_einsum
+
+
+def _minimal_f(array: np.ndarray):
+    _low, _high = np.full(array.shape, -32, dtype=np.int8), np.full(array.shape, 32, dtype=np.int8)
+    while np.any(_low < _high - 1):
+        _mid = (_low + _high) // 2
+        scaled = array * 2.0**_mid
+        mask = scaled != scaled.astype(np.int64)
+        _low = np.where(mask, _mid, _low)
+        _high = np.where(mask, _high, _mid)
+    return _high
+
+
+def minimal_kif(array: np.ndarray):
+    """Given a constant array, determine the minimal k, i, f values that can contain it with no loss of precision.
+
+    Parameters
+    ----------
+    array : np.ndarray
+        The constant array to be represented.
+
+    Returns
+    -------
+    tuple[np.ndarray, np.ndarray, np.ndarray]
+        The minimal k, i, f values that can contain the array with no loss of precision.
+    """
+    f = _minimal_f(array)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        i = np.ceil(np.log2(np.maximum(array + 2.0**-f, -array))).astype(np.int8)
+    k = array < 0
+    null_mask = array == 0
+    i, f = np.where(null_mask, 0, i), np.where(null_mask, 0, f)
+    return k, i, f
+
+
+class _QIntervalArray:
+    def __init__(self, min: np.ndarray, max: np.ndarray, delta: np.ndarray):
+        self.min = min.astype(np.float64)
+        self.max = max.astype(np.float64)
+        self.delta = delta.astype(np.float64)
+        self._validate()
+
+    def _validate(self):
+        with np.errstate(divide='ignore', invalid='ignore'):
+            assert np.all(self.min <= self.max), "min must be less than or equal to max"
+            assert np.all(
+                (self.max % self.delta == 0) | ((self.max == 0) & (self.delta == 0))
+            ), "max must be a multiple of delta"
+            assert np.all(
+                (self.min % self.delta == 0) | ((self.min == 0) & (self.delta == 0))
+            ), "min must be a multiple of delta"
+
+
+class QIntervalArray(_QIntervalArray):
+    """Symbolic array for quantized interval arithmetic.
+
+    Available operations are:
+    - Addition
+    - Subtraction
+    - Multiplication
+    - Division (not recommended)
+    - Matrix multiplication
+
+    Parameters
+    ----------
+    min : np.ndarray
+        The minimum value of the interval.
+    max : np.ndarray
+        The maximum value of the interval.
+    delta : np.ndarray
+        The quantization step of the interval.
+    """
+
+    @singledispatchmethod
+    def __add__(self, other):
+        _min = self.min + other
+        _max = self.max + other
+        _delta = np.minimum(self.delta, 2.0 ** -_minimal_f(other))
+        return QIntervalArray(_min, _max, _delta)
+
+    @__add__.register
+    def _(self, other: _QIntervalArray):
+        _min = self.min + other.min
+        _max = self.max + other.max
+        _delta = np.minimum(self.delta, other.delta)
+        return QIntervalArray(_min, _max, _delta)
+
+    def __sub__(self, other):
+        return self + (-other)
+
+    @singledispatchmethod
+    def __mul__(self, other):
+        other = np.float64(other)
+        v1 = self.min * other
+        v2 = self.max * other
+        _min = np.minimum(v1, v2)
+        _max = np.maximum(v1, v2)
+        _delta = self.delta * other
+        return QIntervalArray(_min, _max, _delta)
+
+    @__mul__.register
+    def _(self, other: _QIntervalArray):
+        v1 = self.min * other.min
+        v2 = self.min * other.max
+        v3 = self.max * other.min
+        v4 = self.max * other.max
+        _min = np.minimum(np.minimum(v1, v2), np.minimum(v3, v4))
+        _max = np.maximum(np.maximum(v1, v2), np.maximum(v3, v4))
+        _delta = self.delta * other.delta
+        return QIntervalArray(_min, _max, _delta)
+
+    def __truediv__(self, other):
+        return self * (1 / other)
+
+    def __neg__(self):
+        return QIntervalArray(-self.max, -self.min, self.delta)
+
+    @singledispatchmethod
+    def __matmul__(self, other: np.ndarray):
+        v1 = np.einsum('ij,j...->ij...', self.min, other, optimize=True)
+        v2 = np.einsum('ij,j...->ij...', self.max, other, optimize=True)
+        other_delta = 2.0 ** -_minimal_f(other)
+        _delta = np.einsum('ij,j...->ij...', self.delta, other_delta, optimize=True)
+        delta = np.min(np.where(_delta == 0, np.inf, _delta), axis=1)
+        _min = np.sum(np.minimum(v1, v2), axis=1)
+        _max = np.sum(np.maximum(v1, v2), axis=1)
+        return QIntervalArray(_min, _max, delta)
+
+    @__matmul__.register
+    def _(self, other: _QIntervalArray):
+        v1 = np.einsum('ij,j...->ij...', self.min, other.min, optimize=True)
+        v2 = np.einsum('ij,j...->ij...', self.max, other.max, optimize=True)
+        v3 = np.einsum('ij,j...->ij...', self.min, other.max, optimize=True)
+        v4 = np.einsum('ij,j...->ij...', self.max, other.min, optimize=True)
+
+        _max = np.sum(np.maximum(np.maximum(v1, v2), np.maximum(v3, v4)), axis=1)
+        _min = np.sum(np.minimum(np.minimum(v1, v2), np.minimum(v3, v4)), axis=1)
+
+        _delta = np.einsum('ij,j...->ij...', self.delta, other.delta, optimize=True)
+        delta = np.min(_delta, axis=1)
+
+        return QIntervalArray(_min, _max, delta)
+
+    def __rmatmul__(self, other: np.ndarray):
+        v1 = np.einsum('ij,j...->ij...', other, self.min, optimize=True)
+        v2 = np.einsum('ij,j...->ij...', other, self.max, optimize=True)
+        other_delta = 2.0 ** -_minimal_f(other)
+        _delta = np.einsum('ij,j...->ij...', other_delta, self.delta, optimize=True)
+        delta = np.min(np.where(_delta == 0, np.inf, _delta), axis=1)
+        _min = np.sum(np.minimum(v1, v2), axis=1)
+        _max = np.sum(np.maximum(v1, v2), axis=1)
+        return QIntervalArray(_min, _max, delta)
+
+    def transpose(self, axes: Sequence[int]):
+        return QIntervalArray(self.min.transpose(axes), self.max.transpose(axes), self.delta.transpose(axes))
+
+    @property
+    def shape(self):
+        return self.min.shape
+
+    def reshape(self, shape: Sequence[int]):
+        return QIntervalArray(self.min.reshape(shape), self.max.reshape(shape), self.delta.reshape(shape))
+
+    def ravel(self):
+        return QIntervalArray(self.min.ravel(), self.max.ravel(), self.delta.ravel())
+
+    @property
+    def dtype(self):
+        return self.min.dtype
+
+    def __getitem__(self, key):
+        return QIntervalArray(self.min[key], self.max[key], self.delta[key])
+
+    def __array_function__(self, func, types, args, kwargs):
+        if func == np.concatenate:
+            return QIntervalArray(
+                np.concatenate([a.min for a in args[0]]),
+                np.concatenate([a.max for a in args[0]]),
+                np.concatenate([a.delta for a in args[0]]),
+            )
+        return NotImplemented
+
+    def rmatmul(self, other: np.ndarray):
+        """Right matrix multiplication (other @ self), with __rmatmul__ implemented in QIntervalArray.
+        This is to avoid using the @ operator defined in np.ndarray.
+
+        Parameters
+        ----------
+        other : np.ndarray
+            The operand matrix multiplied from the left.
+
+        Returns
+        -------
+        QIntervalArray
+            The result
+        """
+        return self.__rmatmul__(other)
+
+    @classmethod
+    def from_kif(cls, k: np.ndarray | int | bool, i: np.ndarray | int, f: np.ndarray | int):
+        """Create a QIntervalArray from k, i, f values.
+
+        Parameters
+        ----------
+        k : np.ndarray | int | bool
+            keep_negative
+        i : np.ndarray | int
+            integer_bits, excluding sign bit
+        f : np.ndarray | int
+            fractional_bits
+
+        Returns
+        -------
+        QIntervalArray
+            The created QIntervalArray.
+        """
+
+        _min = np.asarray(-(2.0**i) * k)
+        _max = np.asarray(2.0**i * k - 2.0**-f)
+        _delta = np.asarray(2.0**-f)
+        return cls(_min, _max, _delta)
+
+    def sample(self, n: int | None = None):
+        if n is not None:
+            rand = np.random.rand(n, *self.min.shape)
+        else:
+            rand = np.random.rand(*self.min.shape)
+        v = rand * (self.max - self.min) + self.min
+        v = np.round(v / self.delta) * self.delta
+        return v
+
+    def to_kif(self):
+        f = -np.log2(self.delta).astype(np.int8)
+
+        with np.errstate(divide='ignore', invalid='ignore'):
+            i = np.ceil(np.log2(np.maximum(self.max + 2.0**-f, -self.min))).astype(np.int8)
+        k = self.min < 0
+        null_mask = (self.max == 0) & (self.min == 0)
+        i, f = np.where(null_mask, 0, i), np.where(null_mask, 0, f)
+        return k, i, f
+
+
+def _exec_einsum(recipe: EinsumRecipe, input0: np.ndarray | QIntervalArray, input1: np.ndarray | QIntervalArray, operator):
+    """Execute einsum operation on two input arrays
+
+    Parameters
+    ----------
+    recipe : EinsumRecipe
+        einsum recipe
+    input0 : np.ndarray
+        input0, the first input array
+    input1 : np.ndarray
+        input1, the second input array
+
+    Returns
+    -------
+    np.ndarray
+        output array
+    """
+    input0 = input0.transpose(recipe['in_transpose_idxs'][0]).ravel()
+    input1 = input1.transpose(recipe['in_transpose_idxs'][1]).ravel()
+    # output = np.zeros(recipe['L0'] * recipe['L1'] * recipe['I'], dtype=input0.dtype)
+    output = []
+
+    L0, L1, I, C = recipe['L0'], recipe['L1'], recipe['I'], recipe['C']
+
+    for i in range(I):
+        for l0 in range(L0):
+            A = input1[i * L1 * C : (i + 1) * L1 * C].reshape((L1, C))
+            B = input0[(i * L0 + l0) * C : (i * L0 + l0 + 1) * C]
+            output.append(operator(A, B))
+    output = np.concatenate(output, axis=0)
+
+    return output.reshape(recipe['out_interpert_shape']).transpose(recipe['out_transpose_idxs'])
+
+
+@overload
+def einsum(fn: str, input0: QIntervalArray, input1: QIntervalArray, operator=None) -> QIntervalArray: ...
+
+
+@overload
+def einsum(fn: str, input0: np.ndarray, input1: QIntervalArray, operator=None) -> QIntervalArray: ...
+
+
+@overload
+def einsum(fn: str, input0: QIntervalArray, input1: np.ndarray, operator=None) -> QIntervalArray: ...
+
+
+@overload
+def einsum(fn: str, input0: np.ndarray, input1: np.ndarray, operator=None) -> np.ndarray: ...
+
+
+def einsum(fn: str, input0: np.ndarray | QIntervalArray, input1: np.ndarray | QIntervalArray) -> Any:  # type: ignore
+    """Execute einsum operation on two input arrays
+
+    WARNING: Order of multiplication is reversed -- watchout if you are using non-commutative operators
+
+    Parameters
+    ----------
+    fn : str
+        einsum string, e.g. 'ij,jk->ik'
+    input : np.ndarray
+        input0, the first input array
+    input1 : np.ndarray
+        input1, the second input array
+
+    Returns
+    -------
+    np.ndarray
+        output array
+    """
+
+    def operator(A, B):
+        if isinstance(A, np.ndarray):
+            return B.__rmatmul__(A)
+        else:
+            return A @ B
+
+    recipe = parse_einsum(fn, input0.shape, input1.shape)
+    return _exec_einsum(recipe, input0, input1, operator)
diff --git a/test/pytest/test_qinterval.py b/test/pytest/test_qinterval.py
new file mode 100644
index 0000000000..78f565e129
--- /dev/null
+++ b/test/pytest/test_qinterval.py
@@ -0,0 +1,103 @@
+import numpy as np
+import pytest
+from quantizers.fixed_point import get_fixed_quantizer_np
+
+from hls4ml.utils.qinterval import QIntervalArray, einsum, minimal_kif
+
+
+def assert_is_represented(qinterval: QIntervalArray, data: np.ndarray):
+    assert np.all(data <= qinterval.max), f'{np.max(data - qinterval.max)} > 0'
+    assert np.all(data >= qinterval.min), f'{np.min(data - qinterval.min)} < 0'
+    with np.errstate(divide='ignore', invalid='ignore'):
+        is_zero = (qinterval.max == 0) & (qinterval.min == 0)
+        assert np.all((data % qinterval.delta == 0) | is_zero)
+
+
+@pytest.fixture(scope='module')
+def data():
+    arr = np.random.randint(-1024, 1024, size=1000000)
+    arr = arr * 2.0 ** np.random.randint(-20, 20, size=1000000)
+    return arr
+
+
+def test_minimal_kif(data):
+    k, i, f = minimal_kif(data)
+    q = get_fixed_quantizer_np()
+    assert np.all(data == q(data, k, i, f))
+    assert np.all((data != q(data, k, i, f - 1)) | (data == 0))
+    assert np.all((data != q(data, k, i - 1, f)) | (data == 0) | (i + f == 0))
+
+
+def random_arr(seed=None):
+    rng = np.random.default_rng(seed)
+    shape = (64, 64)
+
+    _delta = 2.0 ** rng.integers(-8, 8, shape)
+    _min = rng.integers(-1024, 1024, shape) * _delta
+    _max = rng.integers(0, 4096, shape) * _delta + _min
+    interval_arr = QIntervalArray(_min, _max, _delta)
+    return interval_arr
+
+
+@pytest.fixture(scope='module')
+def qint_arr1():
+    return random_arr()
+
+
+@pytest.fixture(scope='module')
+def qint_arr2():
+    return random_arr()
+
+
+@pytest.mark.parametrize('oprstr', ['__add__', '__sub__', '__mul__', '__matmul__', '__rmatmul__'])
+def test_qinterval_oprs(qint_arr1, qint_arr2, oprstr):
+
+    sampled_arr1 = qint_arr1.sample(10000)
+    const_arr = qint_arr2.sample()
+    applied_symbolic = getattr(qint_arr1, oprstr)(const_arr)
+    applied_sampled = getattr(sampled_arr1, oprstr)(const_arr)
+
+    assert_is_represented(applied_symbolic, applied_sampled)
+
+    if oprstr != '__rmatmul__':
+        # rmatmul is only between const and intervals.
+
+        sampled_arr2 = qint_arr2.sample(10000)
+        rapplied_symbolic = getattr(qint_arr1, oprstr)(qint_arr2)
+        rapplied_sampled = getattr(sampled_arr1, oprstr)(sampled_arr2)
+
+        assert_is_represented(rapplied_symbolic, rapplied_sampled)
+
+
+@pytest.mark.parametrize('eq', ['ij,jk->ik', 'ij,kj->ikj'])
+def test_qinterval_einsum(qint_arr1, qint_arr2, eq):
+
+    _in, out = eq.split('->', 1)
+    in0, in1 = _in.split(',', 1)
+    qint_arr1 = qint_arr1[:16, :16]
+    qint_arr2 = qint_arr2[:16, :16]
+
+    sampled_arr1 = qint_arr1.sample(10000)
+    sampled_arr2 = qint_arr2.sample(10000)
+
+    # symbolic - symbolic
+    einsum_symbolic = einsum(eq, qint_arr1, qint_arr2)
+    einsum_sampled = np.einsum(f'A{in0},A{in1}->A{out}', sampled_arr1, sampled_arr2)
+    assert_is_represented(einsum_symbolic, einsum_sampled)
+
+    # symbolic - sampled
+    einsum_symbolic = einsum(eq, qint_arr1, sampled_arr2[0])
+    einsum_sampled = np.einsum(f'A{in0},{in1}->A{out}', sampled_arr1, sampled_arr2[0])
+    assert_is_represented(einsum_symbolic, einsum_sampled)
+
+    # sampled - symbolic
+    einsum_symbolic = einsum(eq, sampled_arr1[0], qint_arr2)
+    einsum_sampled = np.einsum(f'{in0},A{in1}->A{out}', sampled_arr1[0], sampled_arr2)
+    assert_is_represented(einsum_symbolic, einsum_sampled)
+
+
+def test_qinterval_to_kif(qint_arr1):
+    k, i, f = qint_arr1.to_kif()
+    samples = qint_arr1.sample(10000)
+    q = get_fixed_quantizer_np()
+    assert np.all(samples == q(samples, k, i, f))

From 02ff0c3afd447b04662c3b10a447c7e00827c39f Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 4 Dec 2024 05:10:04 +0000
Subject: [PATCH 28/69] preliminary bit-exact precision derivation opt pass

---
 .pre-commit-config.yaml                    |   6 +-
 hls4ml/model/optimizer/__init__.py         |   1 +
 hls4ml/model/optimizer/passes/bit_exact.py | 224 +++++++++++++++++++++
 3 files changed, 230 insertions(+), 1 deletion(-)
 create mode 100644 hls4ml/model/optimizer/passes/bit_exact.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9380ac1689..d607959dab 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -47,7 +47,11 @@ repos:
     exclude: docs/conf.py
     additional_dependencies: [flake8-bugbear, flake8-print]
     args: ['--max-line-length=125',  # github viewer width
-           '--extend-ignore=E203,T201']  # E203 is not PEP8 compliant
+           '--extend-ignore=E203,T201',  # E203 is not PEP8 compliant
+           '--per-file-ignores=hls4ml/model/optimizer/passes/bit_exact.py:E741,hls4ml/converters/keras_v3/squark/_base.py:E741,__init__.py:F401',
+           # i for #int w/o sign, I for #int w/ sign when massively processing bw conversions ......
+           # ignore unused imports in __init__.py .....
+    ]
 
 - repo: https://github.com/mgedmin/check-manifest
   rev: "0.50"
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index a745eceba1..87dff17678 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -72,6 +72,7 @@
         'fuse_batch_normalization',
         'replace_multidimensional_dense_with_conv',
         'enforce_proxy_model_embedded_config',
+        'bit_exact',
         'eliminate_linear_activation',
         'merge_linear_activation',
         # many of the above optimzers need to be done before this
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
new file mode 100644
index 0000000000..4861a001de
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -0,0 +1,224 @@
+import typing
+from functools import singledispatch
+
+import numpy as np
+
+from hls4ml.model.layers import Dense, EinsumDense, GlobalPooling1D, Layer, Pooling1D, Reshape
+from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer
+
+if typing.TYPE_CHECKING:
+    from hls4ml.model import ModelGraph
+
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import FixedPrecisionType, NamedType
+from hls4ml.utils.qinterval import QIntervalArray, einsum, minimal_kif
+
+
+def to_hls4ml_fixed(k, i, f, name, *args):
+    signed, b, i = k != 0, int(k + i + f), int(k + i)
+    args = [arg.upper() for arg in args]
+    ptype = FixedPrecisionType(b, i, signed, *args)
+    return NamedType(name, ptype)
+
+
+def get_input_layers(layer: Layer):
+    model: 'ModelGraph' = layer.model
+    inp_names = layer.attributes.attributes['inputs']
+    return [model.graph[name] for name in inp_names]
+
+
+def get_output_layers(layer: Layer):
+    model: 'ModelGraph' = layer.model
+    return [l for l in model.graph.values() if layer.name in l.attributes.attributes['inputs']]
+
+
+def get_output_shape(layer: Layer):
+    return layer.attributes.attributes[layer.name].shape
+
+
+def get_input_shapes(layer: Layer):
+    return [get_output_shape(inp) for inp in get_input_layers(layer)]
+
+
+@singledispatch
+def request_kif(layer: Layer):
+    output_shape = get_output_shape(layer)
+    k = np.ones(output_shape, dtype=np.int8)
+    i = f = np.full(output_shape, 127, dtype=np.int8)
+    return k, i, f
+
+
+@request_kif.register
+def _(layer: FixedPointQuantizer):
+    assert layer.mask_kbi is not None
+    k, b, I = layer.mask_kbi
+    k, i, f = k, I - k, b - I
+    if layer.SAT != 'WRAP':
+        k[:] = 1
+        i[:] = 127
+    if layer.RND == 'TRN':
+        pass
+    elif layer.RND == 'RND':
+        f += 1
+    else:
+        f += 2
+    return k, i, f
+
+
+@request_kif.register(Pooling1D)
+# @request_kif.register(Pooling2D)
+@request_kif.register(GlobalPooling1D)
+# @request_kif.register(GlobalPooling2D)
+def _(layer: Pooling1D | GlobalPooling1D):
+    # inp_shape = get_input_shapes(layer)[0]
+    out_shape = get_output_shape(layer)
+    pool_width = layer.attributes.attributes['pool_width']
+    stride_width = layer.attributes.attributes['stride_width']
+    pool_op = layer.attributes.attributes['pool_op']
+    if isinstance(layer, Pooling1D):
+        pad_0_0: int = layer.attributes.attributes['pad_left']
+    else:
+        pad_0_0 = 0
+    is_ch_last = layer.attributes.attributes['data_format'] == 'channels_last'
+
+    k = np.ones(out_shape, dtype=np.int8)
+    i = np.full(out_shape, -128, dtype=np.int8)
+    f = np.full(out_shape, 127, dtype=np.int8)
+
+    _, i_out, f_out = np.max([request_kif(next_layer) for next_layer in get_output_layers(layer)], axis=0)
+
+    if not is_ch_last:
+        i = np.moveaxis(i, 0, -1)
+        f = np.moveaxis(f, 0, -1)
+
+    for idx_out in range(k.shape[-1]):
+        i_in_0 = i_out * stride_width - pad_0_0
+        i_in_1 = i_in_0 + pool_width
+        if i_in_0 < 0:
+            i_in_0 = 0
+        i[..., i_in_0:i_in_1] = i_out[..., idx_out]
+        f[..., i_in_0:i_in_1] = f_out[..., idx_out]
+
+    if not is_ch_last:
+        i = np.moveaxis(i, -1, 0)
+        f = np.moveaxis(f, -1, 0)
+
+    if pool_op == 'Average':
+        ln2_size = np.log2(pool_width)
+        i += np.ceil(ln2_size).astype(np.int8)
+        if not ln2_size.is_integer():
+            f[:] = 127
+    return k, i, f
+
+
+@request_kif.register
+def _(layer: Reshape):
+    inp_shape = get_input_shapes(layer)[0]
+    k, i, f = np.max([request_kif(next_layer) for next_layer in get_output_layers(layer)], axis=0)
+    return k.reshape(inp_shape), i.reshape(inp_shape), f.reshape(inp_shape)
+
+
+def requested_kif(layer: Layer):
+    out_layers = get_output_layers(layer)
+    if not out_layers:
+        out_shape = get_output_shape(layer)
+        k = np.ones(out_shape, dtype=np.int8)
+        i = f = np.full(out_shape, 127, dtype=np.int8)
+        return k, i, f
+    return tuple(np.max([request_kif(l) for l in out_layers], axis=0))
+
+
+@singledispatch
+def produce_kif(layer: Layer):
+    k = np.ones(get_output_shape(layer), dtype=np.int8)
+    i = f = np.full(get_output_shape(layer), 127, dtype=np.int8)
+    return k, i, f
+
+
+def get_input_kifs(layer: Layer):
+    return [produce_kif(l) for l in get_input_layers(layer)]
+
+
+@produce_kif.register
+def _(layer: FixedPointQuantizer):
+    assert layer.mask_kbi is not None
+    k, b, I = layer.mask_kbi
+    k, i, f = k, I - k, b - I
+    return k[0], i[0], f[0]
+
+
+@produce_kif.register
+def _(layer: Reshape):
+    out_shape = get_output_shape(layer)
+    k, i, f = produce_kif(get_input_layers(layer)[0])
+    return k.reshape(out_shape), i.reshape(out_shape), f.reshape(out_shape)
+
+
+@produce_kif.register
+def _(layer: EinsumDense):
+    kernel = layer.attributes.attributes['weight'].data[0]  # unsqueezed on axis 0 for unknown reason
+    _bias = layer.attributes.attributes['bias']
+    eq = layer.attributes.attributes['equation']
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+    qint_in = QIntervalArray.from_kif(k_in, i_in, f_in)
+    qint_out = einsum(eq, qint_in, kernel)
+    if _bias is not None:
+        qint_out = qint_out + _bias.data
+    k, i, f = qint_out.to_kif()
+    return k.astype(np.int8), i, f
+
+
+@produce_kif.register
+def _(layer: Dense):
+    kernel = layer.attributes.attributes['weight'].data[0]  # unsqueezed on axis 0 for unknown reason
+    _bias = layer.attributes.attributes['bias']
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+    qint_in = QIntervalArray.from_kif(k_in, i_in, f_in)
+    qint_out = qint_in @ kernel
+    if _bias is not None:
+        qint_out = qint_out + _bias.data
+    k, i, f = qint_out.to_kif()
+    return k.astype(np.int8), i, f
+
+
+def kif_arrs_to_ints(arr: tuple[np.ndarray, np.ndarray, np.ndarray]):
+    return tuple(int(np.max(a)) for a in arr)
+
+
+def register_precision(layer: Layer):
+    _pk, _pi, _pf = produce_kif(layer)
+    _rk, _ri, _rf = requested_kif(layer)
+    _out_kif = np.minimum(_pk, _rk), np.minimum(_pi, _ri), np.minimum(_pf, _rf)
+    _out_kif[1][(_pf > _rf) & (_pi <= _ri)] += 1
+    result_kif = kif_arrs_to_ints(_out_kif)
+    result_t = to_hls4ml_fixed(*result_kif, f'{layer.name}_result_t')
+    layer.attributes.attributes['result_t'] = result_t
+    layer.attributes.attributes[layer.name].type = result_t  # Why??????
+
+    if 'accum_t' in layer.attributes.attributes:
+        accum_kif = kif_arrs_to_ints((_pk, _pi, _pf))
+        accum_t = to_hls4ml_fixed(*accum_kif, f'{layer.name}_accum_t')
+        layer.attributes.attributes['accum_t'] = accum_t
+
+    if 'weight_t' in layer.attributes.attributes:
+        kernel_kif = kif_arrs_to_ints(minimal_kif(layer.attributes.attributes['weight'].data))
+        kernel_t = to_hls4ml_fixed(*kernel_kif, f'{layer.name}_weight_t')
+        layer.attributes.attributes['weight_t'] = kernel_t
+
+    if 'bias_t' in layer.attributes.attributes:
+        _bias = layer.attributes.attributes.get('bias')
+        if _bias is None:
+            bias_t = to_hls4ml_fixed(0, 0, 1, f'{layer.name}_bias_t')
+        else:
+            bias_kif = kif_arrs_to_ints(minimal_kif(_bias.data))
+            bias_t = to_hls4ml_fixed(*bias_kif, f'{layer.name}_bias_t')
+        layer.attributes.attributes['bias_t'] = bias_t
+
+
+class BitExact(OptimizerPass):
+    def match(self, node):
+        return True
+
+    def transform(self, model, node):
+        register_precision(node)
+        return False

From 7c47be959f5b9cb176b78f31089126bc223a68e3 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 4 Dec 2024 05:38:05 +0000
Subject: [PATCH 29/69] squark layer support start

---
 hls4ml/converters/keras_v3/__init__.py        |   1 +
 hls4ml/converters/keras_v3/squark/__init__.py |   1 +
 hls4ml/converters/keras_v3/squark/_base.py    | 147 ++++++++++++++++++
 3 files changed, 149 insertions(+)
 create mode 100644 hls4ml/converters/keras_v3/squark/__init__.py
 create mode 100644 hls4ml/converters/keras_v3/squark/_base.py

diff --git a/hls4ml/converters/keras_v3/__init__.py b/hls4ml/converters/keras_v3/__init__.py
index 6dffcb71d5..eb9442ba91 100644
--- a/hls4ml/converters/keras_v3/__init__.py
+++ b/hls4ml/converters/keras_v3/__init__.py
@@ -1,6 +1,7 @@
 from . import conv  # noqa: F401
 from . import core  # noqa: F401
 from . import einsum_dense  # noqa: F401
+from . import squark  # noqa: F401
 from ._base import registry as layer_handlers
 
 __all__ = ['layer_handlers']
diff --git a/hls4ml/converters/keras_v3/squark/__init__.py b/hls4ml/converters/keras_v3/squark/__init__.py
new file mode 100644
index 0000000000..0944ec2e74
--- /dev/null
+++ b/hls4ml/converters/keras_v3/squark/__init__.py
@@ -0,0 +1 @@
+from . import _base
diff --git a/hls4ml/converters/keras_v3/squark/_base.py b/hls4ml/converters/keras_v3/squark/_base.py
new file mode 100644
index 0000000000..f72563c383
--- /dev/null
+++ b/hls4ml/converters/keras_v3/squark/_base.py
@@ -0,0 +1,147 @@
+from typing import TYPE_CHECKING, Any, Sequence
+
+import numpy as np
+from keras.api import Layer
+
+from hls4ml.converters.keras_v3._base import KerasV3LayerHandler, register
+from hls4ml.converters.keras_v3.conv import KV3ConvHandler
+from hls4ml.converters.keras_v3.core import KV3ActivationHandler, KV3DenseHandler
+from hls4ml.converters.keras_v3.einsum_dense import KV3EinsumDenseHandler
+
+if TYPE_CHECKING:
+    import squark
+    from keras import KerasTensor
+
+
+def extract_fixed_quantizer_config(q, tensor: 'KerasTensor', is_input: bool) -> dict[str, Any]:
+    from keras.api.ops import convert_to_numpy
+    from squark.quantizer.internal.fixed_point_quantizer import FixedPointQuantizerKBI, FixedPointQuantizerKIF
+
+    internal_q: FixedPointQuantizerKIF | FixedPointQuantizerKBI = q.quantizer
+
+    shape: tuple[int, ...] = tensor.shape[1:]  # type: ignore
+    if any([s is None for s in shape]):
+        raise ValueError(f"Tensor {tensor.name} has at least one dimension with no fixed size")
+    k, i, f = internal_q.kif
+    k, B, I = k, k + i + f, k + i  # type: ignore
+    k, B, I = convert_to_numpy(k), convert_to_numpy(B), convert_to_numpy(I)
+
+    k = np.broadcast_to(k.astype(np.int8), (1,) + shape)
+    B = np.broadcast_to(B.astype(np.int8), (1,) + shape)
+    I = np.broadcast_to(I.astype(np.int8), (1,) + shape)
+
+    overflow_mode = internal_q.overflow_mode
+    round_mode = internal_q.round_mode
+    fusible = np.unique(k).size == 1 and np.unique(B).size == 1 and np.unique(I).size == 1
+
+    input_keras_tensor_names = tensor.name if is_input else f'{tensor.name}_q'
+    output_keras_tensor_names = f'{tensor.name}_q' if is_input else tensor.name
+    return {
+        'name': q.name,
+        'class_name': 'FixedPointQuantizer',
+        'mask_kbi': (k, B, I),
+        'SAT': overflow_mode,
+        'RND': round_mode,
+        'fusible': fusible,
+        'input_keras_tensor_names': [input_keras_tensor_names],
+        'output_keras_tensor_names': [output_keras_tensor_names],
+        'overrides': {},
+    }
+
+
+def override_io_tensor_confs(confs: tuple[dict[str, Any], ...], overrides: dict[str, str]):
+    for conf in confs:
+        inp_tensor_names = conf['input_keras_tensor_names']
+        out_tensor_names = conf['output_keras_tensor_names']
+        conf['input_keras_tensor_names'] = [overrides.get(name, name) for name in inp_tensor_names]
+        conf['output_keras_tensor_names'] = [overrides.get(name, name) for name in out_tensor_names]
+
+
+@register
+class SQLayerHandler(KerasV3LayerHandler):
+    def __call__(
+        self,
+        layer: 'squark.layers.QLayerBase',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        ret = super().__call__(layer, in_tensors, out_tensors)
+
+        if layer._enable_iq:
+            if len(in_tensors) > 1:
+                iq_confs = [extract_fixed_quantizer_config(q, tensor, True) for q, tensor in zip(layer._iqs, in_tensors)]
+            else:
+                iq_confs = [extract_fixed_quantizer_config(layer._iq, in_tensors[0], True)]
+        else:
+            iq_confs = ()
+
+        if layer._enable_oq:
+            if len(out_tensors) > 1:
+                oq_confs = [extract_fixed_quantizer_config(q, tensor, False) for q, tensor in zip(layer._oqs, out_tensors)]
+            else:
+                oq_confs = [extract_fixed_quantizer_config(layer._oq, out_tensors[0], False)]
+        else:
+            oq_confs = ()
+
+        if iq_confs:
+            _froms = [t.name for t in in_tensors]
+            _tos = [f'{t.name}_q' for t in in_tensors]
+            overrides = dict(zip(_froms, _tos))
+            override_io_tensor_confs(ret, overrides)
+
+        if oq_confs:
+            _froms = [t.name for t in out_tensors]
+            _tos = [f'{t.name}_q' for t in out_tensors]
+            overrides = dict(zip(_froms, _tos))
+            override_io_tensor_confs(ret, overrides)
+
+        return *iq_confs, *ret, *oq_confs
+
+    def load_weight(self, layer: Layer, key: str):
+        from keras.api.ops import convert_to_numpy
+
+        if hasattr(layer, f'q{key}'):
+            return convert_to_numpy(getattr(layer, f'q{key}'))
+        return super().load_weight(layer, key)
+
+
+@register
+class SQEinsumDenseHandler(SQLayerHandler, KV3EinsumDenseHandler):
+    handles = (
+        'squark.layers.core.einsum_dense.QEinsumDense',
+        'squark.layers.einsum_dense_batchnorm.QEinsumDenseBatchnorm',
+    )
+
+
+@register
+class SQStandaloneQuantizerHandler(KerasV3LayerHandler):
+    handles = ('squark.quantizer.quantizer.Quantizer',)
+
+    def handle(
+        self,
+        layer: 'squark.quantizer.Quantizer',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        conf = extract_fixed_quantizer_config(layer, in_tensors[0], True)
+        del conf['output_keras_tensor_names']
+        return conf
+
+
+@register
+class SQConvHandler(SQLayerHandler, KV3ConvHandler):
+    handles = (
+        'squark.layers.conv.QConv1D',
+        'squark.layers.conv.QConv2D',
+        # 'squark.layers.conv.QConv3D',
+    )
+
+
+@register
+class SQDenseHandler(SQLayerHandler, KV3DenseHandler):
+    handles = ('squark.layers.core.QDense',)
+
+
+@register
+class SQActivationHandler(SQLayerHandler, KV3ActivationHandler):
+    handles = ('squark.layers.activation.QActivation',)

From 43847c40a83d6f7711802423ce40d357c761c838 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 4 Dec 2024 06:03:03 +0000
Subject: [PATCH 30/69] fix einsum_dense precision computation

---
 hls4ml/model/layers.py                     |  9 ++++++++-
 hls4ml/model/optimizer/passes/bit_exact.py | 10 +++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 5392e2ffe5..94242b2284 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1646,7 +1646,8 @@ def initialize(self):
         inp_shape = self.attributes['inp_shape']
         out_shape = self.attributes['out_shape']
 
-        recipe = parse_einsum(equation, inp_shape, kernel.shape)
+        kernel_shape = kernel.shape
+        recipe = parse_einsum(equation, inp_shape, kernel_shape)
         inp_tpose_idxs, ker_tpose_idxs = recipe['in_transpose_idxs']
         out_tpose_idxs = recipe['out_transpose_idxs']
 
@@ -1656,6 +1657,11 @@ def initialize(self):
         kernel = kernel.transpose(ker_tpose_idxs)
         kernel = kernel.reshape(recipe['I'], recipe['L1'], recipe['C']).transpose(0, 2, 1)
 
+        def to_original_kernel(tkernel: np.ndarray) -> np.ndarray:
+            _kernel = tkernel.transpose(0, 2, 1)
+            _kernel = _kernel.reshape(tuple(kernel_shape[i] for i in ker_tpose_idxs))
+            return _kernel.transpose(np.argsort(ker_tpose_idxs))
+
         # TODO: for weight in bram mode (resource), broadcasting bias here shall be avoided.
         if bias is not None:
             bias = np.broadcast_to(bias, out_shape).transpose(np.argsort(out_tpose_idxs))
@@ -1666,6 +1672,7 @@ def initialize(self):
             bias = np.zeros(out_shape).transpose(np.argsort(out_tpose_idxs))
 
         self.attributes.attributes['weight_data'] = kernel
+        self.attributes.attributes['to_original_kernel'] = to_original_kernel
         self.attributes.attributes['bias_data'] = bias
         self.attributes['inp_tpose_idxs'] = inp_tpose_idxs
         self.attributes['out_tpose_idxs'] = out_tpose_idxs
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 4861a001de..e767ed3420 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -156,21 +156,25 @@ def _(layer: Reshape):
 
 @produce_kif.register
 def _(layer: EinsumDense):
-    kernel = layer.attributes.attributes['weight'].data[0]  # unsqueezed on axis 0 for unknown reason
+    t_kernel = layer.attributes.attributes['weight'].data
+    to_original_kernel = layer.attributes.attributes['to_original_kernel']
+    kernel = to_original_kernel(t_kernel)
     _bias = layer.attributes.attributes['bias']
     eq = layer.attributes.attributes['equation']
     k_in, i_in, f_in = get_input_kifs(layer)[0]
     qint_in = QIntervalArray.from_kif(k_in, i_in, f_in)
     qint_out = einsum(eq, qint_in, kernel)
     if _bias is not None:
-        qint_out = qint_out + _bias.data
+        t_bias = _bias.data
+        bias = t_bias.transpose(layer.attributes.attributes['out_tpose_idxs'])
+        qint_out = qint_out + bias
     k, i, f = qint_out.to_kif()
     return k.astype(np.int8), i, f
 
 
 @produce_kif.register
 def _(layer: Dense):
-    kernel = layer.attributes.attributes['weight'].data[0]  # unsqueezed on axis 0 for unknown reason
+    kernel = layer.attributes.attributes['weight'].data
     _bias = layer.attributes.attributes['bias']
     k_in, i_in, f_in = get_input_kifs(layer)[0]
     qint_in = QIntervalArray.from_kif(k_in, i_in, f_in)

From afdaf215a67d7c58c2f65952d45ed8cb69eac012 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 4 Dec 2024 15:19:52 +0000
Subject: [PATCH 31/69] add leftover

---
 hls4ml/model/optimizer/passes/bit_exact.py | 86 +++++++++++++++++++++-
 hls4ml/utils/qinterval.py                  | 51 +++++++------
 2 files changed, 112 insertions(+), 25 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index e767ed3420..cd2f26a51b 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -1,9 +1,10 @@
 import typing
 from functools import singledispatch
+from typing import Sequence
 
 import numpy as np
 
-from hls4ml.model.layers import Dense, EinsumDense, GlobalPooling1D, Layer, Pooling1D, Reshape
+from hls4ml.model.layers import Conv1D, Conv2D, Dense, EinsumDense, GlobalPooling1D, Layer, Pooling1D, Reshape
 from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer
 
 if typing.TYPE_CHECKING:
@@ -42,9 +43,9 @@ def get_input_shapes(layer: Layer):
 
 @singledispatch
 def request_kif(layer: Layer):
-    output_shape = get_output_shape(layer)
-    k = np.ones(output_shape, dtype=np.int8)
-    i = f = np.full(output_shape, 127, dtype=np.int8)
+    input_shape = get_input_shapes(layer)[0]
+    k = np.ones(input_shape, dtype=np.int8)
+    i = f = np.full(input_shape, 127, dtype=np.int8)
     return k, i, f
 
 
@@ -185,6 +186,83 @@ def _(layer: Dense):
     return k.astype(np.int8), i, f
 
 
+def r_im2col(kernel_size: Sequence[int], arr: np.ndarray, buffer: np.ndarray, axis: int):
+    w = kernel_size[0]
+    if len(kernel_size) == 3:  # 1D
+        for i in range(arr.shape[axis] - w + 1):
+            patch = np.take(arr, range(i, i + w), axis=axis)
+            buffer[i] = patch.flatten()
+    else:  # 2D+
+        for i in range(arr.shape[axis] - w + 1):
+            patch = arr[i : i + w]
+            r_im2col(kernel_size[1:], patch, buffer[i], axis + 1)
+
+
+def _im2col(kernel_size: Sequence[int], arr: np.ndarray):
+    if len(kernel_size) < 3:
+        return arr
+    shape = [inp_d - ker_d + 1 for inp_d, ker_d in zip(arr.shape, kernel_size[:-2])]
+    shape.append(np.prod(kernel_size[:-1]))  # type: ignore
+    buf = np.empty(shape, dtype=arr.dtype)
+    r_im2col(kernel_size, arr, buf, 0)
+    return buf
+
+
+def im2col(kernel_size: Sequence[int], *arrs: np.ndarray):
+    """im2col for multidimensional arrays. Assumes Channel Last format.
+
+    Parameters
+    ----------
+    kernel_size : Sequence[int]
+        The size of the kernel, in the form (*kernel_shape, ch_in, ch_out)
+
+    *arrs : np.ndarray
+        The input arrays to be transformed
+
+    Returns
+    -------
+    list[np.ndarray]
+        The transformed arrays
+    """
+    return [_im2col(kernel_size, arr) for arr in arrs]
+
+
+def pad_and_stride_inp_arr(node: Layer, arr: np.ndarray, pad_val: float = 0):
+    if node.class_name.endswith('Conv2D'):
+        pad_top = node.attributes.attributes['pad_top']
+        pad_bottom = node.attributes.attributes['pad_bottom']
+        pad_left = node.attributes.attributes['pad_left']
+        pad_right = node.attributes.attributes['pad_right']
+        st_h = node.attributes.attributes['stride_height']
+        st_w = node.attributes.attributes['stride_width']
+        return np.pad(arr, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), constant_values=pad_val)[::st_h, ::st_w]
+    if node.class_name.endswith('Conv1D'):
+        pad_left = node.attributes.attributes['pad_left']
+        pad_right = node.attributes.attributes['pad_right']
+        st_w = node.attributes.attributes['stride_width']
+        return np.pad(arr, ((pad_left, pad_right), (0, 0)), constant_values=pad_val)[::st_w]
+    return arr
+
+
+@produce_kif.register(Conv1D)
+@produce_kif.register(Conv2D)
+def _(layer: Conv1D | Conv2D):
+    kernel = layer.attributes.attributes['weight'].data
+    _bias = layer.attributes.attributes['bias']
+    bias = _bias.data if _bias is not None else 0
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+    k_in, i_in, f_in = im2col(kernel.shape, k_in, i_in, f_in)
+    k_in = pad_and_stride_inp_arr(layer, k_in, 0)
+    i_in = pad_and_stride_inp_arr(layer, i_in, 0)
+    f_in = pad_and_stride_inp_arr(layer, f_in, 0)
+    kernel = kernel.reshape(-1, kernel.shape[-1])
+    qint_in = QIntervalArray.from_kif(k_in, i_in, f_in)
+    qint_out = qint_in @ kernel
+    qint_out = qint_out + bias
+    k, i, f = qint_out.to_kif()
+    return k.astype(np.int8), i, f
+
+
 def kif_arrs_to_ints(arr: tuple[np.ndarray, np.ndarray, np.ndarray]):
     return tuple(int(np.max(a)) for a in arr)
 
diff --git a/hls4ml/utils/qinterval.py b/hls4ml/utils/qinterval.py
index 4fe3bca8e4..54d47e7f23 100644
--- a/hls4ml/utils/qinterval.py
+++ b/hls4ml/utils/qinterval.py
@@ -123,38 +123,47 @@ def __neg__(self):
 
     @singledispatchmethod
     def __matmul__(self, other: np.ndarray):
-        v1 = np.einsum('ij,j...->ij...', self.min, other, optimize=True)
-        v2 = np.einsum('ij,j...->ij...', self.max, other, optimize=True)
+        seq = ''.join(chr(ord('a') + i) for i in range(self.min.ndim))
+        eq = f'{seq},{seq[-1]}...->{seq}...'
+        ax = self.min.ndim - 1
+        v1 = np.einsum(eq, self.min, other, optimize=True)
+        v2 = np.einsum(eq, self.max, other, optimize=True)
         other_delta = 2.0 ** -_minimal_f(other)
-        _delta = np.einsum('ij,j...->ij...', self.delta, other_delta, optimize=True)
-        delta = np.min(np.where(_delta == 0, np.inf, _delta), axis=1)
-        _min = np.sum(np.minimum(v1, v2), axis=1)
-        _max = np.sum(np.maximum(v1, v2), axis=1)
+        _delta = np.einsum(eq, self.delta, other_delta, optimize=True)
+        delta = np.min(np.where(_delta == 0, np.inf, _delta), axis=ax)
+        _min = np.sum(np.minimum(v1, v2), axis=ax)
+        _max = np.sum(np.maximum(v1, v2), axis=ax)
         return QIntervalArray(_min, _max, delta)
 
     @__matmul__.register
     def _(self, other: _QIntervalArray):
-        v1 = np.einsum('ij,j...->ij...', self.min, other.min, optimize=True)
-        v2 = np.einsum('ij,j...->ij...', self.max, other.max, optimize=True)
-        v3 = np.einsum('ij,j...->ij...', self.min, other.max, optimize=True)
-        v4 = np.einsum('ij,j...->ij...', self.max, other.min, optimize=True)
+        seq = ''.join(chr(ord('a') + i) for i in range(self.min.ndim))
+        eq = f'{seq},{seq[-1]}...->{seq}...'
+        ax = self.min.ndim - 1
+        v1 = np.einsum(eq, self.min, other.min, optimize=True)
+        v2 = np.einsum(eq, self.max, other.max, optimize=True)
+        v3 = np.einsum(eq, self.min, other.max, optimize=True)
+        v4 = np.einsum(eq, self.max, other.min, optimize=True)
 
-        _max = np.sum(np.maximum(np.maximum(v1, v2), np.maximum(v3, v4)), axis=1)
-        _min = np.sum(np.minimum(np.minimum(v1, v2), np.minimum(v3, v4)), axis=1)
+        _max = np.sum(np.maximum(np.maximum(v1, v2), np.maximum(v3, v4)), axis=ax)
+        _min = np.sum(np.minimum(np.minimum(v1, v2), np.minimum(v3, v4)), axis=ax)
 
-        _delta = np.einsum('ij,j...->ij...', self.delta, other.delta, optimize=True)
-        delta = np.min(_delta, axis=1)
+        _delta = np.einsum(eq, self.delta, other.delta, optimize=True)
+        delta = np.min(_delta, axis=ax)
 
         return QIntervalArray(_min, _max, delta)
 
     def __rmatmul__(self, other: np.ndarray):
-        v1 = np.einsum('ij,j...->ij...', other, self.min, optimize=True)
-        v2 = np.einsum('ij,j...->ij...', other, self.max, optimize=True)
+        seq = ''.join(chr(ord('a') + i) for i in range(other.ndim))
+        eq = f'{seq},{seq[-1]}...->{seq}...'
+        ax = other.ndim - 1
+        v1 = np.einsum(eq, other, self.min, optimize=True)
+        v2 = np.einsum(eq, other, self.max, optimize=True)
         other_delta = 2.0 ** -_minimal_f(other)
-        _delta = np.einsum('ij,j...->ij...', other_delta, self.delta, optimize=True)
-        delta = np.min(np.where(_delta == 0, np.inf, _delta), axis=1)
-        _min = np.sum(np.minimum(v1, v2), axis=1)
-        _max = np.sum(np.maximum(v1, v2), axis=1)
+        _delta = np.einsum(eq, other_delta, self.delta, optimize=True)
+        delta = np.min(np.where(_delta == 0, np.inf, _delta), axis=ax)
+        _min = np.sum(np.minimum(v1, v2), axis=ax)
+        _max = np.sum(np.maximum(v1, v2), axis=ax)
         return QIntervalArray(_min, _max, delta)
 
     def transpose(self, axes: Sequence[int]):
@@ -222,7 +231,7 @@ def from_kif(cls, k: np.ndarray | int | bool, i: np.ndarray | int, f: np.ndarray
         """
 
         _min = np.asarray(-(2.0**i) * k)
-        _max = np.asarray(2.0**i * k - 2.0**-f)
+        _max = np.asarray(2.0**i - 2.0**-f)
         _delta = np.asarray(2.0**-f)
         return cls(_min, _max, _delta)
 

From 0da5cd01e3684c37b50023e306e5624a3d9950f7 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 4 Dec 2024 20:15:17 +0000
Subject: [PATCH 32/69] qdense fix

---
 hls4ml/converters/keras_v3/core.py         | 14 ++++++++------
 hls4ml/converters/keras_v3/squark/_base.py |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/hls4ml/converters/keras_v3/core.py b/hls4ml/converters/keras_v3/core.py
index 55a19945a9..92770b195b 100644
--- a/hls4ml/converters/keras_v3/core.py
+++ b/hls4ml/converters/keras_v3/core.py
@@ -22,15 +22,17 @@ def handle(
         in_tensors: Sequence['KerasTensor'],
         out_tensors: Sequence['KerasTensor'],
     ):
-        kernel = np.array(layer.kernel)
-        assert layer._build_shapes_dict is not None, f"Layer {layer.name} is not built"
-        # inp_shape = layer._build_shapes_dict['input_shape'][1:]
+
+        kernel = self.load_weight(layer, 'kernel')
+        bias = self.load_weight(layer, 'bias') if layer.use_bias else None
+        n_in, n_out = kernel.shape
+
         config = {
             'data_format': 'channels_last',
             'weight_data': kernel,
-            'bias_data': self.load_weight(layer, 'bias') if layer.use_bias else None,
-            'n_out': kernel.shape[1],
-            'n_in': kernel.shape[0],
+            'bias_data': bias,
+            'n_out': n_out,
+            'n_in': n_in,
         }
         return config
 
diff --git a/hls4ml/converters/keras_v3/squark/_base.py b/hls4ml/converters/keras_v3/squark/_base.py
index f72563c383..c4f65673d1 100644
--- a/hls4ml/converters/keras_v3/squark/_base.py
+++ b/hls4ml/converters/keras_v3/squark/_base.py
@@ -139,7 +139,7 @@ class SQConvHandler(SQLayerHandler, KV3ConvHandler):
 
 @register
 class SQDenseHandler(SQLayerHandler, KV3DenseHandler):
-    handles = ('squark.layers.core.QDense',)
+    handles = ('squark.layers.core.dense.QDense',)
 
 
 @register

From 6b737744c002ebe05187abd422de234c8b9a0e4b Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 4 Dec 2024 20:56:26 +0000
Subject: [PATCH 33/69] support batch_norm

---
 hls4ml/converters/keras_v3/squark/_base.py | 27 ++++++++++++++++++++++
 hls4ml/model/layers.py                     | 23 ++++++++++--------
 hls4ml/model/optimizer/passes/bit_exact.py | 26 ++++++++++++++++++++-
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/hls4ml/converters/keras_v3/squark/_base.py b/hls4ml/converters/keras_v3/squark/_base.py
index c4f65673d1..625f3b2d29 100644
--- a/hls4ml/converters/keras_v3/squark/_base.py
+++ b/hls4ml/converters/keras_v3/squark/_base.py
@@ -1,3 +1,4 @@
+from math import prod
 from typing import TYPE_CHECKING, Any, Sequence
 
 import numpy as np
@@ -145,3 +146,29 @@ class SQDenseHandler(SQLayerHandler, KV3DenseHandler):
 @register
 class SQActivationHandler(SQLayerHandler, KV3ActivationHandler):
     handles = ('squark.layers.activation.QActivation',)
+
+
+@register
+class SQBatchNormalizationHandler(SQLayerHandler):
+    handles = ('squark.layers.batch_normalization.QBatchNormalization',)
+
+    def handle(
+        self,
+        layer: 'squark.layers.QBatchNormalization',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        from keras import ops
+
+        scale, offset = layer.qscaler_and_qoffset
+        scale = ops.convert_to_numpy(scale)
+        offset = ops.convert_to_numpy(offset)
+
+        assert layer.axis in (len(in_tensors[0].shape) - 1, -1), 'Only batch_norm with axis=-1 is supported'
+
+        return {
+            'n_filt': scale.size,
+            'n_in': prod(in_tensors[0].shape[1:]),  # type: ignore
+            'scale_data': scale,
+            'bias_data': offset,
+        }
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 94242b2284..80652af613 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1018,16 +1018,21 @@ def initialize(self):
         dims = inp.dim_names
         self.add_output_variable(shape, dims)
 
-        gamma = self.get_attr('gamma_data')
-        beta = self.get_attr('beta_data')
-        mean = self.get_attr('mean_data')
-        var = self.get_attr('variance_data')
-
-        scale = gamma / np.sqrt(var + self.get_attr('epsilon'))
-        bias = beta - scale * mean
+        if self.get_attr('scale_data') is None:
+            gamma = self.get_attr('gamma_data')
+            var = self.get_attr('variance_data')
+            scale = gamma / np.sqrt(var + self.get_attr('epsilon'))
+            self.add_weights_variable(name='scale', var_name='s{index}', data=scale)
+        else:
+            self.add_weights_variable(name='scale', var_name='s{index}')
 
-        self.add_weights_variable(name='scale', var_name='s{index}', data=scale)
-        self.add_weights_variable(name='bias', var_name='b{index}', data=bias)
+        if self.get_attr('bias_data') is None:
+            beta = self.get_attr('beta_data')
+            mean = self.get_attr('mean_data')
+            bias = beta - scale * mean
+            self.add_weights_variable(name='bias', var_name='b{index}', data=bias)
+        else:
+            self.add_weights_variable(name='bias', var_name='b{index}')
 
 
 # TODO:  discuss whether this should be renamed to soemthing more descriptive, and whether the class hierarchy makes sense
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index cd2f26a51b..14db7b87a9 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -4,7 +4,17 @@
 
 import numpy as np
 
-from hls4ml.model.layers import Conv1D, Conv2D, Dense, EinsumDense, GlobalPooling1D, Layer, Pooling1D, Reshape
+from hls4ml.model.layers import (
+    BatchNormalization,
+    Conv1D,
+    Conv2D,
+    Dense,
+    EinsumDense,
+    GlobalPooling1D,
+    Layer,
+    Pooling1D,
+    Reshape,
+)
 from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer
 
 if typing.TYPE_CHECKING:
@@ -263,6 +273,20 @@ def _(layer: Conv1D | Conv2D):
     return k.astype(np.int8), i, f
 
 
+@produce_kif.register
+def _(layer: BatchNormalization):
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+    qint_in = QIntervalArray.from_kif(k_in, i_in, f_in)
+    scale = layer.attributes.attributes['scale'].data
+
+    _bias = layer.attributes.attributes['bias']
+    bias = _bias.data if _bias is not None else 0
+
+    qint_out = qint_in * scale + bias
+    k, i, f = qint_out.to_kif()
+    return k.astype(np.int8), i, f
+
+
 def kif_arrs_to_ints(arr: tuple[np.ndarray, np.ndarray, np.ndarray]):
     return tuple(int(np.max(a)) for a in arr)
 

From 93043de8a0c63ce1ff326edb73d5c3bfde664f55 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 4 Dec 2024 22:28:38 +0000
Subject: [PATCH 34/69] support merge layers

---
 hls4ml/converters/keras_v3/core.py         |  6 +++--
 hls4ml/converters/keras_v3/squark/_base.py | 28 +++++++++++++++++++---
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/hls4ml/converters/keras_v3/core.py b/hls4ml/converters/keras_v3/core.py
index 92770b195b..f01fd06550 100644
--- a/hls4ml/converters/keras_v3/core.py
+++ b/hls4ml/converters/keras_v3/core.py
@@ -69,16 +69,18 @@ def handle(
         layer: 'Merge',
         in_tensors: Sequence['KerasTensor'],
         out_tensors: Sequence['KerasTensor'],
+        cls_name: str | None = None,
     ):
         assert len(out_tensors) == 1, f"Merge layer {layer.name} has more than one output"
         output_shape = list(out_tensors[0].shape[1:])
 
+        cls_name = cls_name or layer.__class__.__name__
         config: dict[str, Any] = {
             'output_shape': output_shape,
-            'op': layer.__class__.__name__.lower(),
+            'op': cls_name.lower(),
         }
 
-        match layer.__class__.__name__:
+        match cls_name.lower():
             case 'Concatenate':
                 rank = len(output_shape)
                 class_name = f'Concatenate{rank}d'
diff --git a/hls4ml/converters/keras_v3/squark/_base.py b/hls4ml/converters/keras_v3/squark/_base.py
index 625f3b2d29..f90352a888 100644
--- a/hls4ml/converters/keras_v3/squark/_base.py
+++ b/hls4ml/converters/keras_v3/squark/_base.py
@@ -6,7 +6,7 @@
 
 from hls4ml.converters.keras_v3._base import KerasV3LayerHandler, register
 from hls4ml.converters.keras_v3.conv import KV3ConvHandler
-from hls4ml.converters.keras_v3.core import KV3ActivationHandler, KV3DenseHandler
+from hls4ml.converters.keras_v3.core import KV3ActivationHandler, KV3DenseHandler, KV3MergeHandler
 from hls4ml.converters.keras_v3.einsum_dense import KV3EinsumDenseHandler
 
 if TYPE_CHECKING:
@@ -70,7 +70,7 @@ def __call__(
 
         if layer._enable_iq:
             if len(in_tensors) > 1:
-                iq_confs = [extract_fixed_quantizer_config(q, tensor, True) for q, tensor in zip(layer._iqs, in_tensors)]
+                iq_confs = [extract_fixed_quantizer_config(q, tensor, True) for q, tensor in zip(layer._iq, in_tensors)]
             else:
                 iq_confs = [extract_fixed_quantizer_config(layer._iq, in_tensors[0], True)]
         else:
@@ -78,7 +78,7 @@ def __call__(
 
         if layer._enable_oq:
             if len(out_tensors) > 1:
-                oq_confs = [extract_fixed_quantizer_config(q, tensor, False) for q, tensor in zip(layer._oqs, out_tensors)]
+                oq_confs = [extract_fixed_quantizer_config(q, tensor, False) for q, tensor in zip(layer._oq, out_tensors)]
             else:
                 oq_confs = [extract_fixed_quantizer_config(layer._oq, out_tensors[0], False)]
         else:
@@ -172,3 +172,25 @@ def handle(
             'scale_data': scale,
             'bias_data': offset,
         }
+
+
+@register
+class SQMergeHandler(SQLayerHandler, KV3MergeHandler):
+    handles = (
+        'squark.layers.ops.merge.QAdd',
+        'squark.layers.ops.merge.QSubtract',
+        'squark.layers.ops.merge.QMultiply',
+        'squark.layers.ops.merge.QAverage',
+        'squark.layers.ops.merge.QMaximum',
+        'squark.layers.ops.merge.QMinimum',
+        'squark.layers.ops.merge.QConcatenate',
+    )
+
+    def handle(
+        self,
+        layer: 'squark.layers.ops.merge.QMerge',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        cls_name = layer.__class__.__name__[1:]
+        return super().handle(layer, in_tensors, out_tensors, cls_name)

From d8708f5b2d6411430026a5700406b1f962393e33 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Thu, 5 Dec 2024 23:07:04 +0000
Subject: [PATCH 35/69] support bit-exact q_einsum and fix precision trace for
 multi inp layers

---
 hls4ml/backends/vivado/passes/einsum.py       | 105 ++++++++++++++++++
 hls4ml/backends/vivado/passes/einsum_dense.py |   4 +-
 hls4ml/converters/keras_v3/squark/__init__.py |   2 +-
 hls4ml/converters/keras_v3/squark/einsum.py   |  76 +++++++++++++
 hls4ml/model/layers.py                        |  50 +++++++++
 hls4ml/model/optimizer/passes/bit_exact.py    | 104 +++++++++++++----
 .../templates/vivado/nnet_utils/nnet_einsum.h |  84 ++++++++++++++
 7 files changed, 400 insertions(+), 25 deletions(-)
 create mode 100644 hls4ml/backends/vivado/passes/einsum.py
 create mode 100644 hls4ml/converters/keras_v3/squark/einsum.py
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_einsum.h

diff --git a/hls4ml/backends/vivado/passes/einsum.py b/hls4ml/backends/vivado/passes/einsum.py
new file mode 100644
index 0000000000..0d13a7078a
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/einsum.py
@@ -0,0 +1,105 @@
+from math import ceil
+
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Einsum
+
+from .reshaping_templates import transpose_config_gen
+
+# Shared Dense template
+# Einsum template
+
+einsum_config_template = '''
+struct config{index} {{
+    typedef config{index}_tpose_inp0 tpose_inp0_conf;
+    typedef config{index}_tpose_inp1 tpose_inp1_conf;
+    typedef config{index}_tpose_out tpose_out_conf;
+
+    typedef {accum_t.name} accum_t;
+
+    // Layer Sizes
+    static const unsigned n_free0 = {n_free0};
+    static const unsigned n_free1 = {n_free1};
+    static const unsigned n_contract = {n_contract};
+    static const unsigned n_inplace = {n_inplace};
+
+    // Resource reuse info
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse_factor};
+    static const unsigned multiplier_limit = {multiplier_limit};
+    static const bool store_weights_in_bram = false; // NOT USED
+
+    template <class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};
+'''
+
+einsum_function_template = 'nnet::einsum<{input0_t}, {input1_t}, {output_t}, {config}>({input0}, {input1}, {output});'
+
+einsum_include_list = ['nnet_utils/nnet_einsum.h']
+
+
+class EinsumConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Einsum)
+        self.template = einsum_config_template
+
+    def format(self, node: Einsum):
+        default_params = self._default_config_params(node)
+
+        strategy = node.model.config.get_strategy(node)
+        io_type = node.model.config.get_config_value('IOType')
+
+        assert io_type == 'io_parallel', 'EinsumDense layer only supports io_parallel for now'
+        assert strategy.lower() == 'latency', 'EinsumDense layer only supports Latency strategy for now'
+
+        # EinsumDense config
+        params = default_params.copy()
+        params['strategy'] = strategy
+        params['n_free0'] = node.attributes.attributes['n_free0']
+        params['n_free1'] = node.attributes.attributes['n_free1']
+        params['n_contract'] = node.attributes.attributes['n_contract']
+        params['n_inplace'] = node.attributes.attributes['n_inplace']
+        inp0_t = node.get_input_variable(node.inputs[0]).type.precision
+        inp1_t = node.get_input_variable(node.inputs[1]).type.precision
+        params['product_type'] = get_backend('vivado').product_type(inp0_t, inp1_t)
+
+        total_mults = params['n_free0'] * params['n_free1'] * params['n_contract'] * params['n_inplace']
+        params['multiplier_limit'] = ceil(total_mults / params['reuse_factor'])
+
+        einsum_conf = self.template.format(**params)
+
+        # inp/out transpose config
+        inp0_shape = node.attributes.attributes['inp0_shape']
+        inp1_shape = node.attributes.attributes['inp1_shape']
+        out_interpert_shape = node.attributes.attributes['out_interpert_shape']
+        inp0_tpose_idxs = node.attributes.attributes['inp0_tpose_idxs']
+        inp1_tpose_idxs = node.attributes.attributes['inp1_tpose_idxs']
+        out_tpose_idxs = node.attributes.attributes['out_tpose_idxs']
+        tpose_inp0_conf_name = f'config{node.index}_tpose_inp0'
+        tpose_inp1_conf_name = f'config{node.index}_tpose_inp1'
+        tpose_out_conf_name = f'config{node.index}_tpose_out'
+
+        inp0_tpose_conf = transpose_config_gen(tpose_inp0_conf_name, inp0_shape, inp0_tpose_idxs)
+        inp1_tpose_conf = transpose_config_gen(tpose_inp1_conf_name, inp1_shape, inp1_tpose_idxs)
+        out_tpose_conf = transpose_config_gen(tpose_out_conf_name, out_interpert_shape, out_tpose_idxs)
+
+        return '\n\n'.join((inp0_tpose_conf, inp1_tpose_conf, out_tpose_conf, einsum_conf))
+
+
+class EinsumFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Einsum, include_header=einsum_include_list)
+        self.template = einsum_function_template
+
+    def format(self, node: Einsum):
+        params = {}
+        params['config'] = f'config{node.index}'
+        params['input0_t'] = node.get_input_variable(node.inputs[0]).type.name
+        params['input1_t'] = node.get_input_variable(node.inputs[1]).type.name
+        params['output_t'] = node.get_output_variable().type.name
+        params['input0'] = node.get_input_variable(node.inputs[0]).name
+        params['input1'] = node.get_input_variable(node.inputs[1]).name
+        params['output'] = node.get_output_variable().name
+        return self.template.format(**params)
diff --git a/hls4ml/backends/vivado/passes/einsum_dense.py b/hls4ml/backends/vivado/passes/einsum_dense.py
index fb52873814..4edafa7f42 100644
--- a/hls4ml/backends/vivado/passes/einsum_dense.py
+++ b/hls4ml/backends/vivado/passes/einsum_dense.py
@@ -6,7 +6,7 @@
 
 # Shared Dense template
 
-conv_dense_config_template = """struct config{index}_dense : nnet::dense_config {{
+dense_config_template = """struct config{index}_dense : nnet::dense_config {{
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
@@ -54,7 +54,7 @@ class EinsumDenseConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(EinsumDense)
         self.template = einsum_dense_config_template
-        self.dense_template = conv_dense_config_template
+        self.dense_template = dense_config_template
 
     def format(self, node: EinsumDense):
         default_params = self._default_config_params(node)
diff --git a/hls4ml/converters/keras_v3/squark/__init__.py b/hls4ml/converters/keras_v3/squark/__init__.py
index 0944ec2e74..b89da3ebc9 100644
--- a/hls4ml/converters/keras_v3/squark/__init__.py
+++ b/hls4ml/converters/keras_v3/squark/__init__.py
@@ -1 +1 @@
-from . import _base
+from . import _base, einsum
diff --git a/hls4ml/converters/keras_v3/squark/einsum.py b/hls4ml/converters/keras_v3/squark/einsum.py
new file mode 100644
index 0000000000..0ab2bd8e15
--- /dev/null
+++ b/hls4ml/converters/keras_v3/squark/einsum.py
@@ -0,0 +1,76 @@
+import typing
+from typing import Sequence
+
+from ._base import SQLayerHandler, register
+
+if typing.TYPE_CHECKING:
+    import squark
+    from keras.api import KerasTensor
+
+
+def strip_batch_dim(equation: str, einsum_dense: bool = True):
+    """Remove the batch dimension from the equation.
+
+    Args:
+        equation (str): The einsum equation.
+        einsum_dense (bool): Whether the equation is for EinsumDense layer.
+
+    Returns:
+        str: The einsum equation without the batch dimension.
+    """
+
+    _inps, out = equation.split('->')
+    inp0, inp1 = _inps.split(',')
+    if einsum_dense:
+        if inp0.startswith('...'):
+            assert out.startswith('...'), f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
+        else:
+            assert inp0[0] == out[0], f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
+            assert inp0[0] not in inp1, f'Error in eq: {equation}: Batch dim is used in the kernel.'
+            inp0, out = inp0[1:], out[1:]
+    else:
+        assert inp0[0] == inp1[0] == out[0], f'Error in eq: {equation}: Batch dim mismatch for the inputs and output.'
+        inp0, inp1, out = inp0[1:], inp1[1:], out[1:]
+    return f'{inp0},{inp1}->{out}'
+
+
+@register
+class KV3EinsumDenseHandler(SQLayerHandler):
+    handles = ('squark.layers.ops.einsum.QEinsum',)
+
+    def handle(
+        self,
+        layer: 'squark.layers.QEinsum',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        assert len(in_tensors) == 2, 'EinsumDense layer must have exactly one input tensor'
+        assert len(out_tensors) == 1, 'EinsumDense layer must have exactly one output tensor'
+
+        inp0_shape: tuple[int, ...] = in_tensors[0].shape[1:]  # type: ignore
+        inp1_shape: tuple[int, ...] = in_tensors[1].shape[1:]  # type: ignore
+        out_shape: tuple[int, ...] = out_tensors[0].shape[1:]  # type: ignore
+
+        # fmt: off
+        assert all(d is not None for d in inp0_shape), \
+            f'Error when processing {layer.name}: Einsum layer requires fully inp shapes, got {inp0_shape} for inp1'
+        assert all(d is not None for d in inp1_shape), \
+            f'Error when processing {layer.name}: Einsum layer requires fully inp shapes, got {inp1_shape} for inp2'
+        assert all(d is not None for d in out_shape), \
+            f'Error when processing {layer.name}: EinsumDense layer requires fully out shapes. got {out_shape} for output'
+        # fmt: on
+
+        equation = strip_batch_dim(layer.equation, einsum_dense=False)
+
+        return {
+            'class_name': 'Einsum',
+            'equation': equation,
+            'inp0_shape': inp0_shape,
+            'inp1_shape': inp1_shape,
+            'out_shape': out_shape,
+        }
+
+
+# @register
+# class SQEinsumDenseHandler(SQLayerHandler, KV3EinsumDenseHandler):
+#     handles = ('squark.layers.ops.einsum.QEinsum',)
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 80652af613..5393c25244 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1693,6 +1693,55 @@ def to_original_kernel(tkernel: np.ndarray) -> np.ndarray:
         self.add_bias()
 
 
+class Matmul(Layer):
+    _expected_attributes = [
+        TypeAttribute('accum'),
+        Attribute('inup1_shape', value_type=tuple),
+        Attribute('inp2_shape', value_type=tuple),
+    ]
+
+
+class Einsum(Layer):
+    _expected_attributes = [
+        TypeAttribute('accum'),
+        Attribute('equation', value_type=str),
+        Attribute('inp0_shape', value_type=tuple),
+        Attribute('inp1_shape', value_type=tuple),
+        Attribute('out_shape', value_type=tuple),
+    ]
+
+    def initialize(self):
+        out_shape = self.attributes['out_shape']
+        if len(out_shape) > 1:
+            dims = [f'N_LAYER_{self.index}_D{i}' for i in range(1, len(out_shape) + 1)]
+        else:
+            dims = [f'N_LAYER_{self.index}']
+        self.add_output_variable(list(out_shape), dims)
+
+        equation = self.attributes['equation']
+        inp0_shape = self.attributes['inp0_shape']
+        inp1_shape = self.attributes['inp1_shape']
+        out_shape = self.attributes['out_shape']
+
+        recipe = parse_einsum(equation, inp0_shape, inp1_shape)
+        inp0_tpose_idxs, inp1_tpose_idxs = recipe['in_transpose_idxs']
+        out_tpose_idxs = recipe['out_transpose_idxs']
+
+        self.attributes.attributes.update(recipe)
+        self.attributes['n_free0'] = recipe['L0']
+        self.attributes['n_free1'] = recipe['L1']
+        self.attributes['n_inplace'] = recipe['I']
+        self.attributes['n_contract'] = recipe['C']
+        self.attributes['out_interpert_shape'] = recipe['out_interpert_shape']
+
+        self.attributes['inp0_tpose_idxs'] = inp0_tpose_idxs
+        self.attributes['inp1_tpose_idxs'] = inp1_tpose_idxs
+        self.attributes['out_tpose_idxs'] = out_tpose_idxs
+
+        pf = self.attributes.attributes.get('parallelization_factor', recipe['L0'])
+        self.attributes['parallelization_factor'] = pf
+
+
 layer_map = {
     'Input': Input,
     'InputLayer': Input,
@@ -1762,6 +1811,7 @@ def to_original_kernel(tkernel: np.ndarray) -> np.ndarray:
     # TensorFlow-specific layers:
     'BiasAdd': BiasAdd,
     'EinsumDense': EinsumDense,
+    'Einsum': Einsum,
 }
 
 
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 14db7b87a9..ff3a63cf6d 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -3,15 +3,19 @@
 from typing import Sequence
 
 import numpy as np
+from numpy.typing import NDArray
 
 from hls4ml.model.layers import (
     BatchNormalization,
     Conv1D,
     Conv2D,
     Dense,
+    Einsum,
     EinsumDense,
     GlobalPooling1D,
+    Input,
     Layer,
+    Merge,
     Pooling1D,
     Reshape,
 )
@@ -20,15 +24,22 @@
 if typing.TYPE_CHECKING:
     from hls4ml.model import ModelGraph
 
+from functools import reduce
+
 from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.types import FixedPrecisionType, NamedType
 from hls4ml.utils.qinterval import QIntervalArray, einsum, minimal_kif
 
+KIF_t = tuple[NDArray[np.int8], NDArray[np.int8], NDArray[np.int8]]
+
 
 def to_hls4ml_fixed(k, i, f, name, *args):
-    signed, b, i = k != 0, int(k + i + f), int(k + i)
+    signed, b, I = k != 0, int(k + i + f), int(k + i)
+    if b <= 0:
+        b = 1
+        I = 0
     args = [arg.upper() for arg in args]
-    ptype = FixedPrecisionType(b, i, signed, *args)
+    ptype = FixedPrecisionType(b, I, signed, *args)
     return NamedType(name, ptype)
 
 
@@ -43,22 +54,27 @@ def get_output_layers(layer: Layer):
     return [l for l in model.graph.values() if layer.name in l.attributes.attributes['inputs']]
 
 
-def get_output_shape(layer: Layer):
-    return layer.attributes.attributes[layer.name].shape
+def get_output_shape(layer: Layer) -> tuple[int, ...]:
+    return tuple(layer.attributes.attributes[layer.name].shape)
 
 
-def get_input_shapes(layer: Layer):
+def get_input_shapes(layer: Layer) -> list[tuple[int, ...]]:
     return [get_output_shape(inp) for inp in get_input_layers(layer)]
 
 
-@singledispatch
-def request_kif(layer: Layer):
-    input_shape = get_input_shapes(layer)[0]
-    k = np.ones(input_shape, dtype=np.int8)
-    i = f = np.full(input_shape, 127, dtype=np.int8)
+def _maximum_kif_at_shape(shape: tuple[int, ...]):
+    k = np.ones(shape, dtype=np.int8)
+    i = np.full(shape, 127, dtype=np.int8)
+    f = np.full(shape, 127, dtype=np.int8)
     return k, i, f
 
 
+@singledispatch
+def request_kif(layer: Layer) -> tuple[KIF_t, ...]:
+    input_shapes = get_input_shapes(layer)
+    return tuple(_maximum_kif_at_shape(shape) for shape in input_shapes)
+
+
 @request_kif.register
 def _(layer: FixedPointQuantizer):
     assert layer.mask_kbi is not None
@@ -73,7 +89,7 @@ def _(layer: FixedPointQuantizer):
         f += 1
     else:
         f += 2
-    return k, i, f
+    return ((k, i, f),)
 
 
 @request_kif.register(Pooling1D)
@@ -96,7 +112,7 @@ def _(layer: Pooling1D | GlobalPooling1D):
     i = np.full(out_shape, -128, dtype=np.int8)
     f = np.full(out_shape, 127, dtype=np.int8)
 
-    _, i_out, f_out = np.max([request_kif(next_layer) for next_layer in get_output_layers(layer)], axis=0)
+    _, i_out, f_out = requested_kif(layer)
 
     if not is_ch_last:
         i = np.moveaxis(i, 0, -1)
@@ -119,28 +135,41 @@ def _(layer: Pooling1D | GlobalPooling1D):
         i += np.ceil(ln2_size).astype(np.int8)
         if not ln2_size.is_integer():
             f[:] = 127
-    return k, i, f
+    return ((k, i, f),)
 
 
 @request_kif.register
 def _(layer: Reshape):
-    inp_shape = get_input_shapes(layer)[0]
-    k, i, f = np.max([request_kif(next_layer) for next_layer in get_output_layers(layer)], axis=0)
-    return k.reshape(inp_shape), i.reshape(inp_shape), f.reshape(inp_shape)
+    return (requested_kif(layer),)
 
 
 def requested_kif(layer: Layer):
     out_layers = get_output_layers(layer)
+    out_shape = get_output_shape(layer)
     if not out_layers:
-        out_shape = get_output_shape(layer)
-        k = np.ones(out_shape, dtype=np.int8)
-        i = f = np.full(out_shape, 127, dtype=np.int8)
-        return k, i, f
-    return tuple(np.max([request_kif(l) for l in out_layers], axis=0))
+        return _maximum_kif_at_shape(out_shape)
+
+    k = np.zeros(out_shape, dtype=np.int8)
+    i = np.full(out_shape, -128, dtype=np.int8)
+    f = i.copy()
+    for out_layer in out_layers:
+        _kif_s = request_kif(out_layer)
+        out_layer_inp_layers = get_input_layers(out_layer)
+        idx = out_layer_inp_layers.index(layer)
+        k = np.maximum(k, _kif_s[idx][0])
+        i = np.maximum(i, _kif_s[idx][1])
+        f = np.maximum(f, _kif_s[idx][2])
+
+    return k, i, f
 
 
 @singledispatch
-def produce_kif(layer: Layer):
+def produce_kif(layer: Layer) -> KIF_t:
+    raise NotImplementedError(f'No implementation of produce_kif for {layer.__class__}')
+
+
+@produce_kif.register
+def _(layer: Input):
     k = np.ones(get_output_shape(layer), dtype=np.int8)
     i = f = np.full(get_output_shape(layer), 127, dtype=np.int8)
     return k, i, f
@@ -165,6 +194,26 @@ def _(layer: Reshape):
     return k.reshape(out_shape), i.reshape(out_shape), f.reshape(out_shape)
 
 
+@produce_kif.register
+def _(layer: Merge):
+    op = layer.attributes.attributes['op'].lower()
+    kif_ins = get_input_kifs(layer)
+    match op:
+        case 'add':
+            qint_ins = [QIntervalArray.from_kif(*kif) for kif in kif_ins]
+            k, i, f = reduce(lambda a, b: a + b, qint_ins).to_kif()  # type: ignore
+            return k.astype(np.int8), i, f
+        case 'concatename':
+            axis = layer.attributes.attributes['axis']
+            _ks, _is, _fs = zip(*[kif for kif in kif_ins])
+            k = np.concatenate(_ks, axis=axis)
+            i = np.concatenate(_is, axis=axis)
+            f = np.concatenate(_fs, axis=axis)
+            return k, i, f
+        case _:
+            raise NotImplementedError(f'No implementation of Merge for {op}')
+
+
 @produce_kif.register
 def _(layer: EinsumDense):
     t_kernel = layer.attributes.attributes['weight'].data
@@ -183,6 +232,17 @@ def _(layer: EinsumDense):
     return k.astype(np.int8), i, f
 
 
+@produce_kif.register
+def _(layer: Einsum):
+    kif_in1, kif_in2 = get_input_kifs(layer)
+    qint_in1 = QIntervalArray.from_kif(*kif_in1)
+    qint_in2 = QIntervalArray.from_kif(*kif_in2)
+    eq = layer.attributes.attributes['equation']
+    qint_out = einsum(eq, qint_in1, qint_in2)
+    k, i, f = qint_out.to_kif()
+    return k.astype(np.int8), i, f
+
+
 @produce_kif.register
 def _(layer: Dense):
     kernel = layer.attributes.attributes['weight'].data
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h b/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h
new file mode 100644
index 0000000000..6fddd9b5fa
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h
@@ -0,0 +1,84 @@
+#ifndef NNET_DENSE_H_
+#define NNET_DENSE_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "nnet_transpose.h"
+
+namespace nnet {
+
+struct config_einsum {
+    typedef void tpose_inp0_conf;
+    typedef void tpose_inp1_conf;
+    typedef void tpose_out_conf;
+
+    // Layer Sizes
+    static const unsigned n_free0;
+    static const unsigned n_free1;
+    static const unsigned n_contract;
+    static const unsigned n_inplace;
+
+    // Resource reuse info
+    static const unsigned io_type;
+    static const unsigned strategy;
+    static const unsigned reuse_factor;
+    static const unsigned multiplier_limit;
+    static const bool store_weights_in_bram = false; // NOT USED
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <typename data0_T, typename data1_T, typename res_T, typename CONFIG_T>
+void einsum(const data0_T data0[CONFIG_T::tpose_inp0_conf::N], const data1_T data1[CONFIG_T::tpose_inp1_conf::N],
+            res_T res[CONFIG_T::tpose_out_conf::N]) {
+
+    #pragma HLS PIPELINE II = CONFIG_T::reuse_factor
+    #pragma HLS ALLOCATION operation instances = mul limit = CONFIG_T::multiplier_limit
+
+    data0_T tpose_i0[CONFIG_T::tpose_inp0_conf::N];
+    data1_T tpose_i1[CONFIG_T::tpose_inp1_conf::N];
+    res_T tpose_o[CONFIG_T::tpose_out_conf::N];
+
+    #pragma HLS ARRAY_PARTITION variable = tpose_i0 complete
+    #pragma HLS ARRAY_PARTITION variable = tpose_i1 complete
+    #pragma HLS ARRAY_PARTITION variable = tpose_o complete
+    #pragma HLS ARRAY_PARTITION variable = res_buffer complete
+
+    nnet::transpose<data0_T, data0_T, typename CONFIG_T::tpose_inp0_conf>(data0, tpose_i0);
+    nnet::transpose<data1_T, data1_T, typename CONFIG_T::tpose_inp1_conf>(data1, tpose_i1);
+
+    // for l0 in range(L0):
+    //     for i in range(I):
+    //             output[(i*L0+l0)*L1:(i*L0+l0+1)*L1] = input1[i*L1*C:(i+1)*L1*C].reshape((L1,C)) @
+    //             input0[(i*L0+l0)*C:(i*L0+l0+1)*C]
+
+    constexpr unsigned L0 = CONFIG_T::n_free0;
+    constexpr unsigned L1 = CONFIG_T::n_free1;
+    constexpr unsigned C = CONFIG_T::n_contract;
+    constexpr unsigned I = CONFIG_T::n_inplace;
+
+    typename CONFIG_T::accum_t accum_buf;
+    for (unsigned i = 0; i < I; i++) {
+        #pragma HLS UNROLL
+        for (unsigned l0 = 0; l0 < L0; l0++) {
+            #pragma HLS UNROLL
+            for (unsigned l1 = 0; l1 < L1; l1++) {
+                #pragma HLS UNROLL
+                accum_buf = 0;
+                for (unsigned c = 0; c < C; c++) {
+                    #pragma HLS UNROLL
+                    data0_T a = tpose_i0[(i * L0 + l0) * C + c];
+                    data1_T b = tpose_i1[i * L1 * C + l1 * C + c];
+                    accum_buf += CONFIG_T::template product<data0_T, data1_T>::product(a, b);
+                }
+                tpose_o[(i * L0 + l0) * L1 + l1] = accum_buf;
+            }
+        }
+    }
+
+    nnet::transpose<res_T, res_T, typename CONFIG_T::tpose_out_conf>(tpose_o, res);
+}
+
+} // namespace nnet
+
+#endif

From cba141195f8df296b38b40d5cf9626ac7b0aca73 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Thu, 5 Dec 2024 23:35:08 +0000
Subject: [PATCH 36/69] add einsum test

---
 test/pytest/test_qeinsum.py | 57 +++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 test/pytest/test_qeinsum.py

diff --git a/test/pytest/test_qeinsum.py b/test/pytest/test_qeinsum.py
new file mode 100644
index 0000000000..fd264f23d6
--- /dev/null
+++ b/test/pytest/test_qeinsum.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+
+import keras
+import numpy as np
+import pytest
+from keras.api.layers import Input
+
+from hls4ml.converters import convert_from_keras_model
+
+if keras.__version__ < '3.0.0':
+    pytest.skip('Only keras v3 is supported for now', allow_module_level=True)
+
+try:
+    from squark.layers import QEinsum
+    from squark.utils import trace_mode
+except ImportError:
+    pytest.skip('s-quark is not installed', allow_module_level=True)
+
+test_root_path = Path(__file__).parent
+
+
+@pytest.mark.parametrize('strategy', ['latency'])
+@pytest.mark.parametrize('io_type', ['io_parallel'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+@pytest.mark.parametrize(
+    'operation',
+    [
+        # eq, inp, out
+        ('xbi,xj->xbij', (8, 16), (16,)),
+        ('xbi,xio->xbo', (7, 8), (8, 9)),
+        ('xi,xoi->xo', (16,), (20, 16)),
+        ('xabcd,xbcde->xaeb', (2, 4, 8, 16), (4, 8, 16, 3)),
+    ],
+)
+def test_einsum_dense(backend, io_type, strategy, operation):
+    eq, inp0_shape, inp1_shape = operation
+    inp0 = Input(inp0_shape)
+    inp1 = Input(inp1_shape)
+    out = QEinsum(eq, name='einsum')([inp0, inp1])
+    model = keras.Model(inputs=[inp0, inp1], outputs=out)
+
+    data = np.random.randn(1000, *inp0_shape).astype(np.float32), np.random.randn(1000, *inp1_shape).astype(np.float32)
+    eq_name = eq.replace(',', '_').replace('->', '_')
+    output_dir = str(test_root_path / f'hls4mlprj_einsum_{eq_name}_{backend}_{io_type}_{strategy}')
+    hls_config = {'Model': {'Precision': 'ap_fixed<1,0>', 'ReuseFactor': 1}, 'Strategy': strategy}
+
+    with trace_mode(model):
+        r_keras = model.predict(data, verbose=0, batch_size=1000)  # type: ignore
+
+    model_hls = convert_from_keras_model(
+        model, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type
+    )
+
+    model_hls.compile()
+    r_hls = model_hls.predict(data).reshape(r_keras.shape)  # type: ignore
+
+    assert np.all(r_hls.ravel() == r_keras.ravel())

From f8ae9292e674e2adb7d5f5b55e4b30490db0fbf7 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 6 Dec 2024 06:02:15 +0000
Subject: [PATCH 37/69] declare all softmax attrs in layer class

---
 hls4ml/backends/fpga/fpga_backend.py | 33 +---------------------------
 hls4ml/model/attributes.py           |  2 +-
 hls4ml/model/layers.py               | 17 ++++++++++++++
 3 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index fbfed71c5b..54d7fd6cd8 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 from hls4ml.backends.backend import Backend
-from hls4ml.model.attributes import ChoiceAttribute, ConfigurableAttribute, TypeAttribute
+from hls4ml.model.attributes import ConfigurableAttribute, TypeAttribute
 from hls4ml.model.layers import (
     GRU,
     LSTM,
@@ -32,7 +32,6 @@
     SeparableConv1D,
     SeparableConv2D,
     SimpleRNN,
-    Softmax,
 )
 from hls4ml.model.optimizer import model_optimizer
 from hls4ml.model.types import (
@@ -40,8 +39,6 @@
     FixedPrecisionType,
     IntegerPrecisionType,
     PrecisionType,
-    RoundingMode,
-    SaturationMode,
     UnspecifiedPrecisionType,
     XnorPrecisionType,
 )
@@ -109,34 +106,6 @@ def __init__(self, name):
         act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
         self.attribute_map[Activation] = act_attrs
 
-        softmax_attrs = self.attribute_map.get(Softmax, [])
-        softmax_attrs.append(
-            ChoiceAttribute(
-                'implementation',
-                ['latency', 'stable', 'argmax', 'legacy'],
-                default='stable',
-                description=descriptions.softmax_implementation,
-            )
-        )
-        softmax_attrs.append(
-            ConfigurableAttribute('skip', value_type=bool, default=False, description=descriptions.softmax_skip)
-        )
-        softmax_attrs.append(
-            TypeAttribute(
-                'exp_table',
-                default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
-                description=descriptions.table_type,
-            )
-        )
-        softmax_attrs.append(
-            TypeAttribute(
-                'inv_table',
-                default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
-                description=descriptions.table_type,
-            )
-        )
-        self.attribute_map[Softmax] = softmax_attrs
-
     def create_layer_class(self, layer_class):
         new_attrubutes = []
         for cls, attributes in self.attribute_map.items():
diff --git a/hls4ml/model/attributes.py b/hls4ml/model/attributes.py
index d03d2bd108..9d7b78c9db 100644
--- a/hls4ml/model/attributes.py
+++ b/hls4ml/model/attributes.py
@@ -36,7 +36,7 @@ class Attribute:
 
     """
 
-    def __init__(self, name, value_type=Integral, default=None, configurable=False, description=None):
+    def __init__(self, name, value_type: type = Integral, default=None, configurable=False, description=None):
         self.name = name
         self.value_type = value_type
         self.default = default
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 5393c25244..e166db017f 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -21,6 +21,8 @@
     FixedPrecisionType,
     IntegerPrecisionType,
     NamedType,
+    RoundingMode,
+    SaturationMode,
     TensorVariable,
     UnspecifiedPrecisionType,
     WeightVariable,
@@ -977,6 +979,21 @@ def initialize(self):
 
 
 class Softmax(Activation):
+    _expected_attributes = [
+        Attribute('n_in'),
+        Attribute('activation', value_type=str),
+        ChoiceAttribute('implementation', ['latency', 'stable', 'argmax', 'legacy'], default='stable'),
+        ConfigurableAttribute('skip', value_type=bool, default=False),
+        TypeAttribute(
+            'exp_table',
+            default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+        ),
+        TypeAttribute(
+            'inv_table',
+            default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+        ),
+    ]
+
     def initialize(self):
         super().initialize()
 

From 9326ad5e662c6afe24ce4fe817c386e648e9b32b Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 6 Dec 2024 06:07:13 +0000
Subject: [PATCH 38/69] fix lazy import in handler

---
 hls4ml/converters/keras_v3/squark/_base.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hls4ml/converters/keras_v3/squark/_base.py b/hls4ml/converters/keras_v3/squark/_base.py
index f90352a888..12a4cc729f 100644
--- a/hls4ml/converters/keras_v3/squark/_base.py
+++ b/hls4ml/converters/keras_v3/squark/_base.py
@@ -2,7 +2,6 @@
 from typing import TYPE_CHECKING, Any, Sequence
 
 import numpy as np
-from keras.api import Layer
 
 from hls4ml.converters.keras_v3._base import KerasV3LayerHandler, register
 from hls4ml.converters.keras_v3.conv import KV3ConvHandler
@@ -11,7 +10,7 @@
 
 if TYPE_CHECKING:
     import squark
-    from keras import KerasTensor
+    from keras.api import KerasTensor, Layer
 
 
 def extract_fixed_quantizer_config(q, tensor: 'KerasTensor', is_input: bool) -> dict[str, Any]:
@@ -98,7 +97,7 @@ def __call__(
 
         return *iq_confs, *ret, *oq_confs
 
-    def load_weight(self, layer: Layer, key: str):
+    def load_weight(self, layer: 'Layer', key: str):
         from keras.api.ops import convert_to_numpy
 
         if hasattr(layer, f'q{key}'):

From 0cde312613bce8f2d8594d4974ef478fe5fa19c3 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 6 Dec 2024 06:39:24 +0000
Subject: [PATCH 39/69] cleanup einsum handler

---
 hls4ml/converters/keras_v3/squark/einsum.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/hls4ml/converters/keras_v3/squark/einsum.py b/hls4ml/converters/keras_v3/squark/einsum.py
index 0ab2bd8e15..b1f7dc5bf5 100644
--- a/hls4ml/converters/keras_v3/squark/einsum.py
+++ b/hls4ml/converters/keras_v3/squark/einsum.py
@@ -35,7 +35,7 @@ def strip_batch_dim(equation: str, einsum_dense: bool = True):
 
 
 @register
-class KV3EinsumDenseHandler(SQLayerHandler):
+class SQEinsumDenseHandler(SQLayerHandler):
     handles = ('squark.layers.ops.einsum.QEinsum',)
 
     def handle(
@@ -69,8 +69,3 @@ def handle(
             'inp1_shape': inp1_shape,
             'out_shape': out_shape,
         }
-
-
-# @register
-# class SQEinsumDenseHandler(SQLayerHandler, KV3EinsumDenseHandler):
-#     handles = ('squark.layers.ops.einsum.QEinsum',)

From b97d01e4a07146c3e352c1abebb07888341b7cc1 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 6 Dec 2024 07:19:18 +0000
Subject: [PATCH 40/69] cleanup einsum handler

---
 hls4ml/converters/keras_v3/einsum_dense.py  | 19 +++++++++------
 hls4ml/converters/keras_v3/squark/einsum.py | 27 +--------------------
 2 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/hls4ml/converters/keras_v3/einsum_dense.py b/hls4ml/converters/keras_v3/einsum_dense.py
index cb19272915..8eb000fcf7 100644
--- a/hls4ml/converters/keras_v3/einsum_dense.py
+++ b/hls4ml/converters/keras_v3/einsum_dense.py
@@ -8,11 +8,12 @@
     from keras.api import KerasTensor
 
 
-def strip_batch_dim(equation: str):
+def strip_batch_dim(equation: str, einsum_dense: bool = True):
     """Remove the batch dimension from the equation.
 
     Args:
         equation (str): The einsum equation.
+        einsum_dense (bool): Whether the equation is for EinsumDense layer.
 
     Returns:
         str: The einsum equation without the batch dimension.
@@ -20,12 +21,16 @@ def strip_batch_dim(equation: str):
 
     _inps, out = equation.split('->')
     inp0, inp1 = _inps.split(',')
-    if inp0.startswith('...'):
-        assert out.startswith('...'), f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
+    if einsum_dense:
+        if inp0.startswith('...'):
+            assert out.startswith('...'), f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
+        else:
+            assert inp0[0] == out[0], f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
+            assert inp0[0] not in inp1, f'Error in eq: {equation}: Batch dim is used in the kernel.'
+            inp0, out = inp0[1:], out[1:]
     else:
-        assert inp0[0] == out[0], f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
-        assert inp0[0] not in inp1, f'Error in eq: {equation}: Batch dim is used in the kernel.'
-        inp0, out = inp0[1:], out[1:]
+        assert inp0[0] == inp1[0] == out[0], f'Error in eq: {equation}: Batch dim mismatch for the inputs and output.'
+        inp0, inp1, out = inp0[1:], inp1[1:], out[1:]
     return f'{inp0},{inp1}->{out}'
 
 
@@ -52,7 +57,7 @@ def handle(
             f'Error when processing {layer.name}: EinsumDense layer requires fully out shapes'
         # fmt: on
 
-        equation = strip_batch_dim(layer.equation)
+        equation = strip_batch_dim(layer.equation, True)
 
         kernel = self.load_weight(layer, 'kernel')
 
diff --git a/hls4ml/converters/keras_v3/squark/einsum.py b/hls4ml/converters/keras_v3/squark/einsum.py
index b1f7dc5bf5..10e1c0f5b5 100644
--- a/hls4ml/converters/keras_v3/squark/einsum.py
+++ b/hls4ml/converters/keras_v3/squark/einsum.py
@@ -1,6 +1,7 @@
 import typing
 from typing import Sequence
 
+from ..einsum_dense import strip_batch_dim
 from ._base import SQLayerHandler, register
 
 if typing.TYPE_CHECKING:
@@ -8,32 +9,6 @@
     from keras.api import KerasTensor
 
 
-def strip_batch_dim(equation: str, einsum_dense: bool = True):
-    """Remove the batch dimension from the equation.
-
-    Args:
-        equation (str): The einsum equation.
-        einsum_dense (bool): Whether the equation is for EinsumDense layer.
-
-    Returns:
-        str: The einsum equation without the batch dimension.
-    """
-
-    _inps, out = equation.split('->')
-    inp0, inp1 = _inps.split(',')
-    if einsum_dense:
-        if inp0.startswith('...'):
-            assert out.startswith('...'), f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
-        else:
-            assert inp0[0] == out[0], f'Error in eq: {equation}: Batch dim mismatch for the input and output.'
-            assert inp0[0] not in inp1, f'Error in eq: {equation}: Batch dim is used in the kernel.'
-            inp0, out = inp0[1:], out[1:]
-    else:
-        assert inp0[0] == inp1[0] == out[0], f'Error in eq: {equation}: Batch dim mismatch for the inputs and output.'
-        inp0, inp1, out = inp0[1:], inp1[1:], out[1:]
-    return f'{inp0},{inp1}->{out}'
-
-
 @register
 class SQEinsumDenseHandler(SQLayerHandler):
     handles = ('squark.layers.ops.einsum.QEinsum',)

From c34abbe04fa5112576602cca8fee81f710d3c681 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 6 Dec 2024 09:45:03 +0000
Subject: [PATCH 41/69] more granular control over softmax for vivado

---
 .../backends/vivado/passes/core_templates.py  | 19 +++-
 .../vivado/nnet_utils/nnet_activation.h       | 98 +++++++++----------
 .../nnet_utils/nnet_activation_stream.h       | 50 +++++-----
 test/pytest/test_softmax.py                   | 36 ++++---
 4 files changed, 115 insertions(+), 88 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 836da6e68a..668d404c98 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -150,13 +150,17 @@ def format(self, node):
 
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
     static const unsigned n_in = {n_in};
-    static const unsigned table_size = {table_size};
+    static const unsigned exp_table_size = {table_size};
+    static const unsigned inv_table_size = {table_size};
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
     static const unsigned axis = {axis};
     static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
+    typedef {accum_t.name} accum_t;
+    typedef {inv_inp_t.name} inv_inp_t;
+    typedef {inp_norm_t_str} inp_norm_t;
 }};\n"""
 
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
@@ -208,6 +212,19 @@ def __init__(self):
         super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
         self.template = softmax_config_template
 
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+        if 'exp_table_size' not in params:
+            params['exp_table_size'] = params['table_size']
+        if 'inv_table_size' not in params:
+            params['inv_table_size'] = params['table_size']
+        if 'inp_norm_t_str' not in params:
+            input_t = node.get_input_variable().type.precision
+            width, iwidth = input_t.width, input_t.integer
+            params['inp_norm_t_str'] = f'ap_fixed<{width}, {iwidth}, AP_RND, AP_SAT>'
+        return self.template.format(**params)
+
 
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index 4683239d85..3e3a54322a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -130,37 +130,37 @@ enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax
 
 inline float exp_fcn_float(float input) { return std::exp(input); }
 
-template <class data_T, typename CONFIG_T> inline float softmax_real_val_from_idx(unsigned i) {
+template <class data_T, unsigned table_size> inline float softmax_real_val_from_idx(unsigned i) {
     // Treat the index as the top N bits
-    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    static constexpr int N = ceillog2(table_size); // number of address bits for table
     data_T x(0);
     x(x.width - 1, x.width - N) = i;
     return (float)x;
 }
 
-template <class data_T, typename CONFIG_T> inline unsigned softmax_idx_from_real_val(data_T x) {
+template <class data_T, unsigned table_size> inline unsigned softmax_idx_from_real_val(data_T x) {
     // Slice the top N bits to get an index into the table
-    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
-    ap_uint<N> y = x(x.width - 1, x.width - N);              // slice the top N bits of input
+    static constexpr int N = ceillog2(table_size); // number of address bits for table
+    ap_uint<N> y = x(x.width - 1, x.width - N);    // slice the top N bits of input
     return (unsigned)y(N - 1, 0);
 }
 
 template <class data_T, typename CONFIG_T>
-void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) {
+void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::exp_table_size]) {
     // The template data_T is the data type used to address the table
-    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+    for (unsigned i = 0; i < CONFIG_T::exp_table_size; i++) {
         // Slicing bits for address is going to round towards 0, so take the central value
-        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T::exp_table_size>(i);
         typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
         table_out[i] = exp_x;
     }
 }
 
 template <class data_T, typename CONFIG_T>
-void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) {
+void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::inv_table_size]) {
     // The template data_T is the data type used to address the table
-    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
-        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+    for (unsigned i = 0; i < CONFIG_T::inv_table_size; i++) {
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T::inv_table_size>(i);
         typename CONFIG_T::inv_table_t inv_x = 1 / x;
         table_out[i] = inv_x;
     }
@@ -172,40 +172,39 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup tables
 #ifdef __HLS_SYN__
     bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 #else
     static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 
 #endif
     if (!initialized) {
         // Note we are exponentiating the inputs, which have type data_T
         init_exp_table<data_T, CONFIG_T>(exp_table);
         // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        init_invert_table<typename CONFIG_T::inv_inp_t, CONFIG_T>(invert_table);
         initialized = true;
     }
 
     // Calculate all the e^x's
-    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    typename CONFIG_T::accum_t exp_res[CONFIG_T::n_in];
     #pragma HLS array_partition variable=exp_res complete
-    typename CONFIG_T::exp_table_t exp_sum(0);
+    typename CONFIG_T::inv_inp_t exp_sum(0);
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
-        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(data[i]);
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T::exp_table_size>(data[i]);
         exp_res[i] = exp_table[x];
     }
 
     // Explicitly sum the results with an adder tree.
     // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    Op_add<typename CONFIG_T::accum_t> op_add;
+    exp_sum = reduce<typename CONFIG_T::accum_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
     typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
         res[i] = exp_res[i] * inv_exp_sum;
@@ -218,19 +217,19 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup tables
 #ifdef __HLS_SYN__
     bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 #else
     static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 
 #endif
     if (!initialized) {
         // Note we are exponentiating the inputs, which have type data_T
-        init_exp_table<data_T, CONFIG_T>(exp_table);
+        init_exp_table<typename CONFIG_T::inp_norm_t, CONFIG_T>(exp_table);
         // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        init_invert_table<typename CONFIG_T::inv_inp_t, CONFIG_T>(invert_table);
         initialized = true;
     }
 
@@ -239,30 +238,29 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
 
     // For the diffs, use the same type as the input but force rounding and saturation
-    ap_fixed<data_T::width, data_T::iwidth, AP_RND, AP_SAT> d_xi_xmax[CONFIG_T::n_in];
+    typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
         d_xi_xmax[i] = data[i] - x_max;
     }
 
     // Calculate all the e^x's
-    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    typename CONFIG_T::accum_t exp_res[CONFIG_T::n_in];
     #pragma HLS array_partition variable=exp_res complete
-    typename CONFIG_T::exp_table_t exp_sum(0);
+    typename CONFIG_T::inv_inp_t exp_sum(0);
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
-        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i]);
+        unsigned x = softmax_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(d_xi_xmax[i]);
         exp_res[i] = exp_table[x];
     }
 
     // Explicitly sum the results with an adder tree.
     // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    Op_add<typename CONFIG_T::accum_t> op_add;
+    exp_sum = reduce<typename CONFIG_T::accum_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
     typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
         res[i] = exp_res[i] * inv_exp_sum;
@@ -299,16 +297,16 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
-    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t exp_table[CONFIG_T::exp_table_size];
+    typename CONFIG_T::table_t invert_table[CONFIG_T::inv_table_size];
 #else
     static bool initialized = false;
-    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t exp_table[CONFIG_T::exp_table_size];
+    static typename CONFIG_T::table_t invert_table[CONFIG_T::inv_table_size];
 #endif
     if (!initialized) {
-        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
-        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::exp_table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::inv_table_size>(invert_table);
         initialized = true;
     }
 
@@ -330,12 +328,12 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
             if (ii == jj)
                 exp_diff_res = 1;
             else {
-                data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16;
-                index = data_round + 8 * CONFIG_T::table_size / 16;
+                data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::exp_table_size / 16;
+                index = data_round + 8 * CONFIG_T::exp_table_size / 16;
                 if (index < 0)
                     index = 0;
-                if (index > CONFIG_T::table_size - 1)
-                    index = CONFIG_T::table_size - 1;
+                if (index > CONFIG_T::exp_table_size - 1)
+                    index = CONFIG_T::exp_table_size - 1;
                 exp_diff_res = exp_table[index];
             }
             exp_res[ii] += exp_diff_res;
@@ -344,11 +342,11 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 
     // Second loop to invert
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        int exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64;
+        int exp_res_index = exp_res[ii] * CONFIG_T::inv_table_size / 64;
         if (exp_res_index < 0)
             exp_res_index = 0;
-        if (exp_res_index > CONFIG_T::table_size - 1)
-            exp_res_index = CONFIG_T::table_size - 1;
+        if (exp_res_index > CONFIG_T::inv_table_size - 1)
+            exp_res_index = CONFIG_T::inv_table_size - 1;
         // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
         res[ii] = (res_T)invert_table[exp_res_index];
     }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
index ef687243bf..13c065a313 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
@@ -109,19 +109,19 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup tables
 #ifdef __HLS_SYN__
     bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 #else
     static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 
 #endif
     if (!initialized) {
         // Note we are exponentiating the inputs, which have type data_T
         init_exp_table<typename data_T::value_type, CONFIG_T>(exp_table);
         // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        init_invert_table<typename CONFIG_T::inv_inp_t, CONFIG_T>(invert_table);
         initialized = true;
     }
 
@@ -129,9 +129,9 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     constexpr unsigned ii = data_T::size / multiplier_limit;
 
     // Calculate all the e^x's
-    typename CONFIG_T::exp_table_t exp_res[data_T::size];
+    typename CONFIG_T::accum_t exp_res[data_T::size];
     #pragma HLS array_partition variable=exp_res complete
-    typename CONFIG_T::exp_table_t exp_sum(0);
+    typename CONFIG_T::inv_inp_t exp_sum(0);
 SoftmaxExpLoop:
     for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
         #pragma HLS PIPELINE II=ii
@@ -140,18 +140,17 @@ void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     SoftmaxExpPackLoop:
         for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
-            unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(in_pack[j]);
+            unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T::exp_table_size>(in_pack[j]);
             exp_res[j] = exp_table[x];
         }
 
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-        Op_add<typename CONFIG_T::exp_table_t> op_add;
-        exp_sum =
-            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+        Op_add<typename CONFIG_T::accum_t> op_add;
+        exp_sum = reduce<typename CONFIG_T::accum_t, data_T::size, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
         typename CONFIG_T::inv_table_t inv_exp_sum =
-            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];
 
         res_T out_pack;
         PRAGMA_DATA_PACK(out_pack)
@@ -171,19 +170,19 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     // Initialize the lookup tables
 #ifdef __HLS_SYN__
     bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 #else
     static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::exp_table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::inv_table_size];
 
 #endif
     if (!initialized) {
         // Note we are exponentiating the inputs, which have type data_T
-        init_exp_table<typename data_T::value_type, CONFIG_T>(exp_table);
+        init_exp_table<typename CONFIG_T::inp_norm_t, CONFIG_T>(exp_table);
         // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        init_invert_table<typename CONFIG_T::inv_inp_t, CONFIG_T>(invert_table);
         initialized = true;
     }
 
@@ -209,30 +208,29 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res) {
             reduce<typename data_T::value_type, data_T::size, Op_max<typename data_T::value_type>>(data_array, op_max);
 
         // For the diffs, use the same type as the input but force rounding and saturation
-        ap_fixed<data_T::value_type::width, data_T::value_type::iwidth, AP_RND, AP_SAT> d_xi_xmax[data_T::size];
+        typename CONFIG_T::inp_norm_t d_xi_xmax[data_T::size];
         for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
             d_xi_xmax[j] = data_array[j] - x_max;
         }
 
         // Calculate all the e^x's
-        typename CONFIG_T::exp_table_t exp_res[data_T::size];
+        typename CONFIG_T::accum_t exp_res[data_T::size];
         #pragma HLS ARRAY_PARTITION variable=exp_res complete
-        typename CONFIG_T::exp_table_t exp_sum(0);
+        typename CONFIG_T::inv_inp_t exp_sum(0);
         for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
-            unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[j]);
+            unsigned x = softmax_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(d_xi_xmax[j]);
             exp_res[j] = exp_table[x];
         }
 
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-        Op_add<typename CONFIG_T::exp_table_t> op_add;
-        exp_sum =
-            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+        Op_add<typename CONFIG_T::accum_t> op_add;
+        exp_sum = reduce<typename CONFIG_T::accum_t, data_T::size, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
         typename CONFIG_T::inv_table_t inv_exp_sum =
-            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::accum_t, CONFIG_T::inv_table_size>(exp_sum)];
 
         res_T out_pack;
         PRAGMA_DATA_PACK(out_pack)
diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py
index 048b6832ee..73c54711c8 100644
--- a/test/pytest/test_softmax.py
+++ b/test/pytest/test_softmax.py
@@ -22,18 +22,20 @@ def generate_data(input_shape):
 @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('strategy', ['stable', 'latency', 'argmax'])
 @pytest.mark.parametrize(
-    'input_bits,input_shape,table_bits,io_type',
+    'input_bits,input_shape,table_bits,io_type,custom_accum',
     [
-        ('16,6', (8,), '18,8', 'io_parallel'),
-        ('16,6', (8,), '18,8', 'io_stream'),
-        ('16,6', (8,), '9,6', 'io_parallel'),
-        ('16,6', (8,), '9,6', 'io_stream'),
-        ('9,6', (8,), '18,8', 'io_parallel'),
-        ('9,6', (8,), '18,8', 'io_stream'),
-        ('16,6', (8, 8, 3), '18,8', 'io_stream'),
+        ('16,6', (8,), '18,8', 'io_parallel', False),
+        ('16,6', (8,), '18,8', 'io_stream', False),
+        ('16,6', (8,), '18,8', 'io_parallel', True),
+        ('16,6', (8,), '18,8', 'io_stream', True),
+        ('16,6', (8,), '9,6', 'io_parallel', False),
+        ('16,6', (8,), '9,6', 'io_stream', False),
+        ('9,6', (8,), '18,8', 'io_parallel', False),
+        ('9,6', (8,), '18,8', 'io_stream', False),
+        ('16,6', (8, 8, 3), '18,8', 'io_stream', False),
     ],
 )
-def test_softmax(backend, strategy, generate_data, input_bits, input_shape, table_bits, io_type):
+def test_softmax(backend, strategy, generate_data, input_bits, input_shape, table_bits, io_type, custom_accum):
     X = generate_data
     model = tf.keras.models.Sequential()
     model.add(tf.keras.layers.Activation(input_shape=input_shape, activation='softmax', name='softmax'))
@@ -45,11 +47,23 @@ def test_softmax(backend, strategy, generate_data, input_bits, input_shape, tabl
     cfg['LayerName']['softmax']['Strategy'] = strategy
     cfg['LayerName']['softmax']['inv_table_t'] = table_type
     cfg['LayerName']['softmax']['exp_table_t'] = table_type
-    cfg['LayerName']['softmax_input']['Precision']['result'] = f'fixed<{input_bits}>'
+    cfg['LayerName']['softmax']['accum_t'] = table_type
+    cfg['LayerName']['softmax']['inv_inp_t'] = table_type
+    if custom_accum:
+        if backend not in ['Vivado', 'Vitis']:
+            pytest.skip('Custom accumulators are only supported for Vivado and Vitis backends')
+        W, I = map(int, input_bits.split(','))  # noqa: E741
+        cfg['LayerName']['softmax']['accum_t'] = f'fixed<{W+3},{I+3}>'
+        cfg['LayerName']['softmax']['inv_inp_t'] = f'fixed<{W+2},{I+2}>'
+    inp_layer_name = next(iter(cfg['LayerName'].keys()))
+    cfg['LayerName'][inp_layer_name]['Precision']['result'] = f'fixed<{input_bits}>'
 
     odir = str(
         test_root_path
-        / f'hls4mlprj_softmax_{backend}_{io_type}_{strategy}_{input_shape}_input-bits={input_bits}_table-bits={table_bits}'
+        / (
+            f'hls4mlprj_softmax_{backend}_{io_type}_{strategy}_{input_shape}'
+            f'_input-bits={input_bits}_table-bits={table_bits}_custom-accum={custom_accum}'
+        )
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=cfg, io_type=io_type, output_dir=odir, backend=backend

From 7ea631058338dc74a456b900b04474b27aa262c5 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sat, 7 Dec 2024 00:04:08 +0000
Subject: [PATCH 42/69] properly propagate inv/exp_table_size

---
 hls4ml/backends/vivado/passes/core_templates.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 668d404c98..5f583c7f2f 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -150,8 +150,8 @@ def format(self, node):
 
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
     static const unsigned n_in = {n_in};
-    static const unsigned exp_table_size = {table_size};
-    static const unsigned inv_table_size = {table_size};
+    static const unsigned exp_table_size = {exp_table_size};
+    static const unsigned inv_table_size = {inv_table_size};
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
     static const unsigned axis = {axis};

From 0ecd12e9bc904e6879efbc52c125bd7888e6ddc3 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sat, 7 Dec 2024 00:06:55 +0000
Subject: [PATCH 43/69] support bit-exact softmax for stable impl

---
 hls4ml/converters/keras_v3/squark/__init__.py |   2 +-
 hls4ml/converters/keras_v3/squark/softmax.py  | 107 ++++++++++++++++++
 hls4ml/model/graph.py                         |   2 +
 hls4ml/model/layers.py                        |  12 ++
 hls4ml/model/optimizer/passes/bit_exact.py    |  51 +++++++--
 5 files changed, 163 insertions(+), 11 deletions(-)
 create mode 100644 hls4ml/converters/keras_v3/squark/softmax.py

diff --git a/hls4ml/converters/keras_v3/squark/__init__.py b/hls4ml/converters/keras_v3/squark/__init__.py
index b89da3ebc9..98ea780642 100644
--- a/hls4ml/converters/keras_v3/squark/__init__.py
+++ b/hls4ml/converters/keras_v3/squark/__init__.py
@@ -1 +1 @@
-from . import _base, einsum
+from . import _base, einsum, softmax
diff --git a/hls4ml/converters/keras_v3/squark/softmax.py b/hls4ml/converters/keras_v3/squark/softmax.py
new file mode 100644
index 0000000000..36f5366bf9
--- /dev/null
+++ b/hls4ml/converters/keras_v3/squark/softmax.py
@@ -0,0 +1,107 @@
+import typing
+from copy import copy
+from math import ceil, log2, prod
+from typing import Sequence
+
+from hls4ml.model.types import FixedPrecisionType, RoundingMode, SaturationMode
+
+from ..core import KV3SoftmaxHandler
+from ._base import SQLayerHandler, register
+
+if typing.TYPE_CHECKING:
+    import squark
+    from keras.api import KerasTensor
+    from squark.quantizer.internal import FixedPointQuantizerBase
+
+
+def fixed_quantizer_to_hls4ml_t(q: 'FixedPointQuantizerBase', take_max=False):
+    from keras import ops
+
+    k, i, f = q.kif
+    k = ops.convert_to_numpy(k)
+    i = ops.convert_to_numpy(i)
+    f = ops.convert_to_numpy(f)
+    if not take_max:
+        assert k.size == 1 and i.size == 1 and f.size == 1, 'Only homogeneous quantizer is supported'
+        k = bool(k.ravel().item())
+        i = int(i.ravel().item())
+        f = int(f.ravel().item())
+    else:
+        k = bool(k.max())
+        i = int(i.max())
+        f = int(f.max())
+
+    k, b, I = k, k + i + f, k + i  # noqa: E741
+    round_mode = q.round_mode
+    if round_mode.startswith('S_'):
+        round_mode = round_mode[2:]  # stochastic rounding
+    round_mode = getattr(RoundingMode, round_mode)
+    sat_mode = getattr(SaturationMode, q.overflow_mode)
+    return FixedPrecisionType(b, I, k, rounding_mode=round_mode, saturation_mode=sat_mode)
+
+
+@register
+class SQSoftmaxDenseHandler(SQLayerHandler, KV3SoftmaxHandler):
+    handles = ('squark.layers.softmax.QSoftmax',)
+
+    def handle(
+        self,
+        layer: 'squark.layers.QSoftmax',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        assert not layer._allow_heterogeneous_table, 'Heterogeneous table is not supported in QSoftmax layer'
+        assert len(layer.axis) == 1, 'Support softmax along one axis. Use transpose before & after softmax as workaround.'
+
+        from keras import ops
+        from squark.quantizer.internal import FixedPointQuantizerBase
+
+        impl = 'stable' if layer.stable else 'latency'
+
+        if impl == 'stable':
+            exp_table_size = 2 ** int(ops.convert_to_numpy(ops.max(layer.exp_table.iq.quantizer.bits)))
+        else:
+            exp_table_size = None
+
+        exp_oq = layer.exp_table.oq.quantizer
+        inv_oq = layer.inv_table.oq.quantizer
+        inv_iq = layer.inv_table.iq.quantizer
+        assert isinstance(exp_oq, FixedPointQuantizerBase), 'Only fixed-point quantizer is supported for exp_table'
+        exp_table_t = fixed_quantizer_to_hls4ml_t(exp_oq)
+        inv_table_t = fixed_quantizer_to_hls4ml_t(inv_oq)
+        inv_inp_t = fixed_quantizer_to_hls4ml_t(inv_iq)
+
+        inv_table_size = 2**inv_inp_t.width
+
+        # Set accum_t
+        accum_t = copy(inv_inp_t)
+        if inv_inp_t.saturation_mode != SaturationMode.WRAP:
+            accum_t.saturation_bits = SaturationMode.WRAP
+            L = prod(in_tensors[0].shape[ax] for ax in layer.axis)  # type: ignore
+            scale = ceil(log2(L))
+            accum_t.width += scale
+            accum_t.integer += scale
+        if inv_inp_t.rounding_mode == RoundingMode.TRN:
+            pass
+        elif inv_inp_t.rounding_mode == RoundingMode.RND:
+            accum_t.width += 1
+        else:
+            accum_t.width += 2
+
+        config = super().handle(layer, in_tensors, out_tensors)
+        assert len(config) == 1
+        config[0].update(
+            {
+                'axis': layer.axis[0],
+                'implementation': impl,
+                'exp_table_t': exp_table_t,
+                'exp_table_size': exp_table_size,
+                'inv_table_t': inv_table_t,
+                'inv_table_size': inv_table_size,
+                'inv_inp_t': inv_inp_t,
+                'accum_t': accum_t,
+            }
+        )
+        if layer.stable:
+            config[0]['inp_norm_t'] = fixed_quantizer_to_hls4ml_t(layer.exp_table.iq.quantizer, take_max=True)
+        return config
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 520f96ba5f..1e29a569ef 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -417,6 +417,8 @@ def _apply_sub_flow(self, flow_name, applied_flows):
 
         for sub_flow in flow.requires:
             if sub_flow not in applied_flows.keys():
+                # if sub_flow != 'convert':
+                #     continue
                 self._apply_sub_flow(sub_flow, applied_flows)
 
         if len(flow.optimizers) > 0:
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index e166db017f..59015bc88a 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -21,6 +21,7 @@
     FixedPrecisionType,
     IntegerPrecisionType,
     NamedType,
+    PrecisionType,
     RoundingMode,
     SaturationMode,
     TensorVariable,
@@ -149,6 +150,9 @@ def _validate_attributes(self):
 
         # Validate existing attributes
         for attr_name, attr_value in self.attributes.items():
+            if isinstance(attr_value, PrecisionType):
+                attr_value = self._wrap_precision_to_type(f'{self.name}_{attr_name}', attr_value)
+                self.set_attr(attr_name, attr_value)
             exp_attr = all_attributes.pop(attr_name, None)
             if exp_attr is not None:
                 if not exp_attr.validate_value(attr_value):
@@ -992,6 +996,14 @@ class Softmax(Activation):
             'inv_table',
             default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
         ),
+        TypeAttribute(
+            'inv_inp',
+            default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+        ),
+        TypeAttribute(
+            'accum',
+            default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+        ),
     ]
 
     def initialize(self):
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index ff3a63cf6d..13fb9989a1 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -1,5 +1,5 @@
 import typing
-from functools import singledispatch
+from functools import reduce, singledispatch
 from typing import Sequence
 
 import numpy as np
@@ -18,18 +18,16 @@
     Merge,
     Pooling1D,
     Reshape,
+    Softmax,
 )
+from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer
+from hls4ml.model.types import FixedPrecisionType, NamedType
+from hls4ml.utils.qinterval import QIntervalArray, einsum, minimal_kif
 
 if typing.TYPE_CHECKING:
     from hls4ml.model import ModelGraph
 
-from functools import reduce
-
-from hls4ml.model.optimizer import OptimizerPass
-from hls4ml.model.types import FixedPrecisionType, NamedType
-from hls4ml.utils.qinterval import QIntervalArray, einsum, minimal_kif
-
 KIF_t = tuple[NDArray[np.int8], NDArray[np.int8], NDArray[np.int8]]
 
 
@@ -45,13 +43,13 @@ def to_hls4ml_fixed(k, i, f, name, *args):
 
 def get_input_layers(layer: Layer):
     model: 'ModelGraph' = layer.model
-    inp_names = layer.attributes.attributes['inputs']
+    inp_names = layer.attributes.get('inputs', ())
     return [model.graph[name] for name in inp_names]
 
 
 def get_output_layers(layer: Layer):
     model: 'ModelGraph' = layer.model
-    return [l for l in model.graph.values() if layer.name in l.attributes.attributes['inputs']]
+    return [l for l in model.graph.values() if layer.name in l.attributes.get('inputs', ())]
 
 
 def get_output_shape(layer: Layer) -> tuple[int, ...]:
@@ -347,11 +345,30 @@ def _(layer: BatchNormalization):
     return k.astype(np.int8), i, f
 
 
+@produce_kif.register
+def _(layer: Softmax):
+    out_shape = get_output_shape(layer)
+
+    inv_table_t: FixedPrecisionType = layer.attributes['inv_table_t'].precision
+    exp_table_t: FixedPrecisionType = layer.attributes['exp_table_t'].precision
+
+    b_exp, I_exp = exp_table_t.width, exp_table_t.integer
+    b_inv, I_inv = inv_table_t.width, inv_table_t.integer
+
+    i_exp, f_exp = I_exp, b_exp - I_exp
+    i_inv, f_inv = I_inv, b_inv - I_inv
+    k = np.zeros(out_shape, dtype=np.int8)
+    i = np.full(out_shape, i_exp + i_inv, dtype=np.int8)
+    f = np.full(out_shape, f_exp + f_inv, dtype=np.int8)
+
+    return k, i, f
+
+
 def kif_arrs_to_ints(arr: tuple[np.ndarray, np.ndarray, np.ndarray]):
     return tuple(int(np.max(a)) for a in arr)
 
 
-def register_precision(layer: Layer):
+def default_register_precision(layer: Layer):
     _pk, _pi, _pf = produce_kif(layer)
     _rk, _ri, _rf = requested_kif(layer)
     _out_kif = np.minimum(_pk, _rk), np.minimum(_pi, _ri), np.minimum(_pf, _rf)
@@ -380,6 +397,20 @@ def register_precision(layer: Layer):
             bias_t = to_hls4ml_fixed(*bias_kif, f'{layer.name}_bias_t')
         layer.attributes.attributes['bias_t'] = bias_t
 
+    return (_pk, _pi, _pf), (_rk, _ri, _rf), _out_kif
+
+
+@singledispatch
+def register_precision(node: Layer):
+    default_register_precision(node)
+
+
+@register_precision.register
+def _(node: Softmax):
+    accum_t = node.attributes['accum_t']
+    default_register_precision(node)
+    node.attributes['accum_t'] = accum_t
+
 
 class BitExact(OptimizerPass):
     def match(self, node):

From fdfaac518f6179483436ac0cd2443e1161eba930 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sat, 7 Dec 2024 09:38:11 +0000
Subject: [PATCH 44/69] bit-exact softmax fix and leftovers

---
 .../backends/vivado/passes/core_templates.py  |  4 ++-
 hls4ml/converters/keras_v3/squark/softmax.py  | 23 +++-----------
 hls4ml/model/optimizer/passes/bit_exact.py    | 31 ++++++++++++++++---
 .../vivado/nnet_utils/nnet_activation.h       |  9 ++++--
 .../nnet_utils/nnet_activation_stream.h       |  4 +--
 5 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 5f583c7f2f..da8fb87357 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -219,10 +219,12 @@ def format(self, node):
             params['exp_table_size'] = params['table_size']
         if 'inv_table_size' not in params:
             params['inv_table_size'] = params['table_size']
-        if 'inp_norm_t_str' not in params:
+        if 'inp_norm_t' not in params:
             input_t = node.get_input_variable().type.precision
             width, iwidth = input_t.width, input_t.integer
             params['inp_norm_t_str'] = f'ap_fixed<{width}, {iwidth}, AP_RND, AP_SAT>'
+        else:
+            params['inp_norm_t_str'] = params['inp_norm_t'].name  # type: ignore
         return self.template.format(**params)
 
 
diff --git a/hls4ml/converters/keras_v3/squark/softmax.py b/hls4ml/converters/keras_v3/squark/softmax.py
index 36f5366bf9..3e2d5ff6ac 100644
--- a/hls4ml/converters/keras_v3/squark/softmax.py
+++ b/hls4ml/converters/keras_v3/squark/softmax.py
@@ -1,6 +1,4 @@
 import typing
-from copy import copy
-from math import ceil, log2, prod
 from typing import Sequence
 
 from hls4ml.model.types import FixedPrecisionType, RoundingMode, SaturationMode
@@ -73,21 +71,6 @@ def handle(
 
         inv_table_size = 2**inv_inp_t.width
 
-        # Set accum_t
-        accum_t = copy(inv_inp_t)
-        if inv_inp_t.saturation_mode != SaturationMode.WRAP:
-            accum_t.saturation_bits = SaturationMode.WRAP
-            L = prod(in_tensors[0].shape[ax] for ax in layer.axis)  # type: ignore
-            scale = ceil(log2(L))
-            accum_t.width += scale
-            accum_t.integer += scale
-        if inv_inp_t.rounding_mode == RoundingMode.TRN:
-            pass
-        elif inv_inp_t.rounding_mode == RoundingMode.RND:
-            accum_t.width += 1
-        else:
-            accum_t.width += 2
-
         config = super().handle(layer, in_tensors, out_tensors)
         assert len(config) == 1
         config[0].update(
@@ -99,9 +82,11 @@ def handle(
                 'inv_table_t': inv_table_t,
                 'inv_table_size': inv_table_size,
                 'inv_inp_t': inv_inp_t,
-                'accum_t': accum_t,
             }
         )
         if layer.stable:
-            config[0]['inp_norm_t'] = fixed_quantizer_to_hls4ml_t(layer.exp_table.iq.quantizer, take_max=True)
+            inp_norm_t = fixed_quantizer_to_hls4ml_t(layer.exp_table.iq.quantizer)
+            inp_norm_t.saturation_mode = SaturationMode.WRAP
+            inp_norm_t.rounding_mode = RoundingMode.TRN
+            config[0]['inp_norm_t'] = inp_norm_t
         return config
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 13fb9989a1..04a7d46050 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -1,5 +1,7 @@
 import typing
+from copy import copy
 from functools import reduce, singledispatch
+from math import ceil, log2
 from typing import Sequence
 
 import numpy as np
@@ -22,7 +24,7 @@
 )
 from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer
-from hls4ml.model.types import FixedPrecisionType, NamedType
+from hls4ml.model.types import FixedPrecisionType, NamedType, RoundingMode, SaturationMode
 from hls4ml.utils.qinterval import QIntervalArray, einsum, minimal_kif
 
 if typing.TYPE_CHECKING:
@@ -358,7 +360,7 @@ def _(layer: Softmax):
     i_exp, f_exp = I_exp, b_exp - I_exp
     i_inv, f_inv = I_inv, b_inv - I_inv
     k = np.zeros(out_shape, dtype=np.int8)
-    i = np.full(out_shape, i_exp + i_inv, dtype=np.int8)
+    i = np.full(out_shape, min(i_exp + i_inv, 1), dtype=np.int8)
     f = np.full(out_shape, f_exp + f_inv, dtype=np.int8)
 
     return k, i, f
@@ -407,9 +409,30 @@ def register_precision(node: Layer):
 
 @register_precision.register
 def _(node: Softmax):
-    accum_t = node.attributes['accum_t']
+    inv_inp_t: FixedPrecisionType = node.attributes['inv_inp_t'].precision
+    accum_t = copy(inv_inp_t)
+    if inv_inp_t.saturation_mode != SaturationMode.WRAP:
+        accum_t.saturation_bits = SaturationMode.WRAP
+        inp_shape = get_input_shapes(node)[0]
+        axis = node.attributes['axis']
+        L = inp_shape[axis]  # type: ignore
+        scale = ceil(log2(L))
+        accum_t.width += scale
+        accum_t.integer += scale
+    if inv_inp_t.rounding_mode == RoundingMode.TRN:
+        pass
+    elif inv_inp_t.rounding_mode == RoundingMode.RND:
+        accum_t.width += 1
+    else:
+        accum_t.width += 2
     default_register_precision(node)
-    node.attributes['accum_t'] = accum_t
+    exp_table_size = node.attributes['exp_table_size']
+    if exp_table_size is None:
+        k, i, f = get_input_kifs(node)[0]
+        b = np.max(k) + np.max(i) + np.max(f)
+        exp_table_size = 2 ** int(b)
+    node.attributes['exp_table_size'] = exp_table_size
+    node.attributes['accum_t'] = NamedType(f'{node.name}_accum_t', accum_t)
 
 
 class BitExact(OptimizerPass):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index 3e3a54322a..002e0c2e74 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -146,11 +146,14 @@ template <class data_T, unsigned table_size> inline unsigned softmax_idx_from_re
 }
 
 template <class data_T, typename CONFIG_T>
-void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::exp_table_size]) {
+void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::exp_table_size], bool negative = false) {
     // The template data_T is the data type used to address the table
     for (unsigned i = 0; i < CONFIG_T::exp_table_size; i++) {
         // Slicing bits for address is going to round towards 0, so take the central value
         float x = softmax_real_val_from_idx<data_T, CONFIG_T::exp_table_size>(i);
+        if (negative) {
+            x = -x;
+        }
         typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
         table_out[i] = exp_x;
     }
@@ -227,7 +230,7 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 #endif
     if (!initialized) {
         // Note we are exponentiating the inputs, which have type data_T
-        init_exp_table<typename CONFIG_T::inp_norm_t, CONFIG_T>(exp_table);
+        init_exp_table<typename CONFIG_T::inp_norm_t, CONFIG_T>(exp_table, true);
         // Note we are inverting the exponentials, which have type exp_table_t
         init_invert_table<typename CONFIG_T::inv_inp_t, CONFIG_T>(invert_table);
         initialized = true;
@@ -241,7 +244,7 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         #pragma HLS unroll
-        d_xi_xmax[i] = data[i] - x_max;
+        d_xi_xmax[i] = x_max - data[i];
     }
 
     // Calculate all the e^x's
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
index 13c065a313..d117a565aa 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation_stream.h
@@ -180,7 +180,7 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res) {
 #endif
     if (!initialized) {
         // Note we are exponentiating the inputs, which have type data_T
-        init_exp_table<typename CONFIG_T::inp_norm_t, CONFIG_T>(exp_table);
+        init_exp_table<typename CONFIG_T::inp_norm_t, CONFIG_T>(exp_table, true);
         // Note we are inverting the exponentials, which have type exp_table_t
         init_invert_table<typename CONFIG_T::inv_inp_t, CONFIG_T>(invert_table);
         initialized = true;
@@ -211,7 +211,7 @@ void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         typename CONFIG_T::inp_norm_t d_xi_xmax[data_T::size];
         for (unsigned j = 0; j < data_T::size; j++) {
             #pragma HLS UNROLL
-            d_xi_xmax[j] = data_array[j] - x_max;
+            d_xi_xmax[j] = x_max - data_array[j];
         }
 
         // Calculate all the e^x's

From 3f4c6422402841887ac7b298653a4bdf628d9a98 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sat, 7 Dec 2024 09:38:44 +0000
Subject: [PATCH 45/69] softmax table fixer update

---
 hls4ml/backends/fpga/passes/fix_softmax_table_size.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/hls4ml/backends/fpga/passes/fix_softmax_table_size.py b/hls4ml/backends/fpga/passes/fix_softmax_table_size.py
index 4e04626d2e..860aa89597 100644
--- a/hls4ml/backends/fpga/passes/fix_softmax_table_size.py
+++ b/hls4ml/backends/fpga/passes/fix_softmax_table_size.py
@@ -6,7 +6,11 @@
 
 class FixSoftmaxTableSize(OptimizerPass):
     def match(self, node):
-        return isinstance(node, Softmax)
+        if not isinstance(node, Softmax):
+            return False
+        if 'inv_table_size' in node.attributes:
+            return False  # handler generating inv_table_size sets it properly
+        return True
 
     def transform(self, model, node: Layer):
         inp_layer = node.get_input_node()  # type: ignore

From bf99e83e335d578199051563cfe987ed35daa340 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 8 Dec 2024 04:25:49 +0000
Subject: [PATCH 46/69] support input scaler in softmax

---
 hls4ml/backends/vivado/passes/core_templates.py      | 2 ++
 hls4ml/converters/keras_v3/squark/softmax.py         | 2 ++
 hls4ml/templates/vivado/nnet_utils/nnet_activation.h | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index da8fb87357..5f39989a5d 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -156,6 +156,7 @@ def format(self, node):
     static const unsigned reuse_factor = {reuse};
     static const unsigned axis = {axis};
     static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    static constexpr float exp_scale = {exp_scale};
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
     typedef {accum_t.name} accum_t;
@@ -225,6 +226,7 @@ def format(self, node):
             params['inp_norm_t_str'] = f'ap_fixed<{width}, {iwidth}, AP_RND, AP_SAT>'
         else:
             params['inp_norm_t_str'] = params['inp_norm_t'].name  # type: ignore
+        params['exp_scale'] = node.get_attr('exp_scale', 1.0)
         return self.template.format(**params)
 
 
diff --git a/hls4ml/converters/keras_v3/squark/softmax.py b/hls4ml/converters/keras_v3/squark/softmax.py
index 3e2d5ff6ac..a9e8211294 100644
--- a/hls4ml/converters/keras_v3/squark/softmax.py
+++ b/hls4ml/converters/keras_v3/squark/softmax.py
@@ -68,6 +68,7 @@ def handle(
         exp_table_t = fixed_quantizer_to_hls4ml_t(exp_oq)
         inv_table_t = fixed_quantizer_to_hls4ml_t(inv_oq)
         inv_inp_t = fixed_quantizer_to_hls4ml_t(inv_iq)
+        exp_scale = layer.input_scaler
 
         inv_table_size = 2**inv_inp_t.width
 
@@ -82,6 +83,7 @@ def handle(
                 'inv_table_t': inv_table_t,
                 'inv_table_size': inv_table_size,
                 'inv_inp_t': inv_inp_t,
+                'exp_scale': exp_scale,
             }
         )
         if layer.stable:
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index 002e0c2e74..a8ae404f76 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -150,7 +150,7 @@ void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::exp_table
     // The template data_T is the data type used to address the table
     for (unsigned i = 0; i < CONFIG_T::exp_table_size; i++) {
         // Slicing bits for address is going to round towards 0, so take the central value
-        float x = softmax_real_val_from_idx<data_T, CONFIG_T::exp_table_size>(i);
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T::exp_table_size>(i) * CONFIG_T::exp_scale;
         if (negative) {
             x = -x;
         }

From b925bc8c4fb57e2e6ab4c275aa6ce6d3715aeb61 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 8 Dec 2024 05:47:12 +0000
Subject: [PATCH 47/69] support multidim parallel softmax

---
 .../backends/vivado/passes/core_templates.py  | 33 +++++++++++++++----
 hls4ml/backends/vivado/vivado_backend.py      |  8 -----
 hls4ml/converters/keras_v3/squark/softmax.py  | 21 ++++++++++--
 hls4ml/model/layers.py                        |  8 +++--
 .../vivado/nnet_utils/nnet_activation.h       | 24 ++++++++++++++
 5 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 5f39989a5d..8249f88bb8 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -150,6 +150,9 @@ def format(self, node):
 
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
     static const unsigned n_in = {n_in};
+    static const unsigned n_outer = {n_outer};
+    static const unsigned n_inner = {n_inner};
+    static const unsigned parallelization_factor = {parallelization_factor};
     static const unsigned exp_table_size = {exp_table_size};
     static const unsigned inv_table_size = {inv_table_size};
     static const unsigned io_type = nnet::{iotype};
@@ -216,23 +219,41 @@ def __init__(self):
     def format(self, node):
         params = self._default_config_params(node)
         params['type'] = node.get_attr('activation')
-        if 'exp_table_size' not in params:
-            params['exp_table_size'] = params['table_size']
-        if 'inv_table_size' not in params:
-            params['inv_table_size'] = params['table_size']
+        params.setdefault('exp_table_size', params['table_size'])
+        params.setdefault('inv_table_size', params['table_size'])
+        params.setdefault('n_inner', 1)
+        params.setdefault('n_outer', 1)
+        params.setdefault('exp_scale', 1.0)
+        params.setdefault('parallelization_factor', -1)
+
         if 'inp_norm_t' not in params:
             input_t = node.get_input_variable().type.precision
             width, iwidth = input_t.width, input_t.integer
             params['inp_norm_t_str'] = f'ap_fixed<{width}, {iwidth}, AP_RND, AP_SAT>'
         else:
             params['inp_norm_t_str'] = params['inp_norm_t'].name  # type: ignore
-        params['exp_scale'] = node.get_attr('exp_scale', 1.0)
+
+        return self.template.format(**params)
+
+
+class SoftmaxFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Softmax, include_header=activ_include_list)
+        self.template = activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        use_multidim = node.get_attr('n_inner', 1) > 1 or node.get_attr('n_outer', 1) > 1
+        use_multidim = use_multidim and node.model.config.get_config_value('IOType') == 'io_parallel'
+        params['activation'] = 'softmax' if not use_multidim else 'softmax_multidim'
+        params['config'] = f'softmax_config{node.index}'
+
         return self.template.format(**params)
 
 
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
-        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
+        super().__init__((Activation, HardActivation), include_header=activ_include_list)
         self.template = activ_function_template
 
     def format(self, node):
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 117805dd86..d2ba498a73 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -26,7 +26,6 @@
     SeparableConv1D,
     SeparableConv2D,
     SimpleRNN,
-    Softmax,
 )
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
@@ -551,13 +550,6 @@ def init_pooling1d(self, layer):
     def init_pooling2d(self, layer):
         layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
 
-    @layer_optimizer(Softmax)
-    def init_softmax(self, layer):
-        if layer.model.config.get_config_value('IOType') == 'io_parallel':
-            assert (
-                len(layer.get_input_variable().shape) == 1
-            ), 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
-
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
         if layer.attributes['n_in'] is None:
diff --git a/hls4ml/converters/keras_v3/squark/softmax.py b/hls4ml/converters/keras_v3/squark/softmax.py
index a9e8211294..c615031540 100644
--- a/hls4ml/converters/keras_v3/squark/softmax.py
+++ b/hls4ml/converters/keras_v3/squark/softmax.py
@@ -1,4 +1,5 @@
 import typing
+from math import prod
 from typing import Sequence
 
 from hls4ml.model.types import FixedPrecisionType, RoundingMode, SaturationMode
@@ -49,7 +50,7 @@ def handle(
         out_tensors: Sequence['KerasTensor'],
     ):
         assert not layer._allow_heterogeneous_table, 'Heterogeneous table is not supported in QSoftmax layer'
-        assert len(layer.axis) == 1, 'Support softmax along one axis. Use transpose before & after softmax as workaround.'
+        assert len(layer.axis) == 1, 'Support softmax along one axis. Use transpose & reshape as workaround.'
 
         from keras import ops
         from squark.quantizer.internal import FixedPointQuantizerBase
@@ -74,9 +75,24 @@ def handle(
 
         config = super().handle(layer, in_tensors, out_tensors)
         assert len(config) == 1
+        parallelization_factor = layer.parallelization_factor
+
+        ax = layer.axis[0]
+        ax = ax if ax >= 0 else len(in_tensors[0].shape) + ax
+        # io_stream asserts axis=-1, convert to -1 when it is
+        n_outer: int = prod(in_tensors[0].shape[1:ax])  # type: ignore
+        n_inner: int = prod(in_tensors[0].shape[ax + 1 :])  # type: ignore
+        ax = -1 if ax == len(in_tensors[0].shape) - 1 else ax
+        n_in: int = in_tensors[0].shape[ax]  # type: ignore
+        if parallelization_factor < 0:
+            parallelization_factor = n_outer * n_inner
+
         config[0].update(
             {
-                'axis': layer.axis[0],
+                'axis': ax,
+                'n_in': n_in,
+                'n_outer': n_outer,
+                'n_inner': n_inner,
                 'implementation': impl,
                 'exp_table_t': exp_table_t,
                 'exp_table_size': exp_table_size,
@@ -84,6 +100,7 @@ def handle(
                 'inv_table_size': inv_table_size,
                 'inv_inp_t': inv_inp_t,
                 'exp_scale': exp_scale,
+                'parallelization_factor': parallelization_factor,
             }
         )
         if layer.stable:
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 59015bc88a..3894c218b3 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -33,6 +33,9 @@
 from hls4ml.utils.einsum_utils import parse_einsum
 from hls4ml.utils.string_utils import convert_to_snake_case
 
+if typing.TYPE_CHECKING:
+    from hls4ml.model import ModelGraph
+
 # TODO move this to some utility module
 
 
@@ -85,7 +88,7 @@ def __init__(self, model, name, attributes, inputs, outputs=None):
                 "No model layer should be named 'input' because that is a reserved;"
                 + "layer name in ModelGraph; Please rename the layer in your model"
             )
-        self.model = model
+        self.model: 'ModelGraph' = model
         self.name = name
         self.index = model.next_layer()
         self.inputs = inputs
@@ -918,7 +921,8 @@ def initialize(self):
         shape = inp.shape
         dims = inp.dim_names
         self.add_output_variable(shape, dims)
-        self.set_attr('n_in', self.get_input_variable().size())
+        if 'n_in' not in self.attributes:
+            self.set_attr('n_in', self.get_input_variable().size())
 
 
 class ParametrizedActivation(Activation):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index a8ae404f76..5b51f41a5a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -395,6 +395,30 @@ void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_multidim(data_T data[CONFIG_T::outer * CONFIG_T::n_in * CONFIG_T::n_inner],
+                      res_T res[CONFIG_T::outer * CONFIG_T::n_in * CONFIG_T::n_inner]) {
+    #pragma HLS inline
+    #pragma HLS allocation instances = softmax<CONFIG_T> limit = CONFIG_T::parallelization_factor function
+    data_T buffer_in[CONFIG_T::n_in];
+    res_T buffer_out[CONFIG_T::n_in];
+    for (signed i = 0; i < CONFIG_T::n_outer; i++) {
+        #pragma HLS UNROLL
+        for (signed k = 0; k < CONFIG_T::n_inner; k++) {
+            #pragma HLS UNROLL
+            for (signed j = 0; j < CONFIG_T::n_in; j++) {
+                #pragma HLS UNROLL
+                buffer_in[j] = data[i * CONFIG_T::n_in * CONFIG_T::n_inner + j * CONFIG_T::n_inner + k];
+            }
+            softmax<data_T, res_T, CONFIG_T>(buffer_in, buffer_out);
+            for (signed j = 0; j < CONFIG_T::n_in; j++) {
+                #pragma HLS UNROLL
+                res[i * CONFIG_T::n_in * CONFIG_T::n_inner + j * CONFIG_T::n_inner + k] = buffer_out[j];
+            }
+        }
+    }
+}
+
 // *************************************************
 //       TanH Activation
 // *************************************************

From c611c77173d274031bfb9d5ccb9c407fa8cd27f4 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 8 Dec 2024 10:20:14 +0000
Subject: [PATCH 48/69] fuse quantizer when possible

---
 .../backends/fpga/passes/hgq_proxy_model.py   |  4 --
 hls4ml/converters/keras/hgq_proxy_model.py    | 13 +++--
 hls4ml/model/optimizer/__init__.py            |  3 +-
 hls4ml/model/optimizer/passes/bit_exact.py    | 15 ++++-
 .../model/optimizer/passes/hgq_proxy_model.py | 56 ++++++++++++++++++-
 hls4ml/model/types.py                         | 12 ++++
 test/pytest/test_hgq_layers.py                | 12 ++--
 7 files changed, 93 insertions(+), 22 deletions(-)

diff --git a/hls4ml/backends/fpga/passes/hgq_proxy_model.py b/hls4ml/backends/fpga/passes/hgq_proxy_model.py
index 5ec1200ac7..50e24129ad 100644
--- a/hls4ml/backends/fpga/passes/hgq_proxy_model.py
+++ b/hls4ml/backends/fpga/passes/hgq_proxy_model.py
@@ -52,10 +52,6 @@ def match(self, node: Layer):
         return isinstance(node, FixedPointQuantizer)
 
     def transform(self, model, node: FixedPointQuantizer):
-        if node.fusible:
-            model.remove_node(node, rewire=True)
-            return True
-
         if model.config.config['IOType'] != 'io_parallel':
             raise NotImplementedError('Heterogenous quantization for activations is only supported with IOType=io_parallel')
 
diff --git a/hls4ml/converters/keras/hgq_proxy_model.py b/hls4ml/converters/keras/hgq_proxy_model.py
index 1598759253..69055beb11 100644
--- a/hls4ml/converters/keras/hgq_proxy_model.py
+++ b/hls4ml/converters/keras/hgq_proxy_model.py
@@ -10,11 +10,14 @@ def fixedpoint_quantizer_handler(keras_layer, input_names, input_shapes, data_re
     config['RND'] = keras_layer['config']['RND']
     config['SAT'] = keras_layer['config']['SAT']
     config['fusible'] = fusible
-    if not fusible:
-        k = data_reader.get_weights_data(name, 'keep_negative')
-        b = data_reader.get_weights_data(name, 'bits')
-        i = data_reader.get_weights_data(name, 'integers')
-        config['mask_kbi'] = k, b, i
+    k = data_reader.get_weights_data(name, 'keep_negative')
+    b = data_reader.get_weights_data(name, 'bits')
+    i = data_reader.get_weights_data(name, 'integers')
+
+    if fusible:
+        k, b, i = k.ravel()[0], b.ravel()[0], i.ravel()[0]
+
+    config['mask_kbi'] = k, b, i
     config['overrides'] = keras_layer['config']['overrides']
 
     layer = config
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 87dff17678..17042c5fbd 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -71,8 +71,9 @@
         'fuse_consecutive_batch_normalization',
         'fuse_batch_normalization',
         'replace_multidimensional_dense_with_conv',
-        'enforce_proxy_model_embedded_config',
+        # 'enforce_proxy_model_embedded_config',
         'bit_exact',
+        'fuse_fixed_point_quantizer',
         'eliminate_linear_activation',
         'merge_linear_activation',
         # many of the above optimzers need to be done before this
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 04a7d46050..2c4c990aed 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -89,7 +89,7 @@ def _(layer: FixedPointQuantizer):
         f += 1
     else:
         f += 2
-    return ((k, i, f),)
+    return ((k[0], i[0], f[0]),)
 
 
 @request_kif.register(Pooling1D)
@@ -140,7 +140,12 @@ def _(layer: Pooling1D | GlobalPooling1D):
 
 @request_kif.register
 def _(layer: Reshape):
-    return (requested_kif(layer),)
+    inp_shape = get_input_shapes(layer)[0]
+    k, i, f = requested_kif(layer)
+    k = k.reshape(inp_shape)
+    i = i.reshape(inp_shape)
+    f = f.reshape(inp_shape)
+    return ((k, i, f),)
 
 
 def requested_kif(layer: Layer):
@@ -376,7 +381,7 @@ def default_register_precision(layer: Layer):
     _out_kif = np.minimum(_pk, _rk), np.minimum(_pi, _ri), np.minimum(_pf, _rf)
     _out_kif[1][(_pf > _rf) & (_pi <= _ri)] += 1
     result_kif = kif_arrs_to_ints(_out_kif)
-    result_t = to_hls4ml_fixed(*result_kif, f'{layer.name}_result_t')
+    result_t = to_hls4ml_fixed(*result_kif, f'{layer.name}_t')
     layer.attributes.attributes['result_t'] = result_t
     layer.attributes.attributes[layer.name].type = result_t  # Why??????
 
@@ -425,6 +430,7 @@ def _(node: Softmax):
         accum_t.width += 1
     else:
         accum_t.width += 2
+    accum_t.rounding_mode = RoundingMode.TRN
     default_register_precision(node)
     exp_table_size = node.attributes['exp_table_size']
     if exp_table_size is None:
@@ -437,8 +443,11 @@ def _(node: Softmax):
 
 class BitExact(OptimizerPass):
     def match(self, node):
+        if node.attributes.get('bit_exact_transformed'):
+            return False
         return True
 
     def transform(self, model, node):
         register_precision(node)
+        node.attributes['bit_exact_transformed'] = True
         return False
diff --git a/hls4ml/model/optimizer/passes/hgq_proxy_model.py b/hls4ml/model/optimizer/passes/hgq_proxy_model.py
index 13e48aac43..e023e280a6 100644
--- a/hls4ml/model/optimizer/passes/hgq_proxy_model.py
+++ b/hls4ml/model/optimizer/passes/hgq_proxy_model.py
@@ -1,11 +1,19 @@
 import re
+import typing
+from copy import copy
 from warnings import warn
 
+import numpy as np
+
 from hls4ml.backends.fpga.fpga_types import NamedType
-from hls4ml.model.layers import Layer, register_layer
+from hls4ml.model.layers import Layer, Reshape, register_layer
 from hls4ml.model.optimizer import OptimizerPass, register_pass
+from hls4ml.model.optimizer.passes.bit_exact import get_input_layers, get_output_layers
 from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType, WeightVariable
 
+if typing.TYPE_CHECKING:
+    from hls4ml.model import ModelGraph
+
 re_purge_prefix = re.compile(r'(?<!\w)(?:ap_|ac_)', re.IGNORECASE)
 re_parse_fixed = re.compile(r'\s*(u?)fixed<([^>]+)>\s*', re.IGNORECASE)
 
@@ -20,7 +28,7 @@ def initialize(self):
         self.overrides = self.attributes['overrides']
         self.fusible = self.attributes['fusible']
         self.SAT, self.RND = self.attributes['SAT'], self.attributes['RND']
-        self.mask_kbi = self.attributes.get('mask_kbi', None)
+        self.mask_kbi = self.attributes['mask_kbi']
 
 
 class UnaryLUT(Layer):
@@ -74,6 +82,47 @@ def userconf_ifdef(key: str, layer_name: str, model):
     return key in layer_conf
 
 
+class FuseFixedPointQuantizer(OptimizerPass):
+    def match(self, node: Layer):
+        if not isinstance(node, FixedPointQuantizer):
+            return False
+        if any(np.unique(x).size > 1 for x in node.mask_kbi):
+            return False
+        return True
+
+    def propagate(self, node: Layer, precision: FixedPrecisionType):
+        node.attributes.attributes[node.name].type.precision = precision
+        node.attributes.attributes['result_t'].precision = precision
+
+        if not isinstance(node, Reshape):
+            return
+
+        inp_layer = get_input_layers(node)[0]
+        can_propagate = len(get_output_layers(inp_layer)) == 1
+
+        if not can_propagate:
+            return
+
+        new_precision = copy(precision)
+        precision.saturation_bits = 0
+        precision.rounding_mode = 'TRN'
+        precision.saturation_mode = 'WRAP'
+        self.propagate(inp_layer, new_precision)
+
+    def transform(self, model: 'ModelGraph', node: FixedPointQuantizer):
+        precision: FixedPrecisionType = copy(node.attributes[node.name].type.precision)
+        # Rounding and saturation for FixedPointQuantizer are applied in generated code, thus not reflected in result_t.
+        precision.rounding_mode = node.RND
+        precision.saturation_mode = node.SAT
+        ino_layer = get_input_layers(node)[0]
+        can_fuse = len(get_output_layers(ino_layer)) == 1
+        if not can_fuse:
+            return False
+        self.propagate(ino_layer, precision)
+        model.remove_node(node)
+        return True
+
+
 class EnforceProxyModelEmbeddedConfig(OptimizerPass):
     def match(self, node: Layer):
         if not isinstance(node, FixedPointQuantizer):
@@ -148,4 +197,5 @@ def register_hgq_proxy_model():
     register_layer('HGQ>FixedPointQuantizer', FixedPointQuantizer)
     register_layer('UnaryLUT', UnaryLUT)
     register_layer('HGQ>UnaryLUT', UnaryLUT)
-    register_pass('enforce_proxy_model_embedded_config', EnforceProxyModelEmbeddedConfig)
+    # register_pass('enforce_proxy_model_embedded_config', EnforceProxyModelEmbeddedConfig)
+    register_pass('fuse_fixed_point_quantizer', FuseFixedPointQuantizer)
diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py
index 9d0a97440f..b3b0dea383 100644
--- a/hls4ml/model/types.py
+++ b/hls4ml/model/types.py
@@ -206,6 +206,18 @@ def __eq__(self, other: object) -> bool:
     def __hash__(self) -> int:
         return super().__hash__() ^ hash((self.integer, self.rounding_mode, self.saturation_mode, self.saturation_bits))
 
+    @property
+    def min(self):
+        if not self.signed:
+            return 0.0
+        if self.saturation_mode == SaturationMode.SAT_SYM:
+            return -(2.0 ** (self.integer - 1)) + 2.0**-self.fractional
+        return -(2.0 ** (self.integer - 1))
+
+    @property
+    def max(self):
+        return 2.0 ** (self.integer - 1) - 2.0**-self.fractional
+
 
 class XnorPrecisionType(PrecisionType):
     """
diff --git a/test/pytest/test_hgq_layers.py b/test/pytest/test_hgq_layers.py
index 80d96fbcda..8321c0b78c 100644
--- a/test/pytest/test_hgq_layers.py
+++ b/test/pytest/test_hgq_layers.py
@@ -26,7 +26,7 @@
 # tf.config.experimental_run_functions_eagerly(True)  # noqa
 
 
-test_path = Path(__file__).parent
+test_path = Path('/tmp/test')
 
 
 def _run_synth_match_test(proxy: keras.Model, data, io_type: str, backend: str, dir: str, cond=None):
@@ -154,12 +154,12 @@ def custom_activation_fn(x):
         "HConv2D(2, (3,3), padding='same', strides=2)",
         "HConv2DBatchNorm(2, (3,3), padding='valid')",
         "HAdd()",
-        "HActivation('relu')",
-        #   "HActivation('leaky_relu')",
-        "HActivation('tanh')",
-        "HActivation('sigmoid')",
+        # "HActivation('relu')",
+        # "HActivation('leaky_relu')",
+        # "HActivation('tanh')",
+        # "HActivation('sigmoid')",
         # "HActivation('softmax')",
-        "HActivation(custom_activation_fn)",
+        # "HActivation(custom_activation_fn)",
     ],
 )
 @pytest.mark.parametrize("N", [1000])

From b7975fa6f5bb2e18c771770bfe6e4c13d52f3287 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 9 Dec 2024 05:39:35 +0000
Subject: [PATCH 49/69] partial activation, fix input precision in SAT mode

---
 hls4ml/converters/keras/hgq_proxy_model.py    |  10 +-
 .../converters/keras_v3/squark/unary_lut.py   |  83 ++++++++++
 hls4ml/model/optimizer/__init__.py            |   1 +
 hls4ml/model/optimizer/passes/bit_exact.py    | 156 +++++++++++++++---
 .../model/optimizer/passes/hgq_proxy_model.py |  47 +++---
 .../templates/vivado/nnet_utils/nnet_einsum.h |   4 +-
 test/pytest/test_hgq_layers.py                |  12 +-
 7 files changed, 262 insertions(+), 51 deletions(-)
 create mode 100644 hls4ml/converters/keras_v3/squark/unary_lut.py

diff --git a/hls4ml/converters/keras/hgq_proxy_model.py b/hls4ml/converters/keras/hgq_proxy_model.py
index 69055beb11..68b884a4fd 100644
--- a/hls4ml/converters/keras/hgq_proxy_model.py
+++ b/hls4ml/converters/keras/hgq_proxy_model.py
@@ -1,4 +1,5 @@
 from hls4ml.converters.keras_to_hls import KerasReader, keras_handler, parse_default_keras_layer
+from hls4ml.model.types import FixedPrecisionType
 
 
 @keras_handler('FixedPointQuantizer', 'HGQ>FixedPointQuantizer')
@@ -15,7 +16,7 @@ def fixedpoint_quantizer_handler(keras_layer, input_names, input_shapes, data_re
     i = data_reader.get_weights_data(name, 'integers')
 
     if fusible:
-        k, b, i = k.ravel()[0], b.ravel()[0], i.ravel()[0]
+        k, b, i = k.ravel()[:1], b.ravel()[:1], i.ravel()[:1]
 
     config['mask_kbi'] = k, b, i
     config['overrides'] = keras_layer['config']['overrides']
@@ -30,10 +31,9 @@ def unary_lut_keras_handler(keras_layer, input_names, input_shapes, data_reader:
 
     table = data_reader.get_weights_data(config['name'], 'table')
     k, i, f = keras_layer['config']['kif_out']
-    k, b, i = k, k + i + f, k + i
-    config['table_t'] = f'{"" if k else "u"}fixed<{b},{i}>'
-    config['table'] = table
-    config['table_size'] = len(table)
+    k, b, I = k, k + i + f, k + i  # noqa: E741
+    config['table_t'] = FixedPrecisionType(b, I, k)  # noqa: E741
+    config['table_data'] = table
     config['activation'] = 'unary_lut'
 
     layer = config
diff --git a/hls4ml/converters/keras_v3/squark/unary_lut.py b/hls4ml/converters/keras_v3/squark/unary_lut.py
new file mode 100644
index 0000000000..8d1f748914
--- /dev/null
+++ b/hls4ml/converters/keras_v3/squark/unary_lut.py
@@ -0,0 +1,83 @@
+import typing
+from math import prod
+from typing import Sequence
+
+import numpy as np
+from quantizers import float_quantize_np, get_fixed_quantizer_np
+
+from hls4ml.model.types import FixedPrecisionType
+
+from ._base import SQLayerHandler, register
+
+if typing.TYPE_CHECKING:
+    import squark
+    from keras.api import KerasTensor
+    from squark.quantizer.internal import FixedPointQuantizerBase, FloatPointQuantizer
+
+from decimal import Decimal
+
+from hls4ml.utils.qinterval import minimal_kif
+
+
+@register
+class SQUnaryLUTHandler(SQLayerHandler):
+    handles = ('squark.layers.activation.QUnaryFunctionLUT',)
+
+    def handle(
+        self,
+        layer: 'squark.layers.QUnaryFunctionLUT',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+
+        from keras import ops
+
+        if not layer.enable_iq and not layer.enable_oq:
+            raise ValueError('Currently only support input_quantizer enabled UnaryFunctionLUT layer')
+        assert not layer._allow_heterogeneous_table, 'Heterogeneous table is not supported in QUnaryFunctionLUT layer'
+
+        iq = layer.iq.quantizer
+        _min = Decimal(float(ops.min(iq.min)))  # type: ignore
+        _max = Decimal(float(ops.max(iq.max)))  # type: ignore
+        _eps = Decimal(float(ops.min(iq.epsilon)))  # type: ignore
+        N = (_max - _min) / _eps
+        assert float(N).is_integer(), 'Invalid quantizer range'
+        N = int(N)
+        assert N <= 1e6, 'Too large quantizer range'
+        assert np.log2(N).is_integer(), f'Invalid quantizer range: N must be power of 2, got {N}'
+
+        all_inputs = iq(ops.linspace(_min, _max, N))
+        all_inputs = ops.array(np.unique(ops.convert_to_numpy(all_inputs)))
+        table = ops.convert_to_numpy(layer.activation(all_inputs))
+
+        if isinstance(iq, FixedPointQuantizerBase) and _min < 0:
+            # idx by binary repr, move the positive part to the front
+            table_pos, table_neg = table[N // 2 :], table[: N // 2]
+            table = np.concatenate([table_pos, table_neg])
+
+        oq = layer.oq.quantizer
+        if isinstance(oq, FixedPointQuantizerBase):
+            round_mode = oq.round_mode
+            if round_mode.startswith('S_'):
+                round_mode = round_mode[2:]
+            overflow_mode = oq.overflow_mode
+            fixed_q = get_fixed_quantizer_np(round_mode, overflow_mode)
+            k, i, f = (ops.convert_to_numpy(x).ravel().item() for x in oq.kif)
+            table = fixed_q(table, k, i, f)  # type: ignore
+
+            k, b, I = bool(k), k + i + f, k + i  # noqa: E741
+            table_t = FixedPrecisionType(b, I, k)
+        else:
+            assert isinstance(oq, FloatPointQuantizer)
+            m, e, e0 = (ops.convert_to_numpy(x).ravel().item() for x in (oq.m, oq.e, oq.e0))
+            table = float_quantize_np(table, m, e, e0)
+            k, i, f = (int(np.min(x)) for x in minimal_kif(table))
+
+            raise NotImplementedError('FloatPointQuantizer is not supported yet')
+            table_t = FixedPrecisionType(k + i + f, k + i, bool(k))
+
+        return {
+            'n_in': prod(layer.input_shape[1:]),
+            'table': table,
+            'table_t': table_t,
+        }
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 17042c5fbd..391a3934b5 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -74,6 +74,7 @@
         # 'enforce_proxy_model_embedded_config',
         'bit_exact',
         'fuse_fixed_point_quantizer',
+        'fix_input_precision',
         'eliminate_linear_activation',
         'merge_linear_activation',
         # many of the above optimzers need to be done before this
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 2c4c990aed..be5f225502 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -3,11 +3,13 @@
 from functools import reduce, singledispatch
 from math import ceil, log2
 from typing import Sequence
+from warnings import warn
 
 import numpy as np
 from numpy.typing import NDArray
 
 from hls4ml.model.layers import (
+    Activation,
     BatchNormalization,
     Conv1D,
     Conv2D,
@@ -23,13 +25,14 @@
     Softmax,
 )
 from hls4ml.model.optimizer import OptimizerPass
-from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer
-from hls4ml.model.types import FixedPrecisionType, NamedType, RoundingMode, SaturationMode
+from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer, UnaryLUT
+from hls4ml.model.types import FixedPrecisionType, NamedType, RoundingMode, SaturationMode, WeightVariable
 from hls4ml.utils.qinterval import QIntervalArray, einsum, minimal_kif
 
 if typing.TYPE_CHECKING:
     from hls4ml.model import ModelGraph
 
+
 KIF_t = tuple[NDArray[np.int8], NDArray[np.int8], NDArray[np.int8]]
 
 
@@ -45,7 +48,7 @@ def to_hls4ml_fixed(k, i, f, name, *args):
 
 def get_input_layers(layer: Layer):
     model: 'ModelGraph' = layer.model
-    inp_names = layer.attributes.get('inputs', ())
+    inp_names = layer.inputs
     return [model.graph[name] for name in inp_names]
 
 
@@ -55,7 +58,7 @@ def get_output_layers(layer: Layer):
 
 
 def get_output_shape(layer: Layer) -> tuple[int, ...]:
-    return tuple(layer.attributes.attributes[layer.name].shape)
+    return tuple(layer.get_output_variable().shape)
 
 
 def get_input_shapes(layer: Layer) -> list[tuple[int, ...]]:
@@ -64,8 +67,8 @@ def get_input_shapes(layer: Layer) -> list[tuple[int, ...]]:
 
 def _maximum_kif_at_shape(shape: tuple[int, ...]):
     k = np.ones(shape, dtype=np.int8)
-    i = np.full(shape, 127, dtype=np.int8)
-    f = np.full(shape, 127, dtype=np.int8)
+    i = np.full(shape, 126, dtype=np.int8)
+    f = np.full(shape, 126, dtype=np.int8)
     return k, i, f
 
 
@@ -80,16 +83,22 @@ def _(layer: FixedPointQuantizer):
     assert layer.mask_kbi is not None
     k, b, I = layer.mask_kbi
     k, i, f = k, I - k, b - I
+
+    out_shape = get_output_shape(layer)
+    k = np.broadcast_to(k[0], out_shape).astype(np.int8)
+    i = np.broadcast_to(i[0], out_shape).astype(np.int8)
+    f = np.broadcast_to(f[0], out_shape).astype(np.int8)
+
     if layer.SAT != 'WRAP':
         k[:] = 1
-        i[:] = 127
+        i[:] = 126
     if layer.RND == 'TRN':
         pass
     elif layer.RND == 'RND':
         f += 1
     else:
         f += 2
-    return ((k[0], i[0], f[0]),)
+    return ((k, i, f),)
 
 
 @request_kif.register(Pooling1D)
@@ -109,8 +118,8 @@ def _(layer: Pooling1D | GlobalPooling1D):
     is_ch_last = layer.attributes.attributes['data_format'] == 'channels_last'
 
     k = np.ones(out_shape, dtype=np.int8)
-    i = np.full(out_shape, -128, dtype=np.int8)
-    f = np.full(out_shape, 127, dtype=np.int8)
+    i = np.full(out_shape, -127, dtype=np.int8)
+    f = np.full(out_shape, 126, dtype=np.int8)
 
     _, i_out, f_out = requested_kif(layer)
 
@@ -134,7 +143,7 @@ def _(layer: Pooling1D | GlobalPooling1D):
         ln2_size = np.log2(pool_width)
         i += np.ceil(ln2_size).astype(np.int8)
         if not ln2_size.is_integer():
-            f[:] = 127
+            f[:] = 126
     return ((k, i, f),)
 
 
@@ -148,14 +157,27 @@ def _(layer: Reshape):
     return ((k, i, f),)
 
 
-def requested_kif(layer: Layer):
+@request_kif.register
+def _(layer: Activation):
+    fn_name = layer.attributes.attributes.get('activation')
+    if fn_name == 'linear':
+        return (requested_kif(layer),)
+    if fn_name == 'relu':
+        k, i, f = requested_kif(layer)
+        k[:] = 1
+        return ((k, i, f),)
+    inp_shape = get_input_shapes(layer)[0]
+    return (_maximum_kif_at_shape(inp_shape),)
+
+
+def requested_kif(layer: Layer) -> KIF_t:
     out_layers = get_output_layers(layer)
     out_shape = get_output_shape(layer)
     if not out_layers:
         return _maximum_kif_at_shape(out_shape)
 
     k = np.zeros(out_shape, dtype=np.int8)
-    i = np.full(out_shape, -128, dtype=np.int8)
+    i = np.full(out_shape, -127, dtype=np.int8)
     f = i.copy()
     for out_layer in out_layers:
         _kif_s = request_kif(out_layer)
@@ -176,7 +198,7 @@ def produce_kif(layer: Layer) -> KIF_t:
 @produce_kif.register
 def _(layer: Input):
     k = np.ones(get_output_shape(layer), dtype=np.int8)
-    i = f = np.full(get_output_shape(layer), 127, dtype=np.int8)
+    i = f = np.full(get_output_shape(layer), 126, dtype=np.int8)
     return k, i, f
 
 
@@ -189,7 +211,13 @@ def _(layer: FixedPointQuantizer):
     assert layer.mask_kbi is not None
     k, b, I = layer.mask_kbi
     k, i, f = k, I - k, b - I
-    return k[0], i[0], f[0]
+
+    out_shape = get_output_shape(layer)
+    k = np.broadcast_to(k[0], out_shape)
+    i = np.broadcast_to(i[0], out_shape)
+    f = np.broadcast_to(f[0], out_shape)
+
+    return k, i, f
 
 
 @produce_kif.register
@@ -371,6 +399,42 @@ def _(layer: Softmax):
     return k, i, f
 
 
+@produce_kif.register
+def _(layer: Activation):
+    fn_name = layer.attributes.attributes['activation']
+    k, i, f = get_input_kifs(layer)[0]
+
+    if fn_name == 'linear':
+        return k, i, f
+    if fn_name == 'relu':
+        k[:] = 0
+        return k, i, f
+    if fn_name == 'tanh':
+        i = np.minimum(i, 1)
+        f[:] = 126
+        return k, i, f
+    if fn_name == 'sigmoid':
+        k[:] = 0
+        i = np.minimum(i, 1)
+        f[:] = 126
+        return k, i, f
+
+    k[:] = 1
+    i[:] = 126
+    f[:] = 126
+    return k, i, f
+
+
+@produce_kif.register
+def _(layer: UnaryLUT):
+    k, i, f = minimal_kif(layer.attributes['table'].data)
+    shape = get_output_shape(layer)
+    k = np.full(shape, np.max(k), dtype=np.int8)
+    i = np.full(shape, np.max(i), dtype=np.int8)
+    f = np.full(shape, np.max(f), dtype=np.int8)
+    return k, i, f
+
+
 def kif_arrs_to_ints(arr: tuple[np.ndarray, np.ndarray, np.ndarray]):
     return tuple(int(np.max(a)) for a in arr)
 
@@ -383,17 +447,18 @@ def default_register_precision(layer: Layer):
     result_kif = kif_arrs_to_ints(_out_kif)
     result_t = to_hls4ml_fixed(*result_kif, f'{layer.name}_t')
     layer.attributes.attributes['result_t'] = result_t
-    layer.attributes.attributes[layer.name].type = result_t  # Why??????
+    layer.get_output_variable().type = result_t
 
+    overrides = {}
     if 'accum_t' in layer.attributes.attributes:
         accum_kif = kif_arrs_to_ints((_pk, _pi, _pf))
         accum_t = to_hls4ml_fixed(*accum_kif, f'{layer.name}_accum_t')
-        layer.attributes.attributes['accum_t'] = accum_t
+        overrides['accum_t'] = accum_t
 
     if 'weight_t' in layer.attributes.attributes:
         kernel_kif = kif_arrs_to_ints(minimal_kif(layer.attributes.attributes['weight'].data))
         kernel_t = to_hls4ml_fixed(*kernel_kif, f'{layer.name}_weight_t')
-        layer.attributes.attributes['weight_t'] = kernel_t
+        overrides['weight_t'] = kernel_t
 
     if 'bias_t' in layer.attributes.attributes:
         _bias = layer.attributes.attributes.get('bias')
@@ -402,7 +467,20 @@ def default_register_precision(layer: Layer):
         else:
             bias_kif = kif_arrs_to_ints(minimal_kif(_bias.data))
             bias_t = to_hls4ml_fixed(*bias_kif, f'{layer.name}_bias_t')
-        layer.attributes.attributes['bias_t'] = bias_t
+        overrides['bias_t'] = bias_t
+
+    if 'table' in layer.attributes.attributes:
+        table_kif = kif_arrs_to_ints(minimal_kif(layer.attributes.attributes['table'].data))
+        table_t = to_hls4ml_fixed(*table_kif, f'{layer.name}_table_t')
+        overrides['table_t'] = table_t
+
+    for k, v in overrides.items():
+        layer.attributes.attributes[k] = v
+        if k[:-2] in layer.attributes.attributes:
+            weight_var: WeightVariable = layer.attributes.attributes[k[:-2]]
+            weight_var.type = v
+            weight_var.update_precision(v.precision)
+            layer.model.config.layer_name_precision[f'{layer.name}_{k[:-2]}'] = str(v.precision)
 
     return (_pk, _pi, _pf), (_rk, _ri, _rf), _out_kif
 
@@ -441,6 +519,15 @@ def _(node: Softmax):
     node.attributes['accum_t'] = NamedType(f'{node.name}_accum_t', accum_t)
 
 
+@register_precision.register
+def _(node: UnaryLUT):
+    k, i, f = minimal_kif(node.attributes['table'].data)
+    k, i, f = bool(np.max(k)), int(np.max(i)), int(np.max(f))
+    table_t = to_hls4ml_fixed(k, i, f, f'{node.name}_table_t')
+    node.attributes['table_t'] = table_t
+    default_register_precision(node)
+
+
 class BitExact(OptimizerPass):
     def match(self, node):
         if node.attributes.get('bit_exact_transformed'):
@@ -451,3 +538,34 @@ def transform(self, model, node):
         register_precision(node)
         node.attributes['bit_exact_transformed'] = True
         return False
+
+
+class FixInputPrecision(OptimizerPass):
+    def match(self, node: Layer):
+        if not isinstance(node, Input):
+            return False
+
+        # Unhandled input precision, usually by a heterogeneous quantizer with non-WRAP saturation
+        return node.get_output_variable().type.precision.width > 120
+
+    def transform(self, model, node: Layer):
+        out_layers: list[FixedPointQuantizer] = get_output_layers(node)
+        if not all(isinstance(l, FixedPointQuantizer) for l in out_layers):
+            warn(f'Input {node.name} has unhandled high precision. Consider setting it manually before synthesising.')
+            return False
+
+        sat_modes = [l.SAT for l in out_layers]
+        sat_modes_set = set(sat_modes)
+        illegal_sat_modes = sat_modes_set - {'WRAP', 'SAT', 'SAT_SYM'}
+        if illegal_sat_modes:
+            raise ValueError(f'Input {node.name} has quantizer with illegal saturation mode {illegal_sat_modes} after.')
+
+        kifs = [produce_kif(l) for l in out_layers]
+        i = np.max([np.max(i) for _, i, _ in kifs])
+        k = np.max([np.max(k) for k, _, _ in kifs])
+        f = node.get_output_variable().type.precision.fractional
+        new_type = to_hls4ml_fixed(k, i, f, f'{node.name}_t')
+        new_type.precision.saturation_mode = 'SAT'
+        node.get_output_variable().type = new_type
+        node.model.config.layer_name_precision[node.name] = str(new_type)
+        return False
diff --git a/hls4ml/model/optimizer/passes/hgq_proxy_model.py b/hls4ml/model/optimizer/passes/hgq_proxy_model.py
index e023e280a6..ef78507b6c 100644
--- a/hls4ml/model/optimizer/passes/hgq_proxy_model.py
+++ b/hls4ml/model/optimizer/passes/hgq_proxy_model.py
@@ -6,9 +6,9 @@
 import numpy as np
 
 from hls4ml.backends.fpga.fpga_types import NamedType
+from hls4ml.model.attributes import Attribute, TypeAttribute, WeightAttribute
 from hls4ml.model.layers import Layer, Reshape, register_layer
 from hls4ml.model.optimizer import OptimizerPass, register_pass
-from hls4ml.model.optimizer.passes.bit_exact import get_input_layers, get_output_layers
 from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType, WeightVariable
 
 if typing.TYPE_CHECKING:
@@ -32,29 +32,23 @@ def initialize(self):
 
 
 class UnaryLUT(Layer):
+    _expected_attributes = [
+        Attribute('n_in'),
+        TypeAttribute('table_t', default=FixedPrecisionType(18, 8, True)),
+        WeightAttribute('table'),
+    ]
+
     def initialize(self):
         inp = self.get_input_variable()
         shape = inp.shape
         dims = inp.dim_names
         self.add_output_variable(shape, dims)
         self.set_attr('n_in', inp.size())
-        self.table = self.attributes['table']
-        self.table_size = self.attributes['table_size']
-
-        table_t = to_hls4ml_fixed(self.attributes['table_t'])
-        self.add_weights_variable(name='table', var_name='table{index}', precision=table_t, data=self.table)
-
+        self.table = self.attributes['table_data']
+        self.attributes['table_size'] = len(self.table)
+        self.table_size = len(self.table)
 
-def to_hls4ml_fixed(fixed: str):
-    matched = re_parse_fixed.match(re_purge_prefix.sub('', fixed))
-    assert matched is not None, f'Cannot parse {fixed}'
-    signed = matched.group(1) != 'u'
-    b, i, *args = matched.group(2).split(',')
-    b, i = int(b), int(i)
-    args = [arg.upper() for arg in args]
-    new_type = FixedPrecisionType(b, i, signed, *args)
-    # For some reason, __class__ is overwritten in hls4ml
-    return new_type
+        self.add_weights_variable(name='table')
 
 
 def userconf_ifdef(key: str, layer_name: str, model):
@@ -91,7 +85,9 @@ def match(self, node: Layer):
         return True
 
     def propagate(self, node: Layer, precision: FixedPrecisionType):
-        node.attributes.attributes[node.name].type.precision = precision
+        from hls4ml.model.optimizer.passes.bit_exact import get_input_layers, get_output_layers
+
+        node.get_output_variable().type.precision = precision
         node.attributes.attributes['result_t'].precision = precision
 
         if not isinstance(node, Reshape):
@@ -110,7 +106,9 @@ def propagate(self, node: Layer, precision: FixedPrecisionType):
         self.propagate(inp_layer, new_precision)
 
     def transform(self, model: 'ModelGraph', node: FixedPointQuantizer):
-        precision: FixedPrecisionType = copy(node.attributes[node.name].type.precision)
+        from hls4ml.model.optimizer.passes.bit_exact import get_input_layers, get_output_layers
+
+        precision: FixedPrecisionType = copy(node.get_output_variable().type.precision)
         # Rounding and saturation for FixedPointQuantizer are applied in generated code, thus not reflected in result_t.
         precision.rounding_mode = node.RND
         precision.saturation_mode = node.SAT
@@ -135,6 +133,17 @@ def transform(self, model, node: FixedPointQuantizer):
         if 'layers' not in node.overrides:
             return False
 
+        def to_hls4ml_fixed(fixed: str):
+            matched = re_parse_fixed.match(re_purge_prefix.sub('', fixed))
+            assert matched is not None, f'Cannot parse {fixed}'
+            signed = matched.group(1) != 'u'
+            b, i, *args = matched.group(2).split(',')
+            b, i = int(b), int(i)
+            args = [arg.upper() for arg in args]
+            new_type = FixedPrecisionType(b, i, signed, *args)
+            # For some reason, __class__ is overwritten in hls4ml
+            return new_type
+
         graph_changed = False
         layers = node.overrides['layers']
         for name, conf in layers.items():
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h b/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h
index 6fddd9b5fa..18f323f39d 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h
@@ -1,5 +1,5 @@
-#ifndef NNET_DENSE_H_
-#define NNET_DENSE_H_
+#ifndef NNET_EINSUM_H_
+#define NNET_EINSUM_H_
 
 #include "nnet_common.h"
 #include "nnet_mult.h"
diff --git a/test/pytest/test_hgq_layers.py b/test/pytest/test_hgq_layers.py
index 8321c0b78c..80d96fbcda 100644
--- a/test/pytest/test_hgq_layers.py
+++ b/test/pytest/test_hgq_layers.py
@@ -26,7 +26,7 @@
 # tf.config.experimental_run_functions_eagerly(True)  # noqa
 
 
-test_path = Path('/tmp/test')
+test_path = Path(__file__).parent
 
 
 def _run_synth_match_test(proxy: keras.Model, data, io_type: str, backend: str, dir: str, cond=None):
@@ -154,12 +154,12 @@ def custom_activation_fn(x):
         "HConv2D(2, (3,3), padding='same', strides=2)",
         "HConv2DBatchNorm(2, (3,3), padding='valid')",
         "HAdd()",
-        # "HActivation('relu')",
-        # "HActivation('leaky_relu')",
-        # "HActivation('tanh')",
-        # "HActivation('sigmoid')",
+        "HActivation('relu')",
+        #   "HActivation('leaky_relu')",
+        "HActivation('tanh')",
+        "HActivation('sigmoid')",
         # "HActivation('softmax')",
-        # "HActivation(custom_activation_fn)",
+        "HActivation(custom_activation_fn)",
     ],
 )
 @pytest.mark.parametrize("N", [1000])

From 3d1431e6b1b3df7442b036b215a0f7bcd25a00b5 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 9 Dec 2024 07:07:40 +0000
Subject: [PATCH 50/69] fix padded convXd precition derivation rule

---
 hls4ml/model/optimizer/passes/bit_exact.py | 32 ++++++++++++++++------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index be5f225502..d73ff3e1ee 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -330,21 +330,36 @@ def im2col(kernel_size: Sequence[int], *arrs: np.ndarray):
     return [_im2col(kernel_size, arr) for arr in arrs]
 
 
-def pad_and_stride_inp_arr(node: Layer, arr: np.ndarray, pad_val: float = 0):
+def pad_arrs(node: Layer, pad_val: float = 0, *arrs: np.ndarray):
+    out_arrs = []
     if node.class_name.endswith('Conv2D'):
         pad_top = node.attributes.attributes['pad_top']
         pad_bottom = node.attributes.attributes['pad_bottom']
         pad_left = node.attributes.attributes['pad_left']
         pad_right = node.attributes.attributes['pad_right']
+        for arr in arrs:
+            r = np.pad(arr, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), constant_values=pad_val)
+            out_arrs.append(r)
+    elif node.class_name.endswith('Conv1D'):
+        pad_left = node.attributes.attributes['pad_left']
+        pad_right = node.attributes.attributes['pad_right']
+        for arr in arrs:
+            r = np.pad(arr, ((pad_left, pad_right), (0, 0)), constant_values=pad_val)
+            out_arrs.append(r)
+    else:
+        raise ValueError(f'Layer {node.class_name} is not supported for pad_arrs')
+    return tuple(out_arrs)
+
+
+def stride_arrs(node: Layer, *arrs: np.ndarray):
+    if node.class_name.endswith('Conv2D'):
         st_h = node.attributes.attributes['stride_height']
         st_w = node.attributes.attributes['stride_width']
-        return np.pad(arr, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), constant_values=pad_val)[::st_h, ::st_w]
+        return tuple(arr[::st_h, ::st_w] for arr in arrs)
     if node.class_name.endswith('Conv1D'):
-        pad_left = node.attributes.attributes['pad_left']
-        pad_right = node.attributes.attributes['pad_right']
         st_w = node.attributes.attributes['stride_width']
-        return np.pad(arr, ((pad_left, pad_right), (0, 0)), constant_values=pad_val)[::st_w]
-    return arr
+        return tuple(arr[::st_w] for arr in arrs)
+    raise ValueError(f'Layer {node.class_name} is not supported for stride_arrs')
 
 
 @produce_kif.register(Conv1D)
@@ -354,10 +369,9 @@ def _(layer: Conv1D | Conv2D):
     _bias = layer.attributes.attributes['bias']
     bias = _bias.data if _bias is not None else 0
     k_in, i_in, f_in = get_input_kifs(layer)[0]
+    k_in, i_in, f_in = pad_arrs(layer, 0, k_in, i_in, f_in)
     k_in, i_in, f_in = im2col(kernel.shape, k_in, i_in, f_in)
-    k_in = pad_and_stride_inp_arr(layer, k_in, 0)
-    i_in = pad_and_stride_inp_arr(layer, i_in, 0)
-    f_in = pad_and_stride_inp_arr(layer, f_in, 0)
+    k_in, i_in, f_in = stride_arrs(layer, k_in, i_in, f_in)
     kernel = kernel.reshape(-1, kernel.shape[-1])
     qint_in = QIntervalArray.from_kif(k_in, i_in, f_in)
     qint_out = qint_in @ kernel

From f97d4d8b03ce40f4734ceb949f1bca07f8e21e76 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 9 Dec 2024 10:14:42 +0000
Subject: [PATCH 51/69] add unary lut support

---
 .../backends/fpga/passes/hgq_proxy_model.py   |  1 -
 hls4ml/converters/keras_v3/_base.py           |  1 +
 hls4ml/converters/keras_v3/squark/__init__.py |  2 +-
 .../converters/keras_v3/squark/unary_lut.py   | 60 ++++++++++++-------
 hls4ml/model/optimizer/passes/bit_exact.py    | 21 ++++---
 5 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/hls4ml/backends/fpga/passes/hgq_proxy_model.py b/hls4ml/backends/fpga/passes/hgq_proxy_model.py
index 50e24129ad..77773bf131 100644
--- a/hls4ml/backends/fpga/passes/hgq_proxy_model.py
+++ b/hls4ml/backends/fpga/passes/hgq_proxy_model.py
@@ -90,7 +90,6 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-        node.attributes['result_t'].precision = node.attributes['table_t'].precision
         params['config'] = f'unary_lut_config{node.index}'
         params['table'] = node.get_weights('table').name
 
diff --git a/hls4ml/converters/keras_v3/_base.py b/hls4ml/converters/keras_v3/_base.py
index 28d7c7e1e4..6f50ed6523 100644
--- a/hls4ml/converters/keras_v3/_base.py
+++ b/hls4ml/converters/keras_v3/_base.py
@@ -83,6 +83,7 @@ class KerasV3LayerHandler:
     """Base class for keras v3 layer handlers. Subclass this class to create a handler for a specific layer type."""
 
     handles = ()
+    default_config: DefaultConfig
 
     def __call__(
         self,
diff --git a/hls4ml/converters/keras_v3/squark/__init__.py b/hls4ml/converters/keras_v3/squark/__init__.py
index 98ea780642..0ce9f5f672 100644
--- a/hls4ml/converters/keras_v3/squark/__init__.py
+++ b/hls4ml/converters/keras_v3/squark/__init__.py
@@ -1 +1 @@
-from . import _base, einsum, softmax
+from . import _base, einsum, softmax, unary_lut
diff --git a/hls4ml/converters/keras_v3/squark/unary_lut.py b/hls4ml/converters/keras_v3/squark/unary_lut.py
index 8d1f748914..8dee49540f 100644
--- a/hls4ml/converters/keras_v3/squark/unary_lut.py
+++ b/hls4ml/converters/keras_v3/squark/unary_lut.py
@@ -1,18 +1,16 @@
 import typing
-from math import prod
 from typing import Sequence
 
 import numpy as np
-from quantizers import float_quantize_np, get_fixed_quantizer_np
+from quantizers import float_quantize, get_fixed_quantizer
 
 from hls4ml.model.types import FixedPrecisionType
 
-from ._base import SQLayerHandler, register
+from ._base import KerasV3LayerHandler, SQLayerHandler, register
 
 if typing.TYPE_CHECKING:
     import squark
     from keras.api import KerasTensor
-    from squark.quantizer.internal import FixedPointQuantizerBase, FloatPointQuantizer
 
 from decimal import Decimal
 
@@ -20,7 +18,7 @@
 
 
 @register
-class SQUnaryLUTHandler(SQLayerHandler):
+class SQUnaryLUTHandler(SQLayerHandler, KerasV3LayerHandler):
     handles = ('squark.layers.activation.QUnaryFunctionLUT',)
 
     def handle(
@@ -29,8 +27,8 @@ def handle(
         in_tensors: Sequence['KerasTensor'],
         out_tensors: Sequence['KerasTensor'],
     ):
-
         from keras import ops
+        from squark.quantizer.internal import FixedPointQuantizerBase, FloatPointQuantizer
 
         if not layer.enable_iq and not layer.enable_oq:
             raise ValueError('Currently only support input_quantizer enabled UnaryFunctionLUT layer')
@@ -40,20 +38,32 @@ def handle(
         _min = Decimal(float(ops.min(iq.min)))  # type: ignore
         _max = Decimal(float(ops.max(iq.max)))  # type: ignore
         _eps = Decimal(float(ops.min(iq.epsilon)))  # type: ignore
-        N = (_max - _min) / _eps
+        N = (_max - _min) / _eps + 1
         assert float(N).is_integer(), 'Invalid quantizer range'
         N = int(N)
         assert N <= 1e6, 'Too large quantizer range'
         assert np.log2(N).is_integer(), f'Invalid quantizer range: N must be power of 2, got {N}'
 
-        all_inputs = iq(ops.linspace(_min, _max, N))
-        all_inputs = ops.array(np.unique(ops.convert_to_numpy(all_inputs)))
-        table = ops.convert_to_numpy(layer.activation(all_inputs))
+        all_inputs = ops.linspace(float(_min), float(_max), N)
+
+        config = {}
+        config.update(self.default_config)
 
-        if isinstance(iq, FixedPointQuantizerBase) and _min < 0:
-            # idx by binary repr, move the positive part to the front
-            table_pos, table_neg = table[N // 2 :], table[: N // 2]
-            table = np.concatenate([table_pos, table_neg])
+        if isinstance(iq, FixedPointQuantizerBase):
+            table = ops.convert_to_numpy(layer.activation(all_inputs))
+            if _min < 0:
+                # idx by binary repr, move the positive part to the front
+                table_pos, table_neg = table[N // 2 :], table[: N // 2]
+                table = np.concatenate([table_pos, table_neg])
+        else:
+            assert isinstance(iq, FloatPointQuantizer), f'{layer.name}: Unknown quantizer class {type(iq)}'
+            mee0 = (ops.convert_to_numpy(x) for x in (iq.m, iq.e, iq.e0))
+            assert all(
+                x.size == 1 for x in mee0
+            ), f'{layer.name}: Only homogeneous input quantizer is supported for minifloat'
+            m, e, e0 = (int(x.ravel().item()) for x in mee0)
+            all_inputs = float_quantize(all_inputs, m, e, e0)
+            table = ops.convert_to_numpy(layer.activation(all_inputs))
 
         oq = layer.oq.quantizer
         if isinstance(oq, FixedPointQuantizerBase):
@@ -61,7 +71,7 @@ def handle(
             if round_mode.startswith('S_'):
                 round_mode = round_mode[2:]
             overflow_mode = oq.overflow_mode
-            fixed_q = get_fixed_quantizer_np(round_mode, overflow_mode)
+            fixed_q = get_fixed_quantizer(round_mode, overflow_mode)
             k, i, f = (ops.convert_to_numpy(x).ravel().item() for x in oq.kif)
             table = fixed_q(table, k, i, f)  # type: ignore
 
@@ -70,14 +80,20 @@ def handle(
         else:
             assert isinstance(oq, FloatPointQuantizer)
             m, e, e0 = (ops.convert_to_numpy(x).ravel().item() for x in (oq.m, oq.e, oq.e0))
-            table = float_quantize_np(table, m, e, e0)
+            table = float_quantize(table, m, e, e0)
             k, i, f = (int(np.min(x)) for x in minimal_kif(table))
 
             raise NotImplementedError('FloatPointQuantizer is not supported yet')
             table_t = FixedPrecisionType(k + i + f, k + i, bool(k))
-
-        return {
-            'n_in': prod(layer.input_shape[1:]),
-            'table': table,
-            'table_t': table_t,
-        }
+        table = ops.convert_to_numpy(table)
+
+        config.update(
+            {
+                'class_name': 'UnaryLUT',
+                'table_data': table,
+                'table_t': table_t,
+                'activation': 'unary_lut',
+            }
+        )
+
+        return (config,)
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index d73ff3e1ee..4e3a18d0aa 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -164,7 +164,7 @@ def _(layer: Activation):
         return (requested_kif(layer),)
     if fn_name == 'relu':
         k, i, f = requested_kif(layer)
-        k[:] = 1
+        k = np.ones_like(k)
         return ((k, i, f),)
     inp_shape = get_input_shapes(layer)[0]
     return (_maximum_kif_at_shape(inp_shape),)
@@ -421,27 +421,30 @@ def _(layer: Activation):
     if fn_name == 'linear':
         return k, i, f
     if fn_name == 'relu':
-        k[:] = 0
+        print(k.__class__)
+        k = np.zeros_like(k)
         return k, i, f
     if fn_name == 'tanh':
         i = np.minimum(i, 1)
-        f[:] = 126
+        f = np.full_like(f, 126)
         return k, i, f
     if fn_name == 'sigmoid':
-        k[:] = 0
+        k = np.zeros_like(k)
         i = np.minimum(i, 1)
-        f[:] = 126
+        f = np.full_like(f, 126)
         return k, i, f
 
-    k[:] = 1
-    i[:] = 126
-    f[:] = 126
+    k = np.zeros_like(k)
+    i = np.full_like(i, 1)
+    f = np.full_like(f, 126)
     return k, i, f
 
 
 @produce_kif.register
 def _(layer: UnaryLUT):
-    k, i, f = minimal_kif(layer.attributes['table'].data)
+    table_t = layer.attributes['table_t'].precision
+    k, I, f = table_t.signed, table_t.integer, table_t.fractional
+    i = I - k
     shape = get_output_shape(layer)
     k = np.full(shape, np.max(k), dtype=np.int8)
     i = np.full(shape, np.max(i), dtype=np.int8)

From 61e76a28796a41fe15a42ec25aa37b9724c8e995 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 10 Dec 2024 05:42:17 +0000
Subject: [PATCH 52/69] fix bit-exact corner case introduced by reverse flow

---
 hls4ml/model/optimizer/passes/bit_exact.py    |  4 ++--
 .../model/optimizer/passes/hgq_proxy_model.py | 23 ++++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 4e3a18d0aa..6d741d2abc 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -97,7 +97,7 @@ def _(layer: FixedPointQuantizer):
     elif layer.RND == 'RND':
         f += 1
     else:
-        f += 2
+        f += 3
     return ((k, i, f),)
 
 
@@ -524,7 +524,7 @@ def _(node: Softmax):
     elif inv_inp_t.rounding_mode == RoundingMode.RND:
         accum_t.width += 1
     else:
-        accum_t.width += 2
+        accum_t.width += 3
     accum_t.rounding_mode = RoundingMode.TRN
     default_register_precision(node)
     exp_table_size = node.attributes['exp_table_size']
diff --git a/hls4ml/model/optimizer/passes/hgq_proxy_model.py b/hls4ml/model/optimizer/passes/hgq_proxy_model.py
index ef78507b6c..10ff48a680 100644
--- a/hls4ml/model/optimizer/passes/hgq_proxy_model.py
+++ b/hls4ml/model/optimizer/passes/hgq_proxy_model.py
@@ -76,6 +76,9 @@ def userconf_ifdef(key: str, layer_name: str, model):
     return key in layer_conf
 
 
+q_kifRS_t = tuple[np.ndarray, np.ndarray, np.ndarray, str, str]
+
+
 class FuseFixedPointQuantizer(OptimizerPass):
     def match(self, node: Layer):
         if not isinstance(node, FixedPointQuantizer):
@@ -91,13 +94,13 @@ def propagate(self, node: Layer, precision: FixedPrecisionType):
         node.attributes.attributes['result_t'].precision = precision
 
         if not isinstance(node, Reshape):
-            return
+            return node
 
         inp_layer = get_input_layers(node)[0]
         can_propagate = len(get_output_layers(inp_layer)) == 1
 
         if not can_propagate:
-            return
+            return node
 
         new_precision = copy(precision)
         precision.saturation_bits = 0
@@ -108,15 +111,19 @@ def propagate(self, node: Layer, precision: FixedPrecisionType):
     def transform(self, model: 'ModelGraph', node: FixedPointQuantizer):
         from hls4ml.model.optimizer.passes.bit_exact import get_input_layers, get_output_layers
 
-        precision: FixedPrecisionType = copy(node.get_output_variable().type.precision)
         # Rounding and saturation for FixedPointQuantizer are applied in generated code, thus not reflected in result_t.
-        precision.rounding_mode = node.RND
-        precision.saturation_mode = node.SAT
-        ino_layer = get_input_layers(node)[0]
-        can_fuse = len(get_output_layers(ino_layer)) == 1
+        if node.RND == 'TRN' and node.SAT == 'WRAP':
+            precision: FixedPrecisionType = copy(node.get_output_variable().type.precision)
+        else:
+            k, b, i = node.mask_kbi
+            k, b, i = bool(k.ravel()[0]), int(b.ravel()[0]), int(i.ravel()[0])
+            precision = FixedPrecisionType(b, i, k, node.RND, node.SAT)
+
+        inp_layer = get_input_layers(node)[0]
+        can_fuse = len(get_output_layers(inp_layer)) == 1
         if not can_fuse:
             return False
-        self.propagate(ino_layer, precision)
+        self.propagate(inp_layer, precision)
         model.remove_node(node)
         return True
 

From e50e731f138ea3d44681d02e66ae05dcbc2818ac Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 10 Dec 2024 07:59:26 +0000
Subject: [PATCH 53/69] general data_t inference

---
 hls4ml/model/optimizer/passes/bit_exact.py | 46 +++++++++++-----------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 6d741d2abc..099ac56ffd 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -467,37 +467,35 @@ def default_register_precision(layer: Layer):
     layer.get_output_variable().type = result_t
 
     overrides = {}
+
     if 'accum_t' in layer.attributes.attributes:
         accum_kif = kif_arrs_to_ints((_pk, _pi, _pf))
         accum_t = to_hls4ml_fixed(*accum_kif, f'{layer.name}_accum_t')
         overrides['accum_t'] = accum_t
 
-    if 'weight_t' in layer.attributes.attributes:
-        kernel_kif = kif_arrs_to_ints(minimal_kif(layer.attributes.attributes['weight'].data))
-        kernel_t = to_hls4ml_fixed(*kernel_kif, f'{layer.name}_weight_t')
-        overrides['weight_t'] = kernel_t
-
-    if 'bias_t' in layer.attributes.attributes:
-        _bias = layer.attributes.attributes.get('bias')
-        if _bias is None:
-            bias_t = to_hls4ml_fixed(0, 0, 1, f'{layer.name}_bias_t')
-        else:
-            bias_kif = kif_arrs_to_ints(minimal_kif(_bias.data))
-            bias_t = to_hls4ml_fixed(*bias_kif, f'{layer.name}_bias_t')
-        overrides['bias_t'] = bias_t
-
-    if 'table' in layer.attributes.attributes:
-        table_kif = kif_arrs_to_ints(minimal_kif(layer.attributes.attributes['table'].data))
-        table_t = to_hls4ml_fixed(*table_kif, f'{layer.name}_table_t')
-        overrides['table_t'] = table_t
-
-    for k, v in overrides.items():
-        layer.attributes.attributes[k] = v
-        if k[:-2] in layer.attributes.attributes:
-            weight_var: WeightVariable = layer.attributes.attributes[k[:-2]]
+    for w_name_t, v in layer.attributes.attributes.items():
+        if isinstance(v, NamedType) and w_name_t.endswith('_t'):
+            w_name = w_name_t[:-2]
+            if w_name not in layer.attributes.attributes:
+                continue
+            _data = layer.attributes.attributes[w_name]
+            if _data is None:
+                precision = to_hls4ml_fixed(0, 0, 1, f'{layer.name}_{w_name_t}')
+            else:
+                data = _data.data
+                if not isinstance(data, np.ndarray):
+                    raise ValueError(f'Expected data to be np.ndarray, got {type(data)} on layer {layer.name}')
+                k, i, f = kif_arrs_to_ints(minimal_kif(data))
+                precision = to_hls4ml_fixed(k, i, f, f'{layer.name}_{w_name_t}')
+            overrides[w_name_t] = precision
+
+    for w_name_t, v in overrides.items():
+        layer.attributes.attributes[w_name_t] = v
+        if w_name_t[:-2] in layer.attributes.attributes:
+            weight_var: WeightVariable = layer.attributes.attributes[w_name_t[:-2]]
             weight_var.type = v
             weight_var.update_precision(v.precision)
-            layer.model.config.layer_name_precision[f'{layer.name}_{k[:-2]}'] = str(v.precision)
+            layer.model.config.layer_name_precision[f'{layer.name}_{w_name_t[:-2]}'] = str(v.precision)
 
     return (_pk, _pi, _pf), (_rk, _ri, _rf), _out_kif
 

From 4a6b0b59d79db427a70fbde9c83c0b1b118f9307 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 11 Dec 2024 01:41:01 +0000
Subject: [PATCH 54/69] softmax compatbility

---
 hls4ml/converters/keras_v3/core.py           | 18 ++++++-
 hls4ml/converters/keras_v3/squark/softmax.py | 51 ++++++++++++++------
 hls4ml/model/layers.py                       |  2 +
 hls4ml/model/optimizer/passes/bit_exact.py   | 29 +++++++----
 4 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/hls4ml/converters/keras_v3/core.py b/hls4ml/converters/keras_v3/core.py
index f01fd06550..f3ac9a0d75 100644
--- a/hls4ml/converters/keras_v3/core.py
+++ b/hls4ml/converters/keras_v3/core.py
@@ -1,5 +1,6 @@
 import inspect
 import typing
+from math import prod
 from typing import Any, Sequence
 
 import numpy as np
@@ -178,12 +179,25 @@ def handle(
         in_tensors: Sequence['KerasTensor'],
         out_tensors: Sequence['KerasTensor'],
     ):
+        ax = layer.axis
+        ax = ax if ax >= 0 else len(in_tensors[0].shape) + ax
+        # io_stream asserts axis=-1, convert to -1 when it is
+        n_outer: int = prod(in_tensors[0].shape[1:ax])  # type: ignore
+        n_inner: int = prod(in_tensors[0].shape[ax + 1 :])  # type: ignore
+        ax = -1 if ax == len(in_tensors[0].shape) - 1 else ax
         config = {}
         config.update(self.default_config)
-
-        config['class_name'] = 'Softmax'
+        if len(in_tensors) == 2:
+            raise NotImplementedError("Masked softmax not supported yet")
+            config['class_name'] = 'MaskedSoftmax'
+        elif len(in_tensors) == 1:
+            config['class_name'] = 'Softmax'
+        else:
+            raise ValueError(f"Too many inputs for softmax layer {layer.name}: expected 1 or 2, got {len(in_tensors)}")
         config['axis'] = layer.axis
         config['activation'] = 'softmax'
+        config['n_outer'] = (n_outer,)
+        config['n_inner'] = n_inner
 
         return (config,)
 
diff --git a/hls4ml/converters/keras_v3/squark/softmax.py b/hls4ml/converters/keras_v3/squark/softmax.py
index c615031540..8e3af35c58 100644
--- a/hls4ml/converters/keras_v3/squark/softmax.py
+++ b/hls4ml/converters/keras_v3/squark/softmax.py
@@ -4,7 +4,6 @@
 
 from hls4ml.model.types import FixedPrecisionType, RoundingMode, SaturationMode
 
-from ..core import KV3SoftmaxHandler
 from ._base import SQLayerHandler, register
 
 if typing.TYPE_CHECKING:
@@ -40,7 +39,7 @@ def fixed_quantizer_to_hls4ml_t(q: 'FixedPointQuantizerBase', take_max=False):
 
 
 @register
-class SQSoftmaxDenseHandler(SQLayerHandler, KV3SoftmaxHandler):
+class SQSoftmaxDenseHandler(SQLayerHandler):
     handles = ('squark.layers.softmax.QSoftmax',)
 
     def handle(
@@ -50,7 +49,22 @@ def handle(
         out_tensors: Sequence['KerasTensor'],
     ):
         assert not layer._allow_heterogeneous_table, 'Heterogeneous table is not supported in QSoftmax layer'
-        assert len(layer.axis) == 1, 'Support softmax along one axis. Use transpose & reshape as workaround.'
+        if len(layer.axis) == 1:
+            ax = layer.axis[0]
+            ax = ax if ax >= 0 else len(in_tensors[0].shape) + ax
+            # io_stream asserts axis=-1, convert to -1 when it is
+            n_outer: int = prod(in_tensors[0].shape[1:ax])  # type: ignore
+            n_inner: int = prod(in_tensors[0].shape[ax + 1 :])  # type: ignore
+            n_in: int = in_tensors[0].shape[ax]  # type: ignore
+            ax = -1 if ax == len(in_tensors[0].shape) - 1 else ax
+        else:  # softmax along multiple axes
+            axs = [ax if ax >= 0 else len(in_tensors[0].shape) + ax for ax in layer.axis]
+            axs = sorted(axs)
+            assert all(ax1 - ax0 == 1 for ax0, ax1 in zip(axs[:-1], axs[1:])), 'Softmax must act on adjacent axes'
+            n_outer: int = prod(in_tensors[0].shape[1 : axs[0]])  # type: ignore
+            n_inner: int = prod(in_tensors[0].shape[axs[-1] + 1 :])  # type: ignore
+            n_in: int = prod(in_tensors[0].shape[axs[0] : axs[-1] + 1])  # type: ignore
+            ax = -1  # if n_inner == 1 else 999  # 999 as placeholder
 
         from keras import ops
         from squark.quantizer.internal import FixedPointQuantizerBase
@@ -60,7 +74,7 @@ def handle(
         if impl == 'stable':
             exp_table_size = 2 ** int(ops.convert_to_numpy(ops.max(layer.exp_table.iq.quantizer.bits)))
         else:
-            exp_table_size = None
+            exp_table_size = None  # Placeholder, will be overridden in bit-exact pass
 
         exp_oq = layer.exp_table.oq.quantizer
         inv_oq = layer.inv_table.oq.quantizer
@@ -73,24 +87,26 @@ def handle(
 
         inv_table_size = 2**inv_inp_t.width
 
-        config = super().handle(layer, in_tensors, out_tensors)
-        assert len(config) == 1
         parallelization_factor = layer.parallelization_factor
 
-        ax = layer.axis[0]
-        ax = ax if ax >= 0 else len(in_tensors[0].shape) + ax
-        # io_stream asserts axis=-1, convert to -1 when it is
-        n_outer: int = prod(in_tensors[0].shape[1:ax])  # type: ignore
-        n_inner: int = prod(in_tensors[0].shape[ax + 1 :])  # type: ignore
-        ax = -1 if ax == len(in_tensors[0].shape) - 1 else ax
-        n_in: int = in_tensors[0].shape[ax]  # type: ignore
         if parallelization_factor < 0:
             parallelization_factor = n_outer * n_inner
 
-        config[0].update(
+        if len(in_tensors) == 2:
+            raise NotImplementedError("Masked softmax not supported yet")
+            class_name = 'MaskedSoftmax'
+        elif len(in_tensors) == 1:
+            class_name = 'Softmax'
+        else:
+            raise ValueError(f"Too many inputs for softmax layer {layer.name}: expected 1 or 2, got {len(in_tensors)}")
+
+        config = {}
+        config.update(self.default_config)
+        config.update(
             {
                 'axis': ax,
                 'n_in': n_in,
+                'activation': 'softmax',
                 'n_outer': n_outer,
                 'n_inner': n_inner,
                 'implementation': impl,
@@ -101,11 +117,14 @@ def handle(
                 'inv_inp_t': inv_inp_t,
                 'exp_scale': exp_scale,
                 'parallelization_factor': parallelization_factor,
+                'class_name': class_name,
             }
         )
+
         if layer.stable:
             inp_norm_t = fixed_quantizer_to_hls4ml_t(layer.exp_table.iq.quantizer)
             inp_norm_t.saturation_mode = SaturationMode.WRAP
             inp_norm_t.rounding_mode = RoundingMode.TRN
-            config[0]['inp_norm_t'] = inp_norm_t
-        return config
+            config['inp_norm_t'] = inp_norm_t
+
+        return (config,)
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 3894c218b3..35d9752999 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -990,6 +990,8 @@ class Softmax(Activation):
     _expected_attributes = [
         Attribute('n_in'),
         Attribute('activation', value_type=str),
+        Attribute('n_outer', value_type=int, default=1),
+        Attribute('n_inner', value_type=int, default=1),
         ChoiceAttribute('implementation', ['latency', 'stable', 'argmax', 'legacy'], default='stable'),
         ConfigurableAttribute('skip', value_type=bool, default=False),
         TypeAttribute(
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 099ac56ffd..64d377c14a 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -510,11 +510,9 @@ def _(node: Softmax):
     inv_inp_t: FixedPrecisionType = node.attributes['inv_inp_t'].precision
     accum_t = copy(inv_inp_t)
     if inv_inp_t.saturation_mode != SaturationMode.WRAP:
-        accum_t.saturation_bits = SaturationMode.WRAP
-        inp_shape = get_input_shapes(node)[0]
-        axis = node.attributes['axis']
-        L = inp_shape[axis]  # type: ignore
-        scale = ceil(log2(L))
+        accum_t.saturation_mode = SaturationMode.WRAP
+        n_in = node.attributes['n_in']
+        scale = ceil(log2(n_in))
         accum_t.width += scale
         accum_t.integer += scale
     if inv_inp_t.rounding_mode == RoundingMode.TRN:
@@ -525,11 +523,22 @@ def _(node: Softmax):
         accum_t.width += 3
     accum_t.rounding_mode = RoundingMode.TRN
     default_register_precision(node)
-    exp_table_size = node.attributes['exp_table_size']
-    if exp_table_size is None:
-        k, i, f = get_input_kifs(node)[0]
-        b = np.max(k) + np.max(i) + np.max(f)
-        exp_table_size = 2 ** int(b)
+    impl = node.attributes['implementation']
+    match impl:
+        case 'latency':
+            k, i, f = get_input_kifs(node)[0]
+            b = np.max(k) + np.max(i) + np.max(f)
+        case 'stable':
+            inp_norm_t: FixedPrecisionType = node.attributes['inp_norm_t'].precision
+            b = inp_norm_t.width
+        case 'lagency':
+            raise ValueError('lagency softmax is not supported')
+        case 'argmax':
+            b = 0
+        case _:
+            raise ValueError(f'Unknown softmax implementation {impl}')
+
+    exp_table_size = 2 ** int(b)
     node.attributes['exp_table_size'] = exp_table_size
     node.attributes['accum_t'] = NamedType(f'{node.name}_accum_t', accum_t)
 

From a6128ae8d70ef4f2b830e236670976df8a7cac44 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 11 Dec 2024 02:08:34 +0000
Subject: [PATCH 55/69] fix typo in einsum handler

---
 hls4ml/converters/keras_v3/squark/einsum.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hls4ml/converters/keras_v3/squark/einsum.py b/hls4ml/converters/keras_v3/squark/einsum.py
index 10e1c0f5b5..a04d10916b 100644
--- a/hls4ml/converters/keras_v3/squark/einsum.py
+++ b/hls4ml/converters/keras_v3/squark/einsum.py
@@ -28,11 +28,11 @@ def handle(
 
         # fmt: off
         assert all(d is not None for d in inp0_shape), \
-            f'Error when processing {layer.name}: Einsum layer requires fully inp shapes, got {inp0_shape} for inp1'
+            f'Error when processing {layer.name}: Einsum layer requires full inp shapes, got {inp0_shape} for inp1'
         assert all(d is not None for d in inp1_shape), \
-            f'Error when processing {layer.name}: Einsum layer requires fully inp shapes, got {inp1_shape} for inp2'
+            f'Error when processing {layer.name}: Einsum layer requires full inp shapes, got {inp1_shape} for inp2'
         assert all(d is not None for d in out_shape), \
-            f'Error when processing {layer.name}: EinsumDense layer requires fully out shapes. got {out_shape} for output'
+            f'Error when processing {layer.name}: EinsumDense layer requires full out shapes. got {out_shape} for output'
         # fmt: on
 
         equation = strip_batch_dim(layer.equation, einsum_dense=False)

From 5190c33426dfd350a7d8e758eceb37aaeb26a05b Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 11 Dec 2024 02:37:37 +0000
Subject: [PATCH 56/69] fix more typos

---
 hls4ml/converters/keras_v3/squark/einsum.py  | 6 +++---
 hls4ml/converters/keras_v3/squark/softmax.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hls4ml/converters/keras_v3/squark/einsum.py b/hls4ml/converters/keras_v3/squark/einsum.py
index a04d10916b..0d0e0ed4c2 100644
--- a/hls4ml/converters/keras_v3/squark/einsum.py
+++ b/hls4ml/converters/keras_v3/squark/einsum.py
@@ -10,7 +10,7 @@
 
 
 @register
-class SQEinsumDenseHandler(SQLayerHandler):
+class SQEinsumHandler(SQLayerHandler):
     handles = ('squark.layers.ops.einsum.QEinsum',)
 
     def handle(
@@ -19,8 +19,8 @@ def handle(
         in_tensors: Sequence['KerasTensor'],
         out_tensors: Sequence['KerasTensor'],
     ):
-        assert len(in_tensors) == 2, 'EinsumDense layer must have exactly one input tensor'
-        assert len(out_tensors) == 1, 'EinsumDense layer must have exactly one output tensor'
+        assert len(in_tensors) == 2, 'Einsum layer must have exactly two input tensors'
+        assert len(out_tensors) == 1, 'Einsum layer must have exactly one output tensor'
 
         inp0_shape: tuple[int, ...] = in_tensors[0].shape[1:]  # type: ignore
         inp1_shape: tuple[int, ...] = in_tensors[1].shape[1:]  # type: ignore
diff --git a/hls4ml/converters/keras_v3/squark/softmax.py b/hls4ml/converters/keras_v3/squark/softmax.py
index 8e3af35c58..a4b7179db6 100644
--- a/hls4ml/converters/keras_v3/squark/softmax.py
+++ b/hls4ml/converters/keras_v3/squark/softmax.py
@@ -39,7 +39,7 @@ def fixed_quantizer_to_hls4ml_t(q: 'FixedPointQuantizerBase', take_max=False):
 
 
 @register
-class SQSoftmaxDenseHandler(SQLayerHandler):
+class SQSoftmaxHandler(SQLayerHandler):
     handles = ('squark.layers.softmax.QSoftmax',)
 
     def handle(

From 9cdb67c784272b006964f06b2196f9a27181ab25 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 11 Dec 2024 03:08:46 +0000
Subject: [PATCH 57/69] MHA :tada:

---
 hls4ml/converters/keras_v3/squark/__init__.py |   2 +-
 hls4ml/converters/keras_v3/squark/_base.py    |   2 +-
 .../keras_v3/squark/multi_head_attention.py   | 122 ++++++++++++++++++
 3 files changed, 124 insertions(+), 2 deletions(-)
 create mode 100644 hls4ml/converters/keras_v3/squark/multi_head_attention.py

diff --git a/hls4ml/converters/keras_v3/squark/__init__.py b/hls4ml/converters/keras_v3/squark/__init__.py
index 0ce9f5f672..f0f8d1c89b 100644
--- a/hls4ml/converters/keras_v3/squark/__init__.py
+++ b/hls4ml/converters/keras_v3/squark/__init__.py
@@ -1 +1 @@
-from . import _base, einsum, softmax, unary_lut
+from . import _base, einsum, multi_head_attention, softmax, unary_lut
diff --git a/hls4ml/converters/keras_v3/squark/_base.py b/hls4ml/converters/keras_v3/squark/_base.py
index 12a4cc729f..383b617568 100644
--- a/hls4ml/converters/keras_v3/squark/_base.py
+++ b/hls4ml/converters/keras_v3/squark/_base.py
@@ -67,7 +67,7 @@ def __call__(
     ):
         ret = super().__call__(layer, in_tensors, out_tensors)
 
-        if layer._enable_iq:
+        if layer._enable_iq and hasattr(layer, '_iq'):
             if len(in_tensors) > 1:
                 iq_confs = [extract_fixed_quantizer_config(q, tensor, True) for q, tensor in zip(layer._iq, in_tensors)]
             else:
diff --git a/hls4ml/converters/keras_v3/squark/multi_head_attention.py b/hls4ml/converters/keras_v3/squark/multi_head_attention.py
new file mode 100644
index 0000000000..f096ba76a7
--- /dev/null
+++ b/hls4ml/converters/keras_v3/squark/multi_head_attention.py
@@ -0,0 +1,122 @@
+import typing
+from inspect import Signature
+from typing import Sequence
+
+import numpy as np
+
+from ._base import SQEinsumDenseHandler, SQLayerHandler, register
+from .einsum import SQEinsumHandler
+from .softmax import SQSoftmaxHandler
+
+if typing.TYPE_CHECKING:
+    import squark
+    from keras.api import KerasTensor
+
+
+@register
+class SQMultiHeadAttentionHandler(SQLayerHandler):
+    handles = ('squark.layers.multi_head_attention.QMultiHeadAttention',)
+
+    def handle(
+        self,
+        layer: 'squark.layers.QMultiHeadAttention',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        from keras import KerasTensor
+        from squark.layers import QEinsum
+
+        assert len(in_tensors) in (3, 4), 'MultiHead layer must have 3 (Q, K, V) or 4 (Q, K, V, M) input tensors'
+        assert len(out_tensors) == 1, 'Attention score output is not supported yet'
+        assert len(in_tensors) == 3, 'Mask tensor is not supported yet'
+        tensor_q, tensor_k, tensor_v, *mask = in_tensors
+        tensor_O, *tensor_attn = out_tensors
+        unique_name: str = layer.name
+
+        node_index = layer.input[0]._keras_history.node_index
+        assert all(
+            [node_index == inp._keras_history.node_index for inp in layer.input[1:]]
+        ), f'Critical error handling layer {layer.name}'
+        node = layer._inbound_nodes[node_index]
+
+        args = node.arguments.args
+        kwargs = node.arguments.kwargs
+        sig: Signature = layer._call_signature
+
+        # map everything to kwargs
+        bound = sig.bind(*args, **kwargs)
+        bound.apply_defaults()
+
+        tensor_q = bound.arguments['query']
+        tensor_k = bound.arguments['key']
+        tensor_v = bound.arguments['value']
+        tensor_q_mask = bound.arguments['query_mask']
+        tensor_k_mask = bound.arguments['key_mask']
+        tensor_v_mask = bound.arguments['value_mask']
+        tensor_attn_mask = bound.arguments['attention_mask']
+        return_scores = bound.arguments['return_attention_scores']  # noqa: F841
+
+        n_mask_def = (
+            np.sum(
+                [
+                    tensor_q_mask is not None,
+                    tensor_k_mask is not None,
+                    tensor_v_mask is not None,
+                    tensor_attn_mask is not None,
+                ]
+            )
+            <= 1
+        )
+        assert n_mask_def, f'Layer {layer.name} has {n_mask_def} masks defined, expected at most 1'
+
+        unique_name = f'{layer.name}_{node_index}'
+        to_Q = layer.query_dense
+        to_K = layer.key_dense
+        to_V = layer.value_dense
+        to_O = layer.output_dense
+        softmax = layer._softmax
+
+        Q_batch_shape = to_Q.full_output_shape
+        K_batch_shape = to_K.full_output_shape
+        V_batch_shape = to_V.full_output_shape
+        # O_batch_shape = to_O.full_output_shape
+        n_head = layer.num_heads
+        score_batch_shape = (None, n_head, *Q_batch_shape[1:-2], *K_batch_shape[1:-2])
+
+        einsum_QK = QEinsum(layer._dot_product_equation, name=f'{layer.name}_QK', enable_iq=False, enable_oq=False)
+        einsum_sV = QEinsum(layer._combine_equation, name=f'{layer.name}_aV', enable_iq=False, enable_oq=False)
+
+        tensor_Q = KerasTensor(name=f'{unique_name}_Q', shape=Q_batch_shape)
+        tensor_K = KerasTensor(name=f'{unique_name}_K', shape=K_batch_shape)
+        tensor_V = KerasTensor(name=f'{unique_name}_V', shape=V_batch_shape)
+
+        pre_O_shape = (None, *tensor_q.shape[1:-1], layer.num_heads, layer.value_dim)
+        tensor_pre_O = KerasTensor(name=f'{unique_name}_pre_O', shape=pre_O_shape)
+        # tensor_O = KerasTensor(name=f'{name}_QK', shape=O_batch_shape)
+        tensor_pre_score = KerasTensor(name=f'{unique_name}_pre_score', shape=score_batch_shape)
+        tensor_score = KerasTensor(name=f'{unique_name}_score', shape=score_batch_shape)
+
+        einsum_handler = SQEinsumHandler()
+        einsum_dense_handler = SQEinsumDenseHandler()
+        softmax_handler = SQSoftmaxHandler()
+
+        config_to_Q = einsum_dense_handler(to_Q, [tensor_q], [tensor_Q])
+        config_to_K = einsum_dense_handler(to_K, [tensor_k], [tensor_K])
+        config_to_V = einsum_dense_handler(to_V, [tensor_v], [tensor_V])
+        config_einsum_KQ = einsum_handler(einsum_QK, [tensor_K, tensor_Q], [tensor_pre_score])
+        config_softmax = softmax_handler(softmax, [tensor_pre_score], [tensor_score])
+        config_einsum_sV = einsum_handler(einsum_sV, [tensor_score, tensor_V], [tensor_pre_O])
+        config_to_O = einsum_dense_handler(to_O, [tensor_pre_O], [tensor_O])
+
+        configs = (
+            *config_to_Q,
+            *config_to_K,
+            *config_to_V,
+            *config_einsum_KQ,
+            *config_softmax,
+            *config_einsum_sV,
+            *config_to_O,
+        )
+        for conf in configs:
+            conf['name'] = f'{layer.name}_{conf["name"]}'
+        return configs

From 5bcae969bba4157f1f88ba9c92665c1e9ff9ba77 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 11 Dec 2024 19:39:56 +0000
Subject: [PATCH 58/69] fix einsum and softmax template typos

---
 hls4ml/templates/vivado/nnet_utils/nnet_activation.h | 4 ++--
 hls4ml/templates/vivado/nnet_utils/nnet_einsum.h     | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index 5b51f41a5a..7df968bd94 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -396,8 +396,8 @@ void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_multidim(data_T data[CONFIG_T::outer * CONFIG_T::n_in * CONFIG_T::n_inner],
-                      res_T res[CONFIG_T::outer * CONFIG_T::n_in * CONFIG_T::n_inner]) {
+void softmax_multidim(data_T data[CONFIG_T::n_outer * CONFIG_T::n_in * CONFIG_T::n_inner],
+                      res_T res[CONFIG_T::n_outer * CONFIG_T::n_in * CONFIG_T::n_inner]) {
     #pragma HLS inline
     #pragma HLS allocation instances = softmax<CONFIG_T> limit = CONFIG_T::parallelization_factor function
     data_T buffer_in[CONFIG_T::n_in];
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h b/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h
index 18f323f39d..cc2917783c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_einsum.h
@@ -42,7 +42,6 @@ void einsum(const data0_T data0[CONFIG_T::tpose_inp0_conf::N], const data1_T dat
     #pragma HLS ARRAY_PARTITION variable = tpose_i0 complete
     #pragma HLS ARRAY_PARTITION variable = tpose_i1 complete
     #pragma HLS ARRAY_PARTITION variable = tpose_o complete
-    #pragma HLS ARRAY_PARTITION variable = res_buffer complete
 
     nnet::transpose<data0_T, data0_T, typename CONFIG_T::tpose_inp0_conf>(data0, tpose_i0);
     nnet::transpose<data1_T, data1_T, typename CONFIG_T::tpose_inp1_conf>(data1, tpose_i1);

From d780de2c6743e8f70548902ec31d0a5dbaf8e82c Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Thu, 12 Dec 2024 06:40:31 +0000
Subject: [PATCH 59/69] assert einsum ops doesnot include direct sum operation

---
 hls4ml/model/layers.py       |  8 ++++++++
 hls4ml/utils/einsum_utils.py | 19 ++++++++++++++++---
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 35d9752999..f0d20b824a 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1688,6 +1688,10 @@ def initialize(self):
 
         kernel_shape = kernel.shape
         recipe = parse_einsum(equation, inp_shape, kernel_shape)
+        assert not any(recipe['direct_sum_axis']), (
+            'Do not put direct sum indices (e.g., only appears in one of the operands) in the equation.'
+            'Use explicit addition operator before instead.'
+        )
         inp_tpose_idxs, ker_tpose_idxs = recipe['in_transpose_idxs']
         out_tpose_idxs = recipe['out_transpose_idxs']
 
@@ -1759,6 +1763,10 @@ def initialize(self):
         out_shape = self.attributes['out_shape']
 
         recipe = parse_einsum(equation, inp0_shape, inp1_shape)
+        assert not any(recipe['direct_sum_axis']), (
+            'Do not put direct sum indices (e.g., only appears in one of the operands) in the equation.'
+            'Use explicit addition operator before instead.'
+        )
         inp0_tpose_idxs, inp1_tpose_idxs = recipe['in_transpose_idxs']
         out_tpose_idxs = recipe['out_transpose_idxs']
 
diff --git a/hls4ml/utils/einsum_utils.py b/hls4ml/utils/einsum_utils.py
index c175f9994a..43ceb2ba96 100644
--- a/hls4ml/utils/einsum_utils.py
+++ b/hls4ml/utils/einsum_utils.py
@@ -5,6 +5,7 @@
 
 
 class EinsumRecipe(TypedDict):
+    direct_sum_axis: tuple[tuple[int, ...], tuple[int, ...]]
     in_transpose_idxs: tuple[tuple[int, ...], tuple[int, ...]]
     L0: int
     L1: int
@@ -127,7 +128,7 @@ def _validate_einsum_expr(fn: str, shape0: tuple[int, ...], shape1: tuple[int, .
 
 
 def parse_einsum(fn: str, input_shape0: tuple[int, ...], input_shape1: tuple[int, ...]) -> EinsumRecipe:
-    """Execute einsum operation on two input arrays
+    """Parse einsum operation on two input arrays, return a recipe for execution
 
     Parameters
     ----------
@@ -140,8 +141,8 @@ def parse_einsum(fn: str, input_shape0: tuple[int, ...], input_shape1: tuple[int
 
     Returns
     -------
-    np.ndarray
-        output array
+    EinsumRecipe
+        einsum recipe; executed by _exec_einsum
     """
 
     fn, _ = _validate_einsum_expr(fn, input_shape0, input_shape1)
@@ -158,6 +159,12 @@ def parse_einsum(fn: str, input_shape0: tuple[int, ...], input_shape1: tuple[int
     inplace = sorted(_inplace, key=lambda x: in1.index(x))
     invariant0 = sorted((s_out - _common) & s_in0, key=lambda x: in0.index(x))
     invariant1 = sorted((s_out - _common) & s_in1, key=lambda x: in1.index(x))
+    direct_sum0 = s_in0 - s_out - _common
+    direct_sum1 = s_in1 - s_out - _common
+    direct_sum_axis = (
+        tuple(sorted(in0.index(x) for x in direct_sum0)),
+        tuple(sorted(in1.index(x) for x in direct_sum1)),
+    )
 
     contract_idxs = tuple(map(in0.index, contract)), tuple(map(in1.index, contract))
     inplace_idxs = tuple(map(in0.index, inplace)), tuple(map(in1.index, inplace))
@@ -178,6 +185,7 @@ def parse_einsum(fn: str, input_shape0: tuple[int, ...], input_shape1: tuple[int
     out_transpose_idx = tuple(int(i) for i in _out_transpose_idx)
 
     return EinsumRecipe(
+        direct_sum_axis=direct_sum_axis,
         in_transpose_idxs=(transpose_idx0, transpose_idx1),
         out_interpert_shape=out_shape_pretranspose,
         out_transpose_idxs=out_transpose_idx,
@@ -205,6 +213,11 @@ def _exec_einsum(recipe: EinsumRecipe, input0: np.ndarray, input1: np.ndarray) -
     np.ndarray
         output array
     """
+    sum_axis0, sum_axis1 = recipe['direct_sum_axis']
+    if sum_axis0:
+        input0 = np.sum(input0, axis=sum_axis0)
+    if sum_axis1:
+        input1 = np.sum(input1, axis=sum_axis1)
     input0 = input0.transpose(recipe['in_transpose_idxs'][0]).ravel()
     input1 = input1.transpose(recipe['in_transpose_idxs'][1]).ravel()
     output = np.zeros(recipe['L0'] * recipe['L1'] * recipe['I'], dtype=input0.dtype)

From e3cef20c888c0f0b1e1416827e0df52cceab31a3 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 13 Dec 2024 15:46:03 +0000
Subject: [PATCH 60/69] style

---
 .../keras_v3/squark/multi_head_attention.py   | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/hls4ml/converters/keras_v3/squark/multi_head_attention.py b/hls4ml/converters/keras_v3/squark/multi_head_attention.py
index f096ba76a7..4a1081435b 100644
--- a/hls4ml/converters/keras_v3/squark/multi_head_attention.py
+++ b/hls4ml/converters/keras_v3/squark/multi_head_attention.py
@@ -56,18 +56,15 @@ def handle(
         tensor_attn_mask = bound.arguments['attention_mask']
         return_scores = bound.arguments['return_attention_scores']  # noqa: F841
 
-        n_mask_def = (
-            np.sum(
-                [
-                    tensor_q_mask is not None,
-                    tensor_k_mask is not None,
-                    tensor_v_mask is not None,
-                    tensor_attn_mask is not None,
-                ]
-            )
-            <= 1
+        n_mask_def = np.sum(
+            [
+                tensor_q_mask is not None,
+                tensor_k_mask is not None,
+                tensor_v_mask is not None,
+                tensor_attn_mask is not None,
+            ]
         )
-        assert n_mask_def, f'Layer {layer.name} has {n_mask_def} masks defined, expected at most 1'
+        assert n_mask_def <= 1, f'Layer {layer.name} has {n_mask_def} masks defined, expected at most 1'
 
         unique_name = f'{layer.name}_{node_index}'
         to_Q = layer.query_dense

From 2bcf9e7884d5446a9ff189d8a103940600a90228 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 13 Dec 2024 15:48:41 +0000
Subject: [PATCH 61/69] fix mha layer indexing

---
 hls4ml/converters/keras_v3/squark/multi_head_attention.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/converters/keras_v3/squark/multi_head_attention.py b/hls4ml/converters/keras_v3/squark/multi_head_attention.py
index 4a1081435b..b580bf90f2 100644
--- a/hls4ml/converters/keras_v3/squark/multi_head_attention.py
+++ b/hls4ml/converters/keras_v3/squark/multi_head_attention.py
@@ -29,11 +29,11 @@ def handle(
         assert len(in_tensors) in (3, 4), 'MultiHead layer must have 3 (Q, K, V) or 4 (Q, K, V, M) input tensors'
         assert len(out_tensors) == 1, 'Attention score output is not supported yet'
         assert len(in_tensors) == 3, 'Mask tensor is not supported yet'
-        tensor_q, tensor_k, tensor_v, *mask = in_tensors
+        tensor_q, *_ = in_tensors
         tensor_O, *tensor_attn = out_tensors
         unique_name: str = layer.name
 
-        node_index = layer.input[0]._keras_history.node_index
+        node_index: int = tensor_q._keras_history.node_index  # type: ignore
         assert all(
             [node_index == inp._keras_history.node_index for inp in layer.input[1:]]
         ), f'Critical error handling layer {layer.name}'

From c426ddc4092eff84b9f9a941c7ceafa5d6ab21b4 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sat, 14 Dec 2024 21:32:55 +0000
Subject: [PATCH 62/69] switch to model opt

---
 hls4ml/model/optimizer/passes/bit_exact.py | 26 +++++++++++++++-------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 64d377c14a..32f1334023 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -24,7 +24,7 @@
     Reshape,
     Softmax,
 )
-from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.optimizer import ModelOptimizerPass, OptimizerPass
 from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer, UnaryLUT
 from hls4ml.model.types import FixedPrecisionType, NamedType, RoundingMode, SaturationMode, WeightVariable
 from hls4ml.utils.qinterval import QIntervalArray, einsum, minimal_kif
@@ -545,22 +545,32 @@ def _(node: Softmax):
 
 @register_precision.register
 def _(node: UnaryLUT):
-    k, i, f = minimal_kif(node.attributes['table'].data)
+    k, i, f = minimal_kif(node.attributes['table'].data)  # type: ignore
     k, i, f = bool(np.max(k)), int(np.max(i)), int(np.max(f))
     table_t = to_hls4ml_fixed(k, i, f, f'{node.name}_table_t')
     node.attributes['table_t'] = table_t
     default_register_precision(node)
 
 
-class BitExact(OptimizerPass):
-    def match(self, node):
-        if node.attributes.get('bit_exact_transformed'):
+class BitExact(ModelOptimizerPass):
+    def __init__(self):
+        pass
+
+    def _match(self, model: 'ModelGraph'):
+        if not any(isinstance(node, FixedPointQuantizer) for node in model.graph.values()):
             return False
         return True
 
-    def transform(self, model, node):
-        register_precision(node)
-        node.attributes['bit_exact_transformed'] = True
+    def transform(self, model):
+        if not self._match(model):
+            return False
+
+        for node in model.graph.values():
+            if node.attributes.get('bit_exact_transformed'):
+                return False
+            register_precision(node)
+            node.attributes['bit_exact_transformed'] = True
+
         return False
 
 

From a749c2735d10763d223849eb8716ee87cd868b2a Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 15 Dec 2024 01:55:35 +0000
Subject: [PATCH 63/69] pooling layers

---
 hls4ml/model/optimizer/passes/bit_exact.py | 92 ++++++++++------------
 1 file changed, 41 insertions(+), 51 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 32f1334023..c4ba1bb2e0 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -1,7 +1,7 @@
 import typing
 from copy import copy
 from functools import reduce, singledispatch
-from math import ceil, log2
+from math import ceil, log2, prod
 from typing import Sequence
 from warnings import warn
 
@@ -17,10 +17,12 @@
     Einsum,
     EinsumDense,
     GlobalPooling1D,
+    GlobalPooling2D,
     Input,
     Layer,
     Merge,
     Pooling1D,
+    Pooling2D,
     Reshape,
     Softmax,
 )
@@ -101,52 +103,6 @@ def _(layer: FixedPointQuantizer):
     return ((k, i, f),)
 
 
-@request_kif.register(Pooling1D)
-# @request_kif.register(Pooling2D)
-@request_kif.register(GlobalPooling1D)
-# @request_kif.register(GlobalPooling2D)
-def _(layer: Pooling1D | GlobalPooling1D):
-    # inp_shape = get_input_shapes(layer)[0]
-    out_shape = get_output_shape(layer)
-    pool_width = layer.attributes.attributes['pool_width']
-    stride_width = layer.attributes.attributes['stride_width']
-    pool_op = layer.attributes.attributes['pool_op']
-    if isinstance(layer, Pooling1D):
-        pad_0_0: int = layer.attributes.attributes['pad_left']
-    else:
-        pad_0_0 = 0
-    is_ch_last = layer.attributes.attributes['data_format'] == 'channels_last'
-
-    k = np.ones(out_shape, dtype=np.int8)
-    i = np.full(out_shape, -127, dtype=np.int8)
-    f = np.full(out_shape, 126, dtype=np.int8)
-
-    _, i_out, f_out = requested_kif(layer)
-
-    if not is_ch_last:
-        i = np.moveaxis(i, 0, -1)
-        f = np.moveaxis(f, 0, -1)
-
-    for idx_out in range(k.shape[-1]):
-        i_in_0 = i_out * stride_width - pad_0_0
-        i_in_1 = i_in_0 + pool_width
-        if i_in_0 < 0:
-            i_in_0 = 0
-        i[..., i_in_0:i_in_1] = i_out[..., idx_out]
-        f[..., i_in_0:i_in_1] = f_out[..., idx_out]
-
-    if not is_ch_last:
-        i = np.moveaxis(i, -1, 0)
-        f = np.moveaxis(f, -1, 0)
-
-    if pool_op == 'Average':
-        ln2_size = np.log2(pool_width)
-        i += np.ceil(ln2_size).astype(np.int8)
-        if not ln2_size.is_integer():
-            f[:] = 126
-    return ((k, i, f),)
-
-
 @request_kif.register
 def _(layer: Reshape):
     inp_shape = get_input_shapes(layer)[0]
@@ -332,7 +288,7 @@ def im2col(kernel_size: Sequence[int], *arrs: np.ndarray):
 
 def pad_arrs(node: Layer, pad_val: float = 0, *arrs: np.ndarray):
     out_arrs = []
-    if node.class_name.endswith('Conv2D'):
+    if node.class_name.endswith('2D'):
         pad_top = node.attributes.attributes['pad_top']
         pad_bottom = node.attributes.attributes['pad_bottom']
         pad_left = node.attributes.attributes['pad_left']
@@ -340,7 +296,7 @@ def pad_arrs(node: Layer, pad_val: float = 0, *arrs: np.ndarray):
         for arr in arrs:
             r = np.pad(arr, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), constant_values=pad_val)
             out_arrs.append(r)
-    elif node.class_name.endswith('Conv1D'):
+    elif node.class_name.endswith('1D'):
         pad_left = node.attributes.attributes['pad_left']
         pad_right = node.attributes.attributes['pad_right']
         for arr in arrs:
@@ -352,11 +308,11 @@ def pad_arrs(node: Layer, pad_val: float = 0, *arrs: np.ndarray):
 
 
 def stride_arrs(node: Layer, *arrs: np.ndarray):
-    if node.class_name.endswith('Conv2D'):
+    if node.class_name.endswith('2D'):
         st_h = node.attributes.attributes['stride_height']
         st_w = node.attributes.attributes['stride_width']
         return tuple(arr[::st_h, ::st_w] for arr in arrs)
-    if node.class_name.endswith('Conv1D'):
+    if node.class_name.endswith('1D'):
         st_w = node.attributes.attributes['stride_width']
         return tuple(arr[::st_w] for arr in arrs)
     raise ValueError(f'Layer {node.class_name} is not supported for stride_arrs')
@@ -365,6 +321,7 @@ def stride_arrs(node: Layer, *arrs: np.ndarray):
 @produce_kif.register(Conv1D)
 @produce_kif.register(Conv2D)
 def _(layer: Conv1D | Conv2D):
+    assert layer.attributes.attributes['data_format'] == 'channels_last', 'Only channels_last format is supported'
     kernel = layer.attributes.attributes['weight'].data
     _bias = layer.attributes.attributes['bias']
     bias = _bias.data if _bias is not None else 0
@@ -380,6 +337,39 @@ def _(layer: Conv1D | Conv2D):
     return k.astype(np.int8), i, f
 
 
+@produce_kif.register(Pooling1D)
+@produce_kif.register(Pooling2D)
+@produce_kif.register(GlobalPooling1D)
+@produce_kif.register(GlobalPooling2D)
+def _(layer: Pooling1D | Pooling2D | GlobalPooling1D | GlobalPooling2D):
+    if isinstance(layer, (Pooling1D, GlobalPooling1D)):
+        px_shape = (layer.attributes['pool_width'],)
+    else:
+        px_shape = (layer.attributes['pool_height'], layer.attributes['pool_width'])
+    ch_out = ch_in = layer.attributes['n_filt']
+
+    im2col_shape = *px_shape, ch_in, ch_out  # conv kernel shape
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+    if isinstance(layer, (Pooling1D, Pooling2D)):
+        k_in, i_in, f_in = pad_arrs(layer, 0, k_in, i_in, f_in)
+    k_in, i_in, f_in = im2col(im2col_shape, k_in, i_in, f_in)
+    if isinstance(layer, (Pooling1D, Pooling2D)):
+        k_in, i_in, f_in = stride_arrs(layer, k_in, i_in, f_in)
+
+    k_out = k_in.reshape(*k_in.shape[:-1], -1, ch_in).max(axis=-2).astype(np.int8)
+    i_out = i_in.reshape(*i_in.shape[:-1], -1, ch_in).max(axis=-2).astype(np.int8)
+    f_out = f_in.reshape(*f_in.shape[:-1], -1, ch_in).max(axis=-2).astype(np.int8)
+
+    pool_op = layer.attributes['pool_op']
+    if pool_op == 'Average':
+        f_add = log2(prod(px_shape))
+        if not f_add.is_integer():
+            raise ValueError('Average pooling with non-power-of-2 pool size cannot be bit-exact')
+        f_out += int(f_add)
+
+    return k_out, i_out, f_out
+
+
 @produce_kif.register
 def _(layer: BatchNormalization):
     k_in, i_in, f_in = get_input_kifs(layer)[0]

From 0317b5b6a13e361e658dacd305be3a44976a21fd Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 15 Dec 2024 02:19:34 +0000
Subject: [PATCH 64/69] handle stray inputs

---
 hls4ml/model/optimizer/passes/bit_exact.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index c4ba1bb2e0..66d14ceb6b 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -574,6 +574,13 @@ def match(self, node: Layer):
 
     def transform(self, model, node: Layer):
         out_layers: list[FixedPointQuantizer] = get_output_layers(node)
+
+        if len(out_layers) == 0:  # Input connected to nothing
+            new_type = to_hls4ml_fixed(0, 0, 1, f'{node.name}_t')
+            node.get_output_variable().type = new_type
+            node.model.config.layer_name_precision[node.name] = str(new_type)
+            return False
+
         if not all(isinstance(l, FixedPointQuantizer) for l in out_layers):
             warn(f'Input {node.name} has unhandled high precision. Consider setting it manually before synthesising.')
             return False

From b38420d25a4b6d97fdeae8113d8170ae39abd074 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 15 Dec 2024 21:27:06 +0000
Subject: [PATCH 65/69] fix pooling layer accum_t

---
 hls4ml/model/optimizer/passes/bit_exact.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 66d14ceb6b..084b673607 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -542,6 +542,24 @@ def _(node: UnaryLUT):
     default_register_precision(node)
 
 
+@register_precision.register(Pooling1D)
+@register_precision.register(Pooling2D)
+@register_precision.register(GlobalPooling1D)
+@register_precision.register(GlobalPooling2D)
+def _(node: Pooling1D | Pooling2D | GlobalPooling1D | GlobalPooling2D):
+    default_register_precision(node)
+    pool_op = node.attributes['pool_op']
+    if pool_op != 'Average':
+        return
+    if isinstance(node, (Pooling1D, GlobalPooling1D)):
+        px_shape = (node.attributes['pool_width'],)
+    else:
+        px_shape = (node.attributes['pool_height'], node.attributes['pool_width'])
+    i_add = int(log2(prod(px_shape)))
+    node.attributes['accum_t'].precision.width += i_add
+    node.attributes['accum_t'].precision.integer += i_add
+
+
 class BitExact(ModelOptimizerPass):
     def __init__(self):
         pass

From a2d6e1aa067b81565e4179491e0c691a89a0e1e0 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sun, 15 Dec 2024 21:48:14 +0000
Subject: [PATCH 66/69] bit-exact concatenate

---
 hls4ml/model/optimizer/passes/bit_exact.py | 26 ++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 084b673607..9b16c72cce 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -11,6 +11,7 @@
 from hls4ml.model.layers import (
     Activation,
     BatchNormalization,
+    Concatenate,
     Conv1D,
     Conv2D,
     Dense,
@@ -126,6 +127,20 @@ def _(layer: Activation):
     return (_maximum_kif_at_shape(inp_shape),)
 
 
+@request_kif.register
+def _(layer: Concatenate):
+    inp_shape0, inp_shape1 = get_input_shapes(layer)
+    k, i, f = requested_kif(layer)
+    ax = layer.attributes['axis']
+    n_split = inp_shape0[ax]
+
+    k0, k1 = np.split(k, [n_split], axis=ax)
+    i0, i1 = np.split(i, [n_split], axis=ax)
+    f0, f1 = np.split(f, [n_split], axis=ax)
+
+    return ((k0, i0, f0), (k1, i1, f1))
+
+
 def requested_kif(layer: Layer) -> KIF_t:
     out_layers = get_output_layers(layer)
     out_shape = get_output_shape(layer)
@@ -403,6 +418,17 @@ def _(layer: Softmax):
     return k, i, f
 
 
+@produce_kif.register
+def _(layer: Concatenate):
+    kifs_in = get_input_kifs(layer)
+    ks, is_, fs = zip(*kifs_in)
+    ax = layer.attributes.attributes['axis']
+    k = np.concatenate(ks, axis=ax)
+    i = np.concatenate(is_, axis=ax)
+    f = np.concatenate(fs, axis=ax)
+    return k, i, f
+
+
 @produce_kif.register
 def _(layer: Activation):
     fn_name = layer.attributes.attributes['activation']

From af5c79841fcc2c12103b745c4f8cf35487bfa50e Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Fri, 17 Jan 2025 13:30:42 +0000
Subject: [PATCH 67/69] rm np.float_ in favor of numpy >=2.0

---
 hls4ml/model/graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 1e29a569ef..07339c9709 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -734,7 +734,7 @@ def _get_top_function(self, x):
         if x0.dtype in [np.single, np.float32]:
             top_function = getattr(self._top_function_lib, self.config.get_project_name() + '_float')
             ctype = ctypes.c_float
-        elif x0.dtype in [np.double, np.float64, np.float_]:
+        elif x0.dtype in [np.double, np.float64]:
             top_function = getattr(self._top_function_lib, self.config.get_project_name() + '_double')
             ctype = ctypes.c_double
         else:

From c32df4bfaedbad57a4d6392f5aad8b5125eb4f95 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sat, 18 Jan 2025 21:26:59 +0000
Subject: [PATCH 68/69] add comments

---
 hls4ml/model/optimizer/passes/bit_exact.py | 50 +++++++++++++---------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 9b16c72cce..8d9c7c5f5a 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -473,47 +473,55 @@ def kif_arrs_to_ints(arr: tuple[np.ndarray, np.ndarray, np.ndarray]):
 
 
 def default_register_precision(layer: Layer):
-    _pk, _pi, _pf = produce_kif(layer)
-    _rk, _ri, _rf = requested_kif(layer)
-    _out_kif = np.minimum(_pk, _rk), np.minimum(_pi, _ri), np.minimum(_pf, _rf)
-    _out_kif[1][(_pf > _rf) & (_pi <= _ri)] += 1
-    result_kif = kif_arrs_to_ints(_out_kif)
+    _pk, _pi, _pf = produce_kif(layer)  # Maximum possible k,i,f output from this layer
+    _rk, _ri, _rf = requested_kif(layer)  # Maximum possible k,i,f may be utilized by the next layer
+    _ok, _oi, _of = np.minimum(_pk, _rk), np.minimum(_pi, _ri), np.minimum(_pf, _rf)
+    _oi += ((_pf > _rf) & (_pi <= _ri)).astype(np.int8)  # Corner cases overflow prevention
+
+    result_kif = kif_arrs_to_ints((_ok, _oi, _of))
     result_t = to_hls4ml_fixed(*result_kif, f'{layer.name}_t')
     layer.attributes.attributes['result_t'] = result_t
     layer.get_output_variable().type = result_t
 
     overrides = {}
 
+    # Set accum_t, if exists ONLY for layers with accum_t directly at output (in general, linear DSP operations)
     if 'accum_t' in layer.attributes.attributes:
         accum_kif = kif_arrs_to_ints((_pk, _pi, _pf))
         accum_t = to_hls4ml_fixed(*accum_kif, f'{layer.name}_accum_t')
         overrides['accum_t'] = accum_t
 
+    # Set precision for fixed array (weight_t, bias_t, table_t, etc.)
     for w_name_t, v in layer.attributes.attributes.items():
-        if isinstance(v, NamedType) and w_name_t.endswith('_t'):
-            w_name = w_name_t[:-2]
-            if w_name not in layer.attributes.attributes:
-                continue
-            _data = layer.attributes.attributes[w_name]
-            if _data is None:
-                precision = to_hls4ml_fixed(0, 0, 1, f'{layer.name}_{w_name_t}')
-            else:
-                data = _data.data
-                if not isinstance(data, np.ndarray):
-                    raise ValueError(f'Expected data to be np.ndarray, got {type(data)} on layer {layer.name}')
-                k, i, f = kif_arrs_to_ints(minimal_kif(data))
-                precision = to_hls4ml_fixed(k, i, f, f'{layer.name}_{w_name_t}')
-            overrides[w_name_t] = precision
-
+        if not isinstance(v, NamedType) and w_name_t.endswith('_t'):
+            continue  # Not a precision, skip
+
+        w_name = w_name_t[:-2]
+        if w_name not in layer.attributes.attributes:
+            continue  # No matching data found, skip
+
+        weight_var: WeightVariable = layer.attributes.attributes[w_name]
+        if weight_var is None:  # Corresponding weight not exist, precision to be used nowhere. Put dummy.
+            precision = to_hls4ml_fixed(0, 0, 1, f'{layer.name}_{w_name_t}')
+        else:
+            data = weight_var.data
+            if not isinstance(data, np.ndarray):
+                raise ValueError(f'Expected data to be np.ndarray, got {type(data)} on layer {layer.name}')
+            k, i, f = kif_arrs_to_ints(minimal_kif(data))
+            precision = to_hls4ml_fixed(k, i, f, f'{layer.name}_{w_name_t}')
+        overrides[w_name_t] = precision
+
+    # Apply overrides
     for w_name_t, v in overrides.items():
         layer.attributes.attributes[w_name_t] = v
         if w_name_t[:-2] in layer.attributes.attributes:
+            # weight variables need extra steps to update precision
             weight_var: WeightVariable = layer.attributes.attributes[w_name_t[:-2]]
             weight_var.type = v
             weight_var.update_precision(v.precision)
             layer.model.config.layer_name_precision[f'{layer.name}_{w_name_t[:-2]}'] = str(v.precision)
 
-    return (_pk, _pi, _pf), (_rk, _ri, _rf), _out_kif
+    return (_pk, _pi, _pf), (_rk, _ri, _rf), (_ok, _oi, _of)
 
 
 @singledispatch

From fe0ff2f8e6e0bab529aa02cf871bc01efdaadf1c Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Sat, 18 Jan 2025 22:04:31 +0000
Subject: [PATCH 69/69] skip non-bit-exact compatiable softmax in bit-exact
 pass

---
 hls4ml/converters/keras_v3/squark/softmax.py | 1 +
 hls4ml/model/optimizer/passes/bit_exact.py   | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/hls4ml/converters/keras_v3/squark/softmax.py b/hls4ml/converters/keras_v3/squark/softmax.py
index a4b7179db6..d27e4ede2a 100644
--- a/hls4ml/converters/keras_v3/squark/softmax.py
+++ b/hls4ml/converters/keras_v3/squark/softmax.py
@@ -118,6 +118,7 @@ def handle(
                 'exp_scale': exp_scale,
                 'parallelization_factor': parallelization_factor,
                 'class_name': class_name,
+                '_bit_exact': True,
             }
         )
 
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 8d9c7c5f5a..85c212119c 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -531,6 +531,11 @@ def register_precision(node: Layer):
 
 @register_precision.register
 def _(node: Softmax):
+    if not node.attributes.get('_bit_exact', False):
+        # Softmax is not bit-exact by default
+        warn(f'Softmax layer {node.name} is converted from a frontend not supporting bit-exact softmax.')
+        return
+
     inv_inp_t: FixedPrecisionType = node.attributes['inv_inp_t'].precision
     accum_t = copy(inv_inp_t)
     if inv_inp_t.saturation_mode != SaturationMode.WRAP: