diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0601a84b2d..d45ffbdd27 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,7 +30,7 @@ repos:
     args: ["--profile", "black", --line-length=125]
 
 - repo: https://github.com/asottile/pyupgrade
-  rev: v3.19.0
+  rev: v3.19.1
   hooks:
   - id: pyupgrade
     args: ["--py36-plus"]
diff --git a/CITATION.cff b/CITATION.cff
index 9e1880f03f..91bf036a1d 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -4,7 +4,7 @@ type: software
 authors:
 - given-names: "FastML Team"
 title: "hls4ml"
-version: "v0.8.1"
+version: "v1.0.0"
 doi: 10.5281/zenodo.1201549
 repository-code: "https://github.com/fastmachinelearning/hls4ml"
 url: "https://fastmachinelearning.org/hls4ml"
diff --git a/README.md b/README.md
index 606e824d09..fd96763476 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,9 @@ If you have any questions, comments, or ideas regarding hls4ml or just want to s
 
 # Documentation & Tutorial
 
-For more information visit the webpage: [https://fastmachinelearning.org/hls4ml/](https://fastmachinelearning.org/hls4ml/)
+For more information visit the webpage: [https://fastmachinelearning.org/hls4ml/](https://fastmachinelearning.org/hls4ml/).
+
+For introductory material on FPGAs, HLS and ML inferences using hls4ml, check out the [video](https://www.youtube.com/watch?v=2y3GNY4tf7A&ab_channel=SystemsGroupatETHZ%C3%BCrich).
 
 Detailed tutorials on how to use `hls4ml`'s various functionalities can be found [here](https://github.com/hls-fpga-machine-learning/hls4ml-tutorial).
 
@@ -49,8 +51,8 @@ hls_model = hls4ml.converters.keras_to_hls(config)
 hls4ml.utils.fetch_example_list()
 ```
 
-### Building a project with Xilinx Vivado HLS (after downloading and installing from [here](https://www.xilinx.com/products/design-tools/vivado/integration/esl-design.html))
-Note: Vitis HLS is not yet supported. Vivado HLS versions between 2018.2 and 2020.1 are recommended.
+### Building a project.
+We will build the project using Xilinx Vivado HLS, which can be downloaded and installed from [here](https://www.xilinx.com/products/design-tools/vivado/integration/esl-design.html). Alongside Vivado HLS, hls4ml also supports Vitis HLS, Intel HLS, Catapult HLS and has some experimental support dor Intel oneAPI. The target back-end can be changed using the argument backend when building the model.
 
 ```Python
 # Use Vivado HLS to synthesize the model
@@ -61,15 +63,19 @@ hls_model.build()
 hls4ml.report.read_vivado_report('my-hls-test')
 ```
 
+# FAQ
+
+List of frequently asked questions and common HLS synthesis can be found [here](https://fastmachinelearning.org/hls4ml/faq.html)
+
 # Citation
 If you use this software in a publication, please cite the software
 ```bibtex
 @software{fastml_hls4ml,
   author       = {{FastML Team}},
   title        = {fastmachinelearning/hls4ml},
-  year         = 2023,
+  year         = 2024,
   publisher    = {Zenodo},
-  version      = {v0.8.1},
+  version      = {v1.0.0},
   doi          = {10.5281/zenodo.1201549},
   url          = {https://github.com/fastmachinelearning/hls4ml}
 }
diff --git a/docs/advanced/auto.rst b/docs/advanced/auto.rst
new file mode 100644
index 0000000000..f944a11e54
--- /dev/null
+++ b/docs/advanced/auto.rst
@@ -0,0 +1,22 @@
+=============================
+Automatic precision inference
+=============================
+
+The automatic precision inference (implemented in :py:class:`~hls4ml.model.optimizer.passes.infer_precision.InferPrecisionTypes`) attempts to infer the appropriate
+widths for a given precision. It is initiated by setting a precision in the configuration as ``'auto'``. (Note, only layer-level precisions can be set to ``'auto'``,
+not model-level.)  Functions like :py:class:`~hls4ml.utils.config.config_from_keras_model`, :py:class:`~hls4ml.utils.config.config_from_onnx_model`,
+and :py:class:`~hls4ml.utils.config.config_from_pytorch_model` automatically set most precisions to ``'auto'`` if the ``'name'`` granularity is used.
+
+.. note::
+    It is recommended to pass the backend to the ``config_from_*`` functions so that they can properly extract all the configurable precisions.
+
+The approach taken by the precision inference is to set accumulator (the internal variable used to accumulate values in the matrix multiplications) and other precisions
+to never truncate, using only the bitwidths of the inputs (not the values). This is quite conservative, especially in cases where post-training quantization is used, or
+if the bit widths were set fairly loosely. The recommended action in that case is to edit the configuration and explicitly set some widths in it, potentially in an iterative process
+after profiling the data. Another option is to pass a maximum precision using the ``max_precison`` parameter of the ``config_form_*`` functions. Then the automatic precision
+inference will never set a bitwdith larger than the bitwidth of the ``max_precision`` or an integer part larger than the integer part of the ``max_precision`` that is passed.
+(The bitwidth and integer parts of the ``max_precision`` are treated separately.)
+
+When manually setting bitdwidths, the accumulator can overflow, and the precision may need to be reduced. For the accumulator, it is usually a bad idea to explicitly
+enable rounding or saturation modes since it dramatically increases the execution time. For other types (e.g. output types or weight types), however, rounding and saturation handling
+can be enabled as needed.
diff --git a/docs/advanced/bramfactor.rst b/docs/advanced/bramfactor.rst
new file mode 100644
index 0000000000..37fe766060
--- /dev/null
+++ b/docs/advanced/bramfactor.rst
@@ -0,0 +1,42 @@
+==================================
+Loading weights from external BRAM
+==================================
+
+.. note::
+    This feature is being evaluated for re-implementation. We welcome feedback from users how to make the implementation more flexible.
+
+``hls4ml`` can optionally store weights in BRAMs external to the design. This is supported in Vivado/Vitis and Catapult backends. It is the responsibility of the user to ensure the weights are properly loaded during the operation of the design.
+
+The feature works as a threshold, exposed through a ``BramFactor`` config parameter. Layers with more weights above the threshold will be exposed as BRAM interface. Consider the following code:
+
+.. code-block:: Python
+
+    model = tf.keras.models.Sequential()
+    model.add(Dense(10, activation="relu", input_shape=(12,), name="dense_1"))
+    model.add(Dense(20, activation="relu", name="dense_2"))
+    model.add(Dense(5, activation="softmax", name="dense_3"))
+    model.compile(optimizer='adam', loss='mse')
+
+    config = hls4ml.utils.config_from_keras_model(model)
+    config["Model"]["Strategy"] = "Resource"
+    config["Model"]["BramFactor"] = 100
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend
+    )
+
+Having set ``BramFactor=100``, only layers with more than 100 weights will be exposed as external BRAM, in this case layers ``dense_1`` and ``dense_2``. ``BramFactor`` can currently be only set at the model level. The generated code will now have weights as part of the interface.
+
+.. code-block:: C++
+
+    void myproject(
+        hls::stream<input_t> &dense_1_input,
+        hls::stream<result_t> &layer7_out,
+        model_default_t w2[120],
+        model_default_t w4[200]
+    ) {
+        #pragma HLS INTERFACE axis port=dense_1_input,layer7_out
+        #pragma HLS INTERFACE bram port=w2,w4
+        ...
+
+When integrating the design, users can use the exposed interface to implement weight reloading scheme.
diff --git a/docs/advanced/hgq.rst b/docs/advanced/hgq.rst
new file mode 100644
index 0000000000..dd0faad7dc
--- /dev/null
+++ b/docs/advanced/hgq.rst
@@ -0,0 +1,49 @@
+===================================
+High Granularity Quantization (HGQ)
+===================================
+
+.. image:: https://github.com/calad0i/HGQ/actions/workflows/sphinx-build.yml/badge.svg
+   :target: https://calad0i.github.io/HGQ/
+.. image:: https://badge.fury.io/py/hgq.svg
+   :target: https://badge.fury.io/py/hgq
+.. image:: https://img.shields.io/badge/arXiv-2405.00645-b31b1b.svg
+   :target: https://arxiv.org/abs/2405.00645
+
+`High Granularity Quantization (HGQ) <https://github.com/calad0i/HGQ/>`_ is a library that performs gradient-based automatic bitwidth optimization and quantization-aware training algorithm for neural networks to be deployed on FPGAs. By leveraging gradients, it allows for bitwidth optimization at arbitrary granularity, up to per-weight and per-activation level.
+
+.. image:: https://calad0i.github.io/HGQ/_images/overview.svg
+   :alt: Overview of HGQ
+   :align: center
+
+Conversion of models made with HGQ library is fully supported. The HGQ models are first converted to proxy model format, which can then be parsed by hls4ml bit-accurately. Below is an example of how to create a model with HGQ and convert it to hls4ml model.
+
+.. code-block:: Python
+
+   import keras
+   from HGQ.layers import HDense, HDenseBatchNorm, HQuantize
+   from HGQ import ResetMinMax, FreeBOPs
+
+   model = keras.models.Sequential([
+      HQuantize(beta=1.e-5),
+      HDenseBatchNorm(32, beta=1.e-5, activation='relu'),
+      HDenseBatchNorm(32, beta=1.e-5, activation='relu'),
+      HDense(10, beta=1.e-5),
+   ])
+
+    opt = keras.optimizers.Adam(learning_rate=0.001)
+    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])
+    callbacks = [ResetMinMax(), FreeBOPs()]
+
+    model.fit(..., callbacks=callbacks)
+
+    from HGQ import trace_minmax, to_proxy_model
+    from hls4ml.converters import convert_from_keras_model
+
+    trace_minmax(model, x_train, cover_factor=1.0)
+    proxy = to_proxy_model(model, aggressive=True)
+
+    model_hls = convert_from_keras_model(proxy, backend='vivado',output_dir=... ,part=...)
+
+
+An interactive example of HGQ can be found in the `kaggle notebook <https://www.kaggle.com/code/calad0i/small-jet-tagger-with-hgq-1>`_. Full documentation can be found at `calad0i.github.io/HGQ <https://calad0i.github.io/HGQ/>`_.
diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
index 41132ab619..302d646023 100644
--- a/docs/advanced/model_optimization.rst
+++ b/docs/advanced/model_optimization.rst
@@ -124,8 +124,8 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
     acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
     print(f'Optimized Keras accuracy: {acc_optimized}')
 
-There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilisation and VivadoMultiObjectiveEstimator, aimed at optimising BRAM and DSP utilisation.
-Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesing HLS, by modifying the config:
+There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilization and VivadoMultiObjectiveEstimator, aimed at optimizing BRAM and DSP utilization.
+Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesizing HLS, by modifying the config:
 
 .. code-block:: Python
 
diff --git a/docs/api/profiling.rst b/docs/advanced/profiling.rst
similarity index 100%
rename from docs/api/profiling.rst
rename to docs/advanced/profiling.rst
diff --git a/docs/command.rst b/docs/api/command.rst
similarity index 97%
rename from docs/command.rst
rename to docs/api/command.rst
index cb9d346e31..1f821b7f35 100644
--- a/docs/command.rst
+++ b/docs/api/command.rst
@@ -50,7 +50,7 @@ hls4ml config
 
    hls4ml config [-h] [-m MODEL] [-w WEIGHTS] [-o OUTPUT]
 
-This creates a conversion configuration file. Visit Configuration section of the :doc:`Setup <setup>` page for more details on how to write a configuration file.
+This creates a conversion configuration file. Visit Configuration section of the :doc:`Setup <../intro/setup>` page for more details on how to write a configuration file.
 
 **Arguments**
 
diff --git a/docs/api/concepts.rst b/docs/api/concepts.rst
new file mode 100644
index 0000000000..9087470cf3
--- /dev/null
+++ b/docs/api/concepts.rst
@@ -0,0 +1,78 @@
+========
+Concepts
+========
+
+How it Works
+----------------------
+
+.. image:: ../img/nn_map_paper_fig_2.png
+   :width: 70%
+   :align: center
+
+
+Consider a multilayer neural network. At each neuron in a layer :math:`m`  (containing :math:`N_m` neurons), we calculate an output value (part of the output vector :math:`\mathbf{x}_m` of said layer) using the sum of output values of the previous layer multiplied by independent weights for each of these values and a bias value. An activation function is performed on the result to get the final output value for the neuron. Representing the weights as a :math:`N_m` by :math:`N_{m-1}`  matrix  :math:`W_{m,m-1}`, the bias values as :math:`\mathbf{b}_m`, and the activation function as :math:`g_m`, we can express this compactly as:
+
+
+.. math::
+
+   \mathbf{x}_m = g_m (W_{m,m-1} \mathbf{x}_{m-1} +\mathbf{b}_m)
+
+With hls4ml, each layer of output values is calculated independently in sequence, using pipelining to speed up the process by accepting new inputs after an initiation interval.
+The activations, if nontrivial, are precomputed.
+
+To ensure optimal performance, the user can control aspects of their model, principally:
+
+
+* **Size/Compression** - Though not explicitly part of the ``hls4ml`` package, this is an important optimization to efficiently use the FPGA resources
+* **Precision** - Define the :doc:`precision <../advanced/profiling>` of the calculations in your model
+* **Dataflow/Resource Reuse** - Control parallel or streaming model implementations with varying levels of pipelining
+* **Quantization Aware Training** - Achieve best performance at low precision with tools like QKeras, and benefit automatically during inference with ``hls4ml`` parsing of QKeras models
+
+
+.. image:: ../img/reuse_factor_paper_fig_8.png
+   :width: 70%
+   :align: center
+
+
+Often, these decisions will be hardware dependent to maximize performance.
+Of note is that simplifying the input network must be done before using ``hls4ml`` to generate HLS code, for optimal compression to provide a sizable speedup.
+Also important to note is the use of fixed point arithmetic in ``hls4ml``.
+This improves processing speed relative to floating point implementations.
+The ``hls4ml`` package also offers the functionality of configuring binning and output bit width of the precomputed activation functions as necessary. With respect to parallelization and resource reuse, ``hls4ml`` offers a "reuse factor" parameter that determines the number of times each multiplier is used in order to compute a layer of neuron's values. Therefore, a reuse factor of one would split the computation so each multiplier had to only perform one multiplication in the computation of the output values of a layer, as shown above. Conversely, a reuse factor of four, in this case, uses a single multiplier four times sequentially. Low reuse factor achieves the lowest latency and highest throughput but uses the most resources, while high reuse factor save resources at the expense of longer latency and lower throughput.
+
+
+Frontends and Backends
+----------------------
+
+``hls4ml`` has a concept of a **frontend** that parses the input NN into an internal model graph, and a **backend** that controls
+what type of output is produced from the graph. Frontends and backends can be independently chosen. Examples of frontends are the
+parsers for Keras or ONNX, and examples of backends are Vivado HLS, Intel HLS, and Vitis HLS. See :ref:`Status and Features` for the
+currently supported frontends and backends or the dedicated sections for each frontend/backend.
+
+
+I/O Types
+---------
+
+``hls4ml`` supports multiple styles for handling data transfer to/from the network and between layers, known as the ``io_type``.
+
+io_parallel
+^^^^^^^^^^^
+In this processing style, data is passed in parallel between the layers. Conceptually this corresponds to the C/C++ array where all elements can be accessed ay any time. This style allows for maximum parallelism and is well suited for MLP networks and small CNNs which aim for lowest latency. Due to the impact of parallel processing on resource utilization on FPGAs, the synthesis may fail for larger networks.
+
+io_stream
+^^^^^^^^^
+As opposed to the parallel processing style, in ``io_stream`` mode data is passed one "pixel" at a time. Each pixel is an array of channels, which are always sent in parallel. This method for sending data between layers is recommended for larger CNN and RNN networks. For one-dimensional ``Dense`` layers, all the inputs are streamed in parallel as a single array.
+
+With the ``io_stream`` IO type, each layer is connected with the subsequent layer through first-in first-out (FIFO) buffers.
+The implementation of the FIFO buffers contribute to the overall resource utilization of the design, impacting in particular the BRAM or LUT utilization.
+Because the neural networks can have complex architectures generally, it is hard to know a priori the correct depth of each FIFO buffer.
+By default ``hls4ml`` choses the most conservative possible depth for each FIFO buffer, which can result in a an unnecessary overutilization of resources.
+
+In order to reduce the impact on the resources used for FIFO buffer implementation, we have a FIFO depth optimization flow. This is described
+in the :ref:`FIFO Buffer Depth Optimization` section.
+
+
+Strategy
+---------
+
+**Strategy** in ``hls4ml`` refers to the implementation of core matrix-vector multiplication routine, which can be latency-oriented, resource-saving oriented, or specialized. Different strategies will have an impact on overall latency and resource consumption of each layer and users are advised to choose based on their design goals. The availability of particular strategy for a layer varies across backends, see the :doc:`Attributes <../ir/attributes>` section for a complete list of available strategies per-layer and per-backend.
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
index f0db50a9b6..1bc8f0676c 100644
--- a/docs/api/configuration.rst
+++ b/docs/api/configuration.rst
@@ -34,20 +34,46 @@ Using hls4ml, you can quickly generate a simple configuration dictionary from a
    import hls4ml
    config = hls4ml.utils.config_from_keras_model(model, granularity='model')
 
-This python dictionary can be edited as needed. A more advanced configuration can be generated by, for example:
+This python dictionary can be edited as needed. More advanced configuration can be generated by, for example for ONNX models:
 
 .. code-block:: python
 
    import hls4ml
-   config = hls4ml.utils.config_from_keras_model(
+   config = hls4ml.utils.config_from_onnx_model(
         model,
         granularity='name',
         default_precision='fixed<16,6>',
         backend='Vitis')
 
-This will include per-layer configuration based on the model. Including the backend is recommended because some configation options depend on the backend. Note, the precisions at the
-higher granularites usually default to 'auto', which means that ``hls4ml`` will try to set it automatically. Note that higher granularity settings take precendence
-over model-level settings. See :py:class:`~hls4ml.utils.config.config_from_keras_model` for more information on the various options.
+for Keras models:
+
+.. code-block:: python
+
+   import hls4ml
+   config = hls4ml.utils.config_from_keras_model(
+        model,
+        granularity='name',
+        default_precision='fixed<16,6>',
+        backend='oneAPI')
+
+or for PyTorch models:
+
+.. code-block:: python
+
+   import hls4ml
+   config = hls4ml.utils.config_from_pytorch_model(
+        model,
+        granularity='name',
+        default_precision='fixed<16,6>',
+        backend='Catapult')
+
+
+The ``name`` granularity includes per-layer configuration based on the model. A ``'name'`` granularity is generally recommended because it allows for more turning, and also because it allows
+for automatic setting of precisions.  The layer-level precisions with the ``'name'`` granularity default to ``'auto'``, which means that hls4ml will try to set it automatically
+(see :ref:`Automatic precision inference`). Note that layer-level settings take precedence over model-level settings. A ``'name'`` granularity is required for QKeras
+and QONNX model parsing. Passing the backend to these functions is recommended because some configuration options depend on the backend. See :py:class:`~hls4ml.utils.config.config_from_keras_model`
+and similar for more information on the various options. Note specifically the documentation of :py:class:`~hls4ml.utils.config.config_from_pytorch_model` on how to handle differences in input data
+formats between pytorch and keras (hls4ml follows keras conventions internally).
 
 One can override specific values before using the configuration:
 
@@ -59,7 +85,7 @@ Or to set the precision of a specific layer's weight:
 
 .. code-block:: python
 
-   config['LayerName']['fc1']['Precision']['weight'] = 'ap_fixed<8,4>'
+   config['LayerName']['fc1']['Precision']['weight'] = 'fixed<8,4>'
 
 To better understand how the configuration hierachy works, refer to the next section for more details.
 
@@ -75,7 +101,7 @@ Finally, one then uses the configuration to create an hls model:
         backend='Vitis'
     )
 
-See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options.
+See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options. Similar functions exist for ONNX and PyTorch.
 
 ----
 
@@ -85,7 +111,7 @@ See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information
 2.1 Top Level Configuration
 ---------------------------
 
-Configuration files are YAML files in hls4ml (\ ``*.yml``\ ). An example configuration file is `here <https://github.com/hls-fpga-machine-learning/example-models/blob/master/keras-config.yml>`__.
+One can also use YAML configuration files in hls4ml (\ ``*.yml``\ ). An example configuration file is `here <https://github.com/hls-fpga-machine-learning/example-models/blob/master/keras-config.yml>`__.
 
 It looks like this:
 
@@ -108,7 +134,7 @@ It looks like this:
 
    HLSConfig:
      Model:
-       Precision: ap_fixed<16,6>
+       Precision: fixed<16,6>
        ReuseFactor: 1
        Strategy: Latency
      LayerType:
@@ -124,7 +150,7 @@ There are a number of configuration options that you have.  Let's go through the
 * **ProjectName**\ : the name of the HLS project IP that is produced
 * **KerasJson/KerasH5**\ : for Keras, the model architecture and weights are stored in a ``json`` and ``h5`` file.  The path to those files are required here.
   We also support keras model's file obtained just from ``model.save()``. In this case you can just supply the ``h5`` file in ``KerasH5:`` field.
-* **InputData/OutputPredictions**\ : path to your input/predictions of the model. If none is supplied, then hls4ml will create aritificial data for simulation. The data used above in the example can be found `here <https://cernbox.cern.ch/index.php/s/2LTJVVwCYFfkg59>`__. We also support ``npy`` data files. We welcome suggestions on more input data types to support.
+* **InputData/OutputPredictions**\ : path to your input/predictions of the model. If none is supplied, then hls4ml will create artificial data for simulation. The data used above in the example can be found `here <https://cernbox.cern.ch/index.php/s/2LTJVVwCYFfkg59>`__. We also support ``npy`` data files. We welcome suggestions on more input data types to support.
 
 The backend-specific section of the configuration depends on the backend. You can get a starting point for the necessary settings using, for example `hls4ml.templates.get_backend('Vivado').create_initial_config()`.
 For Vivado backend the options are:
@@ -134,13 +160,13 @@ For Vivado backend the options are:
   Then you have some optimization parameters for how your algorithm runs:
 * **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines the type of data structure used for inputs, intermediate activations between layers, and outputs. For ``io_parallel``, arrays are used that, in principle, can be fully unrolled and are typically implemented in RAMs. For ``io_stream``, HLS streams are used, which are a more efficient/scalable mechanism to represent data that are produced and consumed in a sequential manner. Typically, HLS streams are implemented with FIFOs instead of RAMs. For more information see `here <https://docs.xilinx.com/r/en-US/ug1399-vitis-hls/pragma-HLS-stream>`__.
 * **HLSConfig**\: the detailed configuration of precision and parallelism, including:
+
   * **ReuseFactor**\ : in the case that you are pipelining, this defines the pipeline interval or initiation interval
   * **ParallelizationFactor**\ : The number of output "pixels" to compute in parallel in convolutional layers. Increasing this parameter results in significant increase in resources required on the FPGA.
   * **Strategy**\ : Optimization strategy on FPGA, either "Latency", "Resource" or "Unrolled". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" or "unrolled" strategy. An example of using larger reuse factor can be found `here. <https://github.com/fastmachinelearning/models/tree/master/keras/KERAS_dense>`__
   * **PipelineStyle**\ : Set the top level pipeline style. Valid options are "auto", "pipeline" and "dataflow". If unspecified, it defaults to "auto".
   * **PipelineInterval**\ : Optionally override the desired initiation interval of the design. Only valid in combination with "pipeline" style. If unspecified, it is left to the compiler to decide, ideally matching the largest reuse factor of the network.
-  * **Precision**\ : this defines the precsion of your inputs, outputs, weights and biases. It is denoted by ``ap_fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits.
-  Additionally, integers in fixed precision data type (\ ``ap_int<N>``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. You have a chance to further configure this more finely with per-layer configuration described below.
+  * **Precision**\ : this defines the precision of your inputs, outputs, weights and biases. It is denoted by ``fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits. Additionally, integers in the type (\ ``int<N>``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. The format follows ``ap_fixed`` and ``ap_int`` conventions. You have a chance to further configure this more finely with per-layer configuration described below. In the per-layer configuration (but not globally) one can also use ``'auto'`` precision.
 
 2.2 Per-Layer Configuration
 ---------------------------
@@ -153,10 +179,10 @@ Under the ``HLSConfig`` heading, these can be set for the ``Model``\ , per ``Lay
 
    HLSConfig:
      Model:
-       Precision: ap_fixed<16,6>
+       Precision: fixed<16,6>
        ReuseFactor: 1
 
-This configuration use ``ap_fixed<16,6>`` for every variable and a ReuseFactor of 1 throughout.
+This configuration use ``fixed<16,6>`` for every variable and a ReuseFactor of 1 throughout.
 
 Specify all ``Dense`` layers to use a different precision like this:
 
@@ -164,13 +190,13 @@ Specify all ``Dense`` layers to use a different precision like this:
 
    HLSConfig:
      Model:
-       Precision: ap_fixed<16,6>
+       Precision: fixed<16,6>
        ReuseFactor: 1
      LayerType:
        Dense:
-         Precision: ap_fixed<14,5>
+         Precision: fixed<14,5>
 
-In this case, all variables in any ``Dense`` layers will be represented with ``ap_fixed<14,5>`` while any other layer types will use ``ap_fixed<16,6>``.
+In this case, all variables in any ``Dense`` layers will be represented with ``fixed<14,5>`` while any other layer types will use ``fixed<16,6>``.
 
 A specific layer can be targeted like this:
 
@@ -178,18 +204,18 @@ A specific layer can be targeted like this:
 
     HLSConfig:
        Model:
-         Precision: ap_fixed<16,6>
+         Precision: fixed<16,6>
          ReuseFactor: 16
        LayerName:
          dense1:
            Precision:
-             weight: ap_fixed<14,2>
-             bias: ap_fixed<14,4>
-             result: ap_fixed<16,6>
+             weight: fixed<14,2>
+             bias: fixed<14,4>
+             result: fixed<16,6>
            ReuseFactor: 12
            Strategy: Resource
 
-In this case, the default model configuration will use ``ap_fixed<16,6>`` and a ``ReuseFactor`` of 16. The layer named ``dense1`` (defined in the user provided model architecture file) will instead use different precision for the ``weight``\ , ``bias``\ , and ``result`` (output) variables, a ``ReuseFactor`` of 12, and the ``Resource`` strategy (while the model default is ``Latency`` strategy.
+In this case, the default model configuration will use ``fixed<16,6>`` and a ``ReuseFactor`` of 16. The layer named ``dense1`` (defined in the user provided model architecture file) will instead use different precision for the ``weight``\ , ``bias``\ , and ``result`` (output) variables, a ``ReuseFactor`` of 12, and the ``Resource`` strategy (while the model default is ``Latency`` strategy.
 
 More than one layer can have a configuration specified, e.g.:
 
@@ -206,7 +232,7 @@ More than one layer can have a configuration specified, e.g.:
        dense2:
           ...
 
-For more information on the optimization parameters and what they mean, you can visit the :doc:`Concepts <../concepts>` chapter.
+For more information on the optimization parameters and what they mean, you can visit the :doc:`Concepts <../api/concepts>` section.
 
 ----
 
@@ -235,7 +261,7 @@ In your project, the file ``<OutputDir>/firmware/<ProjectName>.cpp`` is your top
 
    nnet::sigmoid<layer4_t, result_t, sigmoid_config5>(layer4_out, layer5_out);
 
-You can see, for the simple 1-layer DNN, the computation (\ ``nnet::dense_latency``\ ) and activation (\ ``nnet::relu``\ /\ ``nnet::sigmoid``\ ) caluclation for each layer.  For each layer, it has its own additional configuration parameters, e.g. ``config2``.
+You can see, for the simple 1-layer DNN, the computation (\ ``nnet::dense_latency``\ ) and activation (\ ``nnet::relu``\ /\ ``nnet::sigmoid``\ ) calculation for each layer.  For each layer, it has its own additional configuration parameters, e.g. ``config2``.
 
 In your project, the file ``<OutputDir>/firmware/parameters.h`` stores all the configuration options for each neural network library.
 An example is `here <https://github.com/hls-fpga-machine-learning/models/blob/master/HLS_projects/KERAS-1layer-hls/firmware/parameters.h>`__. So for example, the detailed configuration options for an example DNN layer is:
diff --git a/docs/attr_doc_gen.py b/docs/attr_doc_gen.py
new file mode 100644
index 0000000000..0ba2a5b77e
--- /dev/null
+++ b/docs/attr_doc_gen.py
@@ -0,0 +1,149 @@
+import numbers
+
+import hls4ml.backends as backends
+import hls4ml.model.attributes as attributes
+import hls4ml.model.layers as layers
+
+
+class AttrList:
+    def __init__(self, cls_name, cls_attrs) -> None:
+        self.cls_name = cls_name
+        self.config_attrs = [attr for attr in cls_attrs if attr.configurable is True]
+        self.type_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'TypeAttribute']
+        self.weight_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'WeightAttribute']
+        self.base_attrs = [attr for attr in cls_attrs if attr not in self.config_attrs + self.type_attrs + self.weight_attrs]
+        self.backend_attrs = {}
+        self.reverse_backend_attrs = []  # Will hold (attr, backend_name) pairs, used temporarily
+        self.unique_backend_attrs = []
+
+    def add_backend_attrs(self, backend_name, backend_attrs):
+        self.backend_attrs[backend_name] = backend_attrs
+
+        for attr in backend_attrs:
+            self.reverse_backend_attrs.append((attr, backend_name))
+
+    def sift_backend_attrs(self):
+        grouped_dict = {}
+        for attr, backend_name in self.reverse_backend_attrs:
+            if attr not in grouped_dict:
+                grouped_dict[attr] = []
+            grouped_dict[attr].append(backend_name)
+
+        for attr, backend_names in grouped_dict.items():
+            attr.available_in = backend_names
+            self.unique_backend_attrs.append(attr)
+
+    @property
+    def only_configurable(self):
+        all_attrs = self.config_attrs + self.type_attrs + self.unique_backend_attrs
+        return [attr for attr in all_attrs if attr.configurable is True]
+
+
+def convert_to_attr_list():
+    all_backends = backends.get_available_backends()
+    # Removing duplicates but preserving order
+    all_layers = list(dict.fromkeys(layers.layer_map.values()))
+    all_layers_attrs = []
+
+    for layer_cls in all_layers:
+        base_attrs = layer_cls.expected_attributes
+
+        attr_list = AttrList(layer_cls.__name__, base_attrs)
+
+        for backend_name in all_backends:
+            backend = backends.get_backend(backend_name)
+
+            backend_cls = backend.create_layer_class(layer_cls)
+            backend_attrs = backend_cls.expected_attributes
+
+            diff_atts = [
+                attr for attr in backend_attrs if attr not in base_attrs
+            ]  # Sets are faster, but don't preserve order
+            if len(diff_atts) > 0:
+                attr_list.add_backend_attrs(backend.name, diff_atts)
+
+        all_layers_attrs.append(attr_list)
+
+    for attr_list in all_layers_attrs:
+        attr_list.sift_backend_attrs()
+
+    return all_layers_attrs
+
+
+def print_attrs(attrs, file):
+    for attr in attrs:
+        if attr.value_type == numbers.Integral:
+            vtype = 'int'
+        elif attr.__class__ == attributes.ChoiceAttribute:
+            choices = ','.join([str(c) for c in attr.choices])
+            vtype = f'list [{choices}]'
+        else:
+            vtype = attr.value_type.__name__ if hasattr(attr.value_type, '__name__') else str(attr.value_type)
+
+        if attr.default is None:
+            file.write('* ' + attr.name + ': ' + vtype + '\n\n')
+        else:
+            file.write('* ' + attr.name + ': ' + vtype + ' (Default: ' + str(attr.default) + ')\n\n')
+
+        if attr.description is not None:
+            file.write('  * ' + attr.description + '\n\n')
+
+        if hasattr(attr, 'available_in'):
+            file.write('  * Available in: ' + ', '.join(attr.available_in) + '\n\n')
+
+
+def write_all_attributes(all_layers_attrs):
+    with open('attributes.rst', mode='w') as file:
+        file.write('================\n')
+        file.write('Layer attributes\n')
+        file.write('================\n\n\n')
+
+        for attr_list in all_layers_attrs:
+            file.write(attr_list.cls_name + '\n')
+            file.write('=' * len(attr_list.cls_name) + '\n')
+
+            if len(attr_list.base_attrs) > 0:
+                file.write('Base attributes\n')
+                file.write('---------------\n')
+                print_attrs(attr_list.type_attrs, file)
+
+            if len(attr_list.type_attrs) > 0:
+                file.write('Type attributes\n')
+                file.write('---------------\n')
+                print_attrs(attr_list.base_attrs, file)
+
+            if len(attr_list.weight_attrs) > 0:
+                file.write('Weight attributes\n')
+                file.write('-----------------\n')
+                print_attrs(attr_list.weight_attrs, file)
+
+            if len(attr_list.config_attrs) > 0:
+                file.write('Configurable attributes\n')
+                file.write('-----------------------\n')
+                print_attrs(attr_list.config_attrs, file)
+
+            if len(attr_list.backend_attrs) > 0:
+                file.write('Backend-specific attributes\n')
+                file.write('---------------------------\n')
+                print_attrs(attr_list.unique_backend_attrs, file)
+
+
+def write_only_configurable(all_layers_attrs):
+    with open('attributes.rst', mode='w') as file:
+        file.write('================\n')
+        file.write('Layer attributes\n')
+        file.write('================\n\n\n')
+
+        for attr_list in all_layers_attrs:
+            file.write(attr_list.cls_name + '\n')
+            file.write('=' * len(attr_list.cls_name) + '\n')
+
+            config_attrs = attr_list.only_configurable
+            if len(config_attrs) > 0:
+                print_attrs(config_attrs, file)
+
+
+if __name__ == '__main__':
+    all_layers_attrs = convert_to_attr_list()
+    write_all_attributes(all_layers_attrs)
+    # write_only_configurable(all_layers_attrs)
diff --git a/docs/advanced/accelerator.rst b/docs/backend/accelerator.rst
similarity index 95%
rename from docs/advanced/accelerator.rst
rename to docs/backend/accelerator.rst
index 7a79d9dbdc..187bccaa2c 100644
--- a/docs/advanced/accelerator.rst
+++ b/docs/backend/accelerator.rst
@@ -1,8 +1,8 @@
-=========================
-VivadoAccelerator Backend
-=========================
+=================
+VivadoAccelerator
+=================
 
-The ``VivadoAccelerator`` backend of ``hls4ml`` leverages the `PYNQ <http://pynq.io/>`_ software stack to easily deploy models on supported devices.
+The **VivadoAccelerator** backend of ``hls4ml`` leverages the `PYNQ <http://pynq.io/>`_ software stack to easily deploy models on supported devices.
 Currently ``hls4ml`` supports the following boards:
 
 * `pynq-z2 <https://www.xilinx.com/support/university/xup-boards/XUPPYNQ-Z2.html>`_ (part: ``xc7z020clg400-1``)
@@ -13,7 +13,7 @@ Currently ``hls4ml`` supports the following boards:
 * `alveo-u280 <https://www.xilinx.com/products/boards-and-kits/alveo/u280.html>`_ (part: ``xcu280-fsvh2892-2L-e``)
 
 but, in principle, support can be extended to `any board supported by PYNQ <http://www.pynq.io/board.html>`_.
-For the Zynq-based boards, there are two components: an ARM-based processing system (PS) and FPGA-based programmable logic (PL), with various intefaces between the two.
+For the Zynq-based boards, there are two components: an ARM-based processing system (PS) and FPGA-based programmable logic (PL), with various interfaces between the two.
 
 .. image:: ../img/zynq_interfaces.png
   :height: 300px
diff --git a/docs/backend/catapult.rst b/docs/backend/catapult.rst
new file mode 100644
index 0000000000..00cf0fb98b
--- /dev/null
+++ b/docs/backend/catapult.rst
@@ -0,0 +1,7 @@
+========
+Catapult
+========
+
+Support for Siemens Catapult HLS compiler has been added in ``hls4ml`` version 1.0.0.
+
+*TODO expand this section*
diff --git a/docs/advanced/oneapi.rst b/docs/backend/oneapi.rst
similarity index 58%
rename from docs/advanced/oneapi.rst
rename to docs/backend/oneapi.rst
index ae0e0bc56b..585bfc27cb 100644
--- a/docs/advanced/oneapi.rst
+++ b/docs/backend/oneapi.rst
@@ -1,25 +1,24 @@
-==============
-oneAPI Backend
-==============
+======
+oneAPI
+======
 
-The ``oneAPI`` backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually
-replace the ``Quartus`` backend, which should really have been called the Intel HLS backend. (The actual Quartus
-program continues to be used with IP produced by the ``oneAPI`` backend.)
-This section discusses details of the ``oneAPI`` backend.
+The **oneAPI** backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually
+replace the **Quartus** backend, which targeted Intel HLS. (Quartus continues to be used with IP produced by the
+**oneAPI** backend.) This section discusses details of the **oneAPI** backend.
 
-The ``oneAPI`` code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the
-accelerator style of programming. In the IP Component flow, which is currently the only flow supported, the
+The **oneAPI** code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the
+accelerator style of programming. In the SYCL HLS (IP Component) flow, which is currently the only flow supported, the
 kernel becomes the IP, and the "host code" becomes the testbench. An accelerator flow, with easier deployment on
 PCIe accelerator boards, is planned to be added in the future.
 
 The produced work areas use cmake to build the projects in a style based
 `oneAPI-samples <https://github.com/oneapi-src/oneAPI-samples/tree/main/DirectProgramming/C%2B%2BSYCL_FPGA>`_.
-The standard ``fpga_emu``, ``report``, ``fpga_sim``, and ``fpga`` are supported. Additionally, ``make lib``
+The standard ``fpga_emu``, ``report``, ``fpga_sim``, and ``fpga`` make targets are supported. Additionally, ``make lib``
 produces the library used for calling the ``predict`` function from hls4ml. The ``compile`` and ``build`` commands
 in hls4ml interact with the cmake system, so one does not need to manually use the build system, but it there
 if desired.
 
-The ``oneAPI`` backend, like the ``Quartus`` backend, only implements the ``Resource`` strategy for the layers. There
+The **oneAPI** backend, like the **Quartus** backend, only implements the ``Resource`` strategy for the layers. There
 is no ``Latency`` implementation of any of the layers.
 
 Note:  currently tracing and external weights (i.e. setting BramFactor) are not supported.
@@ -30,6 +29,7 @@ io_parallel and io_stream
 As mentioned in the :ref:`I/O Types` section, ``io_parallel`` is for small models, while ``io_stream`` is for
 larger models. In ``oneAPI``, there is an additional difference: ``io_stream`` implements each layer on its
 own ``task_sequence``. Thus, the layers run in parallel, with pipes connecting the inputs and outputs. This
-is similar in style to the `dataflow` implementation on Vitis, but more explicit. On the other hand, ``io_parallel``
-always uses a single task, relying on pipelining within the task for good performance. In contrast, the Vitis
-backend sometimes uses dataflow with ``io_parallel``.
+is similar in style to the `dataflow` implementation on Vitis HLS, but more explicit. It is also a change
+relative to the Intel HLS-based ``Quartus`` backend. On the other hand, ``io_parallel`` always uses a single task,
+relying on pipelining within the task for good performance. In contrast, the Vitis backend sometimes uses dataflow
+with ``io_parallel``.
diff --git a/docs/backend/quartus.rst b/docs/backend/quartus.rst
new file mode 100644
index 0000000000..8cde5f97b2
--- /dev/null
+++ b/docs/backend/quartus.rst
@@ -0,0 +1,12 @@
+=======
+Quartus
+=======
+
+.. warning::
+    The **Quartus** backend is deprecated and will be removed in a future version. Users should migrate to the **oneAPI** backend.
+
+The **Quartus** backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It uses the discontinued Intel HLS compiler. The **oneAPI** backend should be preferred for new projects.
+The **oneAPI** backend contains the migrated the HLS code from this backend, with significantly better io_stream support, though the **oneAPI** backend does not yet support profiling, tracing,
+or the BramFactor option supported by the **Quartus** backend.  Nevertheless, little or no further development is expected for this backend.
+
+The **Quartus** backend only implements the ``Resource`` strategy for the layers. There is no ``Latency`` implementation of any of the layers.
diff --git a/docs/backend/sr.rst b/docs/backend/sr.rst
new file mode 100644
index 0000000000..93a247b63d
--- /dev/null
+++ b/docs/backend/sr.rst
@@ -0,0 +1,7 @@
+==================
+SymbolicExpression
+==================
+
+This backend can be used to implement expressions obtained through symbolic regression tools such as `PySR <https://github.com/MilesCranmer/PySR>`_ or `SymbolNet <https://github.com/hftsoi/SymbolNet>`_. The backend targets Vivado/Vitis HLS and relies on HLS math libraries provided with a licensed installation of these tools.
+
+*TODO expand this section*
diff --git a/docs/backend/vitis.rst b/docs/backend/vitis.rst
new file mode 100644
index 0000000000..9528e89a93
--- /dev/null
+++ b/docs/backend/vitis.rst
@@ -0,0 +1,11 @@
+============
+Vivado/Vitis
+============
+
+The **Vivado** and **Vitis** backends are aimed for use with AMD/Xilinx FPGAs. The **Vivado** backend targets the discontinued ``Vivado HLS`` compiler, while
+the **Vitis** backend targets the ``Vitis HLS`` compiler. Both are designed to produce IP for incorporation in ``Vivado`` designs. (See :doc:`VivadoAccelerator <accelerator>`
+for generating easily-deployable models with ``Vivado HLS``.) The ``Vitis`` accelerator flow is not directly supported, though HLS produced with the **Vitis**
+backend can be easily incorporated into Vitis kernel.
+
+Users should generally use the **Vitis** backend for new designs that target AMD/Xilinx FPGAs; new ``hls4ml`` developments will not necessarily be backported to
+the **Vivado** backend.
diff --git a/docs/concepts.rst b/docs/concepts.rst
deleted file mode 100644
index b788d5ba5d..0000000000
--- a/docs/concepts.rst
+++ /dev/null
@@ -1,69 +0,0 @@
-========
-Concepts
-========
-
-The goal of ``hls4ml`` is to provide an efficient and fast translation of machine learning models from open-source packages (like Keras and PyTorch) for training machine learning algorithms to high level synthesis (HLS) code that can then be transpiled to run on an FPGA. The resulting HLS project can be then used to produce an IP which can be plugged into more complex designs or be used to create a kernel for CPU co-processing. The user has freedom to define many of the parameters of their algorithm to best suit their needs.
-
-The ``hls4ml`` package enables fast prototyping of a machine learning algorithm implementation in FPGAs,
-greatly reducing the time to results and giving the user intuition for how to best design a machine learning algorithm for their application while balancing performance, resource utilization and latency requirements.
-
-The Inspiration
-===============
-
-The inspiration for the creation of the ``hls4ml`` package stems from the high energy physics community at the CERN Large Hadron Collider (LHC).
-While machine learning has already been proven to be extremely useful in analysis of data from detectors at the LHC, it is typically performed in an "offline" environment after the data is taken and agglomerated.
-However, one of the largest problems at detectors on the LHC is that collisions, or "events", generate too much data for everything to be saved.
-As such, filters called "triggers" are used to determine whether a given event should be kept.
-Using FPGAs allows for significantly lower latency so machine learning algorithms can essentially be run "live" at the detector level for event selection. As a result, more events with potential signs of new physics can be preserved for analysis.
-
-The Solution: ``hls4ml``
-========================
-
-.. image:: img/overview.jpg
-
-
-With this in mind, let's take a look at how ``hls4ml`` helps to achieve such a goal. First, it's important to realize the architecture differences between an FPGA and a CPU or GPU.
-An FPGA can be specifically programmed to do a certain task, in this case evaluate neural networks given a set of inputs, and as such can be highly optimized for the task, with tricks like pipelining and parallel evaluation. However, this means dynamic remapping while running isn't really a possibility.
-FPGAs also often come at a comparatively low power cost with respect to CPUs and GPUs. This allows ``hls4ml`` to build HLS code from compressed neural networks that results in predictions on the microsecond scale for latency.
-The ``hls4ml`` tool saves the time investment needed to convert a neural network to a hardware design language or even HLS code, thus allowing for rapid prototyping.
-
-How it Works
-=============
-
-.. image:: img/nn_map_paper_fig_2.png
-   :width: 70%
-   :align: center
-
-
-Consider a multilayer neural network. At each neuron in a layer :math:`m`  (containing :math:`N_m` neurons), we calculate an output value (part of the output vector :math:`\mathbf{x}_m` of said layer) using the sum of output values of the previous layer multiplied by independent weights for each of these values and a bias value. An activation function is performed on the result to get the final output value for the neuron. Representing the weights as a :math:`N_m` by :math:`N_{m-1}`  matrix  :math:`W_{m,m-1}`, the bias values as :math:`\mathbf{b}_m`, and the activation function as :math:`g_m`, we can express this compactly as:
-
-
-.. math::
-
-   \mathbf{x}_m = g_m (W_{m,m-1} \mathbf{x}_{m-1} +\mathbf{b}_m)
-
-With hls4ml, each layer of output values is calculated independently in sequence, using pipelining to speed up the process by accepting new inputs after an initiation interval.
-The activations, if nontrivial, are precomputed.
-
-To ensure optimal performance, the user can control aspects of their model, principally:
-
-
-* **Size/Compression** - Though not explicitly part of the ``hls4ml`` package, this is an important optimization to efficiently use the FPGA resources
-* **Precision** - Define the :doc:`precision <api/profiling>` of the calculations in your model
-* **Dataflow/Resource Reuse** - Control parallel or streaming model implementations with varying levels of pipelining
-* **Quantization Aware Training** - Achieve best performance at low precision with tools like QKeras, and benefit automatically during inference with ``hls4ml`` parsing of QKeras models
-
-
-.. image:: img/reuse_factor_paper_fig_8.png
-   :width: 70%
-   :align: center
-
-
-Often, these decisions will be hardware dependent to maximize performance.
-Of note is that simplifying the input network must be done before using ``hls4ml`` to generate HLS code, for optimal compression to provide a sizable speedup.
-Also important to note is the use of fixed point arithmetic in ``hls4ml``.
-This improves processing speed relative to floating point implementations.
-The ``hls4ml`` package also offers the functionality of configuring binning and output bit width of the precomputed activation functions as necessary. With respect to parallelization and resource reuse, ``hls4ml`` offers a "reuse factor" parameter that determines the number of times each multiplier is used in order to compute a layer of neuron's values. Therefore, a reuse factor of one would split the computation so each multiplier had to only perform one multiplication in the computation of the output values of a layer, as shown above. Conversely, a reuse factor of four, in this case, uses a single multiplier four times sequentially. Low reuse factor achieves the lowest latency and highest throughput but uses the most resources, while high reuse factor save resources at the expense of longer latency and lower throughput.
-The reuse factor can be set using the configuration options defined on the :doc:`Setup <setup>` page.
-
-Thereby, the ``hls4ml`` package builds efficient HLS code to implement neural networks on FPGAs for microsecond-scale latency on predictions. For more detailed information, take a look at our :doc:`References <reference>` page. All figures on this page are taken from the following paper: `JINST 13 P07027 (2018) <https://dx.doi.org/10.1088/1748-0221/13/07/P07027>`_.
diff --git a/docs/details.rst b/docs/details.rst
deleted file mode 100644
index 750833001d..0000000000
--- a/docs/details.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-================
-Software Details
-================
-
-Frontends and Backends
-----------------------
-
-In ``hls4ml`` there is a a concept of a *frontend* to parse the input NN into an internal model graph, and a *backend* that controls
-what type of output is produced from the graph. Frontends and backends can be independently chosen. Examples of frontends are the
-parsers for Keras or ONNX, and examples of backends are Vivado HLS, Intel HLS, and Vitis HLS. See :ref:`Status and Features` for the
-currently supported frontends and backends.
-
-I/O Types
----------
-
-``hls4ml`` supports multiple styles for handling data between layers, known as the ``io_type``.
-
-io_parallel
-^^^^^^^^^^^
-Data is passed in parallel between the layers. This is good for MLP networks and small CNNs. Synthesis may fail for larger networks.
-
-io_stream
-^^^^^^^^^
-Data is passed one "pixel" at a time. Each pixel is an array of channels, which are always sent in parallel. This method for sending
-data between layers is recommended for larger CNNs. For ``Dense`` layers, all the inputs are streamed in parallel as a single array.
-
-With the ``io_stream`` IO type, each layer is connected with the subsequent layer through first-in first-out (FIFO) buffers.
-The implementation of the FIFO buffers contribute to the overall resource utilization of the design, impacting in particular the BRAM or LUT utilization.
-Because the neural networks can have complex architectures generally, it is hard to know a priori the correct depth of each FIFO buffer.
-By default ``hls4ml`` choses the most conservative possible depth for each FIFO buffer, which can result in a an unnecessary overutilization of resources.
-
-In order to reduce the impact on the resources used for FIFO buffer implementation, we have a FIFO depth optimization flow. This is described
-in the :ref:`FIFO Buffer Depth Optimization` section.
diff --git a/docs/frontend/keras.rst b/docs/frontend/keras.rst
new file mode 100644
index 0000000000..d6d42cb4b8
--- /dev/null
+++ b/docs/frontend/keras.rst
@@ -0,0 +1,11 @@
+================
+Keras and QKeras
+================
+
+Keras and the quantization library QKeras are well supported in ``hls4ml``. Currently, the Keras v2 (``tf.keras``) is the preferred version, and the future versions of ``hls4ml`` will expand support for Keras v3. The frontend is based on the parsing the serialized json representation of the model.
+
+Currently, ``hls4ml`` can parse most Keras layers, including core layers, convolutional layers, pooling layers, recurrent layers, merging/reshaping layers and activation layers, implemented either via sequential or functional API. Notably missing are the attention and normalization layers. The equivalent QKeras API and quantizers are also supported. The ``Lambda`` layers don't save their state in the serialized format and are thus impossible to parse. In this case, the ``Lambda`` layers can be implemented as custom layers and parsed via the :ref:`Extension API`.
+
+The ``data_format='channels_first'`` parameter of Keras layers is supported, but not extensively tested. All HLS implementations in ``hls4ml`` are based on ``channels_last`` data format and need to be converted to that format before the HLS code can be emitted. We encourage users of ``channels_first`` to report their experiences to developers on GitHub.
+
+The development team of ``hls4ml`` is currently exploring options for QKeras alternative and will provide a drop-in replacement API compatible with Keras v3.
diff --git a/docs/frontend/pytorch.rst b/docs/frontend/pytorch.rst
new file mode 100644
index 0000000000..6e91d0c44e
--- /dev/null
+++ b/docs/frontend/pytorch.rst
@@ -0,0 +1,20 @@
+====================
+PyTorch and Brevitas
+====================
+
+The PyTorch frontend in ``hls4ml`` is implemented by parsing the symbolic trace of the ``torch.fx`` framework. This ensures the proper execution graph is captured. Therefore, only models that can be traced with the FX framework can be parsed by ``hls4ml``.
+
+Provided the underlying operation is supported in ``hls4ml``, we generally aim to support the use of both ``torch.nn`` classes and ``torch.nn.functional`` functions in the construction of PyTorch models. Generally, the use of classes is more thoroughly
+tested. Please reach out if you experience any issues with either case.
+
+The PyTorch/Brevitas parser is under heavy development and doesn't yet have the same feature set of the Keras parsers. Feel free to reach out to developers if you find a missing feature that is present in Keras parser and would like it implemented.
+
+.. note::
+    The direct ingestion of models quantized with brevitas is not supported currently. Instead, brevitas models shoud be exported in the ONNX format (see `here <https://xilinx.github.io/brevitas/tutorials/onnx_export.html>`_) and read with the ``hls4ml``
+    QONNX frontend. Issues may arise, for example when non power-of-2 or non-scalar quantization scales are used. Please reach out if you encounter any problems with this workflow.
+
+For multi-dimensional tensors, ``hls4ml`` follows the channels-last convention adopted by Keras, whereas PyTorch uses channels-first. By default, ``hls4ml`` will automaticlly transpose any tensors associated with weights and biases of the internal layers
+of the model. If the ``io_parallel`` I/O type (see :ref:`Concepts`) is used, a transpose node will be added to the model that also adjusts the input tensors. This is not available in the ``io_stream`` case and inputs must be transposed by the user.
+Outputs are not transposed back by default, but in ``io_parallel`` case, a transpose node can be added. If not needed, these adjustments can also be switched off. See :py:class:`~hls4ml.utils.config.config_from_pytorch_model` for details.
+
+The equivalent of Keras extension API is not yet available for PyTorch parser, and will be provided in the future.
diff --git a/docs/advanced/qonnx.rst b/docs/frontend/qonnx.rst
similarity index 100%
rename from docs/advanced/qonnx.rst
rename to docs/frontend/qonnx.rst
diff --git a/docs/index.rst b/docs/index.rst
index 339c4cfd42..ff92a3d543 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -2,32 +2,64 @@
     :hidden:
     :caption: Introduction
 
-    concepts
-    status
-    setup
-    release_notes
-    details
-    flows
-    command
-    reference
+    intro/introduction
+    intro/status
+    intro/setup
+    intro/faq
+    intro/release_notes
+    intro/reference
 
 .. toctree::
     :hidden:
     :glob:
-    :caption: Quick API Reference
+    :caption: User Guide
 
-    api/*
+    api/concepts
+    api/configuration
+    api/command
+
+.. toctree::
+    :hidden:
+    :glob:
+    :caption: Frontends
+
+    frontend/keras
+    frontend/pytorch
+    frontend/qonnx
+
+.. toctree::
+    :hidden:
+    :glob:
+    :caption: Backends
+
+    backend/vitis
+    backend/accelerator
+    backend/oneapi
+    backend/catapult
+    backend/quartus
+    backend/sr
 
 .. toctree::
     :hidden:
     :caption: Advanced Features
 
-    advanced/qonnx
+    advanced/profiling
+    advanced/auto
+    advanced/hgq
     advanced/fifo_depth
     advanced/extension
-    advanced/oneapi
-    advanced/accelerator
     advanced/model_optimization
+    advanced/bramfactor
+
+.. toctree::
+    :hidden:
+    :glob:
+    :caption: Internals
+
+    ir/ir
+    ir/modelgraph
+    ir/flows
+    ir/attributes
 
 .. toctree::
     :hidden:
@@ -61,6 +93,4 @@ For the latest status including current and planned features, see the :ref:`Stat
 
 Tutorials
 =================================
-Detailed tutorials on how to use ``hls4ml``'s various functionalities can be found at:
-
-https://github.com/fastmachinelearning/hls4ml-tutorial
+Detailed tutorials on how to use ``hls4ml``'s various functionalities can be found `here <https://github.com/fastmachinelearning/hls4ml-tutorial>`_.
diff --git a/docs/intro/faq.rst b/docs/intro/faq.rst
new file mode 100644
index 0000000000..22b4c6c99a
--- /dev/null
+++ b/docs/intro/faq.rst
@@ -0,0 +1,52 @@
+Frequently asked questions
+==========================
+
+**What is hls4ml?**
+
+``hls4ml`` is a tool for converting neural network models into FPGA firmware. hls4ml is aimed at low-latency applications, such as triggering at the Large Hadron Collider (LHC) at CERN, but is applicable to other domains requiring microsecond latency. See the full documentation for more details.
+
+**How does hls4ml work?**
+
+``hls4ml`` takes the models from Keras, PyTorch and ONNX (optionally quantized with the respective quantization libraries) and produces high-level synthesis code (based on C++) that can be converted to FPGA firmware using the HLS compilers from different vendors (AMD/Xilinx, Intel/Altera, Catapult...).
+
+**How is hls4ml so fast?**
+
+``hls4ml`` stores all weights on-chip for fast access and has tuneable parallelism. As a consequence, the size of the model that can be successfully converted into firmware with hls4ml largely depends on the amount of available resources on the target FPGA. Therefore it is highly recommended to compress the model with quantization (via QKeras or HGQ for Keras or Brevitas for PyTorch) and pruning. Additionally, ``hls4ml`` exploits the parallelism available in an FPGA or ASIC by implementing a spatial dataflow architecture.
+
+**Will my model work with hls4ml?**
+
+``hls4ml`` supports many common layers found in MLP, CNN and RNN architectures, however some seldom-used features of these layers may not be supported. Novel architectures such as graph networks or transformers are in various stages of development and are currently not stable for end-users. See the status and features page for more information. Models with custom layers can be supported through extension API. If you encounter a feature not yet supported, open a new issue.
+
+**Will my model with X parameters fit an FPGA model Y?**
+
+It depends. ``hls4ml`` has been successfully used with quantized models with `O` (10k) parameters, while for some architectures going beyond `O` (1000) parameters is not doable even on the largest FPGAs. The number of parameters of a model is generally not a good estimate of the performance on an FPGA as the computational complexity of different types of NN layers has big effects on the resource consumption on an FPGA. For example, a CNN or GNN may reuse the same parameter in many operations. Furthermore, model compression in the form of quantization and pruning can significantly change the footprint of the model on the FPGA. For these reasons, we discourage the use of this metric for estimating performance.
+
+If you're looking for a quick estimate of the resource usage and latency for a given model without synthesis, look into `rule4ml <https://github.com/IMPETUS-UdeS/rule4ml>`_ and `wa-hls4ml <https://github.com/Dendendelen/wa-hls4ml>`_ projects.
+
+LLMs and large vision transformers are not supported nor planned.
+
+**How do I get started with hls4ml?**
+
+We strongly recommend interested users unfamiliar with FPGAs or model compression techniques to review the `hls4ml tutorials <https://github.com/fastmachinelearning/hls4ml-tutorial>`_ to get an overview of the features and conversion workflow.
+
+**How do I contribute to hls4ml development?**
+
+We're always welcoming new contributions. If you have an interesting feature in mind feel free to start a new discussion thread with your proposal. We also have regular meetings online to discuss the status of developments where you can be invited to present your work. To receive announcements, `request to be added to our CERN e-group <https://e-groups.cern.ch/e-groups/Egroup.do?egroupName=hls-fml>`_. Furthermore, check the `CONTRIBUTING <https://github.com/fastmachinelearning/hls4ml/blob/main/CONTRIBUTING.md>`_ document for a set of technical requirements for making contributions to the hls4ml project.
+
+
+Common HLS synthesis issues
+***************************
+
+**Stop unrolling loop ... because it may cause large runtime and excessive memory usage due to increase in code size.**
+
+This error is common with models that are too large to fit on the FPGA given the ``IOType`` used. If you are using ``io_parallel``, consider switching to ``io_stream``, which prevents unrolling all arrays. It may help to also use the ``Resource`` strategy. Pruning or quantizing the model may not help as it is related to the size of the loops. If possible, try to reduce the number of neurons/filters of your model to reduce the size of the activation tensors and thus number of iterations of loops.
+
+**cannot open shared object file ...: No such file or directory.**
+
+This is usually an indication that the compilation failed due to incorrect HLS code being produced. It is most likely a bug in hls4ml. Please open a bug report. Note that the displayed error message may be the same but the cause can be different. Unless you're sure that the existing bug reports show the same underlying issue, it is better to open a separate bug report.
+
+**My hls4ml predictions don't match the original Keras/PyTorch/ONNX ones**
+
+``hls4ml`` uses fixed-point precision types to represent internal data structures, unlike the floating-point precision types used for computation in upstream ML toolkits. If the used bit width is not sufficiently wide, you may encounter issues with computation accuracy that propagates through the layers. This is especially true for models that are not fully quantized, or models with insufficient ``accum_t`` bitwidth. Look into automatic precision inference and profiling tools to resolve the issue.
+
+Note that bit-exact behavior is not always possible, as many math functions (used by activation functions) are approximated with lookup tables.
diff --git a/docs/intro/introduction.rst b/docs/intro/introduction.rst
new file mode 100644
index 0000000000..8d603bd78f
--- /dev/null
+++ b/docs/intro/introduction.rst
@@ -0,0 +1,30 @@
+============
+Introduction
+============
+
+The goal of ``hls4ml`` is to provide an efficient and fast translation of machine learning models from open-source packages (like Keras and PyTorch) for training machine learning algorithms to high level synthesis (HLS) code that can then be transpiled to run on an FPGA. The resulting HLS project can be then used to produce an IP which can be plugged into more complex designs or be used to create a kernel for CPU co-processing. The user has freedom to define many of the parameters of their algorithm to best suit their needs.
+
+The ``hls4ml`` package enables fast prototyping of a machine learning algorithm implementation in FPGAs,
+greatly reducing the time to results and giving the user intuition for how to best design a machine learning algorithm for their application while balancing performance, resource utilization and latency requirements.
+
+The Inspiration
+===============
+
+The inspiration for the creation of the ``hls4ml`` package stems from the high energy physics community at the CERN Large Hadron Collider (LHC).
+While machine learning has already been proven to be extremely useful in analysis of data from detectors at the LHC, it is typically performed in an "offline" environment after the data is taken and agglomerated.
+However, one of the largest problems at detectors on the LHC is that collisions, or "events", generate too much data for everything to be saved.
+As such, filters called "triggers" are used to determine whether a given event should be kept.
+Using FPGAs allows for significantly lower latency so machine learning algorithms can essentially be run "live" at the detector level for event selection. As a result, more events with potential signs of new physics can be preserved for analysis.
+
+The Solution: ``hls4ml``
+========================
+
+.. image:: ../img/overview.jpg
+
+
+With this in mind, let's take a look at how ``hls4ml`` helps to achieve such a goal. First, it's important to realize the architecture differences between an FPGA and a CPU or GPU.
+An FPGA can be specifically programmed to do a certain task, in this case evaluate neural networks given a set of inputs, and as such can be highly optimized for the task, with tricks like pipelining and parallel evaluation. However, this means dynamic remapping while running isn't really a possibility.
+FPGAs also often come at a comparatively low power cost with respect to CPUs and GPUs. This allows ``hls4ml`` to build HLS code from compressed neural networks that results in predictions on the microsecond scale for latency.
+The ``hls4ml`` tool saves the time investment needed to convert a neural network to a hardware design language or even HLS code, thus allowing for rapid prototyping.
+
+For more detailed information on technical details of ``hls4ml``, read the "Internals" section of our documentation or our :doc:`References <reference>` page. All figures on this page are taken from the following paper: `JINST 13 P07027 (2018) <https://dx.doi.org/10.1088/1748-0221/13/07/P07027>`_.
diff --git a/docs/reference.rst b/docs/intro/reference.rst
similarity index 99%
rename from docs/reference.rst
rename to docs/intro/reference.rst
index f271679620..0bd5912bb1 100644
--- a/docs/reference.rst
+++ b/docs/intro/reference.rst
@@ -12,9 +12,9 @@ If you use this software in a publication, please cite the software
     @software{fastml_hls4ml,
     author       = {{FastML Team}},
     title        = {fastmachinelearning/hls4ml},
-    year         = 2023,
+    year         = 2024,
     publisher    = {Zenodo},
-    version      = {v0.8.1},
+    version      = {v1.0.0},
     doi          = {10.5281/zenodo.1201549},
     url          = {https://github.com/fastmachinelearning/hls4ml}
     }
diff --git a/docs/release_notes.rst b/docs/intro/release_notes.rst
similarity index 100%
rename from docs/release_notes.rst
rename to docs/intro/release_notes.rst
diff --git a/docs/setup.rst b/docs/intro/setup.rst
similarity index 50%
rename from docs/setup.rst
rename to docs/intro/setup.rst
index a735281c3f..6ba0c4ce0e 100644
--- a/docs/setup.rst
+++ b/docs/intro/setup.rst
@@ -14,7 +14,7 @@ The latest release of ``hls4ml`` can be installed with ``pip``:
 
    pip install hls4ml
 
-If you want to use our :doc:`profiling <api/profiling>` toolbox, you might need to install extra dependencies:
+If you want to use our :doc:`profiling <../advanced/profiling>` toolbox, you might need to install extra dependencies:
 
 .. code-block::
 
@@ -43,29 +43,36 @@ version can be installed directly from ``git``:
 Dependencies
 ============
 
-The ``hls4ml`` library depends on a number of Python packages and external tools for synthesis and simulation. Python dependencies are automatically managed
+The ``hls4ml`` library requires python 3.10 or later, and depends on a number of Python packages and external tools for synthesis and simulation. Python dependencies are automatically managed
 by ``pip`` or ``conda``.
 
-* `TensorFlow <https://pypi.org/project/tensorflow/>`_ (version 2.4 and newer) and `QKeras <https://pypi.org/project/qkeras/>`_ are required by the Keras converter.
+* `TensorFlow <https://pypi.org/project/tensorflow/>`_ (version 2.8 to 2.14) and `QKeras <https://pypi.org/project/qkeras/>`_ are required by the Keras converter. One may want to install newer versions of QKeras from GitHub. Newer versions of TensorFlow can be used, but QKeras and hl4ml do not currently support Keras v3.
+
 * `ONNX <https://pypi.org/project/onnx/>`_ (version 1.4.0 and newer) is required by the ONNX converter.
+
 * `PyTorch <https://pytorch.org/get-started>`_ package is optional. If not installed, the PyTorch converter will not be available.
 
 Running C simulation from Python requires a C++11-compatible compiler. On Linux, a GCC C++ compiler ``g++`` is required. Any version from a recent
-Linux should work. On MacOS, the *clang*-based ``g++`` is enough.
+Linux should work. On MacOS, the *clang*-based ``g++`` is enough. For the oneAPI backend, one must have oneAPI installed, along with the FPGA compiler,
+to run C/SYCL simulations.
 
 To run FPGA synthesis, installation of following tools is required:
 
-* Xilinx Vivado HLS 2018.2 to 2020.1 for synthesis for Xilinx FPGAs
+* Xilinx Vivado HLS 2018.2 to 2020.1 for synthesis for Xilinx FPGAs using the ``Vivado`` backend.
+
+* Vitis HLS 2022.2 or newer is required for synthesis for Xilinx FPGAs using the ``Vitis`` backend.
 
-  * Vitis HLS 2022.2 or newer is required for synthesis for Xilinx FPGAs using the ``Vitis`` backend.
+* Intel Quartus 20.1 to 21.4 for the synthesis for Intel/Altera FPGAs using the ``Quartus`` backend.
 
-* Intel Quartus 20.1 to 21.4 for the synthesis for Intel FPGAs
+* oneAPI 2024.1 to 2025.0 with the FPGA compiler and recent Intel/Altera Quartus for Intel/Altera FPGAs using the ``oneAPI`` backend.
+
+Catapult HLS 2024.1_1 or 2024.2 can be used to synthesize both for ASICs and FPGAs.
 
 
 Quick Start
 =============
 
-For basic concepts to understand the tool, please visit the :doc:`Concepts <concepts>` chapter.
+For basic concepts to understand the tool, please visit the :doc:`Concepts <../api/concepts>` chapter.
 Here we give line-by-line instructions to demonstrate the general workflow.
 
 .. code-block:: python
@@ -98,78 +105,79 @@ After that, you can use :code:`Vivado HLS` to synthesize the model:
 
 Done! You've built your first project using ``hls4ml``! To learn more about our various API functionalities, check out our tutorials `here <https://github.com/fastmachinelearning/hls4ml-tutorial>`__.
 
-If you want to configure your model further, check out our :doc:`Configuration <api/configuration>` page.
+If you want to configure your model further, check out our :doc:`Configuration <../api/configuration>` page.
 
-Apart from our main API, we also support model conversion using a command line interface, check out our next section to find out more:
+..
+   Apart from our main API, we also support model conversion using a command line interface, check out our next section to find out more:
 
-Getting started with hls4ml CLI (deprecated)
---------------------------------------------
+   Getting started with hls4ml CLI (deprecated)
+   --------------------------------------------
 
-As an alternative to the recommended Python PI, the command-line interface is provided via the ``hls4ml`` command.
+   As an alternative to the recommended Python PI, the command-line interface is provided via the ``hls4ml`` command.
 
-To follow this tutorial, you must first download our ``example-models`` repository:
+   To follow this tutorial, you must first download our ``example-models`` repository:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   git clone https://github.com/fastmachinelearning/example-models
+      git clone https://github.com/fastmachinelearning/example-models
 
-Alternatively, you can clone the ``hls4ml`` repository with submodules
+   Alternatively, you can clone the ``hls4ml`` repository with submodules
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   git clone --recurse-submodules https://github.com/fastmachinelearning/hls4ml
+      git clone --recurse-submodules https://github.com/fastmachinelearning/hls4ml
 
-The model files, along with other configuration parameters, are defined in the ``.yml`` files.
-Further information about ``.yml`` files can be found in :doc:`Configuration <api/configuration>` page.
+   The model files, along with other configuration parameters, are defined in the ``.yml`` files.
+   Further information about ``.yml`` files can be found in :doc:`Configuration <api/configuration>` page.
 
-In order to create an example HLS project, first go to ``example-models/`` from the main directory:
+   In order to create an example HLS project, first go to ``example-models/`` from the main directory:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   cd example-models/
+      cd example-models/
 
-And use this command to translate a Keras model:
+   And use this command to translate a Keras model:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   hls4ml convert -c keras-config.yml
+      hls4ml convert -c keras-config.yml
 
-This will create a new HLS project directory with an implementation of a model from the ``example-models/keras/`` directory.
-To build the HLS project, do:
+   This will create a new HLS project directory with an implementation of a model from the ``example-models/keras/`` directory.
+   To build the HLS project, do:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   hls4ml build -p my-hls-test -a
+      hls4ml build -p my-hls-test -a
 
-This will create a Vivado HLS project with your model implementation!
+   This will create a Vivado HLS project with your model implementation!
 
-**NOTE:** For the last step, you can alternatively do the following to build the HLS project:
+   **NOTE:** For the last step, you can alternatively do the following to build the HLS project:
 
-.. code-block:: Bash
+   .. code-block:: Bash
 
-   cd my-hls-test
-   vivado_hls -f build_prj.tcl
+      cd my-hls-test
+      vivado_hls -f build_prj.tcl
 
-``vivado_hls`` can be controlled with:
+   ``vivado_hls`` can be controlled with:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   vivado_hls -f build_prj.tcl "csim=1 synth=1 cosim=1 export=1 vsynth=1"
+      vivado_hls -f build_prj.tcl "csim=1 synth=1 cosim=1 export=1 vsynth=1"
 
-Setting the additional parameters from ``1`` to ``0`` disables that step, but disabling ``synth`` also disables ``cosim`` and ``export``.
+   Setting the additional parameters from ``1`` to ``0`` disables that step, but disabling ``synth`` also disables ``cosim`` and ``export``.
 
-Further help
-^^^^^^^^^^^^
+   Further help
+   ^^^^^^^^^^^^
 
-* For further information about how to use ``hls4ml``\ , do: ``hls4ml --help`` or ``hls4ml -h``
-* If you need help for a particular ``command``\ , ``hls4ml command -h`` will show help for the requested ``command``
-* We provide a detailed documentation for each of the command in the :doc:`Command Help <../command>` section
+   * For further information about how to use ``hls4ml``\ , do: ``hls4ml --help`` or ``hls4ml -h``
+   * If you need help for a particular ``command``\ , ``hls4ml command -h`` will show help for the requested ``command``
+   * We provide a detailed documentation for each of the command in the :doc:`Command Help <advanced/command>` section
 
 Existing examples
 -----------------
 
-* Examples of model files and weights can be found in `example_models <https://github.com/fastmachinelearning/example-models>`_ directory.
 * Training codes and examples of resources needed to train the models can be found in the `tutorial <https://github.com/fastmachinelearning/hls4ml-tutorial>`__.
+* Examples of model files and weights can be found in `example_models <https://github.com/fastmachinelearning/example-models>`_ directory.
 
 Uninstalling
 ------------
diff --git a/docs/status.rst b/docs/intro/status.rst
similarity index 81%
rename from docs/status.rst
rename to docs/intro/status.rst
index 4ff4d33282..5d3f3591f2 100644
--- a/docs/status.rst
+++ b/docs/intro/status.rst
@@ -18,8 +18,8 @@ A list of supported ML frameworks, HLS backends, and neural network architecture
 ML framework support:
 
 * (Q)Keras
-* PyTorch (limited)
-* (Q)ONNX (in development)
+* PyTorch
+* (Q)ONNX
 
 Neural network architectures:
 
@@ -32,7 +32,9 @@ HLS backends:
 
 * Vivado HLS
 * Intel HLS
-* Vitis HLS (experimental)
+* Vitis HLS
+* Catapult HLS
+* oneAPI (experimental)
 
 A summary of the on-going status of the ``hls4ml`` tool is in the table below.
 
@@ -46,35 +48,44 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
      - Vivado HLS
      - Intel HLS
      - Vitis HLS
+     - Catapult HLS
+     - oneAPI
    * - MLP
      - ``supported``
-     - ``limited``
-     - ``in development``
+     - ``supported``
+     - ``supported``
+     - ``supported``
+     - ``supported``
      - ``supported``
      - ``supported``
      - ``experimental``
    * - CNN
      - ``supported``
-     - ``limited``
-     - ``in development``
+     - ``supported``
+     - ``supported``
+     - ``supported``
+     - ``supported``
      - ``supported``
      - ``supported``
      - ``experimental``
    * - RNN (LSTM)
+     - ``supported``
      - ``supported``
      - ``N/A``
-     - ``in development``
      - ``supported``
      - ``supported``
-     - ``N/A``
+     - ``supported``
+     - ``supported``
+     - ``experimental``
    * - GNN (GarNet)
      - ``supported``
+     - ``in development``
+     - ``N/A``
      - ``N/A``
      - ``N/A``
      - ``N/A``
      - ``N/A``
      - ``N/A``
-
 
 Other feature notes:
 
@@ -82,6 +93,9 @@ Other feature notes:
    * Vivado HLS versions 2018.2 to 2020.1
    * Intel HLS versions 20.1 to 21.4
    * Vitis HLS versions 2022.2 to 2024.1
+   * Catapult HLS versions 2024.1_1 to 2024.2
+   * oneAPI versions 2024.1 to 2025.0
+
 * Windows and macOS are not supported
 * BDT support has moved to the `Conifer <https://github.com/thesps/conifer>`__ package
 
diff --git a/docs/ir/attributes.rst b/docs/ir/attributes.rst
new file mode 100644
index 0000000000..dfbec51b1c
--- /dev/null
+++ b/docs/ir/attributes.rst
@@ -0,0 +1,2802 @@
+================
+Layer attributes
+================
+
+
+Input
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Constant
+========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* value: ndarray
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Activation
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+ParametrizedActivation
+======================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+PReLU
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+* n_in: int
+
+* activation: str
+
+Weight attributes
+-----------------
+* param: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Softmax
+=======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* implementation: list [latency,stable,argmax,legacy] (Default: stable)
+
+  * Choice of implementation of softmax function. "latency" provides good latency at the expense of extra resources. performs well on small number of classes. "stable" may require extra clock cycles but has better accuracy. "legacy" is the older implementation which has bad accuracy, but is fast and has low resource use. It is superseded by the "latency" implementation for most applications. "argmax" is a special implementation that can be used if only the output with the highest probability is important. Using this implementation will save resources and clock cycles.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* skip: bool (Default: False)
+
+  * If enabled, skips the softmax node and returns the raw outputs.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* exp_table_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* inv_table_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+TernaryTanh
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+HardActivation
+==============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* slope_t: NamedType
+
+* shift_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+* slope: float (Default: 0.2)
+
+* shift: float (Default: 0.5)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* slope_t: NamedType
+
+* shift_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Reshape
+=======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* target_shape: Sequence
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Dense
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_out: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Conv
+====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Conv1D
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Conv2D
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Conv2DBatchnorm
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+SeparableConv1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* depth_multiplier: int (Default: 1)
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* depthwise: WeightVariable
+
+* pointwise: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* depthwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_result_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* dw_output_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * Available in: Catapult
+
+DepthwiseConv1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* depth_multiplier: int (Default: 1)
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+SeparableConv2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* depth_multiplier: int (Default: 1)
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* depthwise: WeightVariable
+
+* pointwise: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* depthwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_result_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* dw_output_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * Available in: Catapult
+
+DepthwiseConv2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* depth_multiplier: int (Default: 1)
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+BatchNormalization
+==================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: -1)
+
+* use_gamma: bool (Default: True)
+
+* use_beta: bool (Default: True)
+
+Weight attributes
+-----------------
+* scale: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Pooling1D
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_out: int
+
+* n_filt: int
+
+* pool_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+* count_pad: bool (Default: False)
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Pooling2D
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_filt: int
+
+* pool_height: int
+
+* pool_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+* count_pad: bool (Default: False)
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+GlobalPooling1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+GlobalPooling2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* n_filt: int
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+ZeroPadding1D
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* pad_left: int
+
+* pad_right: int
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+ZeroPadding2D
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Merge
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+MatMul
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Dot
+===
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+
+Concatenate
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Resize
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* align_corners: bool (Default: False)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* algorithm: list [nearest,bilinear] (Default: nearest)
+
+Transpose
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Embedding
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* embeddings_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_out: int
+
+* vocab_size: int
+
+Weight attributes
+-----------------
+* embeddings: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* embeddings_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+SimpleRNN
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* activation: str
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* recurrent_weight: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* direction: list [forward,backward] (Default: forward)
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+LSTM
+====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* activation: str
+
+* recurrent_activation: str
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+* time_major: bool (Default: False)
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* recurrent_weight: WeightVariable
+
+* recurrent_bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* direction: list [forward,backward] (Default: forward)
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+GRU
+===
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* activation: str
+
+* recurrent_activation: str
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+* time_major: bool (Default: False)
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* recurrent_weight: WeightVariable
+
+* recurrent_bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* direction: list [forward,backward] (Default: forward)
+
+* apply_reset_gate: list [before,after] (Default: after)
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+GarNet
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+
+GarNetStack
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+
+Quant
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* narrow: bool
+
+* rounding_mode: str
+
+* signed: bool
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+ApplyAlpha
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: -1)
+
+* use_gamma: bool (Default: True)
+
+* use_beta: bool (Default: True)
+
+Weight attributes
+-----------------
+* scale: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+BatchNormOnnx
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+LayerGroup
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* layer_list: list
+
+* input_layers: list
+
+* output_layers: list
+
+* data_reader: object
+
+* output_shape: list
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+SymbolicExpression
+==================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* expression: list
+
+* n_symbols: int
+
+* lut_functions: list (Default: [])
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+BiasAdd
+=======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+FixedPointQuantizer
+===================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+UnaryLUT
+========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Repack
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Clone
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+BatchNormalizationQuantizedTanh
+===============================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: 0)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
+* reuse_factor: int (Default: 1)
+
+PointwiseConv1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+PointwiseConv2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Broadcast
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
diff --git a/docs/flows.rst b/docs/ir/flows.rst
similarity index 84%
rename from docs/flows.rst
rename to docs/ir/flows.rst
index 37b8b44ff9..dbdef58896 100644
--- a/docs/flows.rst
+++ b/docs/ir/flows.rst
@@ -2,17 +2,6 @@
 Optimizer Passes and Flows
 ==========================
 
-Internal Structure
-------------------
-
-The ``hls4ml`` library will parse models from Keras, PyTorch or ONNX into an internal execution graph. This model graph is represented with the
-:py:class:`~hls4ml.model.graph.ModelGraph` class. The nodes in this graph, corresponding to the layer and operations of the input model are represented
-by classes derived from the :py:class:`~hls4ml.model.layers.Layer` base class.
-
-Layers are required to have defined inputs and outputs that define how they are connected in the graph and what is the shape of their output. All information
-about the layer's state and configuration is stored in its attributes. All weights, variables and data types are attributes and there are mapping views to sort through them.
-Layers can define expected attributes and can be verified for correctness, or to produce a list of configurable attributes that user can tweak.
-
 Optimizer passes
 ----------------
 
diff --git a/docs/ir/ir.rst b/docs/ir/ir.rst
new file mode 100644
index 0000000000..18b0a1c679
--- /dev/null
+++ b/docs/ir/ir.rst
@@ -0,0 +1,90 @@
+=======================
+Internal representation
+=======================
+
+The ``hls4ml`` library will parse models from Keras, PyTorch or ONNX into an internal execution graph. This model graph is represented with the
+:py:class:`~hls4ml.model.graph.ModelGraph` class. The nodes in this graph, loosely corresponding to the layers and operations of the input model are represented
+by classes derived from the :py:class:`~hls4ml.model.layers.Layer` base class.
+
+Layers are required to have defined inputs and outputs that define how they are connected in the graph and what is the shape of their output. All information
+about the layer's state and configuration is stored in its attributes. All weights, variables and data types are attributes and there are mapping views to sort through them.
+Layers can define expected attributes and can be verified for correctness, or to produce a list of configurable attributes that user can tweak. The complete list of attributes can be found in the :doc:`Attributes <attributes>` page.
+
+
+Layers
+======
+
+The backends of ``hls4ml`` are independent from each other and free to implement features in any suitable way, most implementations share common concepts which we will mention here.
+
+Dense Layers
+------------
+
+One-dimensional Dense Layers
+****************************
+
+Dense layers over one-dimensional data perform a matrix-vector multiplication followed by elementwise addition of bias tensor. This routine is the underlying computation of many other layers as well and is reused as much as possible. It exists in several implementations across different backends, for different `io_type`'s and strategies.
+
+io_parallel
+^^^^^^^^^^^
+
+All the backends have a ``Resource`` implementation, which divides the computation into a loop of ``reuse_factor`` iterations, each iteration simultaneously accessing a different part of the array partitioned in BRAM. There are different implementations depending on whether the reuse factor is smaller or bigger than the input size. The two Xilinx backends and Catapult also implement a ``Latency`` implementation, which uses the reuse factor to control the amount of pipelining/unrolling of the whole function while the weight array is fully partitioned in registers.
+
+io_stream
+^^^^^^^^^
+
+The io_stream implementation only wraps the io_parallel implementation with streams or pipes for communication. Internally, data is still accessed in parallel as an array.
+
+Multi-dimensional Dense Layers
+******************************
+
+Multi-dimensional Dense layers are converted to pointwise convolutions, and do not directly use the above implementation.
+
+
+Convolution Layers
+------------------
+
+Standard convolution
+********************
+
+By *standard* convolution we refer to the operation represented by the ``Conv1D/2D`` layer in Keras (``Conv1d/2d`` in PyTorch). Depending on the ``io_type`` option used, there are two classes of implementations in ``hls4ml``.
+
+io_parallel
+^^^^^^^^^^^
+
+Parallel IO is applicable to small models that require low latency implementation. Larger models face synthesizability limits very quickly.
+
+In Vivado/Vitis backends, parallel convolution relies on the *im2col* transformation of the input, which turns convolution into a matrix-multiplication task. This task is then implemented as a sequence of matrix-vector multiplications using the routine mentioned above. The ``Latency`` and ``Resource`` strategies refer to the function used for matrix-vector multiplication routine, with ``Resource`` allowing for a slightly larger models to be synthesized. Parallelism can be further controlled via the ``ParallelizationFactor``. Catapult backend in turn uses a direct implementation of convolution via nested loops. The ``Quartus``, ``oneAPI``, and ``Catapult`` backends also implement a ``Winograd`` algorithm choosable by setting the ``implementation`` to ``Winograd`` or ``combination``. Winograd implementation is available for only a handful of filter size configurations, and it is less concerned about bit accuracy and overflow. In certain conditions it can be faster.
+
+io_stream
+^^^^^^^^^
+
+There are two main classes of io_stream implementations, ``LineBuffer`` and  ``Encoded``. ``LineBuffer`` is the default, and generally produces marginally better results,
+while ``Catapult`` and ``Vivado`` also implement ``Encoded``, choosable with the ``ConvImplementation`` configuration option. In all cases, the data is processed serially, one pixel at a time, with a pixel containing an array of all the channel values for the pixel.
+
+Depthwise convolution
+*********************
+
+Depthwise implementation substitutes the matrix-vector multiplication in the kernel to the elementwise multiplication. The only implementation available is based on ``Latency`` strategy, used by both ``io_parallel`` and ``io_stream``.
+
+Pointwise convolution
+*********************
+
+Pointwise convolutions are a special case of convolution where the filter size is ``1`` for 1D or ``1x1`` for 2D.
+
+For the Vivado/Vitis backends, there is a dedicated ``io_parallel``/``Latency`` strategy implementation of 1D pointwise convolutional layers originally developed for `arXiv:2402.01876 <https://arxiv.org/abs/2402.01876>`_.
+The reuse factor (RF) is used to split the layer execution and reuse the existing module RF times. The RF also limits the number of multipliers in each module.
+The initiation interval scales as the RF. One limitation is that it assumes ``in_width`` is divisible by the RF.
+
+Activations
+-----------
+
+Most activations without extra parameters are represented with the ``Activation`` layer, and those with single parameters (leaky ReLU, thresholded ReLU, ELU) as ``ParametrizedActivation``. ``PReLU`` has its own class because it has a parameter matrix (stored as a weight). The hard (piecewise linear) sigmoid and tanh functions are implemented in a ``HardActivation`` layer, and ``Softmax`` has its own layer class.
+
+Backends have four softmax implementations that the user can choose from by setting the ``implementation`` parameter:
+
+* **latency**:  Good latency, but somewhat high resource usage. It does not work well if there are many output classes.
+* **stable**:  Slower but with better accuracy, useful in scenarios where higher accuracy is needed.
+* **legacy**:  An older implementation with poor accuracy, but good performance. Usually the latency implementation is preferred.
+* **argmax**:  If you don't care about normalized outputs and only care about which one has the highest value, using argmax saves a lot of resources. This sets the highest value to 1, the others to 0.
+
+Vivado/Vitis backend additionally support completely skipping softmax activation and returning raw outputs.
diff --git a/docs/api/hls-model.rst b/docs/ir/modelgraph.rst
similarity index 58%
rename from docs/api/hls-model.rst
rename to docs/ir/modelgraph.rst
index bf0d8ee3ce..048e67e101 100644
--- a/docs/api/hls-model.rst
+++ b/docs/ir/modelgraph.rst
@@ -1,8 +1,8 @@
 ================
-HLS Model Class
+ModelGraph Class
 ================
 
-This page documents our hls_model class usage. You can generate generate an hls model object from a keras model through ``hls4ml``'s API:
+This page documents our ``ModelGraph`` class usage. You can generate generate an instance of this class through ``hls4ml``'s API, for example by converting a Keras model:
 
 .. code-block:: python
 
@@ -11,10 +11,10 @@ This page documents our hls_model class usage. You can generate generate an hls
    # Generate a simple configuration from keras model
    config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name')
 
-   # Convert to an hls model
+   # Convert to a ModelGraph instance (hls_model)
    hls_model = hls4ml.converters.convert_from_keras_model(keras_model, hls_config=config, output_dir='test_prj')
 
-After that, you can use several methods in that object. Here is a list of all the methods:
+This object can be used to perform common simulation and firmware-generation tasks. Here is a list of important user-facing methods:
 
 
 * :ref:`write <write-method>`
@@ -23,8 +23,6 @@ After that, you can use several methods in that object. Here is a list of all th
 * :ref:`build <build-method>`
 * :ref:`trace <trace-method>`
 
-Similar functionalities are also supported through command line interface. If you prefer using them, please refer to Command Help section.
-
 ----
 
 .. _write-method:
@@ -32,7 +30,7 @@ Similar functionalities are also supported through command line interface. If yo
 ``write`` method
 ====================
 
-Write your keras model as a hls project to ``hls_model``\ 's ``output_dir``\ :
+Write the ``ModelGraph`` to the output directory specified in the config:
 
 .. code-block:: python
 
@@ -45,7 +43,7 @@ Write your keras model as a hls project to ``hls_model``\ 's ``output_dir``\ :
 ``compile`` method
 ======================
 
-Compile your hls project.
+Compiles the written C++/HLS code and links it into the Python runtime. Compiled model can be used to evaluate performance (accuracy) through ``predict()`` method.
 
 .. code-block:: python
 
@@ -58,7 +56,7 @@ Compile your hls project.
 ``predict`` method
 ======================
 
-Similar to ``keras``\ 's predict API, you can get the predictions of ``hls_model`` just by supplying an input ``numpy`` array:
+Similar to ``keras``\ 's predict API, you can get the predictions just by supplying an input ``numpy`` array:
 
 .. code-block:: python
 
@@ -67,7 +65,7 @@ Similar to ``keras``\ 's predict API, you can get the predictions of ``hls_model
 
    y = hls_model.predict(X)
 
-This is similar to doing ``csim`` simulation, but you can get your prediction results much faster. It's very helpful when you want to quickly prototype different configurations for your model.
+This is similar to doing ``csim`` simulation, without creating the testbench and supplying data. It's very helpful when you want to quickly prototype different configurations for your model.
 
 ----
 
@@ -76,13 +74,17 @@ This is similar to doing ``csim`` simulation, but you can get your prediction re
 ``build`` method
 ====================
 
+This method "builds" the generated HLS project. The parameters of build are backend-specific and usually include simulation and synthesis. Refer to each backend for a complete list of supported parameters to ``build()``.
+
 .. code-block:: python
 
-   hls_model.build()
+   report = hls_model.build()
 
    #You can also read the report of the build
    hls4ml.report.read_vivado_report('hls4ml_prj')
 
+The returned ``report`` object will contain the result of build step, which may include C-simulation results, HLS synthesis estimates, co-simulation latency etc, depending on the backend used.
+
 ----
 
 .. _trace-method:
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 66aa579ea6..fe3c4f2544 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,5 +4,4 @@ sphinx>=3.2.1
 sphinx_contributors
 sphinx_github_changelog
 sphinx_rtd_theme
-tensorflow<=2.15
 toposort>=1.5.0
diff --git a/example-models b/example-models
index d40894b03f..c6bb3c0686 160000
--- a/example-models
+++ b/example-models
@@ -1 +1 @@
-Subproject commit d40894b03f840a32da43a5adea0531ffc1db216e
+Subproject commit c6bb3c0686d52439d8c53d7407903bf78e852562
diff --git a/hls4ml/__init__.py b/hls4ml/__init__.py
index 81b2859551..e3a7247b0d 100644
--- a/hls4ml/__init__.py
+++ b/hls4ml/__init__.py
@@ -1,4 +1,34 @@
-from hls4ml import converters, report, utils  # noqa: F401
+# Temporary workaround for QKeras installation requirement, will be removed after 1.0.0
+def maybe_install_qkeras():
+    import subprocess
+    import sys
+
+    QKERAS_PKG_NAME = 'QKeras'
+    # QKERAS_PKG_SOURCE = QKERAS_PKG_NAME
+    QKERAS_PKG_SOURCE = 'qkeras@git+https://github.com/fastmachinelearning/qkeras.git'
+
+    def pip_list():
+        p = subprocess.run([sys.executable, '-m', 'pip', 'list'], check=True, capture_output=True)
+        return p.stdout.decode()
+
+    def pip_install(package):
+        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
+
+    all_pkgs = pip_list()
+    if QKERAS_PKG_NAME not in all_pkgs:
+        print('QKeras installation not found, installing one...')
+        pip_install(QKERAS_PKG_SOURCE)
+        print('QKeras installed.')
+
+
+try:
+    maybe_install_qkeras()
+except Exception:
+    print('Could not find QKeras installation, make sure you have QKeras installed.')
+
+# End of workaround
+
+from hls4ml import converters, report, utils  # noqa: F401, E402
 
 try:
     from ._version import version as __version__
diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py
index 5c85bf9b7e..030016d6cd 100644
--- a/hls4ml/backends/catapult/catapult_backend.py
+++ b/hls4ml/backends/catapult/catapult_backend.py
@@ -32,6 +32,7 @@
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
 from hls4ml.report import parse_catapult_report
+from hls4ml.utils import attribute_descriptions as descriptions
 from hls4ml.utils.fixed_point_utils import ceil_log2
 
 
@@ -51,10 +52,12 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('static', value_type=bool, default=True))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(
+                ConfigurableAttribute('static', value_type=bool, default=True, description=descriptions.recurrent_static)
+            )
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
         # Add ParallelizationFactor to Conv1D/2D
@@ -65,7 +68,7 @@ def _register_layer_attributes(self):
 
         for layer in pf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf))
             self.attribute_map[layer] = attrs
 
         # Add ConvImplementation to Convolution+Pooling layers
@@ -73,8 +76,14 @@ def _register_layer_attributes(self):
 
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
-            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
+            attrs.append(
+                ChoiceAttribute(
+                    'conv_implementation',
+                    choices=['LineBuffer', 'Encoded'],
+                    default='LineBuffer',
+                    description=descriptions.conv_implementation,
+                )
+            )
             self.attribute_map[layer] = attrs
 
         sep_conv_layers = [SeparableConv1D, SeparableConv2D]
@@ -88,6 +97,7 @@ def _register_flows(self):
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
 
         streaming_passes = [
+            'catapult:inplace_stream_flatten',  # Inform downstream changed packsize in case of skipping flatten
             'catapult:reshape_stream',
             'catapult:clone_output',
             'catapult:insert_zero_padding_before_conv1d',
diff --git a/hls4ml/backends/catapult/passes/merge_templates.py b/hls4ml/backends/catapult/passes/merge_templates.py
index ff6928679c..b6548c5112 100755
--- a/hls4ml/backends/catapult/passes/merge_templates.py
+++ b/hls4ml/backends/catapult/passes/merge_templates.py
@@ -6,6 +6,7 @@
 
 merge_config_template = """struct config{index} : nnet::merge_config {{
     static const unsigned n_elem = {n_elem};
+    static const unsigned reuse_factor = {reuse};
 }};\n"""
 
 merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index a9fc09b7aa..b20fdf1228 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -45,6 +45,7 @@
     UnspecifiedPrecisionType,
     XnorPrecisionType,
 )
+from hls4ml.utils import attribute_descriptions as descriptions
 from hls4ml.writer import get_writer
 
 
@@ -74,7 +75,7 @@ def __init__(self, name):
 
         for layer in accum_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(TypeAttribute('accum'))
+            attrs.append(TypeAttribute('accum', description=descriptions.accum_type))
             self.attribute_map[layer] = attrs
 
         rf_layers = accum_layers + [
@@ -90,10 +91,10 @@ def __init__(self, name):
 
         for layer in rf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('reuse_factor', default=1))
+            attrs.append(ConfigurableAttribute('reuse_factor', default=1, description=descriptions.reuse_factor))
             self.attribute_map[layer] = attrs
 
-        # seperable is kind of special because it is effectively two layers that will be split
+        # separable is kind of special because it is effectively two layers that will be split
         for layer in (SeparableConv1D, SeparableConv2D):
             attrs = self.attribute_map.get(layer, [])
             attrs.append(TypeAttribute('depthwise_accum'))
@@ -104,23 +105,34 @@ def __init__(self, name):
             self.attribute_map[layer] = attrs
 
         act_attrs = self.attribute_map.get(Activation, [])
-        act_attrs.append(ConfigurableAttribute('table_size', default=1024))
-        act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+        act_attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+        act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
         self.attribute_map[Activation] = act_attrs
 
         softmax_attrs = self.attribute_map.get(Softmax, [])
-        softmax_attrs.append(ChoiceAttribute('implementation', ['latency', 'stable', 'argmax', 'legacy'], default='stable'))
-        softmax_attrs.append(ConfigurableAttribute('skip', value_type=bool, default=False))
+        softmax_attrs.append(
+            ChoiceAttribute(
+                'implementation',
+                ['latency', 'stable', 'argmax', 'legacy'],
+                default='stable',
+                description=descriptions.softmax_implementation,
+            )
+        )
+        softmax_attrs.append(
+            ConfigurableAttribute('skip', value_type=bool, default=False, description=descriptions.softmax_skip)
+        )
         softmax_attrs.append(
             TypeAttribute(
                 'exp_table',
                 default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                description=descriptions.table_type,
             )
         )
         softmax_attrs.append(
             TypeAttribute(
                 'inv_table',
                 default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                description=descriptions.table_type,
             )
         )
         self.attribute_map[Softmax] = softmax_attrs
diff --git a/hls4ml/backends/fpga/fpga_layers.py b/hls4ml/backends/fpga/fpga_layers.py
index 356973517c..0026ebe213 100644
--- a/hls4ml/backends/fpga/fpga_layers.py
+++ b/hls4ml/backends/fpga/fpga_layers.py
@@ -73,12 +73,14 @@ def set_thresholds(self, scale, bias, ternary_threshold=0.5):
 class PointwiseConv1D(Conv1D):
     '''Optimized Conv1D implementation for 1x1 kernels.'''
 
-    # Nothing to do, will pick up function and config from class name
-    pass
+    def initialize(self):
+        # Do noting, values copied
+        pass
 
 
 class PointwiseConv2D(Conv2D):
     '''Optimized Conv2D implementation for 1x1 kernels.'''
 
-    # Nothing to do, will pick up function and config from class name
-    pass
+    def initialize(self):
+        # Do noting, values copied
+        pass
diff --git a/hls4ml/backends/fpga/passes/clone.py b/hls4ml/backends/fpga/passes/clone.py
index 306e839900..a36d96dfa8 100644
--- a/hls4ml/backends/fpga/passes/clone.py
+++ b/hls4ml/backends/fpga/passes/clone.py
@@ -1,4 +1,4 @@
-import numpy as np
+from math import prod
 
 from hls4ml.backends.template import FunctionCallTemplate
 from hls4ml.model.layers import Layer, register_layer
@@ -54,41 +54,60 @@ def match(self, node):
         if isinstance(node, Clone):
             return False
 
-        return True
+        # Not needed for io_parallel
+        io_type = node.model.config.get_config_value('IOType')
+        if io_type != 'io_stream':
+            return False
+
+        # Check if the output is used more than once
+        output_map = node.get_output_use_map()
+        in_output = node.name in node.model.outputs
+        for output in node.outputs:
+            if len(output_map[output]) + in_output > 1:
+                # model output also need a stream
+                return True
+
+        return False
 
     def transform(self, model, node):
-        if model.config.get_config_value('IOType') != 'io_stream':
-            return False
 
         output_map = node.get_output_use_map()
+        in_output = node.name in node.model.outputs
 
         transformed = False
         for output in node.outputs:
-            if len(output_map[output]) > 1:
-                if len(output_map[output]) > 3:
-                    print(
-                        'WARNING: Cloning output {} of {} ({}) more than 3 times not currently supported'.format(
-                            output, node.__class__.__name__, node.name
-                        )
-                    )
-                    return False
-                out_var = node.get_output_variable(output)
-                for i, layer in enumerate(output_map[output], 1):
-                    attrs = {'size': np.prod(out_var.shape)}
-                    idx = layer.inputs.index(output)
-                    layer.inputs[idx] = output + '_cpy' + str(i)
-
-                clone_layer: Clone = model.make_node(
-                    Clone,
-                    'clone_' + node.name,
-                    attrs,
-                    [output],
-                    [output + '_cpy' + str(i + 1) for i in range(len(output_map[output]))],
-                )
-                for i in range(len(output_map[output])):
-                    key = output + '_cpy' + str(i + 1)
-                    clone_layer.attributes[key].type = node.attributes['result_t']
-                model.insert_node(clone_layer)
-                transformed = True
+            n_outputs = len(output_map[output]) + in_output
+            if n_outputs == 1:
+                continue
+            if n_outputs > 3:
+                msg = f'ERROR: Cloning output {output} of {node.class_name}\
+                      ({node.name}) more than 3 times not currently supported'
+                raise ValueError(msg)
+
+            out_var = node.get_output_variable(output)
+            attrs = {'size': prod(out_var.shape)}
+
+            init_stream_idx = 1
+            if in_output:
+                # If the value is used as output, add one extra stream
+                idx = node.model.outputs.index(node.name)
+                node.model.outputs[idx] = node.name + '_cpy1'
+                init_stream_idx = 2
+            for i, layer in enumerate(output_map[output], init_stream_idx):
+                idx = layer.inputs.index(output)
+                layer.inputs[idx] = output + f'_cpy{i}'
+
+            clone_layer: Clone = model.make_node(
+                Clone,
+                'clone_' + node.name,
+                attrs,
+                [output],
+                [output + '_cpy' + str(i + 1) for i in range(n_outputs)],
+            )
+            for i in range(n_outputs):
+                key = output + '_cpy' + str(i + 1)
+                clone_layer.attributes[key].type = node.attributes['result_t']
+            model.insert_node(clone_layer)
+            transformed = True
 
         return transformed
diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/im2col_codegen.py
similarity index 100%
rename from hls4ml/backends/fpga/passes/codegen.py
rename to hls4ml/backends/fpga/passes/im2col_codegen.py
diff --git a/hls4ml/backends/fpga/passes/inplace_parallel_reshape.py b/hls4ml/backends/fpga/passes/inplace_parallel_reshape.py
index 532becc9db..82efe67100 100644
--- a/hls4ml/backends/fpga/passes/inplace_parallel_reshape.py
+++ b/hls4ml/backends/fpga/passes/inplace_parallel_reshape.py
@@ -11,14 +11,21 @@ class InplaceParallelReshape(OptimizerPass):
     """
 
     def match(self, node):
-        return isinstance(node, Reshape)
-
-    def transform(self, model, node):
-        if model.config.get_config_value('IOType') != 'io_parallel':
+        if not isinstance(node, Reshape):
             return False
+        return node.model.config.get_config_value('IOType') == 'io_parallel'
 
+    def transform(self, model, node):
         outvar = node.get_output_variable()
         invar = node.get_input_variable()
         newoutvar = InplaceTensorVariable(outvar, invar)
         node.set_attr(node.outputs[0], newoutvar)
+        if node.name in model.outputs:
+            prev_node = node.get_input_node()
+            assert (
+                prev_node.name not in model.outputs
+            ), f"Cannot output node {prev_node.name}: reshape is a no-op in io_parallel.\
+            As a result, the previous node {prev_node.name}'s output will be used as the\
+            output. However, this node is already an output."
+            model.outputs = [name if name != node.name else prev_node.name for name in model.outputs]
         return False
diff --git a/hls4ml/backends/fpga/passes/inplace_stream_flatten.py b/hls4ml/backends/fpga/passes/inplace_stream_flatten.py
index a16ffefc4a..be4994e96e 100644
--- a/hls4ml/backends/fpga/passes/inplace_stream_flatten.py
+++ b/hls4ml/backends/fpga/passes/inplace_stream_flatten.py
@@ -11,13 +11,20 @@ class InplaceStreamFlatten(OptimizerPass):
     """
 
     def match(self, node):
-        # Reshape acts as a Flatten layer when the result has 1 dimension
-        return isinstance(node, Reshape) and len(node.get_output_variable().shape) == 1
+        # Layers require flatten data can gather it from the stream, no need for repacking.
+        # Reshape acts as a Flatten layer when the result has 1 dimension. Make it a inplace tensor if it happens.
 
-    def transform(self, model, node):
-        if model.config.get_config_value('IOType') != 'io_stream':
+        if node.model.config.get_config_value('IOType') != 'io_stream':
+            return False
+        if not (isinstance(node, Reshape) and len(node.get_output_variable().shape) == 1):
+            # If is not flatten
             return False
+        if node.name in node.model.outputs:
+            # If used as model output. Output shape shall be preserved in this case.
+            return False
+        return True
 
+    def transform(self, model, node):
         outvar = node.get_output_variable()
         invar = node.get_input_variable()
         newoutvar = InplaceTensorVariable(outvar, invar)
diff --git a/hls4ml/backends/fpga/passes/repack_stream.py b/hls4ml/backends/fpga/passes/repack_stream.py
index 2408ec5ebe..9a77dddb29 100644
--- a/hls4ml/backends/fpga/passes/repack_stream.py
+++ b/hls4ml/backends/fpga/passes/repack_stream.py
@@ -49,7 +49,9 @@ class ReshapeStream(OptimizerPass):
 
     def match(self, node):
         # do not run optimizer pass for a flatten layer (1 output dimension)
-        return isinstance(node, Reshape) and len(node.get_output_variable().shape) > 1
+        if not isinstance(node, Reshape):
+            return False
+        return len(node.get_output_variable().shape) > 1 or node.name in node.model.outputs
 
     def transform(self, model, node):
         if model.config.get_config_value('IOType') != 'io_stream':
diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index c85a8c0e94..7d0f0d48e2 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -10,6 +10,7 @@
 from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
+from hls4ml.utils import attribute_descriptions as descriptions
 
 # from hls4ml.report import parse_oneapi_report
 
@@ -30,9 +31,9 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
         # Add ParallelizationFactor to Conv1D/2D
@@ -43,7 +44,7 @@ def _register_layer_attributes(self):
 
         for layer in pf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf))
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
index 17154559d8..64d9e42228 100644
--- a/hls4ml/backends/oneapi/passes/convolution_templates.py
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -1,7 +1,7 @@
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
-from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm
+from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm, DepthwiseConv1D, DepthwiseConv2D
 
 # TODO - Dilation rate ?
 
@@ -70,9 +70,20 @@
 conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
 
 
+depthconv1d_function_template = (
+    'nnet::depthwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+depthconv1d_include_list = [
+    'nnet_utils/nnet_conv1d.h',
+    'nnet_utils/nnet_conv1d_resource.h',
+    'nnet_utils/nnet_depthconv1d.h',
+    'nnet_utils/nnet_depthconv1d_resource.h',
+]
+
+
 class Conv1DConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__(Conv1D)
+        super().__init__((Conv1D, DepthwiseConv1D))
         self.template = conv1d_config_template
         self.mult_template = conv_mult_config_template
 
@@ -137,6 +148,12 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class DepthwiseConv1DFunctionTemplate(Conv1DFunctionTemplate):
+    def __init__(self):
+        super(Conv1DFunctionTemplate, self).__init__(DepthwiseConv1D, include_header=depthconv1d_include_list)
+        self.template = depthconv1d_function_template
+
+
 ''' 2D Conv '''
 conv2d_config_template = """struct config{index} : nnet::conv2d_config {{
     static const unsigned in_height = {in_height};
@@ -183,7 +200,7 @@ def format(self, node):
 
 class Conv2DConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__((Conv2D, Conv2DBatchnorm))
+        super().__init__((Conv2D, Conv2DBatchnorm, DepthwiseConv2D))
         self.template = conv2d_config_template
         self.mult_template = conv_mult_config_template
 
@@ -233,3 +250,20 @@ def format(self, node):
             raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         return self.template.format(**params)
+
+
+depthconv2d_function_template = (
+    'nnet::depthwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+depthconv2d_include_list = [
+    'nnet_utils/nnet_conv2d.h',
+    'nnet_utils/nnet_conv2d_resource.h',
+    'nnet_utils/nnet_depthconv2d.h',
+    'nnet_utils/nnet_depthconv2d_resource.h',
+]
+
+
+class DepthwiseConv2DFunctionTemplate(Conv2DFunctionTemplate):
+    def __init__(self):
+        super(Conv2DFunctionTemplate, self).__init__(DepthwiseConv2D, include_header=depthconv2d_include_list)
+        self.template = depthconv2d_function_template
diff --git a/hls4ml/backends/oneapi/passes/merge_templates.py b/hls4ml/backends/oneapi/passes/merge_templates.py
index c38e1e055f..9d261e1f74 100644
--- a/hls4ml/backends/oneapi/passes/merge_templates.py
+++ b/hls4ml/backends/oneapi/passes/merge_templates.py
@@ -10,6 +10,7 @@
 # Merge templates
 merge_config_template = """struct config{index} : nnet::merge_config {{
     static const unsigned n_elem = {n_elem};
+    static const unsigned reuse_factor = {reuse};
 }};\n"""
 
 merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
diff --git a/hls4ml/backends/quartus/passes/merge_templates.py b/hls4ml/backends/quartus/passes/merge_templates.py
index 0cf6121666..f71489a5cf 100644
--- a/hls4ml/backends/quartus/passes/merge_templates.py
+++ b/hls4ml/backends/quartus/passes/merge_templates.py
@@ -9,6 +9,7 @@
 # Merge templates
 merge_config_template = """struct config{index} : nnet::merge_config {{
     static const unsigned n_elem = {n_elem};
+    static const unsigned reuse_factor = {reuse};
 }};\n"""
 
 merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py
index aecad642c6..6e596fe2d1 100644
--- a/hls4ml/backends/quartus/quartus_backend.py
+++ b/hls4ml/backends/quartus/quartus_backend.py
@@ -11,6 +11,7 @@
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
 from hls4ml.report import parse_quartus_report
+from hls4ml.utils import attribute_descriptions as descriptions
 
 
 @contextmanager
@@ -39,16 +40,21 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
         initializers = self._get_layer_initializers()
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
 
-        streaming_passes = ['quartus:reshape_stream', 'quartus:clone_output']
+        streaming_passes = [
+            'quartus:inplace_stream_flatten',  # Inform downstream changed packsize in case of skipping flatten
+            'quartus:reshape_stream',
+            'quartus:clone_output',
+        ]
+
         streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name)
 
         quartus_types = [
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index dd77bee85e..e098107eae 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -60,6 +60,8 @@
     typedef {config_t} mult_config;
     template<unsigned K, unsigned S, unsigned W>
     using scale_index = nnet::{scale_index_type}<K, S, W>;
+    template<class data_T, class res_T, class CONFIG_T>
+    using conv_kernel = nnet::{conv_fn}<data_T, res_T, CONFIG_T>;
 }};
 const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
 
@@ -93,11 +95,30 @@ def format(self, node):
         else:
             params['fill_fn'] = 'FillConv1DBuffer'
 
+        is_pointwise_parallel_latency = (
+            node.get_attr('filt_width') == 1
+            and node.get_attr('strategy').lower() == 'latency'
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+        )
+        if is_pointwise_parallel_latency:
+            params['conv_fn'] = f'pointwise_conv_{node.index}'
+        else:
+            if node.get_attr('strategy').lower() == 'latency':
+                params['conv_fn'] = 'Conv1DLatency'
+            else:
+                params['conv_fn'] = 'Conv1DResource'
+
         conv_config = self.template.format(**params)
 
         mult_params = self._default_config_params(node)
-        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
-        mult_params['n_out'] = node.get_attr('n_filt')
+        if is_pointwise_parallel_latency:
+            mult_params['n_in'] = int(
+                node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse']
+            )
+            mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse'])
+        else:
+            mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
+            mult_params['n_out'] = node.get_attr('n_filt')
         mult_params['nzeros'] = node.get_weights('weight').nzeros
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
diff --git a/hls4ml/backends/vivado/passes/merge_templates.py b/hls4ml/backends/vivado/passes/merge_templates.py
index 078e004d33..35aa5d3640 100644
--- a/hls4ml/backends/vivado/passes/merge_templates.py
+++ b/hls4ml/backends/vivado/passes/merge_templates.py
@@ -6,6 +6,7 @@
 
 merge_config_template = """struct config{index} : nnet::merge_config {{
     static const unsigned n_elem = {n_elem};
+    static const unsigned reuse_factor = {reuse};
 }};\n"""
 
 merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py
new file mode 100644
index 0000000000..d41d51f82f
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py
@@ -0,0 +1,84 @@
+from hls4ml.model.layers import Conv1D
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import Source
+
+
+def generate_pointwise_conv1d_fn(layer_idx, reuse_factor=1):
+    """Generate a C++ function for a pointwise convolution layer.
+
+    Args:
+        layer_idx (int): Index of layer ('index' attribute).
+        reuse_factor (int): Number of partitions to divide the input into.
+
+    Returns:
+        str: Generated C++ function
+    """
+
+    generated_code = (
+        'template<class data_T, class res_T, typename CONFIG_T>\n'
+        'class pointwise_conv_{index} : public Conv1DKernel<data_T, res_T, CONFIG_T> {{\n'
+        '  public:\n'
+        '    static void conv(\n'
+        '                     data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n'
+        '                     res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n'
+        '                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n'
+        '                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n'
+        '        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n'  # noqa: E501
+        '        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n'
+        '        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n'  # noqa: E501
+        '        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n'
+        '    RFInputLoop:\n'
+        '        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n'
+        '        #pragma HLS UNROLL\n'
+        '        InnerInputLoop:\n'
+        '            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n'
+        '                #pragma HLS UNROLL\n'
+        '                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n'  # noqa: E501
+        '            }}\n'
+        '        }}\n\n'
+    ).format(index=layer_idx)
+    indent = '        '
+    for i in range(reuse_factor):
+        generated_code += indent
+        generated_code += (
+            f'pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n'
+        )
+
+    generated_code += (
+        '\n'
+        '    RFOutputLoop:\n'
+        '        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n'
+        '        #pragma HLS UNROLL\n'
+        '        InnerOutputLoop:\n'
+        '            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n'
+        '                #pragma HLS UNROLL\n'
+        '                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n'  # noqa: E501
+        '            }\n'
+        '        }\n'
+        '    }\n'
+        '};\n'
+    )
+
+    return generated_code
+
+
+class GeneratePointwiseConv1D(OptimizerPass):
+    '''Generates code for pointwise 1D convolution'''
+
+    def match(self, node):
+        return (
+            isinstance(node, Conv1D)
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+            and node.get_attr('filt_width') == 1
+        )
+
+    def transform(self, model, node):
+        self._generate_pointwise_conv1d(node)
+
+    def _generate_pointwise_conv1d(self, node):
+        code_str = generate_pointwise_conv1d_fn(
+            node.get_attr('index'),
+            node.get_attr('reuse_factor'),
+        )
+
+        node.set_attr('pointwise_conv1d_codegen', Source(code_str))
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 9f8a5171d3..117805dd86 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -31,6 +31,7 @@
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
 from hls4ml.report import parse_vivado_report
+from hls4ml.utils import attribute_descriptions as descriptions
 
 
 class VivadoBackend(FPGABackend):
@@ -49,10 +50,12 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('static', value_type=bool, default=True))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(
+                ConfigurableAttribute('static', value_type=bool, default=True, description=descriptions.recurrent_static)
+            )
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
         # Add ParallelizationFactor to Conv1D/2D
@@ -63,15 +66,21 @@ def _register_layer_attributes(self):
 
         for layer in pf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf))
             self.attribute_map[layer] = attrs
 
         # Add ConvImplementation to Convolution+Pooling layers
         cnn_layers = [Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Pooling1D, Pooling2D]
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
-            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
+            attrs.append(
+                ChoiceAttribute(
+                    'conv_implementation',
+                    choices=['LineBuffer', 'Encoded'],
+                    default='LineBuffer',
+                    description=descriptions.conv_implementation,
+                )
+            )
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
@@ -79,6 +88,7 @@ def _register_flows(self):
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
 
         streaming_passes = [
+            'vivado:inplace_stream_flatten',  # Inform downstream changed packsize in case of skipping flatten
             'vivado:reshape_stream',
             'vivado:clone_output',
             'vivado:insert_zero_padding_before_conv1d',
@@ -113,6 +123,7 @@ def _register_flows(self):
             'vivado:generate_conv_streaming_instructions',
             'vivado:apply_resource_strategy',
             'vivado:generate_conv_im2col',
+            'vivado:generate_pointwise_conv1_d',
             'vivado:generate_unrolled_dense_resource',
             'vivado:set_pipeline_style',
         ]
diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index 13e90df687..3d7ce1fe56 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -278,9 +278,10 @@ def convert_from_pytorch_model(
     Notes:
         Pytorch uses the "channels_first" data format for its tensors, while hls4ml expects the "channels_last" format
         used by keras. By default, hls4ml will automatically add layers to the model which transpose the inputs to the
-        "channels_last"format. Not that this is not supported for the "io_stream" io_type, for which the user will have
-        to transpose the input by hand before passing it to hls4ml. In that case the "inputs_channel_last" argument of
-        the "config_from_pytorch_model" function needs to be set to True. By default, the output of the model remains
+        "channels_last" format. Not that this is not supported for the "io_stream" io_type, for which the user will have
+        to transpose the input by hand before passing it to hls4ml. In that case the "channels_last_conversion" argument of
+        the "config_from_pytorch_model" function needs to be set to "internal". This argument can be used to completely
+        disable this internal conversion. By default, the output of the model remains
         in the "channels_last" data format. The "transpose_outputs" argument of the "config_from_pytorch_model" can be
         used to add a layer to the model that transposes back to "channels_first". As before, this will not work for
         io_stream.
diff --git a/hls4ml/converters/onnx/reshape.py b/hls4ml/converters/onnx/reshape.py
index 9ef20f03d7..f11796b6db 100644
--- a/hls4ml/converters/onnx/reshape.py
+++ b/hls4ml/converters/onnx/reshape.py
@@ -1,4 +1,4 @@
-from hls4ml.converters.onnx_to_hls import onnx_handler
+from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler
 
 
 @onnx_handler('Transpose')
@@ -36,3 +36,25 @@ def parse_flatten_layer(node, input_names, input_shapes, graph):
     layer['target_shape'] = [-1]  # does not contain batch dimension
 
     return layer
+
+
+@onnx_handler('Resize')
+def parse_resize_layer(node, input_names, input_shapes, graph):
+    layer = {}
+    layer['name'] = node.name
+    layer['class_name'] = 'Resize'
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
+    layer['in_height'] = input_shapes[0][2]
+    layer['in_width'] = input_shapes[0][1]
+    layer['out_width'] = input_shapes[0][1]
+    layer['out_height'] = input_shapes[0][2]
+    layer['n_chan'] = input_shapes[0][3]
+    layer['algorithm'] = get_onnx_attribute(node, 'mode')
+    # The following is used in initialize() method.
+    # Probably a better solution would be to have a channels last parameter at QONNX level
+    layer['data_format'] = (
+        'channels_last' if any(node.domain == 'qonnx.custom_op.channels_last' for node in graph.node) else 'channels_first'
+    )
+
+    return layer
diff --git a/hls4ml/converters/pytorch/pooling.py b/hls4ml/converters/pytorch/pooling.py
index 8256a9ff87..3757b2c82e 100644
--- a/hls4ml/converters/pytorch/pooling.py
+++ b/hls4ml/converters/pytorch/pooling.py
@@ -90,15 +90,19 @@ def parse_pooling_layer(operation, layer_name, input_names, input_shapes, node,
                 layer['stride_height'] = node.kwargs['stride'][0]
                 layer['stride_width'] = node.kwargs['stride'][1]
             else:
-                layer['stride_height'] = node.kwargs['stride']
-                layer['stride_width'] = node.kwargs['stride']
-            if type(node.kwargs['kernel_size']) is tuple:
-                layer['pool_height'] = node.kwargs['kernel_size'][0]
-                layer['pool_width'] = node.kwargs['kernel_size'][1]
+                if node.kwargs['stride'] is None:
+                    # if stride is not set it is supposed to default to the kernel size
+                    layer['stride_height'] = node.args[1]
+                    layer['stride_width'] = node.args[1]
+                else:
+                    layer['stride_height'] = node.kwargs['stride']
+                    layer['stride_width'] = node.kwargs['stride']
+            if type(node.args[1]) is tuple:
+                layer['pool_height'] = node.args[1][0]
+                layer['pool_width'] = node.args[1][1]
             else:
-                layer['pool_height'] = node.kwargs['kernel_size']
-                layer['pool_width'] = node.kwargs['kernel_size']
-
+                layer['pool_height'] = node.args[1]
+                layer['pool_width'] = node.args[1]
             if type(node.kwargs['padding']) is tuple:
                 padding = node.kwargs['padding']
             else:
diff --git a/hls4ml/converters/pytorch/reshape.py b/hls4ml/converters/pytorch/reshape.py
index 37191135a1..3d415e7832 100644
--- a/hls4ml/converters/pytorch/reshape.py
+++ b/hls4ml/converters/pytorch/reshape.py
@@ -93,13 +93,23 @@ def parse_flatten_layer(operation, layer_name, input_names, input_shapes, node,
     layer['class_name'] = 'Reshape'
     layer['name'] = layer_name
     layer['inputs'] = input_names
-
-    start_dim = class_object.start_dim
-    end_dim = class_object.end_dim
-    if end_dim + 1 == 0 or end_dim + 1 > len(input_shapes[0]):
-        end_dim = len(input_shapes[0])
+    if node.op == 'call_module':
+        start_dim = class_object.start_dim
+        end_dim = class_object.end_dim
+        if end_dim + 1 == 0 or end_dim + 1 > len(input_shapes[0]):
+            end_dim = len(input_shapes[0])
+        else:
+            end_dim = end_dim + 1
     else:
-        end_dim = end_dim + 1
+        start_dim = node.args[1]
+        if len(node.args) == 3:
+            end_dim = node.args[2]
+        else:
+            end_dim = -1
+        if end_dim + 1 == 0 or end_dim + 1 > len(input_shapes[0]):
+            end_dim = len(input_shapes[0])
+        else:
+            end_dim = end_dim + 1
 
     layer['target_shape'] = (
         input_shapes[0][0:start_dim] + [np.prod(input_shapes[0][start_dim:end_dim])] + input_shapes[0][end_dim:]
diff --git a/hls4ml/converters/utils.py b/hls4ml/converters/utils.py
index d1c9e050d5..f365916b55 100644
--- a/hls4ml/converters/utils.py
+++ b/hls4ml/converters/utils.py
@@ -45,7 +45,7 @@ def compute_padding_1d(pad_type, in_size, stride, filt_size):
     is odd, it will add the extra column to the right.
 
     Args:
-        pad_type (str): Padding type, one of ``same``, `valid`` or ``causal`` (case insensitive).
+        pad_type (str): Padding type, one of ``same``, ``valid`` or ``causal`` (case insensitive).
         in_size (int): Input size.
         stride (int): Stride length.
         filt_size (int): Length of the kernel window.
@@ -135,6 +135,23 @@ def compute_padding_2d(pad_type, in_height, in_width, stride_height, stride_widt
 
 
 def compute_padding_1d_pytorch(pad_type, in_size, stride, filt_size, dilation):
+    """Computes the amount of padding required on each side of the 1D input tensor following pytorch conventions.
+
+    In case of ``same`` padding, this routine tries to pad evenly left and right, but if the amount of columns to be added
+    is odd, it will add the extra column to the right.
+
+    Args:
+        pad_type (str or int): Padding type. If string, one of ``same``, ``valid`` or ``causal`` (case insensitive).
+        in_size (int): Input size.
+        stride (int): Stride length.
+        filt_size (int): Length of the kernel window.
+
+    Raises:
+        Exception: Raised if the padding type is unknown.
+
+    Returns:
+        tuple: Tuple containing the padded input size, left and right padding values.
+    """
     if isinstance(pad_type, str):
         if pad_type.lower() == 'same':
             n_out = int(
@@ -176,6 +193,26 @@ def compute_padding_1d_pytorch(pad_type, in_size, stride, filt_size, dilation):
 def compute_padding_2d_pytorch(
     pad_type, in_height, in_width, stride_height, stride_width, filt_height, filt_width, dilation_height, dilation_width
 ):
+    """Computes the amount of padding required on each side of the 2D input tensor following pytorch conventions.
+
+    In case of ``same`` padding, this routine tries to pad evenly left and right (top and bottom), but if the amount of
+    columns to be added is odd, it will add the extra column to the right/bottom.
+
+    Args:
+        pad_type (str or int): Padding type. If string, one of ``same`` or ``valid`` (case insensitive).
+        in_height (int): The height of the input tensor.
+        in_width (int): The width of the input tensor.
+        stride_height (int): Stride height.
+        stride_width (int): Stride width.
+        filt_height (int): Height of the kernel window.
+        filt_width (int): Width of the kernel window.
+
+    Raises:
+        Exception: Raised if the padding type is unknown.
+
+    Returns:
+        tuple: Tuple containing the padded input height, width, and top, bottom, left and right padding values.
+    """
     if isinstance(pad_type, str):
         if pad_type.lower() == 'same':
             # Height
diff --git a/hls4ml/model/attributes.py b/hls4ml/model/attributes.py
index 0e8df6e10a..d03d2bd108 100644
--- a/hls4ml/model/attributes.py
+++ b/hls4ml/model/attributes.py
@@ -36,11 +36,12 @@ class Attribute:
 
     """
 
-    def __init__(self, name, value_type=Integral, default=None, configurable=False):
+    def __init__(self, name, value_type=Integral, default=None, configurable=False, description=None):
         self.name = name
         self.value_type = value_type
         self.default = default
         self.configurable = configurable
+        self.description = description
 
     def validate_value(self, value):
         if self.value_type is not None:
@@ -59,6 +60,20 @@ def config_name(self):
         """
         return convert_to_pascal_case(self.name)
 
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Attribute):
+            return NotImplemented
+        return (
+            self.name == other.name
+            and self.value_type == other.value_type
+            and self.default == other.default
+            and self.configurable == other.configurable
+            and self.description == other.description
+        )
+
+    def __hash__(self) -> int:
+        return hash((self.name, self.value_type, self.default, self.configurable, self.description))
+
 
 class ConfigurableAttribute(Attribute):
     """
@@ -68,8 +83,8 @@ class ConfigurableAttribute(Attribute):
     when defining the expected attributes of layer classes.
     """
 
-    def __init__(self, name, value_type=int, default=None):
-        super().__init__(name, value_type, default, configurable=True)
+    def __init__(self, name, value_type=Integral, default=None, description=None):
+        super().__init__(name, value_type, default, configurable=True, description=description)
 
 
 class TypeAttribute(Attribute):
@@ -79,10 +94,10 @@ class TypeAttribute(Attribute):
     As a convention, the name of the attribute storing a type will end in ``_t``.
     """
 
-    def __init__(self, name, default=None, configurable=True):
+    def __init__(self, name, default=None, configurable=True, description=None):
         if not name.endswith('_t'):
             name += '_t'
-        super().__init__(name, value_type=NamedType, default=default, configurable=configurable)
+        super().__init__(name, value_type=NamedType, default=default, configurable=configurable, description=description)
 
 
 class ChoiceAttribute(Attribute):
@@ -90,25 +105,31 @@ class ChoiceAttribute(Attribute):
     Represents an attribute whose value can be one of several predefined values.
     """
 
-    def __init__(self, name, choices, default=None, configurable=True):
-        super().__init__(name, value_type=list, default=default, configurable=configurable)
+    def __init__(self, name, choices, default=None, configurable=True, description=None):
+        super().__init__(name, value_type=list, default=default, configurable=configurable, description=description)
         assert len(choices) > 0
         if default is not None:
             assert default in choices
         self.choices = choices
-        self.value_type = str(self.choices)
 
     def validate_value(self, value):
         return value in self.choices
 
+    def __eq__(self, other: object) -> bool:
+        base_eq = super().__eq__(other)
+        return base_eq and hasattr(other, 'choices') and set(self.choices) == set(other.choices)
+
+    def __hash__(self) -> int:
+        return super().__hash__() ^ hash(tuple(sorted(self.choices)))
+
 
 class WeightAttribute(Attribute):
     """
     Represents an attribute that will store a weight variable.
     """
 
-    def __init__(self, name):
-        super().__init__(name, value_type=WeightVariable, default=None, configurable=False)
+    def __init__(self, name, description=None):
+        super().__init__(name, value_type=WeightVariable, default=None, configurable=False, description=description)
 
 
 class CodeAttrubute(Attribute):
@@ -116,8 +137,8 @@ class CodeAttrubute(Attribute):
     Represents an attribute that will store generated source code block.
     """
 
-    def __init__(self, name):
-        super(WeightAttribute, self).__init__(name, value_type=Source, default=None, configurable=False)
+    def __init__(self, name, description=None):
+        super().__init__(name, value_type=Source, default=None, configurable=False, description=description)
 
 
 # endregion
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index cf715fd767..520f96ba5f 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -506,6 +506,8 @@ def insert_node(self, node, before=None, input_idx=0):
 
         if next_node is not None:
             next_node.inputs[input_idx] = node.outputs[0]
+        else:
+            self.outputs = [node.outputs[0] if name == prev_node.outputs[0] else name for name in self.outputs]
 
         new_graph = OrderedDict()
         for k, v in self.graph.items():
@@ -514,47 +516,57 @@ def insert_node(self, node, before=None, input_idx=0):
                 new_graph[node.name] = node
 
         self.graph = new_graph
-        self._update_model_outputs()
 
     def remove_node(self, node, rewire=True):
-        """Remove a node from a graph.
+        """Removes a node from the graph.
 
-        By default, this function can connect the outputs of previous node to the input of next one.
-        Note that when removing a leaf node `rewire` should be set to `False`.
+        By default, this function connects the outputs of the previous
+        node to the inputs of the next node. If the removed node has multiple
+        input/output tensors, an exception is raised.
 
         Args:
-            node (Layer): The node to remove
-            rewire (bool, optional): If `True`, connects the outputs of the previous node
-                to the inputs of the next node
+            node (Layer): The node to remove.
+            rewire (bool, optional): Deprecated, has no effect.
 
         Raises:
-            Exception: If an attempt is made to rewire a leaf node or a node with multiple
-                inputs/outputs.
+            Exception: If an attempt is made to rewire a node with
+            multiple inputs/outputs.
 
+        Note:
+            The `rewire` parameter is deprecated and has no effect.
         """
-        if rewire:
-            inputs = [inp for inp in node.inputs if inp]
-            outputs = [outp for outp in node.outputs if outp]
-            if len(inputs) > 1 or len(outputs) > 1:
-                raise Exception('Cannot rewire a node with multiple inputs/outputs')
-            prev_node = node.get_input_node(node.inputs[0])
+
+        inputs = [inp for inp in node.inputs if inp]
+        outputs = [outp for outp in node.outputs if outp]
+
+        if len(inputs) > 1 or len(outputs) > 1:
+            raise Exception('Cannot delete a node with multiple inputs/outputs')
+
+        if len(inputs) == 1:
+            # Connect inputs -> $outputs
+            if node.name in self.outputs:
+                msg = f'Remove leaf node {node.name} will connect its input node {inputs[0]} to output, but it already is.'
+                assert inputs[0] not in self.outputs, msg
+                self.outputs = [inputs[0] if name == node.name else name for name in self.outputs]
+
+        if len(outputs) == 1 and len(inputs) == 1:
+            inp_var = node.get_input_variable()
+            out_var = node.get_output_variable()
+
+            # fmt: off
+            assert (np.prod(inp_var.shape) == np.prod(out_var.shape)), \
+                f'Input and output shapes do not match for {node.name}: {inp_var.shape} -> {out_var.shape}'
+            # fmt: on
+
             next_nodes = [x for x in self.graph.values() if node.outputs[0] in x.inputs]
-            if prev_node is not None:
-                if len(next_nodes) > 0:
-                    for next_node in next_nodes:
-                        for i, _ in enumerate(next_node.inputs):
-                            if node.outputs[0] == next_node.inputs[i]:
-                                next_node.inputs[i] = prev_node.outputs[0]
-                                break
-                else:
-                    if not node.outputs[0] in self.outputs:
-                        raise Exception('Cannot rewire a node without child')
-            else:
-                raise Exception('Cannot rewire a node without a parent')
+            for next_node in next_nodes:
+                # Connect inputs -> next
+                for i, nxt_inp in enumerate(next_node.inputs):
+                    if outputs[0] == nxt_inp:
+                        next_node.inputs[i] = inputs[0]
 
         del self.output_vars[node.outputs[0]]
         del self.graph[node.name]
-        self._update_model_outputs()
 
     def replace_node(self, old_node, new_node):
         """Replace an existing node in the graph with a new one.
@@ -584,7 +596,11 @@ def replace_node(self, old_node, new_node):
                     node.outputs[i] = repl[n]
 
         self.graph = OrderedDict((new_node.name, new_node) if k == old_node.name else (k, v) for k, v in self.graph.items())
-        self._update_model_outputs()
+
+        old_name = old_node.name
+        if old_name in self.outputs:
+            new_name = new_node.name
+            self.outputs = [new_name if name == old_name else name for name in self.outputs]
 
     def split_node(self, old_node, new_node1, new_node2):
         """Replace an existing node in the graph with two nodes in sequence.
@@ -622,17 +638,9 @@ def split_node(self, old_node, new_node1, new_node2):
             else:
                 new_graph[key] = value
         self.graph = new_graph
-        self._update_model_outputs()
-
-    def _update_model_outputs(self):
-        '''Update the model outputs
 
-        All node outputs and inputs are found. The model outputs are set to all node outputs
-        that are not also node inputs.
-        '''
-        node_outputs = [out for node in self.graph.values() for out in node.outputs]
-        node_inputs = [inp for node in self.graph.values() for inp in node.inputs]
-        self.outputs = [out for out in node_outputs if out not in node_inputs]
+        if old_node.name in self.outputs:
+            self.outputs = [new_node2.name if name == old_node.name else name for name in self.outputs]
 
     def next_layer(self):
         self.index += 1
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index fb548aa164..3847cda9cf 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -26,6 +26,7 @@
     WeightVariable,
     find_minimum_width,
 )
+from hls4ml.utils import attribute_descriptions as descriptions
 from hls4ml.utils.string_utils import convert_to_snake_case
 
 
@@ -53,9 +54,9 @@ class Layer:
     """
 
     _expected_attributes = [
-        Attribute('index'),
-        ConfigurableAttribute('trace', default=False),
-        TypeAttribute('result'),
+        Attribute('index', description=descriptions.index),
+        ConfigurableAttribute('trace', default=False, description=descriptions.trace),
+        TypeAttribute('result', description=descriptions.result_type),
     ]
 
     @classproperty
@@ -175,10 +176,12 @@ def _wrap_precision_to_type(self, name, precision):
         return NamedType(name=name, precision=precision)
 
     def _set_accum_t(self):
-        has_accum_t = any(a for a in self.expected_attributes if a.name == 'accum_t' and isinstance(a, TypeAttribute))
-        if has_accum_t:
-            accum_t = NamedType(*reversed(self.model.config.get_precision(self, 'accum')))
-            self.set_attr('accum_t', accum_t)
+        """Set the accumulator, but don't overwrite an existing one"""
+        if self.get_attr('accum_t') is None:
+            has_accum_t = any(a for a in self.expected_attributes if a.name == 'accum_t' and isinstance(a, TypeAttribute))
+            if has_accum_t:
+                accum_t = NamedType(*reversed(self.model.config.get_precision(self, 'accum')))
+                self.set_attr('accum_t', accum_t)
 
     def _set_type_t(self, name):
         has_type_t = any(a for a in self.expected_attributes if a.name == name + '_t' and isinstance(a, TypeAttribute))
@@ -357,7 +360,7 @@ def initialize(self):
 
 
 class Constant(Layer):
-    # one could consider making this a weight attribute, but given it's transient nature, I am not sure it helps
+    # one could consider making this a weight attribute, but given its transient nature, I am not sure it helps
     _expected_attributes = [
         Attribute('value', value_type=np.ndarray),
     ]
@@ -371,6 +374,10 @@ def initialize(self):
         dims = [f'{self.name}_{i}' for i in range(len(shape))]
         quantizer = self.get_attr('quantizer')
 
+        # the graph._make_graph function sets the input node to the previous node
+        # if it is not set. That is incorrect for Constant nodes, so remove the input node
+        self.inputs = []
+
         # Should the else clause below be None or UnspecifiedPrecisionType
         precision = quantizer.hls_type if quantizer is not None else UnspecifiedPrecisionType()
 
@@ -930,7 +937,7 @@ def _get_act_function_name(self):
 
 class HardActivation(Activation):
     '''
-    Implements the hard sigmoid and tan function in keras and qkeras
+    Implements the hard sigmoid and tanh function in keras and qkeras
     (Default parameters in qkeras are different, so should be configured)
     The hard sigmoid unction is clip(slope * x + shift, 0, 1), and the
     hard tanh function is 2 * hard_sigmoid - 1
@@ -1143,20 +1150,67 @@ class Resize(Layer):
     def initialize(self):
         inp = self.get_input_variable()
 
-        if self.get_attr('data_format') == 'channels_last':
-            if len(inp.shape) == 2:  # 1D -> width + chan
-                shape = [self.get_attr('out_width'), self.get_attr('n_chan')]
-                dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
-            elif len(inp.shape) == 3:  # 2D -> height + width + chan
-                shape = [self.get_attr('out_height'), self.get_attr('out_width'), self.get_attr('n_chan')]
-                dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+        if len(self.inputs) > 1:
+            # In order to be correctly ingested by hls4ml the QONNX resize node should have 3 inputs set with RoI left empty
+            if len(self.inputs) == 2:
+                raise Exception(
+                    'The number of inputs to Resize node is equal to 2. '
+                    'In this case, either one is trying to use a version 10 node '
+                    'or one is using the RoI parameter only to perform the resize operation, '
+                    'both not supported in hls4ml'
+                )
+            if len(self.inputs) == 4:
+                raise Exception('Sizes parameter is not supported by hls4ml. Use scales instead')
+            # get the scales of Resize node from QONNX frontend
+            # see doc here: https://onnx.ai/onnx/operators/onnx__Resize.html
+            scales_idx = 2 if len(self.inputs) == 3 or len(self.inputs) == 4 else 1
+            scales = self.get_input_node(self.inputs[scales_idx]).get_attr('value')
+            if len(scales) == 4:  # Resize 2D
+                self.set_attr('out_width', int(self.get_attr('in_width') * scales[1]))
+                self.set_attr('out_height', int(self.get_attr('in_height') * scales[2]))
+                self.set_attr('n_chan', int(self.get_attr('n_chan') * scales[3]))
+            elif len(scales) == 3:  # Resize 1D
+                self.set_attr('out_width', int(self.get_attr('in_width') * scales[1]))
+                self.set_attr('n_chan', int(self.get_attr('n_chan') * scales[2]))
+            else:
+                raise Exception('Resize 1D and Resize 2D are the ones supported in hls4ml')
+            if self.get_attr('data_format') == 'channels_last':
+                if len(inp.shape) == 2:  # 1D -> width + chan
+                    shape = [int(self.get_attr('out_width')), int(self.get_attr('n_chan'))]
+                    dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+                elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                    shape = [
+                        int(self.get_attr('out_height')),
+                        int(self.get_attr('out_width')),
+                        int(self.get_attr('n_chan')),
+                    ]
+                    dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+            else:
+                if len(inp.shape) == 2:  # 1D -> width + chan
+                    shape = [int(self.get_attr('n_chan')), int(self.get_attr('out_width'))]
+                    dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}']
+                elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                    shape = [
+                        int(self.get_attr('n_chan')),
+                        int(self.get_attr('out_height')),
+                        int(self.get_attr('out_width')),
+                    ]
+                    dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}']
         else:
-            if len(inp.shape) == 2:  # 1D -> width + chan
-                shape = [self.get_attr('n_chan'), self.get_attr('out_width')]
-                dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}']
-            elif len(inp.shape) == 3:  # 2D -> height + width + chan
-                shape = [self.get_attr('n_chan'), self.get_attr('out_height'), self.get_attr('out_width')]
-                dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}']
+            if self.get_attr('data_format') == 'channels_last':
+                if len(inp.shape) == 2:  # 1D -> width + chan
+                    shape = [self.get_attr('out_width'), self.get_attr('n_chan')]
+                    dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+                elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                    shape = [self.get_attr('out_height'), self.get_attr('out_width'), self.get_attr('n_chan')]
+                    dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+            else:
+                if len(inp.shape) == 2:  # 1D -> width + chan
+                    shape = [self.get_attr('n_chan'), self.get_attr('out_width')]
+                    dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}']
+                elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                    shape = [self.get_attr('n_chan'), self.get_attr('out_height'), self.get_attr('out_width')]
+                    dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}']
 
         self.add_output_variable(shape, dims, precision=inp.type.precision)
 
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 0edd549b29..7e9325ccd0 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -34,6 +34,7 @@
     'parse_qonnx',
     [
         'reshape_constant',
+        'resize_remove_constants',
         'quant_constant_parameters',
         'quant_to_activation',
         'fuse_quant_with_constant',
@@ -58,8 +59,7 @@
     'convert',
     [
         'channels_last_converter',
-        'merge_linear_activation',
-        'seperable_to_depthwise_and_conv',
+        'separable_to_depthwise_and_conv',
         'remove_transpose_before_flatten',
         'remove_nop_transpose',
         'remove_single_channel_transpose',
@@ -73,6 +73,7 @@
         'replace_multidimensional_dense_with_conv',
         'enforce_proxy_model_embedded_config',
         'eliminate_linear_activation',
+        'merge_linear_activation',
         # many of the above optimzers need to be done before this
         'infer_precision_types',
     ],
diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py
index e18d79ff4a..26b7b18e38 100644
--- a/hls4ml/model/optimizer/passes/batchnorm_opt.py
+++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py
@@ -166,7 +166,7 @@ class FuseConsecutiveBatchNormalization(OptimizerPass):
     """
 
     def match(self, node):
-        prev_node = node.get_input_node(node.inputs[0])
+        prev_node = node.get_input_node()
         basic_match = (
             isinstance(node, BatchNormalization)
             and isinstance(prev_node, BatchNormalization)
@@ -194,7 +194,7 @@ def match(self, node):
             return False
 
     def transform(self, model, node):
-        prev_node = node.get_input_node(node.inputs[0])
+        prev_node = node.get_input_node()
 
         prev_map = prev_node.get_output_use_map()
         if len(prev_map[prev_node.outputs[0]]) > 1:
diff --git a/hls4ml/model/optimizer/passes/bn_fuse.py b/hls4ml/model/optimizer/passes/bn_fuse.py
index 000d8380ce..be81d5fb3d 100644
--- a/hls4ml/model/optimizer/passes/bn_fuse.py
+++ b/hls4ml/model/optimizer/passes/bn_fuse.py
@@ -18,7 +18,7 @@ class FuseBatchNormalization(OptimizerPass):
     """
 
     def match(self, node):
-        prev_node = node.get_input_node(node.inputs[0])
+        prev_node = node.get_input_node()
         basic_match = (
             isinstance(node, BatchNormalization)
             and isinstance(prev_node, (Dense, Conv1D, Conv2D))
diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
index 0b5f12c008..606f42e54b 100644
--- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py
+++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
@@ -97,12 +97,17 @@ def transform(self, model, node):
             if (
                 isinstance(node, Reshape)
                 and len(node.attributes['target_shape']) == 1
-                and not model.config.config['HLSConfig']['Model']['ChannelsLastConversion'] == "internal"
+                and not model.config.config['HLSConfig']['Model']['ChannelsLastConversion'] == "off"
             ):
                 previous_node = node.get_input_node(node.inputs[0])
                 input = previous_node.name
                 outshape = previous_node.get_output_variable().shape
 
+                if (model.config.config['IOType'] == 'io_stream') and len(outshape) == 3:
+                    raise Exception(
+                        'No 3D transpose available in io_stream, this model cannot be converted to channels-last'
+                    )
+
                 if len(outshape) == 2:
                     attributes = {'perm': [1, 0]}
                 else:
diff --git a/hls4ml/model/optimizer/passes/linear.py b/hls4ml/model/optimizer/passes/linear.py
index b1aee7adc7..ce0308eb66 100644
--- a/hls4ml/model/optimizer/passes/linear.py
+++ b/hls4ml/model/optimizer/passes/linear.py
@@ -40,7 +40,6 @@ def transform(self, model, node):
         # if the activation has a quantizer (usually from a QONNX Quant node), set the previous node's output precision
         if quantizer is not None:
             prev_node.set_attr("quantizer", quantizer)
-            prev_node.types['result_t'] = quantizer.hls_type
             prev_node.get_output_variable().type.precision = quantizer.hls_type
         model.remove_node(node)
         return True
diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py
index a75ed27aca..bdf7447838 100644
--- a/hls4ml/model/optimizer/passes/merge_const.py
+++ b/hls4ml/model/optimizer/passes/merge_const.py
@@ -54,7 +54,6 @@ def transform(self, model, node):
         const_node0.set_attr('quantizer', quantizer)  # overwrite the quantizer
         if quantizer:
             const_node0.set_attr('quantizer', quantizer)
-            const_node0.types['result_t'] = quantizer.hls_type
             const_node0.get_output_variable().type.precision = quantizer.hls_type
         const_node0.set_attr('value', new_val)
 
diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py
index 43fcaa0da7..8fba1ec405 100644
--- a/hls4ml/model/optimizer/passes/move_scales.py
+++ b/hls4ml/model/optimizer/passes/move_scales.py
@@ -5,6 +5,8 @@
 
 '''
 
+import warnings
+
 import numpy as np
 
 from hls4ml.model.layers import ApplyAlpha, Constant, Conv, MatMul, Merge
@@ -85,6 +87,9 @@ def transform(self, model, node):
                 can_propagate = False
 
         if not can_propagate:
+            warnings.warn(
+                'Failed to propagate quantization scales down MatMul node; model probably not suppored.', stacklevel=1
+            )
             return False
 
         model.remove_node(apply_alpha)
@@ -124,6 +129,9 @@ def transform(self, model, node):
         try:
             bias = bias0 + bias1
         except ValueError:
+            warnings.warn(
+                'Failed to propagate quantization scales down Add node; model probably not suppored.', stacklevel=1
+            )
             return False
 
         model.remove_node(in0)
@@ -169,6 +177,7 @@ def transform(self, model, node):
             model.insert_node(new_node)
             return True
         else:
+            warnings.warn('Failed to propagate quantization bias down Add node; model probably not suppored.', stacklevel=1)
             return False
 
 
@@ -243,6 +252,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -287,6 +299,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -308,6 +323,9 @@ def transform(self, model, node):
                     can_propagate = False
 
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -367,6 +385,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -388,6 +409,9 @@ def transform(self, model, node):
                 except ValueError:
                     can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -412,6 +436,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
@@ -445,6 +472,9 @@ def transform(self, model, node):
                     except ValueError:
                         can_propagate = False
             if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
                 return False
 
             # to remove warning, since these get set again
diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py
index cac29b5040..04d5393748 100644
--- a/hls4ml/model/optimizer/passes/quant_opt.py
+++ b/hls4ml/model/optimizer/passes/quant_opt.py
@@ -167,8 +167,8 @@ def match(self, node):
             scale_unit_or_po2 = (scale == np.ones_like(scale)).all()
             if not scale_unit_or_po2 and _ALSO_MATCH_PO2:
                 # This optimization only works if all scales are the same
-                if np.all(scale[0] == scale):
-                    mantissa, _ = np.frexp(scale[0])
+                if np.all(scale.item(0) == scale):
+                    mantissa, _ = np.frexp(scale.item(0))
                     scale_unit_or_po2 = mantissa == 0.5
 
             is_match = scale_unit_or_po2
@@ -187,14 +187,13 @@ def transform(self, model, node):
         integer = bitwidth
         scale = node.get_attr('scale')
         if _ALSO_MATCH_PO2 and not (scale == np.ones_like(scale)).all():
-            _, exp = np.frexp(scale[0])  # know that np.all(scale[0] == scale) must be true
+            _, exp = np.frexp(scale.item(0))  # know that np.all(scale.item(0) == scale) must be true
             integer = bitwidth + exp - 1
 
         precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode)
 
         const_node = node.get_input_node(node.inputs[0])
         const_node.set_attr('quantizer', quantizer)
-        const_node.set_attr('result_t', precision)
         const_node.get_output_variable().type.precision = precision
 
         # Should we update the configuration to reflect the new precision? I don't think it's necessary
@@ -331,7 +330,6 @@ def transform(self, model, node):
         const_node.set_attr('value', new_val)
         const_node.set_attr('quantizer', quantizer)
 
-        const_node.types['result_t'].precision = precision
         const_node.get_output_variable().type.precision = precision
 
         inshape = node.get_input_variable().shape
diff --git a/hls4ml/model/optimizer/passes/resize_remove_constants.py b/hls4ml/model/optimizer/passes/resize_remove_constants.py
new file mode 100644
index 0000000000..69039c60a2
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/resize_remove_constants.py
@@ -0,0 +1,38 @@
+from warnings import warn
+
+from hls4ml.model.layers import Constant, Resize
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ResizeRemoveConstants(OptimizerPass):
+    """
+    This optimizer is intended to clean the Resize node from RoI and Scales parameters that if left cause issues in hls4ml.
+    """
+
+    def match(self, node):
+        is_match = isinstance(node, Resize) and len(node.inputs) > 1
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Remove RoI and Scale Constant from new shape input.
+        """
+        # see doc here: https://onnx.ai/onnx/operators/onnx__Resize.html
+        roi_index = 1
+        scales_idx = 2
+        scales_node = node.get_input_node(node.inputs[scales_idx])
+        node.inputs[scales_idx] = ''
+        if not isinstance(scales_node, Constant):
+            raise RuntimeError("Non-constant shape inputs are not supported")
+        model.remove_node(scales_node, rewire=False)
+        # RoI position is always 1 when present
+        roi_node = node.get_input_node(node.inputs[roi_index])
+        if roi_node.get_attr('value'):
+            warn('RoI value vector is not empty. Consider that RoI is not supported in hls4ml', stacklevel=2)
+        node.inputs[roi_index] = ''
+        if not isinstance(roi_node, Constant):
+            raise RuntimeError("Non-constant RoI inputs are not supported")
+        model.remove_node(roi_node, rewire=False)
+        # Clean all the '' inputs
+        node.inputs = list(filter(None, node.inputs))
+        return True
diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
index 38eef1e7d0..10840ec410 100644
--- a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
+++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
@@ -1,5 +1,5 @@
 """
-This optimizer converts a seperable convolution to a depthwise followed by a regular convolution.
+This optimizer converts a separable convolution to a depthwise followed by a regular convolution.
 For backends with a custom pointwise implementations the regular convolution will subsequently
 be converted to a pointwise convolution by a different optimizer.
 """
@@ -10,8 +10,8 @@
 from hls4ml.model.optimizer import OptimizerPass
 
 
-class SeperableToDepthwiseAndConv(OptimizerPass):
-    """Convert Seperable to DepthwiseConv + Conv (potentially later Pointwise)"""
+class SeparableToDepthwiseAndConv(OptimizerPass):
+    """Convert Separable to DepthwiseConv + Conv (potentially later Pointwise)"""
 
     _dw_attributes = (
         'in_width',
@@ -70,7 +70,7 @@ def transform(self, model, node):
             model.config.parse_name_config(dw_name, dw_layer_config)
 
         # creating the attributes
-        dw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._dw_attributes if k in node.attributes}
+        dw_attributes = {k: node.attributes[k] for k in SeparableToDepthwiseAndConv._dw_attributes if k in node.attributes}
         dw_attributes['n_filt'] = dw_attributes['n_chan'] * dw_attributes['depth_multiplier']
         dw_attributes['use_bias'] = False
 
@@ -100,7 +100,7 @@ def transform(self, model, node):
             model.config.parse_name_config(pw_name, pw_layer_config)
 
         # creating the attributes
-        pw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._pw_attributes if k in node.attributes}
+        pw_attributes = {k: node.attributes[k] for k in SeparableToDepthwiseAndConv._pw_attributes if k in node.attributes}
         pw_attributes['filt_width'] = 1
         pw_attributes['filt_height'] = 1
         pw_attributes['stride_width'] = 1
diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py
index 9fb257a1ef..9d0a97440f 100644
--- a/hls4ml/model/types.py
+++ b/hls4ml/model/types.py
@@ -64,12 +64,15 @@ def __init__(self, width, signed):
         self.width = width
         self.signed = signed
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         eq = self.width == other.width
         eq = eq and self.signed == other.signed
 
         return eq
 
+    def __hash__(self) -> int:
+        return hash((self.width, self.signed))
+
 
 class IntegerPrecisionType(PrecisionType):
     """Arbitrary precision integer  data type.
@@ -89,12 +92,15 @@ def __str__(self):
         return typestring
 
     # Does this need to make sure other is also an IntegerPrecisionType? I could see a match between Fixed and Integer
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         if isinstance(other, IntegerPrecisionType):
             return super().__eq__(other)
 
         return False
 
+    def __hash__(self) -> int:
+        return super().__hash__()
+
     @property
     def integer(self):
         return self.width
@@ -186,7 +192,7 @@ def __str__(self):
         typestring = '{signed}fixed<{args}>'.format(signed='u' if not self.signed else '', args=args)
         return typestring
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         if isinstance(other, FixedPrecisionType):
             eq = super().__eq__(other)
             eq = eq and self.integer == other.integer
@@ -197,6 +203,9 @@ def __eq__(self, other):
 
         return False
 
+    def __hash__(self) -> int:
+        return super().__hash__() ^ hash((self.integer, self.rounding_mode, self.saturation_mode, self.saturation_bits))
+
 
 class XnorPrecisionType(PrecisionType):
     """
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_merge.h b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h
index 00c2cf5e12..9cba030710 100644
--- a/hls4ml/templates/catapult/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h
@@ -11,6 +11,7 @@ namespace nnet {
 
 struct merge_config {
     static const unsigned n_elem = 10;
+    static const unsigned reuse_factor = 1;
 };
 
 struct dot_config {
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h
index c76bfba5a6..ec2e9bfb1a 100644
--- a/hls4ml/templates/catapult/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h
@@ -41,6 +41,26 @@ void clone_stream(ac_channel<data_T> &data, ac_channel<res_T> &res1, ac_channel<
     }
 }
 
+template <class data_T, class res_T, int N>
+void clone_stream(ac_channel<data_T> &data, ac_channel<res_T> &res1, ac_channel<res_T> &res2, ac_channel<res_T> &res3) {
+#ifndef __SYNTHESIS__
+    while (data.available(1))
+#endif
+    {
+        data_T in_data = data.read();
+        res_T out_data;
+
+    ClonePack:
+        for (int j = 0; j < data_T::size; j++) {
+            out_data[j] = in_data[j];
+        }
+
+        res1.write(out_data);
+        res2.write(out_data);
+        res3.write(out_data);
+    }
+}
+
 template <class data_T, class res_T, int N> void repack_stream(ac_channel<data_T> &data, ac_channel<res_T> &res) {
     if (data_T::size == res_T::size) {
         for (int i = 0; i < N / data_T::size; i++) {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h
new file mode 100644
index 0000000000..d2c774fcf8
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h
@@ -0,0 +1,19 @@
+#ifndef NNET_DEPTH_CONV1D_H_
+#define NNET_DEPTH_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d.h"
+#include "nnet_depthconv1d_resource.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
+
+    depthwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h
new file mode 100644
index 0000000000..c06b6b14e7
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h
@@ -0,0 +1,60 @@
+#ifndef NNET_DEPTH_CONV1D_LATENCY_H_
+#define NNET_DEPTH_CONV1D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_resource.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+
+    int depth_multiplier = CONFIG_T::n_filt / CONFIG_T::n_chan;
+    [[intel::fpga_register]] int res_idx = 0;
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::out_width * CONFIG_T::n_filt];
+
+DM_LOOP:
+    #pragma unroll
+    for (int dm = 0; dm < depth_multiplier; dm++) {
+
+    WIDTH_LOOP:
+        #pragma unroll
+        for (int w = 0; w < CONFIG_T::out_width; w++) {
+
+        CHAN_LOOP:
+            #pragma unroll
+            for (int c = 0; c < CONFIG_T::n_chan; c++) {
+
+                res_idx = (w * CONFIG_T::n_filt) + (c * depth_multiplier) + dm;
+
+                acc[res_idx] = biases[c * depth_multiplier + dm];
+
+            KERNEL_W_LOOP:
+                #pragma unroll
+                for (int kw = 0; kw < CONFIG_T::filt_width; kw++) {
+
+                    int w_in = w * CONFIG_T::stride_width + kw - CONFIG_T::pad_left;
+
+                    if ((w_in >= 0) && (w_in < CONFIG_T::in_width)) {
+
+                        acc[res_idx] += CONFIG_T::mult_config::
+                            template product<typename data_T::value_type, typename CONFIG_T::weight_t::value_type>::product(
+                                data[(w_in)*CONFIG_T::n_chan + c],
+                                weights[(dm * CONFIG_T::filt_width * CONFIG_T::n_chan) + (kw * CONFIG_T::n_chan) + c]);
+                    }
+                }
+            }
+        }
+    }
+
+RESULT:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::out_width * CONFIG_T::n_filt; ires++) {
+        res[ires] = cast<typename CONFIG_T::accum_t, typename res_T::value_type, CONFIG_T>(acc[ires]);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h
new file mode 100644
index 0000000000..87dc1805d9
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h
@@ -0,0 +1,19 @@
+#ifndef NNET_DEPTH_CONV2D_H_
+#define NNET_DEPTH_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d.h"
+#include "nnet_depthconv2d_resource.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
+
+    depthwise_conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h
new file mode 100644
index 0000000000..91ddc28f65
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h
@@ -0,0 +1,76 @@
+#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_
+#define NNET_SEPARABLE_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_resource.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+
+    int depth_multiplier = CONFIG_T::n_filt / CONFIG_T::n_chan;
+    [[intel::fpga_register]] int res_idx = 0;
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::out_width * CONFIG_T::out_height * CONFIG_T::n_filt];
+
+DM_LOOP:
+    #pragma unroll
+    for (int dm = 0; dm < depth_multiplier; dm++) {
+
+    HEIGHT_LOOP:
+        #pragma unroll
+        for (int h = 0; h < CONFIG_T::out_height; h++) {
+        WIDTH_LOOP:
+            #pragma unroll
+            for (int w = 0; w < CONFIG_T::out_width; w++) {
+
+            CHAN_LOOP:
+                #pragma unroll
+                for (int c = 0; c < CONFIG_T::n_chan; c++) {
+
+                    res_idx =
+                        (h * CONFIG_T::out_width * CONFIG_T::n_filt) + (w * CONFIG_T::n_filt) + (c * depth_multiplier) + dm;
+
+                    acc[res_idx] = biases[c * depth_multiplier + dm];
+
+                KERNEL_H_LOOP:
+                    #pragma unroll
+                    for (int kh = 0; kh < CONFIG_T::filt_height; kh++) {
+                    KERNEL_W_LOOP:
+                        #pragma unroll
+                        for (int kw = 0; kw < CONFIG_T::filt_width; kw++) {
+
+                            int h_in = h * CONFIG_T::stride_height + kh - CONFIG_T::pad_top;
+                            int w_in = w * CONFIG_T::stride_width + kw - CONFIG_T::pad_left;
+
+                            if ((h_in >= 0) && (h_in < CONFIG_T::in_height) && (w_in >= 0) && (w_in < CONFIG_T::in_width)) {
+
+                                acc[res_idx] +=
+                                    CONFIG_T::mult_config::template product<typename data_T::value_type,
+                                                                            typename CONFIG_T::weight_t::value_type>::
+                                        product(
+                                            data[(h_in)*CONFIG_T::in_width * CONFIG_T::n_chan + (w_in)*CONFIG_T::n_chan + c],
+                                            weights[(dm * CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan) +
+                                                    (kh * CONFIG_T::filt_width * CONFIG_T::n_chan) +
+                                                    (kw * CONFIG_T::n_chan) + c]);
+
+                                ;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+RESULT:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::out_width * CONFIG_T::out_height * CONFIG_T::n_filt; ires++) {
+        res[ires] = cast<typename CONFIG_T::accum_t, typename res_T::value_type, CONFIG_T>(acc[ires]);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
index 550663b881..d1262f4377 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
@@ -7,6 +7,7 @@ namespace nnet {
 
 struct merge_config {
     static const unsigned n_elem = 10;
+    static const unsigned reuse_factor = 1;
 };
 
 struct dot_config {
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
index 766ef2e208..1ee9a9f564 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
@@ -7,6 +7,7 @@ namespace nnet {
 
 struct merge_config {
     static const unsigned n_elem = 10;
+    static const unsigned reuse_factor = 1;
 };
 
 struct dot_config {
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index 52a404672c..46beeacb03 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -4,6 +4,7 @@
 #include "nnet_common.h"
 #include "nnet_conv1d_latency.h"
 #include "nnet_conv1d_resource.h"
+#include "nnet_function_stubs.h"
 #include <cstdlib>
 
 namespace nnet {
@@ -38,11 +39,7 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
     //#pragma HLS INLINE recursive
 
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
@@ -55,13 +52,28 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
     //#pragma HLS INLINE recursive
 
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DLatency : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        //#pragma HLS INLINE region
         conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DResource : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        //#pragma HLS INLINE region
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
-}
+};
 
 } // namespace nnet
 
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index 1bf25cc89c..e166cdd470 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -85,5 +85,83 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                #pragma HLS UNROLL
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::weight_t>::product(
+                        data[index_data], weights[index_weight]);
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
+        }
+    }
+}
+
 } // namespace nnet
 #endif
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index 7d0420611a..05d4b8a4d5 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -161,7 +161,7 @@ if {$opt(reset)} {
 } else {
     open_solution "solution1"
 }
-catch {config_array_partition -maximum_size 4096}
+catch {config_array_partition -maximum_size $maximum_size}
 config_compile -name_max_length 80
 set_part $part
 config_schedule -enable_dsp_full_reg=false
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
index 4a8a40cd10..6011e20cca 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
@@ -1,6 +1,7 @@
 #ifndef NNET_INSTR_GEN_H_
 #define NNET_INSTR_GEN_H_
 
+#include "nnet_conv1d_latency.h"
 #include "nnet_helpers.h"
 
 #include "hls_stream.h"
@@ -10,6 +11,16 @@
 
 namespace nnet {
 
+template <class data_T, class res_T, typename CONFIG_T> class PointwiseConv1D {
+  public:
+    static void pointwise_conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        // To be implemented in subclasses
+    }
+};
+
 // hls4ml insert code
 
 } // namespace nnet
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index a14517df5b..6db3f62f6e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -2,6 +2,7 @@
 #define NNET_COMMON_H_
 
 #include "ap_fixed.h"
+#include "nnet_helpers.h"
 
 // This is a substitute for "ceil(n/(float)d)".
 #define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index e2e0211b49..72bce78067 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -4,6 +4,7 @@
 #include "nnet_common.h"
 #include "nnet_conv1d_latency.h"
 #include "nnet_conv1d_resource.h"
+#include "nnet_function_stubs.h"
 #include <cstdlib>
 
 namespace nnet {
@@ -37,11 +38,7 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO
                 typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     #pragma HLS INLINE region
 
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
@@ -53,13 +50,28 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 
     #pragma HLS INLINE region
 
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DLatency : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        #pragma HLS INLINE region
         conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DResource : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        #pragma HLS INLINE region
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
-}
+};
 
 } // namespace nnet
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 0d9afb10cb..ef2f94dcaf 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -84,5 +84,83 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                #pragma HLS UNROLL
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::weight_t>::product(
+                        data[index_data], weights[index_weight]);
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
+        }
+    }
+}
+
 } // namespace nnet
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
index 1316bbe776..97774bc95b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
@@ -37,6 +37,15 @@ template <class data_T, class res_T, typename CONFIG_T> class DenseKernel {
     }
 };
 
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DKernel {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        // To be implemented in subclasses
+    }
+};
+
 } // namespace nnet
 
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
index 8005682978..979c447825 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
@@ -10,6 +10,7 @@ namespace nnet {
 
 struct merge_config {
     static const unsigned n_elem = 10;
+    static const unsigned reuse_factor = 1;
 };
 
 struct dot_config {
diff --git a/hls4ml/utils/attribute_descriptions.py b/hls4ml/utils/attribute_descriptions.py
new file mode 100644
index 0000000000..756f276fa1
--- /dev/null
+++ b/hls4ml/utils/attribute_descriptions.py
@@ -0,0 +1,51 @@
+"""Strings holding attribute descriptions."""
+
+# Common attributes
+
+reuse_factor = (
+    'The number of times each multiplier is used by controlling the amount of pipelining/unrolling. '
+    'Lower number results in more parallelism and lower latency at the expense of the resources used.'
+    'Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.'
+)
+
+index = 'Internal node counter used for bookkeeping and variable/tensor naming.'
+trace = 'Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)'
+
+result_type = 'The datatype (precision) of the output tensor.'
+accum_type = 'The datatype (precision) used to store intermediate results of the computation within the layer.'
+
+# Activation-related attributes
+
+table_size = 'The size of the lookup table used to approximate the function.'
+table_type = 'The datatype (precision) used for the values of the lookup table.'
+
+softmax_implementation = (
+    'Choice of implementation of softmax function. '
+    '"latency" provides good latency at the expense of extra resources. performs well on small number of classes. '
+    '"stable" may require extra clock cycles but has better accuracy. '
+    '"legacy" is the older implementation which has bad accuracy, but is fast and has low resource use. '
+    'It is superseded by the "latency" implementation for most applications. '
+    '"argmax" is a special implementation that can be used if only the output with the highest probability is important. '
+    'Using this implementation will save resources and clock cycles.'
+)
+softmax_skip = 'If enabled, skips the softmax node and returns the raw outputs.'
+
+# Convolution-related attributes
+
+conv_pf = (
+    'The number of outputs computed in parallel. Essentially the number of multiplications of input window with the '
+    'convolution kernel occuring in parallel. '
+    'Higher number results in more parallelism (lower latency and II) at the expense of resources used.'
+    'Currently only supported in io_parallel.'
+)
+conv_implementation = (
+    '"LineBuffer" implementation is preferred over "Encoded" for most use cases. '
+    'This attribute only applies to io_stream.'
+)
+
+# Recurrent-related attributes
+
+recurrent_static = (
+    'If set to True, will reuse the the same recurrent block for computation, resulting in lower resource '
+    'usage at the expense of serialized computation and higher latency/II.'
+)
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index 1bd9ff25ef..1db8e3c731 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -283,7 +283,8 @@ def config_from_pytorch_model(
     default_precision='ap_fixed<16,6>',
     default_reuse_factor=1,
     channels_last_conversion='full',
-    transpose_outputs=True,
+    transpose_outputs=False,
+    max_precision=None,
 ):
     """Create an HLS conversion config given the PyTorch model.
 
@@ -291,6 +292,15 @@ def config_from_pytorch_model(
     Users are advised to inspect the returned object to tweak the conversion configuration.
     The return object can be passed as `hls_config` parameter to `convert_from_pytorch_model`.
 
+    Note that hls4ml internally follows the keras convention for nested tensors known as
+    "channels last", wherease pytorch uses the "channels first" convention.
+    For exampe, for a tensor encoding an image with 3 channels, pytorch will expect the data
+    to be encoded as (Number_Of_Channels, Height , Width), whereas hls4ml expects
+    (Height , Width, Number_Of_Channels). By default, hls4ml will perform the necessary
+    conversions of the inputs and internal tensors automatically, but will return the output
+    in "channels last" However, this behavior can be controlled by the user using the
+    related arguments discussed below.
+
     Args:
         model: PyTorch model
         input_shape (tuple or list of tuples): The shape of the input tensor, excluding the batch size.
@@ -304,15 +314,19 @@ def config_from_pytorch_model(
             will generate config keys for every layer separately, allowing for highly specific
             configuration tweaks.
         backend(str, optional): Name of the backend to use
-        default_precision (str, optional): Default precision to use. Defaults to 'fixed<16,6>'.
+        default_precision (str, optional): Default precision to use. Defaults to 'fixed<16,6>'. Note, this must
+            be an explicit precision: 'auto' is not allowed.
         default_reuse_factor (int, optional): Default reuse factor. Defaults to 1.
         channels_last_conversion (string, optional): Configures the conversion of pytorch layers to
-        'channels_last' dataformate. Can be set to 'full', 'internal', or 'off'. If 'full', both the inputs
-        and internal layers will be converted. If 'internal', only internal layers will be converted; this
-        assumes the inputs are converted by the user. If 'off', no conversion is performed.
+            'channels_last' data format used by hls4ml internally. Can be set to 'full' (default), 'internal',
+            or 'off'. If 'full', both the inputs and internal layers will be converted. If 'internal',
+            only internal layers will be converted; this assumes the inputs are converted by the user.
+            If 'off', no conversion is performed.
         transpose_outputs (bool, optional): Set to 'False' if the output should not be transposed from
             channels_last into channels_first data format. Defaults to 'False'. If False, outputs needs
             to be transposed manually.
+        max_precision (str or None, optional): Maximum width precision to use. Defaults to None, meaning no maximum.
+            Note:  Only integer and fixed precisions are supported
 
     Raises:
         Exception: If PyTorch model has layers not supported by hls4ml.
@@ -324,11 +338,16 @@ def config_from_pytorch_model(
     config = {}
 
     model_config = {}
-    model_config['Precision'] = default_precision
+    model_config['Precision'] = {}
+    model_config['Precision']['default'] = default_precision
+    if max_precision is not None:
+        model_config['Precision']['maximum'] = max_precision
     model_config['ReuseFactor'] = default_reuse_factor
     model_config['ChannelsLastConversion'] = channels_last_conversion
     model_config['TransposeOutputs'] = transpose_outputs
     model_config['Strategy'] = 'Latency'
+    model_config['BramFactor'] = 1_000_000_000
+    model_config['TraceOutput'] = False
 
     config['Model'] = model_config
     config['PytorchModel'] = model
@@ -372,7 +391,7 @@ def make_layer_config(layer):
                 if name.endswith('_t'):
                     name = name[:-2]
                 if attr.default is None:
-                    precision_cfg[name] = default_precision
+                    precision_cfg[name] = 'auto'
                 else:
                     precision_cfg[name] = str(attr.default)
             elif attr.name == 'reuse_factor':
@@ -413,7 +432,7 @@ def make_layer_config(layer):
 
 
 def config_from_onnx_model(
-    model, granularity='name', backend=None, default_precision='ap_fixed<16,6>', default_reuse_factor=1
+    model, granularity='name', backend=None, default_precision='fixed<16,6>', default_reuse_factor=1, max_precision=None
 ):
     """Create an HLS conversion config given the ONNX model.
 
@@ -435,6 +454,8 @@ def config_from_onnx_model(
         backend(str, optional): Name of the backend to use
         default_precision (str, optional): Default precision to use. Defaults to 'fixed<16,6>'.
         default_reuse_factor (int, optional): Default reuse factor. Defaults to 1.
+        max_precision (str or None, optional): Maximum width precision to use. Defaults to None, meaning no maximum.
+            Note:  Only integer and fixed precisions are supported
 
     Raises:
         Exception: If ONNX model has layers not supported by hls4ml.
@@ -456,9 +477,14 @@ def config_from_onnx_model(
     config = {}
 
     model_config = {}
-    model_config['Precision'] = default_precision
+    model_config['Precision'] = {}
+    model_config['Precision']['default'] = default_precision
+    if max_precision is not None:
+        model_config['Precision']['maximum'] = max_precision
     model_config['ReuseFactor'] = default_reuse_factor
     model_config['Strategy'] = 'Latency'
+    model_config['BramFactor'] = 1_000_000_000
+    model_config['TraceOutput'] = False
 
     config['Model'] = model_config
 
diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py
index cefa158e11..817847887d 100644
--- a/hls4ml/writer/vivado_accelerator_writer.py
+++ b/hls4ml/writer/vivado_accelerator_writer.py
@@ -394,6 +394,8 @@ def write_board_script(self, model):
         f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
         f.write('variable version\n')
         f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
+        f.write('variable maximum_size\n')
+        f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4096')))
         if self.vivado_accelerator_config.get_interface() == 'axi_stream':
             in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth()
             f.write(f'set bit_width_hls_output {in_bit}\n')
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 5ab13736ec..0341959045 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -717,6 +717,8 @@ def write_build_script(self, model):
             f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
             f.write('variable version\n')
             f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
+            f.write('variable maximum_size\n')
+            f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4096')))
 
         # build_prj.tcl
         srcpath = (filedir / '../templates/vivado/build_prj.tcl').resolve()
diff --git a/setup.cfg b/setup.cfg
index 9b7ef45f8f..0b81e7b592 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -27,12 +27,12 @@ install_requires =
     numpy
     onnx>=1.4.0
     pydigitalwavetools==1.1
+    pyparsing
     pyyaml
-    qkeras
     tabulate
-    tensorflow
+    tensorflow>=2.8.0,<=2.14.1
     tensorflow-model-optimization<=0.7.5
-python_requires = >=3.10
+python_requires = >=3.10, <3.12
 include_package_data = True
 scripts = scripts/hls4ml
 
diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index b130b43cef..adc3d680ab 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -18,13 +18,14 @@
     EXAMPLEMODEL: {}
 """
 
+
 n_test_files_per_yml = int(os.environ.get('N_TESTS_PER_YAML', 4))
 
 # Blacklisted tests will be skipped
 BLACKLIST = {'test_reduction'}
 
 # Long-running tests will not be bundled with other tests
-LONGLIST = {'test_hgq_layers'}
+LONGLIST = {'test_hgq_layers', 'test_hgq_players', 'test_qkeras', 'test_pytorch_api'}
 
 
 def path_to_name(test_path):
@@ -71,7 +72,7 @@ def generate_test_yaml(test_root='.'):
         name = path.stem.replace('test_', '')
         test_file = str(path.relative_to(test_root))
         needs_examples = uses_example_model(path)
-        diff_yml = yaml.safe_load(template.format(name, test_file, needs_examples))
+        diff_yml = yaml.safe_load(template.format(name, test_file, int(needs_examples)))
         yml.update(diff_yml)
 
     return yml
diff --git a/test/pytest/test_depthconv1d.py b/test/pytest/test_depthconv1d.py
index 3734815af0..85c8e2ac4f 100644
--- a/test/pytest/test_depthconv1d.py
+++ b/test/pytest/test_depthconv1d.py
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),
diff --git a/test/pytest/test_depthconv2d.py b/test/pytest/test_depthconv2d.py
index 9178edf368..4832cb1ae9 100644
--- a/test/pytest/test_depthconv2d.py
+++ b/test/pytest/test_depthconv2d.py
@@ -24,6 +24,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),
diff --git a/test/pytest/test_hgq_layers.py b/test/pytest/test_hgq_layers.py
index 92a7ea1876..80d96fbcda 100644
--- a/test/pytest/test_hgq_layers.py
+++ b/test/pytest/test_hgq_layers.py
@@ -19,7 +19,6 @@
     Signature,
 )
 from HGQ.proxy import to_proxy_model
-from HGQ.proxy.fixed_point_quantizer import gfixed
 from tensorflow import keras
 
 from hls4ml.converters import convert_from_keras_model
@@ -79,51 +78,6 @@ def run_model_test(
     _run_synth_match_test(proxy, data, io_type, backend, dir, cond=cond)
 
 
-def create_player_model(layer: str, rnd_strategy: str, io_type: str):
-    pa_config = get_default_paq_conf()
-    pa_config['rnd_strategy'] = rnd_strategy
-    pa_config['skip_dims'] = 'all' if io_type == 'io_stream' else 'batch'
-    set_default_paq_conf(pa_config)
-
-    inp = keras.Input(shape=(15))
-    if 'PConcatenate' in layer:
-        _inp = [HQuantize()(inp)] * 2
-        out = eval(layer)(_inp)
-        out = HDense(15)(out)
-        return keras.Model(inp, out)
-    elif 'Signature' in layer:
-        _inp = eval(layer)(inp)
-        out = HDense(15)(_inp)
-        return keras.Model(inp, out)
-    elif 'Pool2D' in layer:
-        _inp = PReshape((3, 5, 1))(HQuantize()(inp))
-    elif 'Pool1D' in layer:
-        _inp = PReshape((5, 3))(HQuantize()(inp))
-    elif 'Dense' in layer or 'Activation' in layer:
-        _inp = HQuantize()(inp)
-    elif 'Flatten' in layer:
-        out = HQuantize()(inp)
-        out = PReshape((3, 5))(out)
-        out = HConv1D(2, 2)(out)
-        out = eval(layer)(out)
-        out = HDense(15)(out)
-        return keras.Model(inp, out)
-    else:
-        raise Exception(f'Please add test for {layer}')
-
-    out = eval(layer)(_inp)
-    model = keras.Model(inp, out)
-
-    for layer in model.layers:
-        # No weight bitwidths to randomize
-        # And activation bitwidths
-        if hasattr(layer, 'paq'):
-            fbw: tf.Variable = layer.paq.fbw
-            fbw.assign(tf.constant(np.random.uniform(4, 6, fbw.shape).astype(np.float32)))
-
-    return model
-
-
 def create_hlayer_model(layer: str, rnd_strategy: str, io_type: str):
     pa_config = get_default_paq_conf()
     pa_config['rnd_strategy'] = rnd_strategy
@@ -222,43 +176,3 @@ def test_syn_hlayers(layer, N: int, rnd_strategy: str, io_type: str, cover_facto
     path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}'
 
     run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive, cond=cond)
-
-
-@pytest.mark.parametrize(
-    'layer',
-    [
-        "PConcatenate()",
-        "PMaxPool1D(2, padding='same')",
-        "PMaxPool1D(4, padding='same')",
-        "PMaxPool2D((5,3), padding='same')",
-        "PMaxPool1D(2, padding='valid')",
-        "PMaxPool2D((2,3), padding='valid')",
-        "Signature(1,6,3)",
-        "PAvgPool1D(2, padding='same')",
-        "PAvgPool2D((1,2), padding='same')",
-        "PAvgPool2D((2,2), padding='same')",
-        "PAvgPool1D(2, padding='valid')",
-        "PAvgPool2D((1,2), padding='valid')",
-        "PAvgPool2D((2,2), padding='valid')",
-        "PFlatten()",
-    ],
-)
-@pytest.mark.parametrize("N", [1000])
-@pytest.mark.parametrize("rnd_strategy", ['floor', 'standard_round'])
-@pytest.mark.parametrize("io_type", ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize("cover_factor", [1.0])
-@pytest.mark.parametrize("aggressive", [True, False])
-@pytest.mark.parametrize("backend", ['vivado', 'vitis'])
-def test_syn_players(layer, N: int, rnd_strategy: str, io_type: str, cover_factor: float, aggressive: bool, backend: str):
-    model = create_player_model(layer=layer, rnd_strategy=rnd_strategy, io_type=io_type)
-    data = get_data((N, 15), 7, 1)
-
-    path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}'
-
-    if 'Signature' in layer:
-        q = gfixed(1, 6, 3)
-        data = q(data).numpy()
-    if "padding='same'" in layer and io_type == 'io_stream':
-        pytest.skip("io_stream does not support padding='same' for pools at the moment")
-
-    run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive)
diff --git a/test/pytest/test_hgq_players.py b/test/pytest/test_hgq_players.py
new file mode 100644
index 0000000000..9c4b40f97f
--- /dev/null
+++ b/test/pytest/test_hgq_players.py
@@ -0,0 +1,171 @@
+from pathlib import Path
+
+import HGQ  # noqa: F401
+import numpy as np
+import pytest
+import tensorflow as tf
+from HGQ import get_default_paq_conf, set_default_paq_conf, trace_minmax
+from HGQ.layers import (  # noqa: F401
+    HConv1D,
+    HDense,
+    HQuantize,
+    PAvgPool1D,
+    PAvgPool2D,
+    PConcatenate,
+    PFlatten,
+    PMaxPool1D,
+    PMaxPool2D,
+    PReshape,
+    Signature,
+)
+from HGQ.proxy import to_proxy_model
+from HGQ.proxy.fixed_point_quantizer import gfixed
+from tensorflow import keras
+
+from hls4ml.converters import convert_from_keras_model
+
+# tf.config.experimental_run_functions_eagerly(True)  # noqa
+
+
+test_path = Path(__file__).parent
+
+
+def _run_synth_match_test(proxy: keras.Model, data, io_type: str, backend: str, dir: str, cond=None):
+
+    output_dir = dir + '/hls4ml_prj'
+    hls_model = convert_from_keras_model(
+        proxy,
+        io_type=io_type,
+        output_dir=output_dir,
+        backend=backend,
+        hls_config={'Model': {'Precision': 'fixed<1,0>', 'ReuseFactor': 1}},
+    )
+    hls_model.compile()
+
+    data_len = data.shape[0] if isinstance(data, np.ndarray) else data[0].shape[0]
+    # Multiple output case. Check each output separately
+    if len(proxy.outputs) > 1:  # type: ignore
+        r_proxy: list[np.ndarray] = [x.numpy() for x in proxy(data)]  # type: ignore
+        r_hls: list[np.ndarray] = hls_model.predict(data)  # type: ignore
+        r_hls = [x.reshape(r_proxy[i].shape) for i, x in enumerate(r_hls)]
+    else:
+        r_proxy: list[np.ndarray] = [proxy(data).numpy()]  # type: ignore
+        r_hls: list[np.ndarray] = [hls_model.predict(data).reshape(r_proxy[0].shape)]  # type: ignore
+
+    errors = []
+    for i, (p, h) in enumerate(zip(r_proxy, r_hls)):
+        try:
+            if cond is None:
+                mismatch_ph = p != h
+                assert (
+                    np.sum(mismatch_ph) == 0
+                ), f"Proxy-HLS4ML mismatch for out {i}: {np.sum(np.any(mismatch_ph, axis=1))} out of {data_len} samples are different. Sample: {p[mismatch_ph].ravel()[:5]} vs {h[mismatch_ph].ravel()[:5]}"  # noqa: E501
+            else:
+                cond(p, h)
+        except AssertionError as e:
+            errors.append(e)
+    if len(errors) > 0:
+        msgs = [str(e) for e in errors]
+        raise AssertionError('\n'.join(msgs))
+
+
+def run_model_test(
+    model: keras.Model, cover_factor: float | None, data, io_type: str, backend: str, dir: str, aggressive: bool, cond=None
+):
+    data_len = data.shape[0] if isinstance(data, np.ndarray) else data[0].shape[0]
+    if cover_factor is not None:
+        trace_minmax(model, data, cover_factor=cover_factor, bsz=data_len)
+    proxy = to_proxy_model(model, aggressive=aggressive, unary_lut_max_table_size=4096)
+    _run_synth_match_test(proxy, data, io_type, backend, dir, cond=cond)
+
+
+def create_player_model(layer: str, rnd_strategy: str, io_type: str):
+    pa_config = get_default_paq_conf()
+    pa_config['rnd_strategy'] = rnd_strategy
+    pa_config['skip_dims'] = 'all' if io_type == 'io_stream' else 'batch'
+    set_default_paq_conf(pa_config)
+
+    inp = keras.Input(shape=(15))
+    if 'PConcatenate' in layer:
+        _inp = [HQuantize()(inp)] * 2
+        out = eval(layer)(_inp)
+        out = HDense(15)(out)
+        return keras.Model(inp, out)
+    elif 'Signature' in layer:
+        _inp = eval(layer)(inp)
+        out = HDense(15)(_inp)
+        return keras.Model(inp, out)
+    elif 'Pool2D' in layer:
+        _inp = PReshape((3, 5, 1))(HQuantize()(inp))
+    elif 'Pool1D' in layer:
+        _inp = PReshape((5, 3))(HQuantize()(inp))
+    elif 'Dense' in layer or 'Activation' in layer:
+        _inp = HQuantize()(inp)
+    elif 'Flatten' in layer:
+        out = HQuantize()(inp)
+        out = PReshape((3, 5))(out)
+        out = HConv1D(2, 2)(out)
+        out = eval(layer)(out)
+        out = HDense(15)(out)
+        return keras.Model(inp, out)
+    else:
+        raise Exception(f'Please add test for {layer}')
+
+    out = eval(layer)(_inp)
+    model = keras.Model(inp, out)
+
+    for layer in model.layers:
+        # No weight bitwidths to randomize
+        # And activation bitwidths
+        if hasattr(layer, 'paq'):
+            fbw: tf.Variable = layer.paq.fbw
+            fbw.assign(tf.constant(np.random.uniform(4, 6, fbw.shape).astype(np.float32)))
+
+    return model
+
+
+def get_data(shape: tuple[int, ...], v: float, max_scale: float):
+    rng = np.random.default_rng()
+    a1 = rng.uniform(-v, v, shape).astype(np.float32)
+    a2 = rng.uniform(0, max_scale, (1, shape[1])).astype(np.float32)
+    return (a1 * a2).astype(np.float32)
+
+
+@pytest.mark.parametrize(
+    'layer',
+    [
+        "PConcatenate()",
+        "PMaxPool1D(2, padding='same')",
+        "PMaxPool1D(4, padding='same')",
+        "PMaxPool2D((5,3), padding='same')",
+        "PMaxPool1D(2, padding='valid')",
+        "PMaxPool2D((2,3), padding='valid')",
+        "Signature(1,6,3)",
+        "PAvgPool1D(2, padding='same')",
+        "PAvgPool2D((1,2), padding='same')",
+        "PAvgPool2D((2,2), padding='same')",
+        "PAvgPool1D(2, padding='valid')",
+        "PAvgPool2D((1,2), padding='valid')",
+        "PAvgPool2D((2,2), padding='valid')",
+        "PFlatten()",
+    ],
+)
+@pytest.mark.parametrize("N", [1000])
+@pytest.mark.parametrize("rnd_strategy", ['floor', 'standard_round'])
+@pytest.mark.parametrize("io_type", ['io_parallel', 'io_stream'])
+@pytest.mark.parametrize("cover_factor", [1.0])
+@pytest.mark.parametrize("aggressive", [True, False])
+@pytest.mark.parametrize("backend", ['vivado', 'vitis'])
+def test_syn_players(layer, N: int, rnd_strategy: str, io_type: str, cover_factor: float, aggressive: bool, backend: str):
+    model = create_player_model(layer=layer, rnd_strategy=rnd_strategy, io_type=io_type)
+    data = get_data((N, 15), 7, 1)
+
+    path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}'
+
+    if 'Signature' in layer:
+        q = gfixed(1, 6, 3)
+        data = q(data).numpy()
+    if "padding='same'" in layer and io_type == 'io_stream':
+        pytest.skip("io_stream does not support padding='same' for pools at the moment")
+
+    run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive)
diff --git a/test/pytest/test_multiout_network.py b/test/pytest/test_multiout_network.py
index 15e23ff79a..366fac7fb5 100644
--- a/test/pytest/test_multiout_network.py
+++ b/test/pytest/test_multiout_network.py
@@ -19,6 +19,21 @@ def model():
     return model
 
 
+@pytest.fixture(scope='module')
+def model_corner_cases():
+    in1 = keras.layers.Input(shape=(24, 8))
+    in2 = keras.layers.Input(shape=(16))
+    out1 = keras.layers.Conv1D(1, 3)(in1)
+    out1 = keras.layers.Flatten()(out1)
+    out2 = keras.layers.Dense(16, activation='relu')(out1)
+    out2 = keras.layers.Add()([out2, in2])
+    out3 = keras.layers.Dense(2)(out1)
+    out4 = keras.layers.Dense(2)(out2)
+    out4 = keras.layers.Flatten()(out4)
+    model = keras.models.Model(inputs=[in1, in2], outputs=[out1, out2, out3, out4])
+    return model
+
+
 @pytest.fixture(scope='module')
 def data():
     X = np.random.normal(0, 1, (1000, 10))
@@ -26,18 +41,20 @@ def data():
     return X
 
 
+@pytest.fixture(scope='module')
+def data_corner_cases():
+    X1 = np.random.normal(0, 1, (1000, 24, 8))
+    X2 = np.random.normal(0, 1, (1000, 16))
+    X1 = np.clip(X1, -16, 15)
+    X2 = np.clip(X2, -16, 15)
+    return X1, X2
+
+
 @pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-def test_multi_clone(model, data, backend: str, io_type: str):
+def test_multi_output_nn(model, data, backend: str, io_type: str):
     output_dir = str(test_root_path / f'hls4mlprj_multiout_network_{backend}_{io_type}')
     hls_config = {'Model': {'Precision': 'fixed<32,5>', 'ReuseFactor': 1}}
-    layer_config = {
-        'dense1': {'Precision': {'result': 'fixed<35,5>'}},
-        'dense2': {'Precision': {'result': 'fixed<40,5>'}},
-        'dense1_linear': {'Precision': {'result': 'fixed<35,5>'}},
-        'dense2_linear': {'Precision': {'result': 'fixed<40,5>'}},
-    }
-    hls_config['LayerName'] = layer_config
     model_hls = convert_from_keras_model(
         model, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type
     )
@@ -50,3 +67,32 @@ def test_multi_clone(model, data, backend: str, io_type: str):
 
     assert np.allclose(r_hls[0], r_keras[0], atol=1e-5, rtol=0)
     assert np.allclose(r_hls[1], r_keras[1], atol=1e-5, rtol=0)
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis', 'Catapult', 'OneAPI'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+@pytest.mark.parametrize('strategy', ['latency', 'resource'])
+def test_multi_output_nn_corner_cases(model_corner_cases, data_corner_cases, backend: str, io_type: str, strategy: str):
+    """Cover corner cases, when:
+    - a layer outputs both to the next layer(s) and to the model output
+       - when an node removal/insertion is triggered internally
+    - a reshape in io_parallel, or flatten in io_stream layer's output is used multiple times
+       - and as layer output
+       - and by layer taking multiple inputs
+    - a Flatten layer outputs to the model output in io_stream
+    """
+    output_dir = str(test_root_path / f'hls4mlprj_multiout_network_2_{backend}_{io_type}_{strategy}')
+    hls_config = {'Model': {'Precision': 'fixed<32,5>', 'ReuseFactor': 1}, 'Strategy': strategy}
+
+    model_hls = convert_from_keras_model(
+        model_corner_cases, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type
+    )
+
+    model_hls.compile()
+    r_hls = model_hls.predict(data_corner_cases)
+    r_keras = model_corner_cases.predict(data_corner_cases, verbose=0, batch_size=1000)
+
+    assert np.allclose(r_hls[0], r_keras[0], atol=1e-5, rtol=0)
+    assert np.allclose(r_hls[1], r_keras[1], atol=1e-5, rtol=0)
+    assert np.allclose(r_hls[2], r_keras[2], atol=1e-5, rtol=0)
+    assert np.allclose(r_hls[3], r_keras[3], atol=1e-5, rtol=0)
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index 678b22bfeb..1cfb43e4cd 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -19,25 +19,27 @@
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('strides', strides1d_options)
 @pytest.mark.parametrize(
-    'backend, io_type, strategy',
+    'backend, io_type, strategy, rf',
     [
-        ('Quartus', 'io_parallel', 'resource'),
-        ('Quartus', 'io_stream', 'resource'),
-        ('oneAPI', 'io_parallel', 'resource'),
-        ('oneAPI', 'io_stream', 'resource'),
-        ('Vivado', 'io_parallel', 'resource'),
-        ('Vitis', 'io_parallel', 'resource'),
-        ('Vivado', 'io_parallel', 'latency'),
-        ('Vitis', 'io_parallel', 'latency'),
-        ('Vivado', 'io_stream', 'latency'),
-        ('Vivado', 'io_stream', 'resource'),
-        ('Vitis', 'io_stream', 'latency'),
-        ('Vitis', 'io_stream', 'resource'),
-        ('Catapult', 'io_stream', 'latency'),
-        ('Catapult', 'io_stream', 'resource'),
+        ('Quartus', 'io_parallel', 'resource', 1),
+        ('Quartus', 'io_stream', 'resource', 1),
+        ('oneAPI', 'io_parallel', 'resource', 1),
+        ('oneAPI', 'io_stream', 'resource', 1),
+        ('Vivado', 'io_parallel', 'resource', 1),
+        ('Vitis', 'io_parallel', 'resource', 1),
+        ('Vivado', 'io_parallel', 'latency', 1),
+        ('Vitis', 'io_parallel', 'latency', 1),
+        ('Vivado', 'io_parallel', 'latency', 14),
+        ('Vitis', 'io_parallel', 'latency', 14),
+        ('Vivado', 'io_stream', 'latency', 1),
+        ('Vivado', 'io_stream', 'resource', 1),
+        ('Vitis', 'io_stream', 'latency', 1),
+        ('Vitis', 'io_stream', 'resource', 1),
+        ('Catapult', 'io_stream', 'latency', 1),
+        ('Catapult', 'io_stream', 'resource', 1),
     ],
 )
-def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
+def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, rf):
     model = tf.keras.models.Sequential()
     input_shape = (28, 3)
     model.add(
@@ -50,6 +52,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
             kernel_initializer='normal',
             use_bias=False,
             data_format=chans,
+            name='pointwise1d',
         )
     )
     model.compile(optimizer='adam', loss='mse')
@@ -58,14 +61,12 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
     keras_prediction = model.predict(X_input)
 
     default_precision = 'fixed<32,16>'
-    config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision)
+    config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name')
     config['Model']['Strategy'] = strategy
+    config['LayerName']['pointwise1d']['ReuseFactor'] = rf
 
     output_dir = str(
-        test_root_path
-        / 'hls4mlprj_pointwise1d_{}_strides_{}_{}_padding_{}_{}_{}'.format(
-            chans, strides[0], padds, backend, io_type, strategy
-        )
+        test_root_path / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_rf{rf}'
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend
@@ -110,6 +111,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
             kernel_initializer='normal',
             use_bias=False,
             data_format=chans,
+            name='pointwise2d',
         )
     )
 
@@ -123,10 +125,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
     config['Model']['Strategy'] = strategy
     stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '')
     output_dir = str(
-        test_root_path
-        / 'hls4mlprj_pointwise2d_{}_strides_{}_{}_padding_{}_{}_{}'.format(
-            chans, stride_cfg, padds, backend, io_type, strategy
-        )
+        test_root_path / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}'
     )
 
     hls_model = hls4ml.converters.convert_from_keras_model(
diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py
index 3056bd13f8..3de0b3f193 100644
--- a/test/pytest/test_pytorch_api.py
+++ b/test/pytest/test_pytorch_api.py
@@ -498,7 +498,7 @@ def test_pooling(pooling, padds, backend):
     model.eval()
     pytorch_prediction = model(torch.Tensor(X_input)).detach().numpy()
 
-    config = config_from_pytorch_model(model, input_shape_forHLS)
+    config = config_from_pytorch_model(model, input_shape_forHLS, transpose_outputs=True)
     output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_pooling_{pooling.__name__}_padds_{padds}_backend_{backend}')
     hls_model = convert_from_pytorch_model(model, hls_config=config, output_dir=output_dir, backend=backend)
     hls_model.compile()
diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py
index f822c591a7..f48f268626 100644
--- a/test/pytest/test_qonnx.py
+++ b/test/pytest/test_qonnx.py
@@ -101,6 +101,32 @@ def sep_conv_model():
     return model
 
 
+@pytest.fixture(scope='module')
+def branched_model():
+    """
+    Load branched model using separable convs, already channels-last and cleaned
+    """
+    dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+
+    return model
+
+
+@pytest.fixture(scope='module')
+def tiny_unet_model():
+    """
+    Load tiny unet model, already channels-last and cleaned
+    """
+    dl_file = str(example_model_path / "onnx/tiny_unet_ch_last.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+
+    return model
+
+
 @pytest.fixture(scope='module')
 def two_layer_keras_model():
     """
@@ -309,6 +335,58 @@ def test_sep_conv(sep_conv_model, backend):
     np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1)
 
 
+@pytest.mark.parametrize('backend', ['Vitis'])
+def test_branched_model(branched_model, backend):
+    model = branched_model
+    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend=backend, default_precision='fixed<32,16>'
+    )
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=str(test_root_path / f'hls4mlprj_qonnx_branched_model_{backend}'),
+        io_type='io_stream',
+        backend=backend,
+        hls_config=config,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
+
+    np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
+
+
+@pytest.mark.parametrize('backend', ['Vitis'])
+def test_tiny_unet_model(tiny_unet_model, backend):
+
+    model = tiny_unet_model
+    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend=backend, default_precision='fixed<32,16>'
+    )
+
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=str(test_root_path / f'hls4mlprj_qonnx_tiny_unet_model_{backend}'),
+        io_type='io_stream',
+        backend=backend,
+        hls_config=config,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
+
+    np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
+
+
 @pytest.mark.parametrize(
     'model_name',
     [
diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py
index 64312e9932..aef24db040 100644
--- a/test/pytest/test_sepconv1d.py
+++ b/test/pytest/test_sepconv1d.py
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 4732c7c7f1..1d056f15c9 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),